diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 391fb30d95e2a..2e6568225b908 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -3944,6 +3944,56 @@ struct MemorySanitizerVisitor : public InstVisitor { } } + /// Handle intrinsics by applying the intrinsic to the shadows. + /// + /// The trailing arguments are passed verbatim to the intrinsic, though any + /// uninitialized trailing arguments can also taint the shadow e.g., for an + /// intrinsic with one trailing verbatim argument: + /// out = intrinsic(var1, var2, opType) + /// we compute: + /// shadow[out] = + /// intrinsic(shadow[var1], shadow[var2], opType) | shadow[opType] + /// + /// For example, this can be applied to the Arm NEON vector table intrinsics + /// (tbl{1,2,3,4}). + /// + /// The origin is approximated using setOriginForNaryOp. + void handleIntrinsicByApplyingToShadow(IntrinsicInst &I, + unsigned int trailingVerbatimArgs) { + IRBuilder<> IRB(&I); + + assert(trailingVerbatimArgs < I.arg_size()); + + SmallVector ShadowArgs; + // Don't use getNumOperands() because it includes the callee + for (unsigned int i = 0; i < I.arg_size() - trailingVerbatimArgs; i++) { + Value *Shadow = getShadow(&I, i); + ShadowArgs.push_back(Shadow); + } + + for (unsigned int i = I.arg_size() - trailingVerbatimArgs; i < I.arg_size(); + i++) { + Value *Arg = I.getArgOperand(i); + ShadowArgs.push_back(Arg); + } + + CallInst *CI = + IRB.CreateIntrinsic(I.getType(), I.getIntrinsicID(), ShadowArgs); + Value *CombinedShadow = CI; + + // Combine the computed shadow with the shadow of trailing args + for (unsigned int i = I.arg_size() - trailingVerbatimArgs; i < I.arg_size(); + i++) { + Value *Shadow = + CreateShadowCast(IRB, getShadow(&I, i), CombinedShadow->getType()); + CombinedShadow = IRB.CreateOr(Shadow, CombinedShadow, "_msprop"); + } + + setShadow(&I, CombinedShadow); + + setOriginForNaryOp(I); + } + void visitIntrinsicInst(IntrinsicInst &I) { switch (I.getIntrinsicID()) { case Intrinsic::uadd_with_overflow: @@ -4319,6 +4369,28 @@ struct MemorySanitizerVisitor : public InstVisitor { break; } + // Arm NEON vector table intrinsics have the source/table register(s) as + // arguments, followed by the index register. They return the output. + // + // 'TBL writes a zero if an index is out-of-range, while TBX leaves the + // original value unchanged in the destination register.' + // Conveniently, zero denotes a clean shadow, which means out-of-range + // indices for TBL will initialize the user data with zero and also clean + // the shadow. (For TBX, neither the user data nor the shadow will be + // updated, which is also correct.) + case Intrinsic::aarch64_neon_tbl1: + case Intrinsic::aarch64_neon_tbl2: + case Intrinsic::aarch64_neon_tbl3: + case Intrinsic::aarch64_neon_tbl4: + case Intrinsic::aarch64_neon_tbx1: + case Intrinsic::aarch64_neon_tbx2: + case Intrinsic::aarch64_neon_tbx3: + case Intrinsic::aarch64_neon_tbx4: { + // The last trailing argument (index register) should be handled verbatim + handleIntrinsicByApplyingToShadow(I, 1); + break; + } + default: if (!handleUnknownIntrinsic(I)) visitInstruction(I); diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_tbl.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_tbl.ll index 2505a3e95fe02..1453e64abd5a2 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_tbl.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_tbl.ll @@ -16,18 +16,10 @@ define <8 x i8> @tbl1_8b(<16 x i8> %A, <8 x i8> %B) nounwind sanitize_memory { ; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to i64 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1:![0-9]+]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[TMP1]], <8 x i8> [[B]]) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i8> [[TMP2]], [[TMP3]] ; CHECK-NEXT: [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[A]], <8 x i8> [[B]]) -; CHECK-NEXT: store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i8> [[OUT]] ; %out = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %A, <8 x i8> %B) @@ -40,7 +32,8 @@ define <16 x i8> @tbl1_16b(<16 x i8> %A, <16 x i8> %B) nounwind sanitize_memory ; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[B]]) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP2]], [[TMP3]] ; CHECK-NEXT: [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) ; CHECK-NEXT: store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i8> [[OUT]] @@ -56,21 +49,10 @@ define <8 x i8> @tbl2_8b(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C) sanitize_memor ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to i64 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 8: +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> [[C]]) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i8> [[TMP3]], [[TMP4]] ; CHECK-NEXT: [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[A]], <16 x i8> [[B]], <8 x i8> [[C]]) -; CHECK-NEXT: store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i8> [[OUT]] ; %out = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C) @@ -84,10 +66,10 @@ define <16 x i8> @tbl2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) sanitize_me ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[C]]) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP3]], [[TMP4]] ; CHECK-NEXT: [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]]) -; CHECK-NEXT: store <16 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i8> [[OUT]] ; %out = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) @@ -102,24 +84,10 @@ define <8 x i8> @tbl3_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) ; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to i64 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP8]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 10: +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> [[D]]) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i8> [[TMP4]], [[TMP5]] ; CHECK-NEXT: [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <8 x i8> [[D]]) -; CHECK-NEXT: store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i8> [[OUT]] ; %out = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) @@ -134,11 +102,10 @@ define <16 x i8> @tbl3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> % ; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i8> [[_MSPROP1]], [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[D]]) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP4]], [[TMP5]] ; CHECK-NEXT: [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]]) -; CHECK-NEXT: store <16 x i8> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i8> [[OUT]] ; %out = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) @@ -154,27 +121,10 @@ define <8 x i8> @tbl4_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, ; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 ; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to i128 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to i64 -; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i64 [[TMP10]], 0 -; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]] -; CHECK-NEXT: br i1 [[_MSOR7]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] -; CHECK: 11: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 12: +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> [[E]]) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i8> [[TMP5]], [[TMP6]] ; CHECK-NEXT: [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <8 x i8> [[E]]) -; CHECK-NEXT: store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i8> [[OUT]] ; %out = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) @@ -190,12 +140,10 @@ define <16 x i8> @tbl4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> % ; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 ; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i8> [[_MSPROP1]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i8> [[_MSPROP2]], [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[E]]) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP5]], [[TMP6]] ; CHECK-NEXT: [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> [[E]]) -; CHECK-NEXT: store <16 x i8> [[_MSPROP3]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i8> [[OUT]] ; %out = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) @@ -212,30 +160,15 @@ define <8 x i8> @shuffled_tbl2_to_tbl4_v8i8(<16 x i8> %a, <16 x i8> %b, <16 x i8 ; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 8: +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> ) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i8> zeroinitializer, [[TMP5]] ; CHECK-NEXT: [[T1:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[A]], <16 x i8> [[B]], <8 x i8> ) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP9]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP4]] to i128 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP10]], 0 -; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: br i1 [[_MSOR4]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] -; CHECK: 11: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 12: +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> ) +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i8> zeroinitializer, [[TMP6]] ; CHECK-NEXT: [[T2:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[C]], <16 x i8> [[D]], <8 x i8> ) +; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i8> [[_MSPROP]], <8 x i8> [[_MSPROP1]], <8 x i32> ; CHECK-NEXT: [[S:%.*]] = shufflevector <8 x i8> [[T1]], <8 x i8> [[T2]], <8 x i32> -; CHECK-NEXT: store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <8 x i8> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i8> [[S]] ; %t1 = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %a, <16 x i8> %b, <8 x i8> ) @@ -254,15 +187,15 @@ define <16 x i8> @shuffled_tbl2_to_tbl4(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c ; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> ) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> zeroinitializer, [[TMP5]] ; CHECK-NEXT: [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> ) -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i8> [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i8> [[_MSPROP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> ) +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i8> zeroinitializer, [[TMP6]] ; CHECK-NEXT: [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> ) -; CHECK-NEXT: [[_MSPROP4:%.*]] = shufflevector <16 x i8> [[_MSPROP1]], <16 x i8> [[_MSPROP3]], <16 x i32> +; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <16 x i8> [[_MSPROP]], <16 x i8> [[_MSPROP1]], <16 x i32> ; CHECK-NEXT: [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> -; CHECK-NEXT: store <16 x i8> [[_MSPROP4]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <16 x i8> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i8> [[S]] ; %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) @@ -313,15 +246,15 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask(<16 x i8> %a, <16 x ; CHECK-NEXT: [[INS_14:%.*]] = insertelement <16 x i8> [[INS_13]], i8 -1, i32 14 ; CHECK-NEXT: [[_MSPROP15:%.*]] = insertelement <16 x i8> [[_MSPROP14]], i8 0, i32 15 ; CHECK-NEXT: [[INS_15:%.*]] = insertelement <16 x i8> [[INS_14]], i8 -1, i32 15 -; CHECK-NEXT: [[_MSPROP16:%.*]] = or <16 x i8> [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP17:%.*]] = or <16 x i8> [[_MSPROP16]], [[_MSPROP15]] +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[INS_15]]) +; CHECK-NEXT: [[_MSPROP16:%.*]] = or <16 x i8> [[_MSPROP15]], [[TMP6]] ; CHECK-NEXT: [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[INS_15]]) -; CHECK-NEXT: [[_MSPROP18:%.*]] = or <16 x i8> [[TMP4]], [[TMP5]] -; CHECK-NEXT: [[_MSPROP19:%.*]] = or <16 x i8> [[_MSPROP18]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> ) +; CHECK-NEXT: [[_MSPROP17:%.*]] = or <16 x i8> zeroinitializer, [[TMP7]] ; CHECK-NEXT: [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> ) -; CHECK-NEXT: [[_MSPROP20:%.*]] = shufflevector <16 x i8> [[_MSPROP17]], <16 x i8> [[_MSPROP19]], <16 x i32> +; CHECK-NEXT: [[_MSPROP18:%.*]] = shufflevector <16 x i8> [[_MSPROP16]], <16 x i8> [[_MSPROP17]], <16 x i32> ; CHECK-NEXT: [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> -; CHECK-NEXT: store <16 x i8> [[_MSPROP20]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <16 x i8> [[_MSPROP18]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i8> [[S]] ; %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0 @@ -376,15 +309,15 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2(<16 x i8> %a, <16 x ; CHECK-NEXT: [[INS_14:%.*]] = insertelement <16 x i8> [[INS_13]], i8 -1, i32 14 ; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <16 x i8> [[_MSPROP2]], i8 [[TMP1]], i32 15 ; CHECK-NEXT: [[INS_15:%.*]] = insertelement <16 x i8> [[INS_14]], i8 [[V]], i32 15 -; CHECK-NEXT: [[_MSPROP4:%.*]] = or <16 x i8> [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP5:%.*]] = or <16 x i8> [[_MSPROP4]], [[_MSPROP3]] +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[INS_15]]) +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <16 x i8> [[_MSPROP3]], [[TMP6]] ; CHECK-NEXT: [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[INS_15]]) -; CHECK-NEXT: [[_MSPROP6:%.*]] = or <16 x i8> [[TMP4]], [[TMP5]] -; CHECK-NEXT: [[_MSPROP7:%.*]] = or <16 x i8> [[_MSPROP6]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> ) +; CHECK-NEXT: [[_MSPROP5:%.*]] = or <16 x i8> zeroinitializer, [[TMP7]] ; CHECK-NEXT: [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> ) -; CHECK-NEXT: [[_MSPROP8:%.*]] = shufflevector <16 x i8> [[_MSPROP5]], <16 x i8> [[_MSPROP7]], <16 x i32> +; CHECK-NEXT: [[_MSPROP6:%.*]] = shufflevector <16 x i8> [[_MSPROP4]], <16 x i8> [[_MSPROP5]], <16 x i32> ; CHECK-NEXT: [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> -; CHECK-NEXT: store <16 x i8> [[_MSPROP8]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <16 x i8> [[_MSPROP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i8> [[S]] ; %ins.0 = insertelement <16 x i8> poison, i8 1, i32 0 @@ -452,15 +385,15 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask(<16 x i8> %a, <16 x ; CHECK-NEXT: [[INS_14:%.*]] = insertelement <16 x i8> [[INS_13]], i8 -1, i32 14 ; CHECK-NEXT: [[_MSPROP15:%.*]] = insertelement <16 x i8> [[_MSPROP14]], i8 0, i32 15 ; CHECK-NEXT: [[INS_15:%.*]] = insertelement <16 x i8> [[INS_14]], i8 -1, i32 15 -; CHECK-NEXT: [[_MSPROP16:%.*]] = or <16 x i8> [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP17:%.*]] = or <16 x i8> [[_MSPROP16]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> ) +; CHECK-NEXT: [[_MSPROP16:%.*]] = or <16 x i8> zeroinitializer, [[TMP6]] ; CHECK-NEXT: [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> ) -; CHECK-NEXT: [[_MSPROP18:%.*]] = or <16 x i8> [[TMP4]], [[TMP5]] -; CHECK-NEXT: [[_MSPROP19:%.*]] = or <16 x i8> [[_MSPROP18]], [[_MSPROP15]] +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[INS_15]]) +; CHECK-NEXT: [[_MSPROP17:%.*]] = or <16 x i8> [[_MSPROP15]], [[TMP7]] ; CHECK-NEXT: [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[INS_15]]) -; CHECK-NEXT: [[_MSPROP20:%.*]] = shufflevector <16 x i8> [[_MSPROP17]], <16 x i8> [[_MSPROP19]], <16 x i32> +; CHECK-NEXT: [[_MSPROP18:%.*]] = shufflevector <16 x i8> [[_MSPROP16]], <16 x i8> [[_MSPROP17]], <16 x i32> ; CHECK-NEXT: [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> -; CHECK-NEXT: store <16 x i8> [[_MSPROP20]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <16 x i8> [[_MSPROP18]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i8> [[S]] ; %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0 @@ -528,15 +461,15 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2(<16 x i8> %a, <16 ; CHECK-NEXT: [[INS_14:%.*]] = insertelement <16 x i8> [[INS_13]], i8 [[V]], i32 14 ; CHECK-NEXT: [[_MSPROP15:%.*]] = insertelement <16 x i8> [[_MSPROP14]], i8 [[TMP1]], i32 15 ; CHECK-NEXT: [[INS_15:%.*]] = insertelement <16 x i8> [[INS_14]], i8 [[V]], i32 15 -; CHECK-NEXT: [[_MSPROP16:%.*]] = or <16 x i8> [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP17:%.*]] = or <16 x i8> [[_MSPROP16]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> ) +; CHECK-NEXT: [[_MSPROP16:%.*]] = or <16 x i8> zeroinitializer, [[TMP6]] ; CHECK-NEXT: [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> ) -; CHECK-NEXT: [[_MSPROP18:%.*]] = or <16 x i8> [[TMP4]], [[TMP5]] -; CHECK-NEXT: [[_MSPROP19:%.*]] = or <16 x i8> [[_MSPROP18]], [[_MSPROP15]] +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[INS_15]]) +; CHECK-NEXT: [[_MSPROP17:%.*]] = or <16 x i8> [[_MSPROP15]], [[TMP7]] ; CHECK-NEXT: [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[INS_15]]) -; CHECK-NEXT: [[_MSPROP20:%.*]] = shufflevector <16 x i8> [[_MSPROP17]], <16 x i8> [[_MSPROP19]], <16 x i32> +; CHECK-NEXT: [[_MSPROP18:%.*]] = shufflevector <16 x i8> [[_MSPROP16]], <16 x i8> [[_MSPROP17]], <16 x i32> ; CHECK-NEXT: [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> -; CHECK-NEXT: store <16 x i8> [[_MSPROP20]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <16 x i8> [[_MSPROP18]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i8> [[S]] ; %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0 @@ -571,15 +504,15 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_shuffle(<16 x i8> %a, <16 x i8> %b ; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> ) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> zeroinitializer, [[TMP5]] ; CHECK-NEXT: [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> ) -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i8> [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i8> [[_MSPROP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> ) +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i8> zeroinitializer, [[TMP6]] ; CHECK-NEXT: [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> ) -; CHECK-NEXT: [[_MSPROP4:%.*]] = shufflevector <16 x i8> [[_MSPROP1]], <16 x i8> [[_MSPROP3]], <16 x i32> +; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <16 x i8> [[_MSPROP]], <16 x i8> [[_MSPROP1]], <16 x i32> ; CHECK-NEXT: [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> -; CHECK-NEXT: store <16 x i8> [[_MSPROP4]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <16 x i8> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i8> [[S]] ; %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) @@ -598,15 +531,15 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask1(<16 x i8> %a, <16 x i8> ; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> ) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> zeroinitializer, [[TMP5]] ; CHECK-NEXT: [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> ) -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i8> [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i8> [[_MSPROP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> ) +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i8> zeroinitializer, [[TMP6]] ; CHECK-NEXT: [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> ) -; CHECK-NEXT: [[_MSPROP4:%.*]] = shufflevector <16 x i8> [[_MSPROP1]], <16 x i8> [[_MSPROP3]], <16 x i32> +; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <16 x i8> [[_MSPROP]], <16 x i8> [[_MSPROP1]], <16 x i32> ; CHECK-NEXT: [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> -; CHECK-NEXT: store <16 x i8> [[_MSPROP4]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <16 x i8> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i8> [[S]] ; %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) @@ -625,15 +558,15 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask2(<16 x i8> %a, <16 x i8> ; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> ) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> zeroinitializer, [[TMP5]] ; CHECK-NEXT: [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> ) -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i8> [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i8> [[_MSPROP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> ) +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i8> zeroinitializer, [[TMP6]] ; CHECK-NEXT: [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> ) -; CHECK-NEXT: [[_MSPROP4:%.*]] = shufflevector <16 x i8> [[_MSPROP1]], <16 x i8> [[_MSPROP3]], <16 x i32> +; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <16 x i8> [[_MSPROP]], <16 x i8> [[_MSPROP1]], <16 x i32> ; CHECK-NEXT: [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> -; CHECK-NEXT: store <16 x i8> [[_MSPROP4]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <16 x i8> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i8> [[S]] ; %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) @@ -658,21 +591,10 @@ define <8 x i8> @tbx1_8b(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C) nounwind saniti ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to i64 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to i64 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 8: +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> [[C]]) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i8> [[TMP3]], [[TMP4]] ; CHECK-NEXT: [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <8 x i8> [[C]]) -; CHECK-NEXT: store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i8> [[OUT]] ; %out = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C) @@ -686,10 +608,10 @@ define <16 x i8> @tbx1_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) nounwind sa ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[C]]) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP3]], [[TMP4]] ; CHECK-NEXT: [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]]) -; CHECK-NEXT: store <16 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i8> [[OUT]] ; %out = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) @@ -704,24 +626,10 @@ define <8 x i8> @tbx2_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) s ; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to i64 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to i64 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP8]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 10: +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> [[D]]) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i8> [[TMP4]], [[TMP5]] ; CHECK-NEXT: [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <8 x i8> [[D]]) -; CHECK-NEXT: store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i8> [[OUT]] ; %out = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) @@ -736,11 +644,10 @@ define <16 x i8> @tbx2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> % ; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i8> [[_MSPROP1]], [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[D]]) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP4]], [[TMP5]] ; CHECK-NEXT: [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]]) -; CHECK-NEXT: store <16 x i8> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i8> [[OUT]] ; %out = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) @@ -756,27 +663,10 @@ define <8 x i8> @tbx3_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, ; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to i64 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP6]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to i128 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to i64 -; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i64 [[TMP10]], 0 -; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]] -; CHECK-NEXT: br i1 [[_MSOR7]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] -; CHECK: 11: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 12: +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> [[E]]) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i8> [[TMP5]], [[TMP6]] ; CHECK-NEXT: [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <8 x i8> [[E]]) -; CHECK-NEXT: store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i8> [[OUT]] ; %out = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(< 8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) @@ -792,12 +682,10 @@ define <16 x i8> @tbx3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> % ; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 ; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i8> [[_MSPROP1]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i8> [[_MSPROP2]], [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[E]]) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP5]], [[TMP6]] ; CHECK-NEXT: [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> [[E]]) -; CHECK-NEXT: store <16 x i8> [[_MSPROP3]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i8> [[OUT]] ; %out = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) @@ -814,30 +702,10 @@ define <8 x i8> @tbx4_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, ; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8 ; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP1]] to i64 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP7]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP8]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP4]] to i128 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP10]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP5]] to i128 -; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP11]], 0 -; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]] -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to i64 -; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i64 [[TMP12]], 0 -; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSOR7]], [[_MSCMP8]] -; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 14: +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <8 x i8> [[F]]) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i8> [[TMP6]], [[TMP7]] ; CHECK-NEXT: [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> [[E]], <8 x i8> [[F]]) -; CHECK-NEXT: store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i8> [[OUT]] ; %out = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F) @@ -854,13 +722,10 @@ define <16 x i8> @tbx4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> % ; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i8> [[_MSPROP1]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i8> [[_MSPROP2]], [[TMP5]] -; CHECK-NEXT: [[_MSPROP4:%.*]] = or <16 x i8> [[_MSPROP3]], [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[F]]) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP6]], [[TMP7]] ; CHECK-NEXT: [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> [[E]], <16 x i8> [[F]]) -; CHECK-NEXT: store <16 x i8> [[_MSPROP4]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i8> [[OUT]] ; %out = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F)