diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index 944cb481b025b..ff6696f6bec40 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -2929,6 +2929,208 @@ def NVVM_Tcgen05CpOp : NVVM_Op<"tcgen05.cp"> { }]; } +//===----------------------------------------------------------------------===// +// NVVM tcgen05 LdSt Shape Attr +//===----------------------------------------------------------------------===// + +def Tcgen05LdStShape16x64b: I32EnumAttrCase<"SHAPE_16X64B", 0, "shape_16x64b">; +def Tcgen05LdStShape16x128b: I32EnumAttrCase<"SHAPE_16X128B", 1, "shape_16x128b">; +def Tcgen05LdStShape16x256b: I32EnumAttrCase<"SHAPE_16X256B", 2, "shape_16x256b">; +def Tcgen05LdStShape32x32b: I32EnumAttrCase<"SHAPE_32X32B", 3, "shape_32x32b">; +def Tcgen05LdStShape16x32bx2: I32EnumAttrCase<"SHAPE_16X32BX2", 4, "shape_16x32bx2">; + +def Tcgen05LdStShape: I32EnumAttr< + "Tcgen05LdStShape", + "", + [Tcgen05LdStShape16x64b, Tcgen05LdStShape16x128b, Tcgen05LdStShape16x256b, + Tcgen05LdStShape32x32b, Tcgen05LdStShape16x32bx2] +> { + let cppNamespace = "::mlir::NVVM"; + let genSpecializedAttr = 0; +} + +def Tcgen05LdStShapeAttr: EnumAttr { + let assemblyFormat = "`<` $value `>`"; +} + +//===----------------------------------------------------------------------===// +// NVVM tcgen05.ld Op +//===----------------------------------------------------------------------===// + +def NVVM_Tcgen05LdOp : NVVM_Op<"tcgen05.ld"> { + let summary = "tensor memory load instructions"; + let arguments = (ins + // Attributes + UnitAttr:$pack, + Tcgen05LdStShapeAttr:$shape, + // Arguments + LLVM_PointerTensor:$tmemAddr, + Optional:$offset + ); + + let results = (outs AnyTypeOf<[I32, VectorOfLengthAndType< + [2, 4, 8, 16, 32, 64, 128], [I32]>]>:$res); + + let assemblyFormat = [{ + $tmemAddr (`,` $offset^)? (`pack` $pack^)? attr-dict `:` type($res) + }]; + + let description = [{ + Instruction `tcgen05.ld` asynchronously loads data from the Tensor Memory at + the location specified by the 32-bit address operand `tmemAddr` into the + destination register `res`, collectively across all threads of the warps. + + The `shape` and the `num` attribute together determines the total + dimension of the data which is loaded from the Tensor Memory. The `shape` + attribute indicates the base dimension of data to be accessed as described + in the Data Movement Shape. The `num` attribute indicates the repeat + factor on the base dimension resulting in the total dimension of the data + that is accessed. + + The shape `16x32bx2` performs two accesses into Tensor Memory of the shape + `16x32b`. The base address of the first access is specified by `tmemAddr` + and the base address of the second access is specified by + `tmemAddr + offset`, where `offset` is an immediate argument. + + The unit attribute `pack` can be used to pack two 16-bit + elements from adjacent columns into a single 32-bit element during the load. + + The following table describes the size of the vector for various combinations + of `num` and `shape` attributes + |=====================================================================| + | num/shape | 16x32bx2/16x64b/32x32b | 16x128b | 16x256b | + |=====================================================================| + | x1 | 1 | 2 | 4 | + | x2 | 2 | 4 | 8 | + | x4 | 4 | 8 | 16 | + | x8 | 8 | 16 | 32 | + | x16 | 16 | 32 | 64 | + | x32 | 32 | 64 | 128 | + | x64 | 64 | 128 | NA | + | x128 | 128 | NA | NA | + |=====================================================================| + + Example: + ```mlir + nvvm.tcgen05.ld %tmemAddr, %offset pack { + shape = #nvvm.tcgen05_ldst_shape, + } : <2xi32> + ``` + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-st) + }]; + + let hasVerifier = 1; + + string llvmBuilder = [{ + llvm::LLVMContext &Context = moduleTranslation.getLLVMContext(); + auto Pack = llvm::ConstantInt::get(Context, llvm::APInt(1, $pack)); + + unsigned num = $_resultType->isVectorTy() + ? llvm::cast($_resultType) + ->getElementCount() + .getFixedValue() + : 1; + + auto ID = getTcgen05LdIntrinsicID($shape, num); + if (ID == llvm::Intrinsic::not_intrinsic) + llvm::report_fatal_error("unknow intrinsic signature for tcgen05.ld"); + + if ($offset) + $res = createIntrinsicCall(builder, ID, {$tmemAddr, $offset, Pack}); + else + $res = createIntrinsicCall(builder, ID, {$tmemAddr, Pack}); + }]; +} + +//===----------------------------------------------------------------------===// +// NVVM tcgen05.st Op +//===----------------------------------------------------------------------===// + +def NVVM_Tcgen05StOp : NVVM_Op<"tcgen05.st"> { + let summary = "tensor memory store instructions"; + let arguments = (ins + // Attributes + UnitAttr:$unpack, + Tcgen05LdStShapeAttr:$shape, + // Arguments + LLVM_PointerTensor:$tmemAddr, + AnyTypeOf<[I32, VectorOfLengthAndType< + [2, 4, 8, 16, 32, 64, 128], [I32]>]>:$val, + Optional:$offset + ); + + let assemblyFormat = [{ + $tmemAddr `,` $val (`,` $offset^)? (`unpack` $unpack^)? attr-dict `:` type($val) + }]; + + let description = [{ + Instruction `tcgen05.st` asynchronously stores data from the source register `r` + into the Tensor Memory at the location specified by the 32-bit address operand + `tmemAddr`, collectively across all threads of the warps. + + The `shape` and the `num` attribute together determines the total dimension of + the data which is stored to the Tensor Memory. The `shape` indicates the base + dimension of data to be accessed. The `num` attribute indicates the repeat + factor on the base dimension resulting in the total dimension of the data that + is accessed. + + The shape `16x32bx2` performs two accesses into Tensor Memory of the shape + `16x32b`. The base address of the first access is specified by `tmemAddr` + and the base address of the second access is specified by + `tmemAddr + offset`, where `offset` is an immediate argument. + + The unit attribute `unpack` can be used to unpack a 32-bit element + in the register into two 16-bit elements and store them in adjacent columns. + + The following table describes the size of the vector for various combinations + of `num` and `shape` attributes + |=====================================================================| + | num/shape | 16x32bx2/16x64b/32x32b | 16x128b | 16x256b | + |=====================================================================| + | x1 | 1 | 2 | 4 | + | x2 | 2 | 4 | 8 | + | x4 | 4 | 8 | 16 | + | x8 | 8 | 16 | 32 | + | x16 | 16 | 32 | 64 | + | x32 | 32 | 64 | 128 | + | x64 | 64 | 128 | NA | + | x128 | 128 | NA | NA | + |=====================================================================| + + Example: + ```mlir + nvvm.tcgen05.st %tmemAddr, %val, %offset unpack { + shape = #nvvm.tcgen05_ldst_shape, + } : <2xi32> + ``` + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-st) + }]; + + string llvmBuilder = [{ + llvm::LLVMContext &Context = moduleTranslation.getLLVMContext(); + auto Unpack = llvm::ConstantInt::get(Context, llvm::APInt(1, $unpack)); + + auto valTy = $val->getType(); + uint32_t num = valTy->isVectorTy() ? llvm::cast(valTy) + ->getElementCount() + .getFixedValue() + : 1; + + auto ID = getTcgen05StIntrinsicID($shape, num); + if (ID == llvm::Intrinsic::not_intrinsic) + llvm::report_fatal_error("unknow intrinsic signature for tcgen05.st"); + + if ($offset) + createIntrinsicCall(builder, ID, {$tmemAddr, $offset, $val, Unpack}); + else + createIntrinsicCall(builder, ID, {$tmemAddr, $val, Unpack}); + }]; + + let hasVerifier = 1; +} + //===----------------------------------------------------------------------===// // NVVM target attribute. //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp index 45a0f9dbd4a7c..8f080a2d597a5 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp @@ -35,6 +35,7 @@ #include "llvm/IR/Function.h" #include "llvm/IR/Type.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/raw_ostream.h" #include @@ -1387,6 +1388,51 @@ llvm::Intrinsic::ID Tcgen05CpOp::getIntrinsicID(Operation &op) { llvm_unreachable("Invalid shape in tcgen05 cp Op"); } +// Returns the valid vector length for a given shape and vector length, the +// function models the table mentioned in the tcgen05.{ld, st} Op description +static unsigned isValidVectorLength(NVVM::Tcgen05LdStShape Shape, + unsigned VecLen) { + if (Shape == NVVM::Tcgen05LdStShape::SHAPE_16X128B) + return VecLen >= 2; + if (Shape == NVVM::Tcgen05LdStShape::SHAPE_16X256B) + return VecLen >= 4; + return true; +} + +LogicalResult Tcgen05LdOp::verify() { + LogicalResult Result = success(); + if (getShape() == NVVM::Tcgen05LdStShape::SHAPE_16X32BX2 && !getOffset()) + Result = emitError("shape 16x32bx2 requires offset argument"); + + auto ResTy = getRes().getType(); + unsigned ResLen = isa(ResTy) + ? llvm::cast(ResTy).getNumElements() + : 1; + if (!isValidVectorLength(getShape(), ResLen)) + Result = emitError(llvm::formatv("invalid result type length {0} for shape " + "{1} in tcgen05.ld Op", + ResLen, stringifyEnum(getShape()))); + + return Result; +} + +LogicalResult Tcgen05StOp::verify() { + LogicalResult Result = success(); + if (getShape() == NVVM::Tcgen05LdStShape::SHAPE_16X32BX2 && !getOffset()) + Result = emitError("shape 16x32bx2 requires offset argument"); + + auto ValTy = getVal().getType(); + unsigned ValLen = isa(ValTy) + ? llvm::cast(ValTy).getNumElements() + : 1; + if (!isValidVectorLength(getShape(), ValLen)) + Result = emitError(llvm::formatv("invalid input length {0} for shape " + "{1} in tcgen05.st Op", + ValLen, stringifyEnum(getShape()))); + + return Result; +} + /// Infer the result ranges for the NVVM SpecialRangeableRegisterOp that might /// have ConstantRangeAttr. static void nvvmInferResultRanges(Operation *op, Value result, diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp index 9540762de2777..c3a129a82688f 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp @@ -170,6 +170,112 @@ static unsigned getUnidirectionalFenceProxyID(NVVM::ProxyKind fromProxy, llvm_unreachable("Unsupported proxy kinds"); } +#define TCGEN05LD(SHAPE, NUM) llvm::Intrinsic::nvvm_tcgen05_ld_##SHAPE##_##NUM + +static llvm::Intrinsic::ID +getTcgen05LdIntrinsicID(mlir::NVVM::Tcgen05LdStShape shape, uint32_t num) { + llvm::Intrinsic::ID Shape16x64b[] = { + TCGEN05LD(16x64b, x1), TCGEN05LD(16x64b, x2), TCGEN05LD(16x64b, x4), + TCGEN05LD(16x64b, x8), TCGEN05LD(16x64b, x16), TCGEN05LD(16x64b, x32), + TCGEN05LD(16x64b, x64), TCGEN05LD(16x64b, x128), + }; + + llvm::Intrinsic::ID Shape16x128b[] = { + TCGEN05LD(16x128b, x1), TCGEN05LD(16x128b, x2), TCGEN05LD(16x128b, x4), + TCGEN05LD(16x128b, x8), TCGEN05LD(16x128b, x16), TCGEN05LD(16x128b, x32), + TCGEN05LD(16x128b, x64), + }; + + llvm::Intrinsic::ID Shape16x256b[] = { + TCGEN05LD(16x256b, x1), TCGEN05LD(16x256b, x2), TCGEN05LD(16x256b, x4), + TCGEN05LD(16x256b, x8), TCGEN05LD(16x256b, x16), TCGEN05LD(16x256b, x32), + }; + + llvm::Intrinsic::ID Shape16x32bx2[] = { + TCGEN05LD(16x32bx2, x1), TCGEN05LD(16x32bx2, x2), + TCGEN05LD(16x32bx2, x4), TCGEN05LD(16x32bx2, x8), + TCGEN05LD(16x32bx2, x16), TCGEN05LD(16x32bx2, x32), + TCGEN05LD(16x32bx2, x64), TCGEN05LD(16x32bx2, x128), + }; + + llvm::Intrinsic::ID Shape32x32b[] = { + TCGEN05LD(32x32b, x1), TCGEN05LD(32x32b, x2), TCGEN05LD(32x32b, x4), + TCGEN05LD(32x32b, x8), TCGEN05LD(32x32b, x16), TCGEN05LD(32x32b, x32), + TCGEN05LD(32x32b, x64), TCGEN05LD(32x32b, x128), + }; + + // `num` contains the length of vector and log2 of `num` returns the index + // into the shape array + unsigned Idx = std::log2(num); + + switch (shape) { + case NVVM::Tcgen05LdStShape::SHAPE_16X64B: + return Shape16x64b[Idx]; + case NVVM::Tcgen05LdStShape::SHAPE_16X128B: + return Shape16x128b[Idx - 1]; + case NVVM::Tcgen05LdStShape::SHAPE_16X256B: + return Shape16x256b[Idx - 2]; + case NVVM::Tcgen05LdStShape::SHAPE_32X32B: + return Shape32x32b[Idx]; + case NVVM::Tcgen05LdStShape::SHAPE_16X32BX2: + return Shape16x32bx2[Idx]; + } + llvm_unreachable("unhandled tcgen05.ld lowering"); +} + +#define TCGEN05ST(SHAPE, NUM) llvm::Intrinsic::nvvm_tcgen05_st_##SHAPE##_##NUM + +static llvm::Intrinsic::ID +getTcgen05StIntrinsicID(mlir::NVVM::Tcgen05LdStShape shape, uint32_t num) { + llvm::Intrinsic::ID Shape16x64b[] = { + TCGEN05ST(16x64b, x1), TCGEN05ST(16x64b, x2), TCGEN05ST(16x64b, x4), + TCGEN05ST(16x64b, x8), TCGEN05ST(16x64b, x16), TCGEN05ST(16x64b, x32), + TCGEN05ST(16x64b, x64), TCGEN05ST(16x64b, x128), + }; + + llvm::Intrinsic::ID Shape16x128b[] = { + TCGEN05ST(16x128b, x1), TCGEN05ST(16x128b, x2), TCGEN05ST(16x128b, x4), + TCGEN05ST(16x128b, x8), TCGEN05ST(16x128b, x16), TCGEN05ST(16x128b, x32), + TCGEN05ST(16x128b, x64), + }; + + llvm::Intrinsic::ID Shape16x256b[] = { + TCGEN05ST(16x256b, x1), TCGEN05ST(16x256b, x2), TCGEN05ST(16x256b, x4), + TCGEN05ST(16x256b, x8), TCGEN05ST(16x256b, x16), TCGEN05ST(16x256b, x32), + }; + + llvm::Intrinsic::ID Shape16x32bx2[] = { + TCGEN05ST(16x32bx2, x1), TCGEN05ST(16x32bx2, x2), + TCGEN05ST(16x32bx2, x4), TCGEN05ST(16x32bx2, x8), + TCGEN05ST(16x32bx2, x16), TCGEN05ST(16x32bx2, x32), + TCGEN05ST(16x32bx2, x64), TCGEN05ST(16x32bx2, x128), + }; + + llvm::Intrinsic::ID Shape32x32b[] = { + TCGEN05ST(32x32b, x1), TCGEN05ST(32x32b, x2), TCGEN05ST(32x32b, x4), + TCGEN05ST(32x32b, x8), TCGEN05ST(32x32b, x16), TCGEN05ST(32x32b, x32), + TCGEN05ST(32x32b, x64), TCGEN05ST(32x32b, x128), + }; + + // `num` contains the length of vector and log2 of `num` returns the index + // into the shape array + unsigned Idx = std::log2(num); + + switch (shape) { + case NVVM::Tcgen05LdStShape::SHAPE_16X64B: + return Shape16x64b[Idx]; + case NVVM::Tcgen05LdStShape::SHAPE_16X128B: + return Shape16x128b[Idx - 1]; + case NVVM::Tcgen05LdStShape::SHAPE_16X256B: + return Shape16x256b[Idx - 2]; + case NVVM::Tcgen05LdStShape::SHAPE_32X32B: + return Shape32x32b[Idx]; + case NVVM::Tcgen05LdStShape::SHAPE_16X32BX2: + return Shape16x32bx2[Idx]; + } + llvm_unreachable("unhandled tcgen05.st lowering"); +} + namespace { /// Implementation of the dialect interface that converts operations belonging /// to the NVVM dialect to LLVM IR. diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld.mlir new file mode 100644 index 0000000000000..b1266b0e8151d --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld.mlir @@ -0,0 +1,287 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_ld_16x64b +llvm.func @nvvm_tcgen05_ld_16x64b(%tmemAddr : !llvm.ptr<6>) { + +// CHECK: call i32 @llvm.nvvm.tcgen05.ld.16x64b.x1(ptr addrspace(6) {{%[0-9]+}}, i1 false) + %ldv1 = nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape} : i32 + +// CHECK: call <2 x i32> @llvm.nvvm.tcgen05.ld.16x64b.x2(ptr addrspace(6) {{%[0-9]+}}, i1 false) + %ldv2 = nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape} : vector<2 x i32> + +// CHECK: call <4 x i32> @llvm.nvvm.tcgen05.ld.16x64b.x4(ptr addrspace(6) {{%[0-9]+}}, i1 false) + %ldv4 = nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape} : vector<4 x i32> + +// CHECK: call <8 x i32> @llvm.nvvm.tcgen05.ld.16x64b.x8(ptr addrspace(6) {{%[0-9]+}}, i1 false) + %ldv8 = nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape} : vector<8 x i32> + +// CHECK: call <16 x i32> @llvm.nvvm.tcgen05.ld.16x64b.x16(ptr addrspace(6) {{%[0-9]+}}, i1 false) + %ldv16= nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape} : vector<16 x i32> + +// CHECK: call <32 x i32> @llvm.nvvm.tcgen05.ld.16x64b.x32(ptr addrspace(6) {{%[0-9]+}}, i1 false) + %ldv32= nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape} : vector<32 x i32> + +// CHECK: call <64 x i32> @llvm.nvvm.tcgen05.ld.16x64b.x64(ptr addrspace(6) {{%[0-9]+}}, i1 false) + %ldv64= nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape} : vector<64 x i32> + +// CHECK: call <128 x i32> @llvm.nvvm.tcgen05.ld.16x64b.x128(ptr addrspace(6) {{%[0-9]+}}, i1 false) + %ldv128= nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape} : vector<128 x i32> + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_ld_16x64b_pack +llvm.func @nvvm_tcgen05_ld_16x64b_pack(%tmemAddr : !llvm.ptr<6>) { + +// CHECK: call i32 @llvm.nvvm.tcgen05.ld.16x64b.x1(ptr addrspace(6) {{%[0-9]+}}, i1 true) + %ldv1 = nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape} : i32 + +// CHECK: call <2 x i32> @llvm.nvvm.tcgen05.ld.16x64b.x2(ptr addrspace(6) {{%[0-9]+}}, i1 true) + %ldv2 = nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape} : vector<2 x i32> + +// CHECK: call <4 x i32> @llvm.nvvm.tcgen05.ld.16x64b.x4(ptr addrspace(6) {{%[0-9]+}}, i1 true) + %ldv4 = nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape} : vector<4 x i32> + +// CHECK: call <8 x i32> @llvm.nvvm.tcgen05.ld.16x64b.x8(ptr addrspace(6) {{%[0-9]+}}, i1 true) + %ldv8 = nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape} : vector<8 x i32> + +// CHECK: call <16 x i32> @llvm.nvvm.tcgen05.ld.16x64b.x16(ptr addrspace(6) {{%[0-9]+}}, i1 true) + %ldv16= nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape} : vector<16 x i32> + +// CHECK: call <32 x i32> @llvm.nvvm.tcgen05.ld.16x64b.x32(ptr addrspace(6) {{%[0-9]+}}, i1 true) + %ldv32= nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape} : vector<32 x i32> + +// CHECK: call <64 x i32> @llvm.nvvm.tcgen05.ld.16x64b.x64(ptr addrspace(6) {{%[0-9]+}}, i1 true) + %ldv64= nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape} : vector<64 x i32> + +// CHECK: call <128 x i32> @llvm.nvvm.tcgen05.ld.16x64b.x128(ptr addrspace(6) {{%[0-9]+}}, i1 true) + %ldv128= nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape} : vector<128 x i32> + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_ld_16x128b +llvm.func @nvvm_tcgen05_ld_16x128b(%tmemAddr : !llvm.ptr<6>) { + +// CHECK: call <2 x i32> @llvm.nvvm.tcgen05.ld.16x128b.x1(ptr addrspace(6) {{%[0-9]+}}, i1 false) + %ldv2 = nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape} : vector<2 x i32> + +// CHECK: call <4 x i32> @llvm.nvvm.tcgen05.ld.16x128b.x2(ptr addrspace(6) {{%[0-9]+}}, i1 false) + %ldv4 = nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape} : vector<4 x i32> + +// CHECK: call <8 x i32> @llvm.nvvm.tcgen05.ld.16x128b.x4(ptr addrspace(6) {{%[0-9]+}}, i1 false) + %ldv8 = nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape} : vector<8 x i32> + +// CHECK: call <16 x i32> @llvm.nvvm.tcgen05.ld.16x128b.x8(ptr addrspace(6) {{%[0-9]+}}, i1 false) + %ldv16= nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape} : vector<16 x i32> + +// CHECK: call <32 x i32> @llvm.nvvm.tcgen05.ld.16x128b.x16(ptr addrspace(6) {{%[0-9]+}}, i1 false) + %ldv32= nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape} : vector<32 x i32> + +// CHECK: call <64 x i32> @llvm.nvvm.tcgen05.ld.16x128b.x32(ptr addrspace(6) {{%[0-9]+}}, i1 false) + %ldv64= nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape} : vector<64 x i32> + +// CHECK: call <128 x i32> @llvm.nvvm.tcgen05.ld.16x128b.x64(ptr addrspace(6) {{%[0-9]+}}, i1 false) + %ldv128= nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape} : vector<128 x i32> + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_ld_16x128b_pack +llvm.func @nvvm_tcgen05_ld_16x128b_pack(%tmemAddr : !llvm.ptr<6>) { + +// CHECK: call <2 x i32> @llvm.nvvm.tcgen05.ld.16x128b.x1(ptr addrspace(6) {{%[0-9]+}}, i1 true) + %ldv2 = nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape} : vector<2 x i32> + +// CHECK: call <4 x i32> @llvm.nvvm.tcgen05.ld.16x128b.x2(ptr addrspace(6) {{%[0-9]+}}, i1 true) + %ldv4 = nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape} : vector<4 x i32> + +// CHECK: call <8 x i32> @llvm.nvvm.tcgen05.ld.16x128b.x4(ptr addrspace(6) {{%[0-9]+}}, i1 true) + %ldv8 = nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape} : vector<8 x i32> + +// CHECK: call <16 x i32> @llvm.nvvm.tcgen05.ld.16x128b.x8(ptr addrspace(6) {{%[0-9]+}}, i1 true) + %ldv16= nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape} : vector<16 x i32> + +// CHECK: call <32 x i32> @llvm.nvvm.tcgen05.ld.16x128b.x16(ptr addrspace(6) {{%[0-9]+}}, i1 true) + %ldv32= nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape} : vector<32 x i32> + +// CHECK: call <64 x i32> @llvm.nvvm.tcgen05.ld.16x128b.x32(ptr addrspace(6) {{%[0-9]+}}, i1 true) + %ldv64= nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape} : vector<64 x i32> + +// CHECK: call <128 x i32> @llvm.nvvm.tcgen05.ld.16x128b.x64(ptr addrspace(6) {{%[0-9]+}}, i1 true) + %ldv128= nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape} : vector<128 x i32> + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_ld_16x256b +llvm.func @nvvm_tcgen05_ld_16x256b(%tmemAddr : !llvm.ptr<6>) { + +// CHECK: call <4 x i32> @llvm.nvvm.tcgen05.ld.16x256b.x1(ptr addrspace(6) {{%[0-9]+}}, i1 false) + %ldv4 = nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape} : vector<4 x i32> + +// CHECK: call <8 x i32> @llvm.nvvm.tcgen05.ld.16x256b.x2(ptr addrspace(6) {{%[0-9]+}}, i1 false) + %ldv8 = nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape} : vector<8 x i32> + +// CHECK: call <16 x i32> @llvm.nvvm.tcgen05.ld.16x256b.x4(ptr addrspace(6) {{%[0-9]+}}, i1 false) + %ldv16= nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape} : vector<16 x i32> + +// CHECK: call <32 x i32> @llvm.nvvm.tcgen05.ld.16x256b.x8(ptr addrspace(6) {{%[0-9]+}}, i1 false) + %ldv32= nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape} : vector<32 x i32> + +// CHECK: call <64 x i32> @llvm.nvvm.tcgen05.ld.16x256b.x16(ptr addrspace(6) {{%[0-9]+}}, i1 false) + %ldv64= nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape} : vector<64 x i32> + +// CHECK: call <128 x i32> @llvm.nvvm.tcgen05.ld.16x256b.x32(ptr addrspace(6) {{%[0-9]+}}, i1 false) + %ldv128= nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape} : vector<128 x i32> + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_ld_16x256b_pack +llvm.func @nvvm_tcgen05_ld_16x256b_pack(%tmemAddr : !llvm.ptr<6>) { + +// CHECK: call <4 x i32> @llvm.nvvm.tcgen05.ld.16x256b.x1(ptr addrspace(6) {{%[0-9]+}}, i1 true) + %ldv4 = nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape} : vector<4 x i32> + +// CHECK: call <8 x i32> @llvm.nvvm.tcgen05.ld.16x256b.x2(ptr addrspace(6) {{%[0-9]+}}, i1 true) + %ldv8 = nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape} : vector<8 x i32> + +// CHECK: call <16 x i32> @llvm.nvvm.tcgen05.ld.16x256b.x4(ptr addrspace(6) {{%[0-9]+}}, i1 true) + %ldv16= nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape} : vector<16 x i32> + +// CHECK: call <32 x i32> @llvm.nvvm.tcgen05.ld.16x256b.x8(ptr addrspace(6) {{%[0-9]+}}, i1 true) + %ldv32= nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape} : vector<32 x i32> + +// CHECK: call <64 x i32> @llvm.nvvm.tcgen05.ld.16x256b.x16(ptr addrspace(6) {{%[0-9]+}}, i1 true) + %ldv64= nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape} : vector<64 x i32> + +// CHECK: call <128 x i32> @llvm.nvvm.tcgen05.ld.16x256b.x32(ptr addrspace(6) {{%[0-9]+}}, i1 true) + %ldv128= nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape} : vector<128 x i32> + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_ld_32x32b +llvm.func @nvvm_tcgen05_ld_32x32b(%tmemAddr : !llvm.ptr<6>) { + +// CHECK: call i32 @llvm.nvvm.tcgen05.ld.32x32b.x1(ptr addrspace(6) {{%[0-9]+}}, i1 false) + %ldv1 = nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape} : i32 + +// CHECK: call <2 x i32> @llvm.nvvm.tcgen05.ld.32x32b.x2(ptr addrspace(6) {{%[0-9]+}}, i1 false) + %ldv2 = nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape} : vector<2 x i32> + +// CHECK: call <4 x i32> @llvm.nvvm.tcgen05.ld.32x32b.x4(ptr addrspace(6) {{%[0-9]+}}, i1 false) + %ldv4 = nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape} : vector<4 x i32> + +// CHECK: call <8 x i32> @llvm.nvvm.tcgen05.ld.32x32b.x8(ptr addrspace(6) {{%[0-9]+}}, i1 false) + %ldv8 = nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape} : vector<8 x i32> + +// CHECK: call <16 x i32> @llvm.nvvm.tcgen05.ld.32x32b.x16(ptr addrspace(6) {{%[0-9]+}}, i1 false) + %ldv16= nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape} : vector<16 x i32> + +// CHECK: call <32 x i32> @llvm.nvvm.tcgen05.ld.32x32b.x32(ptr addrspace(6) {{%[0-9]+}}, i1 false) + %ldv32= nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape} : vector<32 x i32> + +// CHECK: call <64 x i32> @llvm.nvvm.tcgen05.ld.32x32b.x64(ptr addrspace(6) {{%[0-9]+}}, i1 false) + %ldv64= nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape} : vector<64 x i32> + +// CHECK: call <128 x i32> @llvm.nvvm.tcgen05.ld.32x32b.x128(ptr addrspace(6) {{%[0-9]+}}, i1 false) + %ldv128= nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape} : vector<128 x i32> + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_ld_32x32b_pack +llvm.func @nvvm_tcgen05_ld_32x32b_pack(%tmemAddr : !llvm.ptr<6>) { + +// CHECK: call i32 @llvm.nvvm.tcgen05.ld.32x32b.x1(ptr addrspace(6) {{%[0-9]+}}, i1 true) + %ldv1 = nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape} : i32 + +// CHECK: call <2 x i32> @llvm.nvvm.tcgen05.ld.32x32b.x2(ptr addrspace(6) {{%[0-9]+}}, i1 true) + %ldv2 = nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape} : vector<2 x i32> + +// CHECK: call <4 x i32> @llvm.nvvm.tcgen05.ld.32x32b.x4(ptr addrspace(6) {{%[0-9]+}}, i1 true) + %ldv4 = nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape} : vector<4 x i32> + +// CHECK: call <8 x i32> @llvm.nvvm.tcgen05.ld.32x32b.x8(ptr addrspace(6) {{%[0-9]+}}, i1 true) + %ldv8 = nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape} : vector<8 x i32> + +// CHECK: call <16 x i32> @llvm.nvvm.tcgen05.ld.32x32b.x16(ptr addrspace(6) {{%[0-9]+}}, i1 true) + %ldv16= nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape} : vector<16 x i32> + +// CHECK: call <32 x i32> @llvm.nvvm.tcgen05.ld.32x32b.x32(ptr addrspace(6) {{%[0-9]+}}, i1 true) + %ldv32= nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape} : vector<32 x i32> + +// CHECK: call <64 x i32> @llvm.nvvm.tcgen05.ld.32x32b.x64(ptr addrspace(6) {{%[0-9]+}}, i1 true) + %ldv64= nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape} : vector<64 x i32> + +// CHECK: call <128 x i32> @llvm.nvvm.tcgen05.ld.32x32b.x128(ptr addrspace(6) {{%[0-9]+}}, i1 true) + %ldv128= nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape} : vector<128 x i32> + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_ld_16x32bx2 +llvm.func @nvvm_tcgen05_ld_16x32bx2(%tmemAddr : !llvm.ptr<6>) { + + %halfSplitOffset = llvm.mlir.constant(2:i64) : i64 + +// CHECK: call i32 @llvm.nvvm.tcgen05.ld.16x32bx2.x1(ptr addrspace(6) {{%[0-9]+}}, i64 2, i1 false) + %ldv1 = nvvm.tcgen05.ld %tmemAddr, %halfSplitOffset { shape = #nvvm.tcgen05_ldst_shape} : i32 + +// CHECK: call <2 x i32> @llvm.nvvm.tcgen05.ld.16x32bx2.x2(ptr addrspace(6) {{%[0-9]+}}, i64 2, i1 false) + %ldv2 = nvvm.tcgen05.ld %tmemAddr, %halfSplitOffset { shape = #nvvm.tcgen05_ldst_shape} : vector<2 x i32> + +// CHECK: call <4 x i32> @llvm.nvvm.tcgen05.ld.16x32bx2.x4(ptr addrspace(6) {{%[0-9]+}}, i64 2, i1 false) + %ldv4 = nvvm.tcgen05.ld %tmemAddr, %halfSplitOffset { shape = #nvvm.tcgen05_ldst_shape} : vector<4 x i32> + +// CHECK: call <8 x i32> @llvm.nvvm.tcgen05.ld.16x32bx2.x8(ptr addrspace(6) {{%[0-9]+}}, i64 2, i1 false) + %ldv8 = nvvm.tcgen05.ld %tmemAddr, %halfSplitOffset { shape = #nvvm.tcgen05_ldst_shape} : vector<8 x i32> + +// CHECK: call <16 x i32> @llvm.nvvm.tcgen05.ld.16x32bx2.x16(ptr addrspace(6) {{%[0-9]+}}, i64 2, i1 false) + %ldv16= nvvm.tcgen05.ld %tmemAddr, %halfSplitOffset { shape = #nvvm.tcgen05_ldst_shape} : vector<16 x i32> + +// CHECK: call <32 x i32> @llvm.nvvm.tcgen05.ld.16x32bx2.x32(ptr addrspace(6) {{%[0-9]+}}, i64 2, i1 false) + %ldv32= nvvm.tcgen05.ld %tmemAddr, %halfSplitOffset { shape = #nvvm.tcgen05_ldst_shape} : vector<32 x i32> + +// CHECK: call <64 x i32> @llvm.nvvm.tcgen05.ld.16x32bx2.x64(ptr addrspace(6) {{%[0-9]+}}, i64 2, i1 false) + %ldv64= nvvm.tcgen05.ld %tmemAddr, %halfSplitOffset { shape = #nvvm.tcgen05_ldst_shape} : vector<64 x i32> + +// CHECK: call <128 x i32> @llvm.nvvm.tcgen05.ld.16x32bx2.x128(ptr addrspace(6) {{%[0-9]+}}, i64 2, i1 false) + %ldv128= nvvm.tcgen05.ld %tmemAddr, %halfSplitOffset { shape = #nvvm.tcgen05_ldst_shape} : vector<128 x i32> + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_ld_16x32bx2_pack +llvm.func @nvvm_tcgen05_ld_16x32bx2_pack(%tmemAddr : !llvm.ptr<6>) { + + %halfSplitOffset = llvm.mlir.constant(2:i64) : i64 + +// CHECK: call i32 @llvm.nvvm.tcgen05.ld.16x32bx2.x1(ptr addrspace(6) {{%[0-9]+}}, i64 2, i1 true) + %ldv1 = nvvm.tcgen05.ld %tmemAddr, %halfSplitOffset pack { shape = #nvvm.tcgen05_ldst_shape} : i32 + +// CHECK: call <2 x i32> @llvm.nvvm.tcgen05.ld.16x32bx2.x2(ptr addrspace(6) {{%[0-9]+}}, i64 2, i1 true) + %ldv2 = nvvm.tcgen05.ld %tmemAddr, %halfSplitOffset pack { shape = #nvvm.tcgen05_ldst_shape} : vector<2 x i32> + +// CHECK: call <4 x i32> @llvm.nvvm.tcgen05.ld.16x32bx2.x4(ptr addrspace(6) {{%[0-9]+}}, i64 2, i1 true) + %ldv4 = nvvm.tcgen05.ld %tmemAddr, %halfSplitOffset pack { shape = #nvvm.tcgen05_ldst_shape} : vector<4 x i32> + +// CHECK: call <8 x i32> @llvm.nvvm.tcgen05.ld.16x32bx2.x8(ptr addrspace(6) {{%[0-9]+}}, i64 2, i1 true) + %ldv8 = nvvm.tcgen05.ld %tmemAddr, %halfSplitOffset pack { shape = #nvvm.tcgen05_ldst_shape} : vector<8 x i32> + +// CHECK: call <16 x i32> @llvm.nvvm.tcgen05.ld.16x32bx2.x16(ptr addrspace(6) {{%[0-9]+}}, i64 2, i1 true) + %ldv16= nvvm.tcgen05.ld %tmemAddr, %halfSplitOffset pack { shape = #nvvm.tcgen05_ldst_shape} : vector<16 x i32> + +// CHECK: call <32 x i32> @llvm.nvvm.tcgen05.ld.16x32bx2.x32(ptr addrspace(6) {{%[0-9]+}}, i64 2, i1 true) + %ldv32= nvvm.tcgen05.ld %tmemAddr, %halfSplitOffset pack { shape = #nvvm.tcgen05_ldst_shape} : vector<32 x i32> + +// CHECK: call <64 x i32> @llvm.nvvm.tcgen05.ld.16x32bx2.x64(ptr addrspace(6) {{%[0-9]+}}, i64 2, i1 true) + %ldv64= nvvm.tcgen05.ld %tmemAddr, %halfSplitOffset pack { shape = #nvvm.tcgen05_ldst_shape} : vector<64 x i32> + +// CHECK: call <128 x i32> @llvm.nvvm.tcgen05.ld.16x32bx2.x128(ptr addrspace(6) {{%[0-9]+}}, i64 2, i1 true) + %ldv128= nvvm.tcgen05.ld %tmemAddr, %halfSplitOffset pack { shape = #nvvm.tcgen05_ldst_shape} : vector<128 x i32> + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-st.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-st.mlir new file mode 100644 index 0000000000000..119746133625d --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-st.mlir @@ -0,0 +1,377 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_ld_16x64b +llvm.func @nvvm_tcgen05_ld_16x64b( + %tmemAddr : !llvm.ptr<6>, + %stv1 : i32, + %stv2 : vector<2xi32>, + %stv4 : vector<4xi32>, + %stv8 : vector<8xi32>, + %stv16 : vector<16xi32>, + %stv32 : vector<32xi32>, + %stv64 : vector<64xi32>, + %stv128 : vector<128xi32>) { + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x64b.x1(ptr addrspace(6) {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv1 { shape = #nvvm.tcgen05_ldst_shape, num=1:i32 } : i32 + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x64b.x2(ptr addrspace(6) {{%[0-9]+}}, <2 x i32> {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv2 { shape = #nvvm.tcgen05_ldst_shape, num=2:i32 } : vector<2xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x64b.x4(ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv4 { shape = #nvvm.tcgen05_ldst_shape, num=4:i32 } : vector<4xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x64b.x8(ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv8 { shape = #nvvm.tcgen05_ldst_shape, num=8:i32 } : vector<8xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x64b.x16(ptr addrspace(6) {{%[0-9]+}}, <16 x i32> {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv16 { shape = #nvvm.tcgen05_ldst_shape, num=16:i32 } : vector<16xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x64b.x32(ptr addrspace(6) {{%[0-9]+}}, <32 x i32> {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv32 { shape = #nvvm.tcgen05_ldst_shape, num=32:i32 } : vector<32xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x64b.x64(ptr addrspace(6) {{%[0-9]+}}, <64 x i32> {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv64 { shape = #nvvm.tcgen05_ldst_shape, num=64:i32 } : vector<64xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x64b.x128(ptr addrspace(6) {{%[0-9]+}}, <128 x i32> {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv128 { shape = #nvvm.tcgen05_ldst_shape, num=128:i32 } : vector<128xi32> + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_ld_16x64b_pack +llvm.func @nvvm_tcgen05_ld_16x64b_pack( + %tmemAddr : !llvm.ptr<6>, + %stv1 : i32, + %stv2 : vector<2xi32>, + %stv4 : vector<4xi32>, + %stv8 : vector<8xi32>, + %stv16 : vector<16xi32>, + %stv32 : vector<32xi32>, + %stv64 : vector<64xi32>, + %stv128 : vector<128xi32>) { + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x64b.x1(ptr addrspace(6) {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv1 unpack { shape = #nvvm.tcgen05_ldst_shape, num=1:i32 } : i32 + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x64b.x2(ptr addrspace(6) {{%[0-9]+}}, <2 x i32> {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv2 unpack { shape = #nvvm.tcgen05_ldst_shape, num=2:i32 } : vector<2xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x64b.x4(ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv4 unpack { shape = #nvvm.tcgen05_ldst_shape, num=4:i32 } : vector<4xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x64b.x8(ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv8 unpack { shape = #nvvm.tcgen05_ldst_shape, num=8:i32 } : vector<8xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x64b.x16(ptr addrspace(6) {{%[0-9]+}}, <16 x i32> {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv16 unpack { shape = #nvvm.tcgen05_ldst_shape, num=16:i32 } : vector<16xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x64b.x32(ptr addrspace(6) {{%[0-9]+}}, <32 x i32> {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv32 unpack { shape = #nvvm.tcgen05_ldst_shape, num=32:i32 } : vector<32xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x64b.x64(ptr addrspace(6) {{%[0-9]+}}, <64 x i32> {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv64 unpack { shape = #nvvm.tcgen05_ldst_shape, num=64:i32 } : vector<64xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x64b.x128(ptr addrspace(6) {{%[0-9]+}}, <128 x i32> {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv128 unpack { shape = #nvvm.tcgen05_ldst_shape, num=128:i32 } : vector<128xi32> + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_ld_16x128b +llvm.func @nvvm_tcgen05_ld_16x128b( + %tmemAddr : !llvm.ptr<6>, + %stv1 : i32, + %stv2 : vector<2xi32>, + %stv4 : vector<4xi32>, + %stv8 : vector<8xi32>, + %stv16 : vector<16xi32>, + %stv32 : vector<32xi32>, + %stv64 : vector<64xi32>, + %stv128 : vector<128xi32>) { + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x128b.x1(ptr addrspace(6) {{%[0-9]+}}, <2 x i32> {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv2 { shape = #nvvm.tcgen05_ldst_shape, num=1:i32 } : vector<2xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x128b.x2(ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv4 { shape = #nvvm.tcgen05_ldst_shape, num=2:i32 } : vector<4xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x128b.x4(ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv8 { shape = #nvvm.tcgen05_ldst_shape, num=4:i32 } : vector<8xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x128b.x8(ptr addrspace(6) {{%[0-9]+}}, <16 x i32> {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv16 { shape = #nvvm.tcgen05_ldst_shape, num=8:i32 } : vector<16xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x128b.x16(ptr addrspace(6) {{%[0-9]+}}, <32 x i32> {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv32 { shape = #nvvm.tcgen05_ldst_shape, num=16:i32 } : vector<32xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x128b.x32(ptr addrspace(6) {{%[0-9]+}}, <64 x i32> {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv64 { shape = #nvvm.tcgen05_ldst_shape, num=32:i32 } : vector<64xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x128b.x64(ptr addrspace(6) {{%[0-9]+}}, <128 x i32> {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv128 { shape = #nvvm.tcgen05_ldst_shape, num=64:i32 } : vector<128xi32> + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_ld_16x128b_pack +llvm.func @nvvm_tcgen05_ld_16x128b_pack( + %tmemAddr : !llvm.ptr<6>, + %stv1 : i32, + %stv2 : vector<2xi32>, + %stv4 : vector<4xi32>, + %stv8 : vector<8xi32>, + %stv16 : vector<16xi32>, + %stv32 : vector<32xi32>, + %stv64 : vector<64xi32>, + %stv128 : vector<128xi32>) { + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x128b.x1(ptr addrspace(6) {{%[0-9]+}}, <2 x i32> {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv2 unpack { shape = #nvvm.tcgen05_ldst_shape, num=1:i32 } : vector<2xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x128b.x2(ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv4 unpack { shape = #nvvm.tcgen05_ldst_shape, num=2:i32 } : vector<4xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x128b.x4(ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv8 unpack { shape = #nvvm.tcgen05_ldst_shape, num=4:i32 } : vector<8xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x128b.x8(ptr addrspace(6) {{%[0-9]+}}, <16 x i32> {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv16 unpack { shape = #nvvm.tcgen05_ldst_shape, num=8:i32 } : vector<16xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x128b.x16(ptr addrspace(6) {{%[0-9]+}}, <32 x i32> {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv32 unpack { shape = #nvvm.tcgen05_ldst_shape, num=16:i32 } : vector<32xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x128b.x32(ptr addrspace(6) {{%[0-9]+}}, <64 x i32> {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv64 unpack { shape = #nvvm.tcgen05_ldst_shape, num=32:i32 } : vector<64xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x128b.x64(ptr addrspace(6) {{%[0-9]+}}, <128 x i32> {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv128 unpack { shape = #nvvm.tcgen05_ldst_shape, num=64:i32 } : vector<128xi32> + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_ld_16x256b +llvm.func @nvvm_tcgen05_ld_16x256b( + %tmemAddr : !llvm.ptr<6>, + %stv1 : i32, + %stv2 : vector<2xi32>, + %stv4 : vector<4xi32>, + %stv8 : vector<8xi32>, + %stv16 : vector<16xi32>, + %stv32 : vector<32xi32>, + %stv64 : vector<64xi32>, + %stv128 : vector<128xi32>) { + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x256b.x1(ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv4 { shape = #nvvm.tcgen05_ldst_shape, num=1:i32 } : vector<4xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x256b.x2(ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv8 { shape = #nvvm.tcgen05_ldst_shape, num=2:i32 } : vector<8xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x256b.x4(ptr addrspace(6) {{%[0-9]+}}, <16 x i32> {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv16 { shape = #nvvm.tcgen05_ldst_shape, num=4:i32 } : vector<16xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x256b.x8(ptr addrspace(6) {{%[0-9]+}}, <32 x i32> {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv32 { shape = #nvvm.tcgen05_ldst_shape, num=8:i32 } : vector<32xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x256b.x16(ptr addrspace(6) {{%[0-9]+}}, <64 x i32> {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv64 { shape = #nvvm.tcgen05_ldst_shape, num=16:i32 } : vector<64xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x256b.x32(ptr addrspace(6) {{%[0-9]+}}, <128 x i32> {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv128 { shape = #nvvm.tcgen05_ldst_shape, num=32:i32 } : vector<128xi32> + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_ld_16x256b_pack +llvm.func @nvvm_tcgen05_ld_16x256b_pack( + %tmemAddr : !llvm.ptr<6>, + %stv1 : i32, + %stv2 : vector<2xi32>, + %stv4 : vector<4xi32>, + %stv8 : vector<8xi32>, + %stv16 : vector<16xi32>, + %stv32 : vector<32xi32>, + %stv64 : vector<64xi32>, + %stv128 : vector<128xi32>) { + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x256b.x1(ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv4 unpack { shape = #nvvm.tcgen05_ldst_shape, num=1:i32 } : vector<4xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x256b.x2(ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv8 unpack { shape = #nvvm.tcgen05_ldst_shape, num=2:i32 } : vector<8xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x256b.x4(ptr addrspace(6) {{%[0-9]+}}, <16 x i32> {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv16 unpack { shape = #nvvm.tcgen05_ldst_shape, num=4:i32 } : vector<16xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x256b.x8(ptr addrspace(6) {{%[0-9]+}}, <32 x i32> {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv32 unpack { shape = #nvvm.tcgen05_ldst_shape, num=8:i32 } : vector<32xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x256b.x16(ptr addrspace(6) {{%[0-9]+}}, <64 x i32> {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv64 unpack { shape = #nvvm.tcgen05_ldst_shape, num=16:i32 } : vector<64xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x256b.x32(ptr addrspace(6) {{%[0-9]+}}, <128 x i32> {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv128 unpack { shape = #nvvm.tcgen05_ldst_shape, num=32:i32 } : vector<128xi32> + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_ld_32x32b +llvm.func @nvvm_tcgen05_ld_32x32b( + %tmemAddr : !llvm.ptr<6>, + %stv1 : i32, + %stv2 : vector<2xi32>, + %stv4 : vector<4xi32>, + %stv8 : vector<8xi32>, + %stv16 : vector<16xi32>, + %stv32 : vector<32xi32>, + %stv64 : vector<64xi32>, + %stv128 : vector<128xi32>) { + +// CHECK: call void @llvm.nvvm.tcgen05.st.32x32b.x1(ptr addrspace(6) {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv1 { shape = #nvvm.tcgen05_ldst_shape, num=1:i32 } : i32 + +// CHECK: call void @llvm.nvvm.tcgen05.st.32x32b.x2(ptr addrspace(6) {{%[0-9]+}}, <2 x i32> {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv2 { shape = #nvvm.tcgen05_ldst_shape, num=2:i32 } : vector<2xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.32x32b.x4(ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv4 { shape = #nvvm.tcgen05_ldst_shape, num=4:i32 } : vector<4xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.32x32b.x8(ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv8 { shape = #nvvm.tcgen05_ldst_shape, num=8:i32 } : vector<8xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.32x32b.x16(ptr addrspace(6) {{%[0-9]+}}, <16 x i32> {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv16 { shape = #nvvm.tcgen05_ldst_shape, num=16:i32 } : vector<16xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.32x32b.x32(ptr addrspace(6) {{%[0-9]+}}, <32 x i32> {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv32 { shape = #nvvm.tcgen05_ldst_shape, num=32:i32 } : vector<32xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.32x32b.x64(ptr addrspace(6) {{%[0-9]+}}, <64 x i32> {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv64 { shape = #nvvm.tcgen05_ldst_shape, num=64:i32 } : vector<64xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.32x32b.x128(ptr addrspace(6) {{%[0-9]+}}, <128 x i32> {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv128 { shape = #nvvm.tcgen05_ldst_shape, num=128:i32 } : vector<128xi32> + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_ld_32x32b_pack +llvm.func @nvvm_tcgen05_ld_32x32b_pack( + %tmemAddr : !llvm.ptr<6>, + %stv1 : i32, + %stv2 : vector<2xi32>, + %stv4 : vector<4xi32>, + %stv8 : vector<8xi32>, + %stv16 : vector<16xi32>, + %stv32 : vector<32xi32>, + %stv64 : vector<64xi32>, + %stv128 : vector<128xi32>) { + +// CHECK: call void @llvm.nvvm.tcgen05.st.32x32b.x1(ptr addrspace(6) {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv1 unpack { shape = #nvvm.tcgen05_ldst_shape, num=1:i32 } : i32 + +// CHECK: call void @llvm.nvvm.tcgen05.st.32x32b.x2(ptr addrspace(6) {{%[0-9]+}}, <2 x i32> {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv2 unpack { shape = #nvvm.tcgen05_ldst_shape, num=2:i32 } : vector<2xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.32x32b.x4(ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv4 unpack { shape = #nvvm.tcgen05_ldst_shape, num=4:i32 } : vector<4xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.32x32b.x8(ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv8 unpack { shape = #nvvm.tcgen05_ldst_shape, num=8:i32 } : vector<8xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.32x32b.x16(ptr addrspace(6) {{%[0-9]+}}, <16 x i32> {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv16 unpack { shape = #nvvm.tcgen05_ldst_shape, num=16:i32 } : vector<16xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.32x32b.x32(ptr addrspace(6) {{%[0-9]+}}, <32 x i32> {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv32 unpack { shape = #nvvm.tcgen05_ldst_shape, num=32:i32 } : vector<32xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.32x32b.x64(ptr addrspace(6) {{%[0-9]+}}, <64 x i32> {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv64 unpack { shape = #nvvm.tcgen05_ldst_shape, num=64:i32 } : vector<64xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.32x32b.x128(ptr addrspace(6) {{%[0-9]+}}, <128 x i32> {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv128 unpack { shape = #nvvm.tcgen05_ldst_shape, num=128:i32 } : vector<128xi32> + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_ld_16x32bx2 +llvm.func @nvvm_tcgen05_ld_16x32bx2( + %tmemAddr : !llvm.ptr<6>, + %stv1 : i32, + %stv2 : vector<2xi32>, + %stv4 : vector<4xi32>, + %stv8 : vector<8xi32>, + %stv16 : vector<16xi32>, + %stv32 : vector<32xi32>, + %stv64 : vector<64xi32>, + %stv128 : vector<128xi32>) { + + %offset = llvm.mlir.constant(2:i64) : i64 + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x32bx2.x1(ptr addrspace(6) {{%[0-9]+}}, i64 2, i32 {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv1, %offset { shape = #nvvm.tcgen05_ldst_shape, num=1:i32 } : i32 + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x32bx2.x2(ptr addrspace(6) {{%[0-9]+}}, i64 2, <2 x i32> {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv2, %offset { shape = #nvvm.tcgen05_ldst_shape, num=2:i32 } : vector<2xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x32bx2.x4(ptr addrspace(6) {{%[0-9]+}}, i64 2, <4 x i32> {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv4, %offset { shape = #nvvm.tcgen05_ldst_shape, num=4:i32 } : vector<4xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x32bx2.x8(ptr addrspace(6) {{%[0-9]+}}, i64 2, <8 x i32> {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv8, %offset { shape = #nvvm.tcgen05_ldst_shape, num=8:i32 } : vector<8xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x32bx2.x16(ptr addrspace(6) {{%[0-9]+}}, i64 2, <16 x i32> {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv16, %offset { shape = #nvvm.tcgen05_ldst_shape, num=16:i32 } : vector<16xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x32bx2.x32(ptr addrspace(6) {{%[0-9]+}}, i64 2, <32 x i32> {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv32, %offset { shape = #nvvm.tcgen05_ldst_shape, num=32:i32 } : vector<32xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x32bx2.x64(ptr addrspace(6) {{%[0-9]+}}, i64 2, <64 x i32> {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv64, %offset { shape = #nvvm.tcgen05_ldst_shape, num=64:i32 } : vector<64xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x32bx2.x128(ptr addrspace(6) {{%[0-9]+}}, i64 2, <128 x i32> {{%[0-9]+}}, i1 false) + nvvm.tcgen05.st %tmemAddr, %stv128, %offset { shape = #nvvm.tcgen05_ldst_shape, num=128:i32 } : vector<128xi32> + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_ld_16x32bx2_pack +llvm.func @nvvm_tcgen05_ld_16x32bx2_pack( + %tmemAddr : !llvm.ptr<6>, + %stv1 : i32, + %stv2 : vector<2xi32>, + %stv4 : vector<4xi32>, + %stv8 : vector<8xi32>, + %stv16 : vector<16xi32>, + %stv32 : vector<32xi32>, + %stv64 : vector<64xi32>, + %stv128 : vector<128xi32>) { + + %offset = llvm.mlir.constant(2:i64) : i64 + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x32bx2.x1(ptr addrspace(6) {{%[0-9]+}}, i64 2, i32 {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv1, %offset unpack { shape = #nvvm.tcgen05_ldst_shape, num=1:i32 } : i32 + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x32bx2.x2(ptr addrspace(6) {{%[0-9]+}}, i64 2, <2 x i32> {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv2, %offset unpack { shape = #nvvm.tcgen05_ldst_shape, num=2:i32 } : vector<2xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x32bx2.x4(ptr addrspace(6) {{%[0-9]+}}, i64 2, <4 x i32> {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv4, %offset unpack { shape = #nvvm.tcgen05_ldst_shape, num=4:i32 } : vector<4xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x32bx2.x8(ptr addrspace(6) {{%[0-9]+}}, i64 2, <8 x i32> {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv8, %offset unpack { shape = #nvvm.tcgen05_ldst_shape, num=8:i32 } : vector<8xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x32bx2.x16(ptr addrspace(6) {{%[0-9]+}}, i64 2, <16 x i32> {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv16, %offset unpack { shape = #nvvm.tcgen05_ldst_shape, num=16:i32 } : vector<16xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x32bx2.x32(ptr addrspace(6) {{%[0-9]+}}, i64 2, <32 x i32> {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv32, %offset unpack { shape = #nvvm.tcgen05_ldst_shape, num=32:i32 } : vector<32xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x32bx2.x64(ptr addrspace(6) {{%[0-9]+}}, i64 2, <64 x i32> {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv64, %offset unpack { shape = #nvvm.tcgen05_ldst_shape, num=64:i32 } : vector<64xi32> + +// CHECK: call void @llvm.nvvm.tcgen05.st.16x32bx2.x128(ptr addrspace(6) {{%[0-9]+}}, i64 2, <128 x i32> {{%[0-9]+}}, i1 true) + nvvm.tcgen05.st %tmemAddr, %stv128, %offset unpack { shape = #nvvm.tcgen05_ldst_shape, num=128:i32 } : vector<128xi32> + + llvm.return +}