diff --git a/mlir/lib/Conversion/ArmSMEToSCF/ArmSMEToSCF.cpp b/mlir/lib/Conversion/ArmSMEToSCF/ArmSMEToSCF.cpp index 0511e270c9924..a4dfd4b7edc77 100644 --- a/mlir/lib/Conversion/ArmSMEToSCF/ArmSMEToSCF.cpp +++ b/mlir/lib/Conversion/ArmSMEToSCF/ArmSMEToSCF.cpp @@ -80,9 +80,8 @@ struct TileLoadOpConversion : public OpRewritePattern { LogicalResult matchAndRewrite(arm_sme::TileLoadOp tileLoadOp, PatternRewriter &rewriter) const override { if (tileLoadOp.getMask()) - // TODO: add masked patterns. - return rewriter.notifyMatchFailure( - tileLoadOp, "op has mask, needs masked pattern(s)"); + return rewriter.notifyMatchFailure(tileLoadOp, + "op has mask, apply masked patterns"); OpBuilder::InsertionGuard g(rewriter); auto loc = tileLoadOp.getLoc(); @@ -142,6 +141,234 @@ struct TileLoadOpConversion : public OpRewritePattern { } }; +/// Lower `arm_sme.tile_load` with mask and pad of constant zero. +/// +/// BEFORE: +/// ```mlir +/// %pad = arith.constant 0 : i32 +/// %num_rows = arith.constant 2 : index +/// %num_cols = arith.constant 4 : index +/// %mask = vector.create_mask %num_rows, %num_cols : vector<[4]x[4]xi1> +/// %tile = arm_sme.tile_load %src[%c0, %c0], %pad, %mask : +/// memref, vector<[4]x[4]xi32> +/// ``` +/// +/// AFTER: +/// ```mlir +/// %c0 = arith.constant 0 : index +/// %c1 = arith.constant 1 : index +/// %tile = arm_sme.zero : vector<[4]x[4]xi32> +/// %num_rows = arith.constant 2 : index +/// %num_cols = vector.create_mask %c4 : vector<[4]xi1> +/// scf.for %tile_slice_idx = %c0 to %num_rows step %c1 { +/// %tile_update = arm_sme.load_tile_slice +/// %src[%tile_slice_idx], %num_cols, %tile, %tile_slice_idx : +/// memref, vector<[1]xi32>, vector<[4]x[4]xi32> +/// } +/// ``` +/// +/// NOTE: Only mask of 'vector.create_mask' op is currently supported. +struct TileLoadOpWithMaskAndPadZeroConversion + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(arm_sme::TileLoadOp tileLoadOp, + PatternRewriter &rewriter) const override { + OpBuilder::InsertionGuard g(rewriter); + auto loc = tileLoadOp.getLoc(); + auto tileType = tileLoadOp.getVectorType(); + + auto maskOp = tileLoadOp.getMask(); + if (!maskOp) + return rewriter.notifyMatchFailure( + tileLoadOp, "op has no mask, needs unmasked pattern"); + + auto padOp = tileLoadOp.getPadding(); + assert(padOp && "expected padding when masking!"); + + auto createMaskOp = maskOp.getDefiningOp(); + if (!createMaskOp) + return rewriter.notifyMatchFailure( + tileLoadOp, "unsupported mask op, only 'vector.create_mask' is " + "currently supported"); + + auto constPadOp = padOp.getDefiningOp(); + if (!constPadOp || constPadOp.getValue() != + rewriter.getZeroAttr(tileType.getElementType())) + return rewriter.notifyMatchFailure( + tileLoadOp, "op has non-zero pad, needs non-zero pad pattern"); + + auto numRows = createMaskOp.getOperands()[0]; + auto numCols = createMaskOp.getOperands()[1]; + + auto predicateType = + VectorType::get(tileType.getDimSize(1), rewriter.getI1Type(), true); + auto numColsOp = + rewriter.create(loc, predicateType, numCols); + + // Initialize tile with zero to satisfy padding. Inactive cols will be + // zeroed anyway since the loads use zeroing predication. For inactive rows + // however, no load will occur so these need to be zeroed. + auto tile = rewriter.create(loc, tileType); + + // Create a loop to load the active tile slices from memory. + auto step = rewriter.create(loc, 1); + auto lowerBound = rewriter.create(loc, 0); + auto upperBound = numRows; + auto forOp = rewriter.create(loc, lowerBound, upperBound, step); + + rewriter.setInsertionPointToStart(forOp.getBody()); + + // Create 'arm_sme.load_tile_slice' to load tile slice from memory into + // tile. + SmallVector memrefIndices; + auto tileSliceIndex = forOp.getInductionVar(); + getMemrefIndices(tileLoadOp.getIndices(), + tileLoadOp.getMemRefType().getRank(), tileSliceIndex, + upperBound, memrefIndices, loc, rewriter); + rewriter.create( + loc, tileType, tileLoadOp.getBase(), numColsOp, tile, memrefIndices, + tileSliceIndex, tileLoadOp.getLayout()); + + rewriter.setInsertionPointAfter(forOp); + + // Replace 'arm_sme.tile_load' with the tile. + rewriter.replaceOp(tileLoadOp, tile); + + return success(); + } +}; + +/// Lower `arm_sme.tile_load` with mask and non-zero pad. +/// +/// BEFORE: +/// ```mlir +/// %pad = arith.constant 1 : i32 +/// %num_rows = arith.constant 2 : index +/// %num_cols = arith.constant 4 : index +/// %mask = vector.create_mask %num_rows, %num_cols : vector<[4]x[4]xi1> +/// %tile = arm_sme.tile_load %src[%c0, %c0], %pad, %mask : +/// memref, vector<[4]x[4]xi32> +/// ``` +/// +/// AFTER: +/// ```mlir +/// ... +/// %pad_1d = arith.constant dense<1> : vector<[4]xi32> +/// scf.for %tile_slice_idx = %c0 to %svl_s step %c1 { +/// ... +/// %mask_1d = vector.create_mask : vector<[4]xi1> +/// %slice = vector.maskedload %base[%tile_slice_idx, %c0], %mask_1d, %pad_1d +/// : memref, vector<[4]xi1>, +/// vector<[4]xi32> into vector<[4]xi32> +/// // Insert slice into tile +/// arm_sme.move_vector_to_tile_slice %slice, %tile, %tile_slice_idx +/// : vector<[4]xi32> into vector<[4]x[4]xi32> +/// } +/// ``` +struct TileLoadOpWithMaskAndPadNonZeroConversion + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(arm_sme::TileLoadOp tileLoadOp, + PatternRewriter &rewriter) const override { + OpBuilder::InsertionGuard g(rewriter); + auto loc = tileLoadOp.getLoc(); + auto tileType = tileLoadOp.getVectorType(); + auto tileElementType = tileType.getElementType(); + unsigned tileElementWidth = tileElementType.getIntOrFloatBitWidth(); + + auto maskOp = tileLoadOp.getMask(); + if (!maskOp) + return rewriter.notifyMatchFailure( + tileLoadOp, "op has no mask, needs unmasked pattern"); + + auto padOp = tileLoadOp.getPadding(); + assert(padOp && "expected padding when masking!"); + + auto createMaskOp = maskOp.getDefiningOp(); + if (!createMaskOp) + return rewriter.notifyMatchFailure( + tileLoadOp, "unsupported mask op, only 'vector.create_mask' is " + "currently supported"); + + auto constPadOp = padOp.getDefiningOp(); + if (constPadOp && + constPadOp.getValue() == rewriter.getZeroAttr(tileElementType)) + return rewriter.notifyMatchFailure( + tileLoadOp, "op has constant zero pad, needs zero pad pattern"); + + auto numRows = createMaskOp.getOperands()[0]; + auto numCols = createMaskOp.getOperands()[1]; + + auto numColsI32 = rewriter.create( + loc, rewriter.getI32Type(), numCols); + + // Create 'arm_sme.get_tile' op. + auto tileId = rewriter.create( + loc, rewriter.getIntegerType(tileElementWidth)); + + // Create `arm_sme.cast_tile_to_vector` to cast tile ID to a vector type to + // use as input tile to 'arm_sme.load_tile_slice' ops. + auto tile = + rewriter.create(loc, tileType, tileId); + + // Create a loop that loads each ZA tile slice from memory. + auto step = rewriter.create(loc, 1); + auto minTileSlices = rewriter.create( + loc, arm_sme::getSMETileSliceMinNumElts(tileElementType)); + auto vscale = + rewriter.create(loc, rewriter.getIndexType()); + auto lowerBound = rewriter.create(loc, 0); + auto numTileSlices = + rewriter.create(loc, minTileSlices, vscale); + auto forOp = + rewriter.create(loc, lowerBound, numTileSlices, step); + + rewriter.setInsertionPointToStart(forOp.getBody()); + + auto tileSliceIndex = forOp.getInductionVar(); + + // Combine masks. + auto rowIsActive = rewriter.create( + loc, arith::CmpIPredicate::ult, tileSliceIndex, numRows); + auto rowIsActiveI32 = rewriter.create( + loc, rewriter.getI32Type(), rowIsActive); + auto mask = rewriter.create(loc, rowIsActiveI32, numColsI32); + auto maskIndex = + rewriter.create(loc, rewriter.getIndexType(), mask); + auto predicateType = + VectorType::get(tileType.getDimSize(1), rewriter.getI1Type(), true); + auto maskOp1D = rewriter.create( + loc, predicateType, maskIndex.getResult()); + + SmallVector memrefIndices; + getMemrefIndices(tileLoadOp.getIndices(), + tileLoadOp.getMemRefType().getRank(), tileSliceIndex, + numTileSlices, memrefIndices, loc, rewriter); + + // Splat pad into 1-D vector matching type of tile slice. + VectorType tileSliceType = VectorType::Builder(tileType).dropDim(0); + auto pad1DOp = rewriter.create(loc, tileSliceType, padOp); + + auto loadSlice = rewriter.create( + loc, tileSliceType, tileLoadOp.getBase(), memrefIndices, maskOp1D, + /*passthru=*/pad1DOp); + + // Create 'arm_sme.move_vector_to_tile_slice' to move slice into tile. + rewriter.create( + loc, tileType, loadSlice->getResult(0), tile, tileSliceIndex, + tileLoadOp.getLayout()); + + rewriter.setInsertionPointAfter(forOp); + + // Replace 'arm_sme.tile_load' with the tile. + rewriter.replaceOp(tileLoadOp, tile); + + return success(); + } +}; + /// Lower `arm_sme.tile_store` to a loop over the tile slices and store each /// slice using `arm_sme.store_tile_slice`. /// @@ -294,7 +521,8 @@ struct TileVectorPrintOpConversion : public OpRewritePattern { } // namespace void mlir::populateArmSMEToSCFConversionPatterns(RewritePatternSet &patterns) { - patterns.add(patterns.getContext()); } diff --git a/mlir/test/Conversion/ArmSMEToSCF/arm-sme-to-scf.mlir b/mlir/test/Conversion/ArmSMEToSCF/arm-sme-to-scf.mlir index e839c2e9e06db..38a2332ffd5e0 100644 --- a/mlir/test/Conversion/ArmSMEToSCF/arm-sme-to-scf.mlir +++ b/mlir/test/Conversion/ArmSMEToSCF/arm-sme-to-scf.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -convert-arm-sme-to-scf -cse -split-input-file | FileCheck %s +// RUN: mlir-opt %s -convert-arm-sme-to-scf -cse -split-input-file -verify-diagnostics | FileCheck %s //===----------------------------------------------------------------------===// // arm_sme.tile_load @@ -33,6 +33,81 @@ func.func @arm_sme_tile_load_ver(%src : memref) { return } +// ----- + +// CHECK-LABEL: func.func @arm_sme_tile_load_hor_with_mask_and_pad_zero( +// CHECK-SAME: %[[SRC:.*]]: memref) { +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[NUM_ROWS:.*]] = arith.constant 3 : index +// CHECK-DAG: %[[NUM_COLS:.*]] = vector.create_mask %c2 : vector<[4]xi1> +// CHECK-DAG: %[[TILEZERO:.*]] = arm_sme.zero : vector<[4]x[4]xi32> +// CHECK-NEXT: scf.for %[[TILE_SLICE_INDEX:.*]] = %[[C0]] to %[[NUM_ROWS]] step %[[C1]] { +// CHECK-NEXT: %[[OFFSET:.*]] = arith.addi %[[C0]], %[[TILE_SLICE_INDEX]] : index +// CHECK-NEXT: arm_sme.load_tile_slice %[[SRC]]{{\[}}%[[OFFSET]], %[[C0]]], %[[NUM_COLS]], %[[TILEZERO]], %[[TILE_SLICE_INDEX]] : memref, vector<[4]xi1>, vector<[4]x[4]xi32> +func.func @arm_sme_tile_load_hor_with_mask_and_pad_zero(%src : memref) { + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %pad = arith.constant 0 : i32 + %mask = vector.create_mask %c3, %c2 : vector<[4]x[4]xi1> + %tile = arm_sme.tile_load %src[%c0, %c0], %pad, %mask : memref, vector<[4]x[4]xi32> + return +} + +// ----- + +// CHECK-LABEL: func.func @arm_sme_tile_load_hor_with_mask_and_nonzero_pad( +// CHECK-SAME: %[[SRC:.*]]: memref, +// CHECK-SAME: %[[PAD:.*]]: i32) { +// CHECK-DAG: %[[TILE_ID:.*]] = arm_sme.get_tile_id : i32 +// CHECK-DAG: %[[CAST_TILE_TO_VECTOR:.*]] = arm_sme.cast_tile_to_vector %[[TILE_ID]] : i32 to vector<[4]x[4]xi32> +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index +// CHECK-DAG: %[[NUM_ROWS:.*]] = arith.constant 3 : index +// CHECK-DAG: %[[NUM_COLS:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[NUM_COLS_I32:.*]] = arith.index_castui %[[NUM_COLS]] : index to i32 +// CHECK-DAG: %[[VSCALE:.*]] = vector.vscale +// CHECK-NEXT: %[[NUM_TILE_SLICES:.*]] = arith.muli %[[C4]], %[[VSCALE]] : index +// CHECK-NEXT: scf.for %[[TILE_SLICE_INDEX:.*]] = %[[C0]] to %[[NUM_TILE_SLICES]] step %[[C1]] { +// CHECK-NEXT: %[[ROW_IS_ACTIVE:.*]] = arith.cmpi ult, %[[TILE_SLICE_INDEX]], %[[NUM_ROWS]] : index +// CHECK-NEXT: %[[ROW_IS_ACTIVE_SEXT_I32:.*]] = arith.extsi %[[ROW_IS_ACTIVE]] : i1 to i32 +// CHECK-NEXT: %[[MASK:.*]] = arith.andi %[[ROW_IS_ACTIVE_SEXT_I32]], %[[NUM_COLS_I32]] : i32 +// CHECK-NEXT: %[[MASK_INDEX:.*]] = arith.index_cast %[[MASK]] : i32 to index +// CHECK-NEXT: %[[MASK_1D:.*]] = vector.create_mask %[[MASK_INDEX]] : vector<[4]xi1> +// CHECK-NEXT: %[[OFFSET:.*]] = arith.addi %[[C0]], %[[TILE_SLICE_INDEX]] : index +// CHECK: %[[PAD_1D:.*]] = vector.splat %[[PAD]] : vector<[4]xi32> +// CHECK: %[[LOAD_SLICE:.*]] = vector.maskedload %[[SRC]]{{\[}}%[[OFFSET]], %[[C0]]], %[[MASK_1D]], %[[PAD_1D]] : memref, vector<[4]xi1>, vector<[4]xi32> into vector<[4]xi32> +// CHECK: arm_sme.move_vector_to_tile_slice %[[LOAD_SLICE]], %[[CAST_TILE_TO_VECTOR]], %[[TILE_SLICE_INDEX]] : vector<[4]xi32> into vector<[4]x[4]xi32> +func.func @arm_sme_tile_load_hor_with_mask_and_nonzero_pad(%src : memref, %pad : i32) { + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %mask = vector.create_mask %c3, %c2 : vector<[4]x[4]xi1> + %tile = arm_sme.tile_load %src[%c0, %c0], %pad, %mask : memref, vector<[4]x[4]xi32> + return +} + +// ----- + +func.func @arm_sme_tile_load_zero_pad__unsupported_mask_op(%src : memref, %mask : vector<[4]x[4]xi1>) { + %c0 = arith.constant 0 : index + %pad = arith.constant 0 : i32 + // expected-error@+1 {{failed to legalize operation 'arm_sme.tile_load' that was explicitly marked illegal}} + %tile = arm_sme.tile_load %src[%c0, %c0], %pad, %mask : memref, vector<[4]x[4]xi32> + return +} + +// ----- + +func.func @arm_sme_tile_load_nonzero_pad__unsupported_mask_op(%src : memref, %pad : i32, %mask : vector<[4]x[4]xi1>) { + %c0 = arith.constant 0 : index + // expected-error@+1 {{failed to legalize operation 'arm_sme.tile_load' that was explicitly marked illegal}} + %tile = arm_sme.tile_load %src[%c0, %c0], %pad, %mask : memref, vector<[4]x[4]xi32> + return +} + //===----------------------------------------------------------------------===// // arm_sme.tile_store //===----------------------------------------------------------------------===// diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-read-2d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-read-2d.mlir new file mode 100644 index 0000000000000..48725d9ea03f9 --- /dev/null +++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-read-2d.mlir @@ -0,0 +1,212 @@ +// DEFINE: %{entry_point} = entry +// DEFINE: %{compile} = mlir-opt %s \ +// DEFINE: -enable-arm-streaming="mode=locally enable-za" \ +// DEFINE: -convert-vector-to-arm-sme -convert-arm-sme-to-scf \ +// DEFINE: -convert-vector-to-llvm="enable-arm-sme" -cse -canonicalize \ +// DEFINE: -allocate-arm-sme-tiles -test-lower-to-llvm +// DEFINE: %{run} = %mcr_aarch64_cmd \ +// DEFINE: -march=aarch64 -mattr=+sve,+sme \ +// DEFINE: -e %{entry_point} -entry-point-result=void \ +// DEFINE: -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils + +// RUN: %{compile} | %{run} | FileCheck %s + +// 2-D vector load (SME tile). +func.func @transfer_read_2d(%A : memref, %base1: index, %base2: index) { + %c4 = arith.constant 4 : index + %pad = arith.constant 0.0 : f32 + %0 = vector.transfer_read %A[%base1, %base2], %pad {in_bounds=[true, true]} : + memref, vector<[4]x[4]xf32> + + vector.print str "TILE BEGIN:" + vector.print %0: vector<[4]x[4]xf32> + + return +} + +// 2-D vector load (SME tile) + transpose. +func.func @transfer_read_2d_transposed(%A : memref, %base1: index, %base2: index) { + %pad = arith.constant 0.0 : f32 + %0 = vector.transfer_read %A[%base1, %base2], %pad + {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds=[true, true]} + : memref, vector<[4]x[4]xf32> + + vector.print str "TILE BEGIN:" + vector.print %0 : vector<[4]x[4]xf32> + + return +} + +// 2-D vector load (SME tile) with mask and pad of zero. +func.func @transfer_read_2d_mask(%A : memref, %base1: index, %base2: index) { + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %pad = arith.constant 0.0 : f32 + %mask = vector.create_mask %c2, %c3 : vector<[4]x[4]xi1> + %0 = vector.transfer_read %A[%base1, %base2], %pad, %mask + {in_bounds = [true, true]} : memref, vector<[4]x[4]xf32> + + vector.print str "TILE BEGIN:" + vector.print %0: vector<[4]x[4]xf32> + + return +} + +// 2-D vector load (SME tile) with mask and pad of zero + transpose. +func.func @transfer_read_2d_mask_transposed(%A : memref, %base1: index, %base2: index) { + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %pad = arith.constant 0.0 : f32 + %mask = vector.create_mask %c2, %c3 : vector<[4]x[4]xi1> + %0 = vector.transfer_read %A[%base1, %base2], %pad, %mask + {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds=[true, true]} + : memref, vector<[4]x[4]xf32> + + vector.print str "TILE BEGIN:" + vector.print %0: vector<[4]x[4]xf32> + + return +} + +// 2-D vector load (SME tile) with mask and non-zero pad. +func.func @transfer_read_2d_mask_non_zero_pad(%A : memref, %base1: index, %base2: index) { + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %pad = arith.constant -42.0 : f32 + %mask = vector.create_mask %c2, %c3 : vector<[4]x[4]xi1> + %0 = vector.transfer_read %A[%base1, %base2], %pad, %mask + {in_bounds = [true, true]} : memref, vector<[4]x[4]xf32> + + vector.print str "TILE BEGIN:" + vector.print %0: vector<[4]x[4]xf32> + + return +} + +// 2-D vector load (SME tile) with mask and non-zero pad + transpose. +func.func @transfer_read_2d_mask_non_zero_pad_transposed(%A : memref, %base1: index, %base2: index) { + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %pad = arith.constant -42.0 : f32 + %mask = vector.create_mask %c2, %c3 : vector<[4]x[4]xi1> + %0 = vector.transfer_read %A[%base1, %base2], %pad, %mask + {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds=[true, true]} + : memref, vector<[4]x[4]xf32> + + vector.print str "TILE BEGIN:" + vector.print %0: vector<[4]x[4]xf32> + + return +} + +// Allocate heap memory of size 'd0' x 'd1' and initialize. +// +// Example: +// +// initialize_memory(%c4, %c5) +// +// 0, 1, 2, 3, 4 +// 10, 11, 12, 13, 14 +// 20, 21, 22, 23, 24 +// 30, 31, 32, 33, 34 +// +// Returns dynamic memref. It's the callers responsiblity to free the returned +// memref. +func.func @initialize_memory(%d0 : index, %d1 : index) -> memref { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c1_f32 = arith.constant 1.0 : f32 + %c10_f32 = arith.constant 10.0 : f32 + + %A = memref.alloc(%d0, %d1) : memref + + %init = arith.constant 0.0 : f32 + scf.for %i = %c0 to %d0 step %c1 iter_args(%val = %init) -> f32 { + scf.for %j = %c0 to %d1 step %c1 iter_args(%inner_val = %val) -> f32 { + memref.store %inner_val, %A[%i, %j] : memref + %inner_val_next = arith.addf %inner_val, %c1_f32 : f32 + scf.yield %inner_val_next : f32 + } + %val_next = arith.addf %val, %c10_f32 : f32 + scf.yield %val_next : f32 + } + + return %A : memref +} + +func.func @entry() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c4 = arith.constant 4 : index + + // Allocate enough memory to load a 32-bit tile plus a tiny bit more to test + // non-zero offsets while remaining inbounds. + %vscale = vector.vscale + %svl_s = arith.muli %c4, %vscale : index + %svl_s_plus_two = arith.addi %svl_s, %c2 : index + + %A = call @initialize_memory(%svl_s_plus_two, %svl_s_plus_two) : (index, index) -> memref + + // 1.a. Read 2D vector from 2D memref. + // + // CHECK-LABEL: TILE BEGIN: + // CHECK-NEXT: ( 0, 1, 2, 3 + // CHECK-NEXT: ( 10, 11, 12, 13 + // CHECK-NEXT: ( 20, 21, 22, 23 + // CHECK-NEXT: ( 30, 31, 32, 33 + call @transfer_read_2d(%A, %c0, %c0) : (memref, index, index) -> () + + // 1.b. Same as 1.a., but with non-zero offsets. + // + // CHECK-LABEL: TILE BEGIN: + // CHECK-NEXT: ( 12, 13, 14, 15 + // CHECK-NEXT: ( 22, 23, 24, 25 + // CHECK-NEXT: ( 32, 33, 34, 35 + // CHECK-NEXT: ( 42, 43, 44, 45 + call @transfer_read_2d(%A, %c1, %c2) : (memref, index, index) -> () + + // 2. Same as 1.a., but with mask and a pad of constant zero. + // CHECK-LABEL: TILE BEGIN: + // CHECK-NEXT: ( 0, 1, 2, 0 + // CHECK-NEXT: ( 10, 11, 12, 0 + // CHECK-NEXT: ( 0, 0, 0, 0 + // CHECK-NEXT: ( 0, 0, 0, 0 + call @transfer_read_2d_mask(%A, %c0, %c0) : (memref, index, index) -> () + + // 3. Same as 1.a., but with mask and non-zero pad. + // CHECK-LABEL: TILE BEGIN: + // CHECK-NEXT: ( 0, 1, 2, -42 + // CHECK-NEXT: ( 10, 11, 12, -42 + // CHECK-NEXT: ( -42, -42, -42, -42 + // CHECK-NEXT: ( -42, -42, -42, -42 + call @transfer_read_2d_mask_non_zero_pad(%A, %c0, %c0) : (memref, index, index) -> () + + // 4. Same as 1.a., but transpose the result. + // CHECK-LABEL: TILE BEGIN: + // CHECK-NEXT: ( 0, 10, 20, 30 + // CHECK-NEXT: ( 1, 11, 21, 31 + // CHECK-NEXT: ( 2, 12, 22, 32 + // CHECK-NEXT: ( 3, 13, 23, 33 + call @transfer_read_2d_transposed(%A, %c0, %c0) : (memref, index, index) -> () + + // 5. Same as 2., but transpose the result. + // CHECK-LABEL: TILE BEGIN: + // CHECK-NEXT: ( 0, 10, 0, 0 + // CHECK-NEXT: ( 1, 11, 0, 0 + // CHECK-NEXT: ( 2, 12, 0, 0 + // CHECK-NEXT: ( 0, 0, 0, 0 + call @transfer_read_2d_mask_transposed(%A, %c0, %c0) : (memref, index, index) -> () + + // 5. Same as 3, but transpose the result. + // CHECK-LABEL: TILE BEGIN: + // CHECK-NEXT: ( 0, 10, -42, -42 + // CHECK-NEXT: ( 1, 11, -42, -42 + // CHECK-NEXT: ( 2, 12, -42, -42 + // CHECK-NEXT: ( -42, -42, -42, -42 + call @transfer_read_2d_mask_non_zero_pad_transposed(%A, %c0, %c0) : (memref, index, index) -> () + + memref.dealloc %A : memref + + return +}