Skip to content

Commit 22f2ec2

Browse files
SC llvm teamSC llvm team
SC llvm team
authored and
SC llvm team
committed
Merged main:8f397e04e5ce into amd-gfx:61f415de20dd
Local branch amd-gfx 61f415d Merged main:d6254e1b2e6d into amd-gfx:a1fa6830c554 Remote branch main 8f397e0 [mlir][memref] Fix emulate narrow types for strided memref offset (llvm#68181)
2 parents 61f415d + 8f397e0 commit 22f2ec2

File tree

10 files changed

+142
-24
lines changed

10 files changed

+142
-24
lines changed

clang/lib/CodeGen/CodeGenModule.cpp

+13
Original file line numberDiff line numberDiff line change
@@ -1390,6 +1390,19 @@ void CodeGenModule::setGlobalVisibility(llvm::GlobalValue *GV,
13901390
}
13911391
if (!D)
13921392
return;
1393+
1394+
// OpenMP declare target variables must be visible to the host so they can
1395+
// be registered. We require protected visibility unless the variable has
1396+
// the DT_nohost modifier and does not need to be registered.
1397+
if (Context.getLangOpts().OpenMP &&
1398+
Context.getLangOpts().OpenMPIsTargetDevice && isa<VarDecl>(D) &&
1399+
D->hasAttr<OMPDeclareTargetDeclAttr>() &&
1400+
D->getAttr<OMPDeclareTargetDeclAttr>()->getDevType() !=
1401+
OMPDeclareTargetDeclAttr::DT_NoHost) {
1402+
GV->setVisibility(llvm::GlobalValue::ProtectedVisibility);
1403+
return;
1404+
}
1405+
13931406
// Set visibility for definitions, and for declarations if requested globally
13941407
// or set explicitly.
13951408
LinkageInfo LV = D->getLinkageAndVisibility();

clang/lib/CodeGen/Targets/AMDGPU.cpp

+7-6
Original file line numberDiff line numberDiff line change
@@ -308,12 +308,13 @@ static bool requiresAMDGPUProtectedVisibility(const Decl *D,
308308
if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility)
309309
return false;
310310

311-
return D->hasAttr<OpenCLKernelAttr>() ||
312-
(isa<FunctionDecl>(D) && D->hasAttr<CUDAGlobalAttr>()) ||
313-
(isa<VarDecl>(D) &&
314-
(D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
315-
cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinSurfaceType() ||
316-
cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType()));
311+
return !D->hasAttr<OMPDeclareTargetDeclAttr>() &&
312+
(D->hasAttr<OpenCLKernelAttr>() ||
313+
(isa<FunctionDecl>(D) && D->hasAttr<CUDAGlobalAttr>()) ||
314+
(isa<VarDecl>(D) &&
315+
(D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
316+
cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinSurfaceType() ||
317+
cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType())));
317318
}
318319

319320
void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(

clang/test/OpenMP/declare_target_codegen.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
// CHECK-DAG: @dy = {{protected | }}global i32 0,
3232
// CHECK-DAG: @bbb = {{protected | }}global i32 0,
3333
// CHECK-DAG: weak constant %struct.__tgt_offload_entry { ptr @bbb,
34-
// CHECK-DAG: @ccc = external global i32,
34+
// CHECK-DAG: @ccc = external {{protected | }}global i32,
3535
// CHECK-DAG: @ddd = {{protected | }}global i32 0,
3636
// CHECK-DAG: @hhh_decl_tgt_ref_ptr = weak global ptr null
3737
// CHECK-DAG: @ggg_decl_tgt_ref_ptr = weak global ptr null

clang/test/OpenMP/declare_target_constexpr_codegen.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ class A {
1616
public:
1717
static constexpr double pi = 3.141592653589793116;
1818
//.
19-
// CHECK: @_ZN1A2piE = linkonce_odr constant double 0x400921FB54442D18, comdat, align 8
19+
// CHECK: @_ZN1A2piE = linkonce_odr protected constant double 0x400921FB54442D18, comdat, align 8
2020
// CHECK: @_ZL9anotherPi = internal constant double 3.140000e+00, align 8
2121
// CHECK: @llvm.compiler.used = appending global [2 x ptr] [ptr @"__ZN1A2piE$ref", ptr @"__ZL9anotherPi$ref"], section "llvm.metadata"
2222
//.

clang/test/OpenMP/target_visibility.cpp

+9-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
// RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -o - | FileCheck %s
2-
// RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -o - | FileCheck %s
2+
// RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -o - | FileCheck %s
33
// expected-no-diagnostics
44

55

@@ -21,6 +21,14 @@ void B::bar() { A a; a.foo(); }
2121
void B::sbar() { A::sfoo(); }
2222
#pragma omp declare target to(B::bar, B::sbar)
2323

24+
[[gnu::visibility("hidden")]] extern const int x = 0;
25+
#pragma omp declare target to(x) device_type(nohost)
26+
27+
[[gnu::visibility("hidden")]] int y = 0;
28+
#pragma omp declare target to(y)
29+
30+
// CHECK-DAG: @x = hidden{{.*}} constant i32 0
31+
// CHECK-DAG: @y = protected{{.*}} i32 0
2432
// CHECK-DAG: define hidden void @_ZN1B4sbarEv()
2533
// CHECK-DAG: define linkonce_odr hidden void @_ZN1A4sfooEv()
2634
// CHECK-DAG: define hidden void @_ZN1B3barEv(

llvm/include/llvm/Config/llvm-config.h.cmake

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
/* Indicate that this is LLVM compiled from the amd-gfx branch. */
1818
#define LLVM_HAVE_BRANCH_AMD_GFX
19-
#define LLVM_MAIN_REVISION 476930
19+
#define LLVM_MAIN_REVISION 476933
2020

2121
/* Define if LLVM_ENABLE_DUMP is enabled */
2222
#cmakedefine LLVM_ENABLE_DUMP

llvm/lib/CodeGen/MachineFunctionPass.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,8 @@ bool MachineFunctionPass::runOnFunction(Function &F) {
8888
MF.print(OS);
8989
}
9090

91+
MFProps.reset(ClearedProperties);
92+
9193
bool RV = runOnMachineFunction(MF);
9294

9395
if (ShouldEmitSizeRemarks) {
@@ -114,7 +116,6 @@ bool MachineFunctionPass::runOnFunction(Function &F) {
114116
}
115117

116118
MFProps.set(SetProperties);
117-
MFProps.reset(ClearedProperties);
118119

119120
// For --print-changed, print if the serialized MF has changed. Modes other
120121
// than quiet/verbose are unimplemented and treated the same as 'quiet'.

llvm/lib/CodeGen/RegisterCoalescer.cpp

-8
Original file line numberDiff line numberDiff line change
@@ -4167,14 +4167,6 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
41674167
else
41684168
JoinGlobalCopies = (EnableGlobalCopies == cl::BOU_TRUE);
41694169

4170-
// FIXME: MachineFunctionProperties cannot express the required pre-property
4171-
// no-SSA. When running a MIR testcase without any virtual register defs, the
4172-
// MIR parser assumes SSA. MachineFunctionPass::getClearedProperties is called
4173-
// after the pass is run, so the properties at this point say it's an SSA
4174-
// function. Forcibly clear it here so -verify-coalescing doesn't complain
4175-
// after multiple virtual register defs are introduced.
4176-
MRI->leaveSSA();
4177-
41784170
// If there are PHIs tracked by debug-info, they will need updating during
41794171
// coalescing. Build an index of those PHIs to ease updating.
41804172
SlotIndexes *Slots = LIS->getSlotIndexes();

mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp

+89-5
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "mlir/Dialect/MemRef/Transforms/Transforms.h"
1818
#include "mlir/Dialect/MemRef/Utils/MemRefUtils.h"
1919
#include "mlir/Dialect/Vector/IR/VectorOps.h"
20+
#include "mlir/Support/MathExtras.h"
2021
#include "mlir/Transforms/DialectConversion.h"
2122
#include "llvm/Support/FormatVariadic.h"
2223
#include "llvm/Support/MathExtras.h"
@@ -209,6 +210,76 @@ struct ConvertMemRefLoad final : OpConversionPattern<memref::LoadOp> {
209210
return success();
210211
}
211212
};
213+
214+
//===----------------------------------------------------------------------===//
215+
// ConvertMemRefSubview
216+
//===----------------------------------------------------------------------===//
217+
218+
/// Emulating narrow ints on subview have limited support, supporting only
219+
/// static offset and size and stride of 1. Ideally, the subview should be
220+
/// folded away before running narrow type emulation, and this pattern would
221+
/// never run. This pattern is mostly used for testing pruposes.
222+
struct ConvertMemRefSubview final : OpConversionPattern<memref::SubViewOp> {
223+
using OpConversionPattern::OpConversionPattern;
224+
225+
LogicalResult
226+
matchAndRewrite(memref::SubViewOp op, OpAdaptor adaptor,
227+
ConversionPatternRewriter &rewriter) const override {
228+
MemRefType newTy =
229+
dyn_cast<MemRefType>(getTypeConverter()->convertType(op.getType()));
230+
if (!newTy) {
231+
return rewriter.notifyMatchFailure(
232+
op->getLoc(),
233+
llvm::formatv("failed to convert memref type: {0}", op.getType()));
234+
}
235+
236+
auto convertedElementType = newTy.getElementType();
237+
auto oldElementType = op.getType().getElementType();
238+
int srcBits = oldElementType.getIntOrFloatBitWidth();
239+
int dstBits = convertedElementType.getIntOrFloatBitWidth();
240+
if (dstBits % srcBits != 0) {
241+
return rewriter.notifyMatchFailure(
242+
op, "only dstBits % srcBits == 0 supported");
243+
}
244+
245+
// Only support offset for 1-D subview.
246+
if (op.getType().getRank() != 1) {
247+
return rewriter.notifyMatchFailure(
248+
op->getLoc(), "subview with rank > 1 is not supported");
249+
}
250+
251+
// Only support stride of 1.
252+
if (op.getStaticStride(0) != 1) {
253+
return rewriter.notifyMatchFailure(
254+
op->getLoc(), "subview with stride != 1 is not supported");
255+
}
256+
257+
int64_t size = op.getStaticSize(0);
258+
int64_t offset = op.getStaticOffset(0);
259+
// Only support static sizes and offsets.
260+
if (size == ShapedType::kDynamic || offset == ShapedType::kDynamic) {
261+
return rewriter.notifyMatchFailure(
262+
op->getLoc(), "subview with dynamic size or offset is not supported");
263+
}
264+
265+
int elementsPerByte = dstBits / srcBits;
266+
if (offset % elementsPerByte != 0) {
267+
return rewriter.notifyMatchFailure(
268+
op->getLoc(),
269+
"subview with offset not multiple of elementsPerByte is not "
270+
"supported");
271+
}
272+
273+
size = ceilDiv(size, elementsPerByte);
274+
offset = offset / elementsPerByte;
275+
276+
rewriter.replaceOpWithNewOp<memref::SubViewOp>(
277+
op, newTy, *adaptor.getODSOperands(0).begin(), offset, size,
278+
op.getStaticStrides());
279+
return success();
280+
}
281+
};
282+
212283
} // end anonymous namespace
213284

214285
//===----------------------------------------------------------------------===//
@@ -220,9 +291,9 @@ void memref::populateMemRefNarrowTypeEmulationPatterns(
220291
RewritePatternSet &patterns) {
221292

222293
// Populate `memref.*` conversion patterns.
223-
patterns
224-
.add<ConvertMemRefAlloc, ConvertMemRefLoad, ConvertMemRefAssumeAlignment>(
225-
typeConverter, patterns.getContext());
294+
patterns.add<ConvertMemRefAlloc, ConvertMemRefLoad,
295+
ConvertMemRefAssumeAlignment, ConvertMemRefSubview>(
296+
typeConverter, patterns.getContext());
226297
memref::populateResolveExtractStridedMetadataPatterns(patterns);
227298
}
228299

@@ -271,9 +342,22 @@ void memref::populateMemRefNarrowTypeEmulationConversions(
271342
return std::nullopt;
272343

273344
StridedLayoutAttr layoutAttr;
345+
// If the offset is 0, we do not need a strided layout as the stride is
346+
// 1, so we only use the strided layout if the offset is not 0.
274347
if (offset != 0) {
275-
layoutAttr = StridedLayoutAttr::get(ty.getContext(), offset,
276-
ArrayRef<int64_t>{1});
348+
if (offset == ShapedType::kDynamic) {
349+
layoutAttr = StridedLayoutAttr::get(ty.getContext(), offset,
350+
ArrayRef<int64_t>{1});
351+
} else {
352+
// Check if the number of bytes are a multiple of the loadStoreWidth
353+
// and if so, divide it by the loadStoreWidth to get the offset.
354+
if ((offset * width) % loadStoreWidth != 0)
355+
return std::nullopt;
356+
offset = (offset * width) / loadStoreWidth;
357+
358+
layoutAttr = StridedLayoutAttr::get(ty.getContext(), offset,
359+
ArrayRef<int64_t>{1});
360+
}
277361
}
278362

279363
return MemRefType::get(getLinearizedShape(ty, width, loadStoreWidth),

mlir/test/Dialect/MemRef/emulate-narrow-type.mlir

+19
Original file line numberDiff line numberDiff line change
@@ -155,3 +155,22 @@ func.func @rank_zero_memref() -> i4 {
155155
// CHECK32: %[[LOAD:.+]] = memref.load %[[ALLOC]][] : memref<i32>
156156
// CHECK32: %[[TRUNC:.+]] = arith.trunci %[[LOAD]] : i32 to i4
157157
// CHECK32: return %[[TRUNC]]
158+
159+
// -----
160+
161+
func.func @memref_strided_i4(%idx : index) -> i4 {
162+
%arr = memref.alloc() : memref<128xi4>
163+
%subview = memref.subview %arr[32] [32] [1] : memref<128xi4> to memref<32xi4, strided<[1], offset:32>>
164+
%1 = memref.load %subview[%idx] : memref<32xi4, strided<[1], offset:32>>
165+
return %1 : i4
166+
}
167+
168+
// CHECK-LABEL: func @memref_strided_i4
169+
// CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<64xi8>
170+
// CHECK: %[[SUBVIEW:.+]] = memref.subview %[[ALLOC]][16] [16] [1] : memref<64xi8> to memref<16xi8, strided<[1], offset: 16>>
171+
// CHECK: %[[LOAD:.+]] = memref.load %[[SUBVIEW]]
172+
173+
// CHECK32-LABEL: func @memref_strided_i4
174+
// CHECK32: %[[ALLOC:.+]] = memref.alloc() : memref<16xi32>
175+
// CHECK32: %[[SUBVIEW:.+]] = memref.subview %[[ALLOC]][4] [4] [1] : memref<16xi32> to memref<4xi32, strided<[1], offset: 4>>
176+
// CHECK32: %[[LOAD:.+]] = memref.load %[[SUBVIEW]]

0 commit comments

Comments
 (0)