Skip to content

[AArch64][SME] Add remarks to flag lazy ZA saves, and SMSTART/SMSTOP transitions #68255

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 6, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 40 additions & 1 deletion llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
@@ -31,6 +31,7 @@
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/ObjCARCUtil.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
@@ -7362,6 +7363,19 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
CalleeAttrs = SMEAttrs(ES->getSymbol());

auto DescribeCallsite =
[&](OptimizationRemarkAnalysis &R) -> OptimizationRemarkAnalysis & {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Would it be easier to have a function that takes just a StringRef, so that you can do EmitSMERemarkForCall("requires a streaming mode transition");? You can then hide the construction of the ORE and OptimizationRemarkAnalysis in this lambda.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking the current format would be a bit nicer for additions, i.e. if/when ZA liveness starts being tracked, the remark could be amended to mention the number of za slices that should be preserved.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a bit weary of writing code in anticipation of future changes which may never happen. We can always rewrite the code in this form later if that makes more sense. That said, I don't feel too strongly about it since the current code isn't bad either.

R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
R << ore::NV("Callee", ES->getSymbol());
else if (CLI.CB && CLI.CB->getCalledFunction())
R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
else
R << "unknown callee";
R << "'";
return R;
};

bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
if (RequiresLazySave) {
SDValue NumZaSaveSlices;
@@ -7388,13 +7402,38 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
TPIDR2ObjAddr);
OptimizationRemarkEmitter ORE(&MF.getFunction());
ORE.emit([&]() {
auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
CLI.CB)
: OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
&MF.getFunction());
DescribeCallsite(R) << " sets up a lazy save for ZA";
if (CalleeAttrs.preservesZA())
R << ", but callee preserves ZA, so we request 0 slices to be saved";
else
R << ", and we request that all slices be saved";
R << ore::setExtraArgs()
<< ore::NV("CalleePreservesZA", CalleeAttrs.preservesZA());
return R;
});
}

SDValue PStateSM;
std::optional<bool> RequiresSMChange =
CallerAttrs.requiresSMChange(CalleeAttrs);
if (RequiresSMChange)
if (RequiresSMChange) {
PStateSM = getPStateSM(DAG, Chain, CallerAttrs, DL, MVT::i64);
OptimizationRemarkEmitter ORE(&MF.getFunction());
ORE.emit([&]() {
auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
CLI.CB)
: OptimizationRemarkAnalysis("sme", "SMETransition",
&MF.getFunction());
DescribeCallsite(R) << " requires a streaming mode transition";
return R;
});
}

// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
32 changes: 32 additions & 0 deletions llvm/test/CodeGen/AArch64/sme-lazy-save-call-remarks.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64 -mattr=+sme --pass-remarks-analysis=sme -o /dev/null < %s 2>&1 | FileCheck %s

declare void @private_za_callee()
declare void @private_za_preserved_callee() "aarch64_pstate_za_preserved"
declare float @llvm.cos.f32(float)

define void @test_lazy_save_1_callee() nounwind "aarch64_pstate_za_shared" {
; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_1_callee' to 'private_za_callee' sets up a lazy save for ZA, and we request that all slices be saved
call void @private_za_callee()
ret void
}

define void @test_lazy_save_2_callees() nounwind "aarch64_pstate_za_shared" {
; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_2_callees' to 'private_za_callee' sets up a lazy save for ZA, and we request that all slices be saved
call void @private_za_callee()
; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_2_callees' to 'private_za_callee' sets up a lazy save for ZA, and we request that all slices be saved
call void @private_za_callee()
ret void
}

define void @test_lazy_save_preserved_callee() nounwind "aarch64_pstate_za_shared" {
; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_preserved_callee' to 'private_za_preserved_callee' sets up a lazy save for ZA, but callee preserves ZA, so we request 0 slices to be saved
call void @private_za_preserved_callee()
ret void
}

define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_pstate_za_shared" {
; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_expanded_intrinsic' to 'cosf' sets up a lazy save for ZA, and we request that all slices be saved
%res = call float @llvm.cos.f32(float %a)
ret float %res
}
90 changes: 90 additions & 0 deletions llvm/test/CodeGen/AArch64/sme-streaming-interface-remarks.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme,+sve -verify-machineinstrs --pass-remarks-analysis=sme -o /dev/null < %s 2>&1 | FileCheck %s

declare void @normal_callee()
declare void @streaming_callee() "aarch64_pstate_sm_enabled"
declare void @streaming_compatible_callee() "aarch64_pstate_sm_compatible"

; CHECK: remark: <unknown>:0:0: call from 'normal_caller_streaming_callee' to 'streaming_callee' requires a streaming mode transition
define void @normal_caller_streaming_callee() nounwind {
call void @streaming_callee()
ret void;
}

; CHECK: remark: <unknown>:0:0: call from 'streaming_caller_normal_callee' to 'normal_callee' requires a streaming mode transition
define void @streaming_caller_normal_callee() nounwind "aarch64_pstate_sm_enabled" {
call void @normal_callee()
ret void;
}

; CHECK-NOT: streaming_caller_streaming_callee
define void @streaming_caller_streaming_callee() nounwind "aarch64_pstate_sm_enabled" {
call void @streaming_callee()
ret void;
}

; CHECK-NOT: streaming_caller_streaming_compatible_callee
define void @streaming_caller_streaming_compatible_callee() nounwind "aarch64_pstate_sm_enabled" {
call void @streaming_compatible_callee()
ret void;
}

; CHECK: remark: <unknown>:0:0: call from 'call_to_function_pointer_streaming_enabled' to 'unknown callee' requires a streaming mode transition
define void @call_to_function_pointer_streaming_enabled(ptr %p) nounwind {
call void %p() "aarch64_pstate_sm_enabled"
ret void
}

; CHECK: remark: <unknown>:0:0: call from 'smstart_clobber_simdfp' to 'streaming_callee' requires a streaming mode transition
define <4 x i32> @smstart_clobber_simdfp(<4 x i32> %x) nounwind {
call void @streaming_callee()
ret <4 x i32> %x;
}

; CHECK: remark: <unknown>:0:0: call from 'smstart_clobber_sve' to 'streaming_callee' requires a streaming mode transition
define <vscale x 4 x i32> @smstart_clobber_sve(<vscale x 4 x i32> %x) nounwind {
call void @streaming_callee()
ret <vscale x 4 x i32> %x;
}

; CHECK: remark: <unknown>:0:0: call from 'smstart_clobber_sve_duplicate' to 'streaming_callee' requires a streaming mode transition
; CHECK: remark: <unknown>:0:0: call from 'smstart_clobber_sve_duplicate' to 'streaming_callee' requires a streaming mode transition
define <vscale x 4 x i32> @smstart_clobber_sve_duplicate(<vscale x 4 x i32> %x) nounwind {
call void @streaming_callee()
call void @streaming_callee()
ret <vscale x 4 x i32> %x;
}

; CHECK: remark: <unknown>:0:0: call from 'call_to_intrinsic_without_chain' to 'cos' requires a streaming mode transition
define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_pstate_sm_enabled" {
entry:
%res = call fast double @llvm.cos.f64(double %x)
%res.fadd = fadd fast double %res, %x
ret double %res.fadd
}

declare double @llvm.cos.f64(double)

; CHECK: remark: <unknown>:0:0: call from 'disable_tailcallopt' to 'streaming_callee' requires a streaming mode transition
define void @disable_tailcallopt() nounwind {
tail call void @streaming_callee()
ret void;
}

; CHECK: remark: <unknown>:0:0: call from 'call_to_non_streaming_pass_sve_objects' to 'foo' requires a streaming mode transition
define i8 @call_to_non_streaming_pass_sve_objects(ptr nocapture noundef readnone %ptr) #0 {
entry:
%Data1 = alloca <vscale x 16 x i8>, align 16
%Data2 = alloca <vscale x 16 x i8>, align 16
%Data3 = alloca <vscale x 16 x i8>, align 16
%0 = tail call i64 @llvm.aarch64.sme.cntsb()
call void @foo(ptr noundef nonnull %Data1, ptr noundef nonnull %Data2, ptr noundef nonnull %Data3, i64 noundef %0)
%1 = load <vscale x 16 x i8>, ptr %Data1, align 16
%vecext = extractelement <vscale x 16 x i8> %1, i64 0
ret i8 %vecext
}

declare i64 @llvm.aarch64.sme.cntsb()

declare void @foo(ptr noundef, ptr noundef, ptr noundef, i64 noundef)

attributes #0 = { nounwind vscale_range(1,16) "aarch64_pstate_sm_enabled" }