-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[StableHash] Implement stable global name for the hash computation #106156
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-adt Author: Kyungwoo Lee (kyulee-com) ChangesLLVM often extends global names by adding suffixes to distinguish unique identities. However, these suffixes are not always stable across different runs and build environments. To address this issue, I implemented Full diff: https://github.com/llvm/llvm-project/pull/106156.diff 4 Files Affected:
diff --git a/llvm/include/llvm/ADT/StableHashing.h b/llvm/include/llvm/ADT/StableHashing.h
index 7778f5d7c3a1c3..23a878b264da88 100644
--- a/llvm/include/llvm/ADT/StableHashing.h
+++ b/llvm/include/llvm/ADT/StableHashing.h
@@ -50,6 +50,22 @@ inline stable_hash stable_hash_combine(stable_hash A, stable_hash B,
return stable_hash_combine(Hashes);
}
+// Removes suffixes introduced by LLVM from the name to enhance stability and
+// maintain closeness to the original name across different builds.
+inline StringRef get_stable_name(StringRef Name) {
+ auto [P1, S1] = Name.rsplit(".llvm.");
+ auto [P2, S2] = P1.rsplit(".__uniq.");
+ return P2;
+}
+
+// Generates a consistent hash value for a given input name across different
+// program executions and environments. This function first converts the input
+// name into a stable form using the `get_stable_name` function, and then
+// computes a hash of this stable name.
+inline stable_hash stable_hash_name(StringRef Name) {
+ return xxh3_64bits(get_stable_name(Name));
+}
+
} // namespace llvm
#endif
diff --git a/llvm/lib/CodeGen/MachineStableHash.cpp b/llvm/lib/CodeGen/MachineStableHash.cpp
index 916acbf2d2cbf9..e88c6f37bbebd0 100644
--- a/llvm/lib/CodeGen/MachineStableHash.cpp
+++ b/llvm/lib/CodeGen/MachineStableHash.cpp
@@ -95,13 +95,21 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) {
case MachineOperand::MO_Metadata:
StableHashBailingMetadataUnsupported++;
return 0;
- case MachineOperand::MO_GlobalAddress:
- StableHashBailingGlobalAddress++;
- return 0;
+ case MachineOperand::MO_GlobalAddress: {
+ const GlobalValue *GV = MO.getGlobal();
+ if (!GV->hasName()) {
+ StableHashBailingGlobalAddress++;
+ return 0;
+ }
+ auto Name = GV->getName();
+ return stable_hash_combine(MO.getType(), MO.getTargetFlags(),
+ stable_hash_name(Name), MO.getOffset());
+ }
+
case MachineOperand::MO_TargetIndex: {
if (const char *Name = MO.getTargetIndexName())
return stable_hash_combine(MO.getType(), MO.getTargetFlags(),
- xxh3_64bits(Name), MO.getOffset());
+ stable_hash_name(Name), MO.getOffset());
StableHashBailingTargetIndexNoName++;
return 0;
}
@@ -113,7 +121,8 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) {
case MachineOperand::MO_ExternalSymbol:
return stable_hash_combine(MO.getType(), MO.getTargetFlags(),
- MO.getOffset(), xxh3_64bits(MO.getSymbolName()));
+ MO.getOffset(),
+ stable_hash_name(MO.getSymbolName()));
case MachineOperand::MO_RegisterMask:
case MachineOperand::MO_RegisterLiveOut: {
@@ -149,7 +158,7 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) {
case MachineOperand::MO_MCSymbol: {
auto SymbolName = MO.getMCSymbol()->getName();
return stable_hash_combine(MO.getType(), MO.getTargetFlags(),
- xxh3_64bits(SymbolName));
+ stable_hash_name(SymbolName));
}
case MachineOperand::MO_CFIIndex:
return stable_hash_combine(MO.getType(), MO.getTargetFlags(),
diff --git a/llvm/unittests/MIR/CMakeLists.txt b/llvm/unittests/MIR/CMakeLists.txt
index d6aff46eff07b7..206094266ba148 100644
--- a/llvm/unittests/MIR/CMakeLists.txt
+++ b/llvm/unittests/MIR/CMakeLists.txt
@@ -15,6 +15,7 @@ set(LLVM_LINK_COMPONENTS
add_llvm_unittest(MIRTests
MachineMetadata.cpp
+ MachineStableHashTest.cpp
)
target_link_libraries(MIRTests PRIVATE LLVMTestingSupport)
diff --git a/llvm/unittests/MIR/MachineStableHashTest.cpp b/llvm/unittests/MIR/MachineStableHashTest.cpp
new file mode 100644
index 00000000000000..19cb7ebb25cdfd
--- /dev/null
+++ b/llvm/unittests/MIR/MachineStableHashTest.cpp
@@ -0,0 +1,143 @@
+//===- MachineStableHashTest.cpp ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineStableHash.h"
+#include "llvm/CodeGen/MIRParser/MIRParser.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/FileCheck/FileCheck.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetMachine.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+class MachineStableHashTest : public testing::Test {
+public:
+ MachineStableHashTest() {}
+
+protected:
+ LLVMContext Context;
+ std::unique_ptr<Module> M;
+ std::unique_ptr<MIRParser> MIR;
+
+ static void SetUpTestCase() {
+ InitializeAllTargetInfos();
+ InitializeAllTargets();
+ InitializeAllTargetMCs();
+ }
+
+ void SetUp() override { M = std::make_unique<Module>("Dummy", Context); }
+
+ std::unique_ptr<LLVMTargetMachine>
+ createTargetMachine(std::string TT, StringRef CPU, StringRef FS) {
+ std::string Error;
+ const Target *T = TargetRegistry::lookupTarget(TT, Error);
+ if (!T)
+ return nullptr;
+ TargetOptions Options;
+ return std::unique_ptr<LLVMTargetMachine>(
+ static_cast<LLVMTargetMachine *>(T->createTargetMachine(
+ TT, CPU, FS, Options, std::nullopt, std::nullopt)));
+ }
+
+ std::unique_ptr<Module> parseMIR(const TargetMachine &TM, StringRef MIRCode,
+ MachineModuleInfo &MMI) {
+ SMDiagnostic Diagnostic;
+ std::unique_ptr<MemoryBuffer> MBuffer = MemoryBuffer::getMemBuffer(MIRCode);
+ MIR = createMIRParser(std::move(MBuffer), Context);
+ if (!MIR)
+ return nullptr;
+
+ std::unique_ptr<Module> Mod = MIR->parseIRModule();
+ if (!Mod)
+ return nullptr;
+
+ Mod->setDataLayout(TM.createDataLayout());
+
+ if (MIR->parseMachineFunctions(*Mod, MMI)) {
+ M.reset();
+ return nullptr;
+ }
+
+ return Mod;
+ }
+};
+
+TEST_F(MachineStableHashTest, StableGlobalName) {
+ auto TM = createTargetMachine(("aarch64--"), "", "");
+ if (!TM)
+ GTEST_SKIP();
+ StringRef MIRString = R"MIR(
+--- |
+ define void @f1() { ret void }
+ define void @f2() { ret void }
+ define void @f3() { ret void }
+ define void @f4() { ret void }
+ declare void @goo()
+ declare void @goo.llvm.123()
+ declare void @goo.__uniq.456()
+ declare void @goo.invalid.789()
+...
+---
+name: f1
+alignment: 16
+tracksRegLiveness: true
+frameInfo:
+ maxAlignment: 16
+machineFunctionInfo: {}
+body: |
+ bb.0:
+ liveins: $lr
+ BL @goo
+ RET undef $lr
+
+...
+---
+name: f2
+body: |
+ bb.0:
+ liveins: $lr
+ BL @goo.llvm.123
+ RET undef $lr
+...
+---
+name: f3
+body: |
+ bb.0:
+ liveins: $lr
+ BL @goo.__uniq.456
+ RET undef $lr
+...
+---
+name: f4
+body: |
+ bb.0:
+ liveins: $lr
+ BL @goo.invalid.789
+ RET undef $lr
+...
+)MIR";
+ MachineModuleInfo MMI(TM.get());
+ M = parseMIR(*TM, MIRString, MMI);
+ ASSERT_TRUE(M);
+ auto *MF1 = MMI.getMachineFunction(*M->getFunction("f1"));
+ auto *MF2 = MMI.getMachineFunction(*M->getFunction("f2"));
+ auto *MF3 = MMI.getMachineFunction(*M->getFunction("f3"));
+ auto *MF4 = MMI.getMachineFunction(*M->getFunction("f4"));
+
+ // Expect the suffix, `.llvm.{number}` to be ignored.
+ EXPECT_EQ(stableHashValue(*MF1), stableHashValue(*MF2));
+ // Expect the suffix, `.__uniq.{number}` to be ignored.
+ EXPECT_EQ(stableHashValue(*MF1), stableHashValue(*MF3));
+ // Do not ignore `.invalid.{number}`.
+ EXPECT_NE(stableHashValue(*MF1), stableHashValue(*MF4));
+}
|
inline StringRef get_stable_name(StringRef Name) { | ||
auto [P1, S1] = Name.rsplit(".llvm."); | ||
auto [P2, S2] = P1.rsplit(".__uniq."); | ||
return P2; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Similar code is found in a few more place.
I wonder if it makes sense to put this code somewhere so that the above functions can be replaced with it. That might be better off in a separate PR, though.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, the code segments appear similar, but they could be implied slightly differently depending on the client. Specifically, in this instance, I am planning to further extend the functionality for the upcoming global outlining feature -- #90074. I intend to introduce an additional suffix .content.{number}
to the outlined function names, which will enable us to deduce the identities of target functions even when their names vary -- in that case, use the content hash from the suffix, rather than ignoring it . So, I think the usage here is primarily focused on accurately identifying the stable target based on the name.
// Generates a consistent hash value for a given input name across different | ||
// program executions and environments. This function first converts the input | ||
// name into a stable form using the `get_stable_name` function, and then | ||
// computes a hash of this stable name. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess foo.llvm.1234
would have the same hash as foo.llvm.5678
. It might be worth mentioning that hash collisions are not uncommon.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added comments.
case MachineOperand::MO_GlobalAddress: { | ||
const GlobalValue *GV = MO.getGlobal(); | ||
if (!GV->hasName()) { | ||
StableHashBailingGlobalAddress++; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
StableHashBailingGlobalAddress++; | |
++StableHashBailingGlobalAddress; |
Prefer preincrement, but you might also want to change the rest of the stats variables.
https://llvm.org/docs/CodingStandards.html#prefer-preincrement
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated the increments of the stats variables.
// Expect the suffix, `.llvm.{number}` to be ignored. | ||
EXPECT_EQ(stableHashValue(*MF1), stableHashValue(*MF2)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// Expect the suffix, `.llvm.{number}` to be ignored. | |
EXPECT_EQ(stableHashValue(*MF1), stableHashValue(*MF2)); | |
EXPECT_EQ(stableHashValue(*MF1), stableHashValue(*MF2)) << "Expect the suffix, `.llvm.{number}` to be ignored."; |
It would be nice to have the explanation in the error message if the test breaks.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the suggestion. Added the expected output message.
LLVM often extends global names by adding suffixes to distinguish unique identities. However, these suffixes are not always stable across different runs and build environments. To address this issue, I implemented `get_stable_name` to ignore such suffixes and obtain the original name. This approach is not new, as PGO or Bolt already handle this issue similarly. Using the stable name obtained from `get_stable_name`, I implemented `stable_hash_name` while utilizing the same underlying `xxh3_64bit` algorithm as before.
612853f
to
39e6559
Compare
@@ -50,6 +50,22 @@ inline stable_hash stable_hash_combine(stable_hash A, stable_hash B, | |||
return stable_hash_combine(Hashes); | |||
} | |||
|
|||
// Removes suffixes introduced by LLVM from the name to enhance stability and | |||
// maintain closeness to the original name across different builds. | |||
inline StringRef get_stable_name(StringRef Name) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I prefer to keep this function separate instead of being directly inlined into the stable_hash_name
below in place, as I plan to extend it further for global outlining or merging.
inline StringRef get_stable_name(StringRef Name) { | ||
auto [P1, S1] = Name.rsplit(".llvm."); | ||
auto [P2, S2] = P1.rsplit(".__uniq."); | ||
return P2; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, the code segments appear similar, but they could be implied slightly differently depending on the client. Specifically, in this instance, I am planning to further extend the functionality for the upcoming global outlining feature -- #90074. I intend to introduce an additional suffix .content.{number}
to the outlined function names, which will enable us to deduce the identities of target functions even when their names vary -- in that case, use the content hash from the suffix, rather than ignoring it . So, I think the usage here is primarily focused on accurately identifying the stable target based on the name.
// Expect the suffix, `.llvm.{number}` to be ignored. | ||
EXPECT_EQ(stableHashValue(*MF1), stableHashValue(*MF2)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the suggestion. Added the expected output message.
case MachineOperand::MO_GlobalAddress: { | ||
const GlobalValue *GV = MO.getGlobal(); | ||
if (!GV->hasName()) { | ||
StableHashBailingGlobalAddress++; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated the increments of the stats variables.
// Generates a consistent hash value for a given input name across different | ||
// program executions and environments. This function first converts the input | ||
// name into a stable form using the `get_stable_name` function, and then | ||
// computes a hash of this stable name. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added comments.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Thank you for the prompt review! I will address any further issues in the next PR if they arise. |
…lvm#106156) LLVM often extends global names by adding suffixes to distinguish unique identities. However, these suffixes are not always stable across different runs and build environments. To address this issue, I implemented `get_stable_name` to ignore such suffixes and obtain the original name. This approach is not new, as PGO or Bolt already handle this issue similarly. Using the stable name obtained from `get_stable_name`, I implemented `stable_hash_name` while utilizing the same underlying `xxh3_64bit` algorithm as before. (cherry picked from commit f9ad249)
LLVM often extends global names by adding suffixes to distinguish unique identities. However, these suffixes are not always stable across different runs and build environments. To address this issue, I implemented
get_stable_name
to ignore such suffixes and obtain the original name. This approach is not new, as PGO or Bolt already handle this issue similarly. Using the stable name obtained fromget_stable_name
, I implementedstable_hash_name
while utilizing the same underlyingxxh3_64bit
algorithm as before.