-
Notifications
You must be signed in to change notification settings - Fork 14.8k
[lldb] Support parsing the Wasm symbol table #153093
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-lldb Author: Jonas Devlieghere (JDevlieghere) ChangesThis PR adds support for parsing the WebAssembly symbol table. The symbol table is encoded in the "names" section and contains names and indexes into other sections. For now we only support parsing function (code) symbols. The result is that you can set breakpoints by symbol name, while previously breakpoints by name required debug info (DWARF). This is also necessary for Swift, which checks for the presence of Full diff: https://github.com/llvm/llvm-project/pull/153093.diff 7 Files Affected:
diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h
index c63c1f03e58da..fec9fdef44df9 100644
--- a/lldb/include/lldb/lldb-enumerations.h
+++ b/lldb/include/lldb/lldb-enumerations.h
@@ -777,6 +777,7 @@ enum SectionType {
eSectionTypeLLDBTypeSummaries,
eSectionTypeLLDBFormatters,
eSectionTypeSwiftModules,
+ eSectionTypeWasmName,
};
FLAGS_ENUM(EmulateInstructionOptions){
diff --git a/lldb/source/Core/Section.cpp b/lldb/source/Core/Section.cpp
index 27dcf987b0278..02d9d86fe5374 100644
--- a/lldb/source/Core/Section.cpp
+++ b/lldb/source/Core/Section.cpp
@@ -153,6 +153,8 @@ const char *Section::GetTypeAsCString() const {
return "lldb-formatters";
case eSectionTypeSwiftModules:
return "swift-modules";
+ case eSectionTypeWasmName:
+ return "wasm-name";
case eSectionTypeOther:
return "regular";
}
@@ -415,6 +417,7 @@ bool Section::ContainsOnlyDebugInfo() const {
case eSectionTypeCompactUnwind:
case eSectionTypeGoSymtab:
case eSectionTypeAbsoluteAddress:
+ case eSectionTypeWasmName:
case eSectionTypeOther:
// Used for "__dof_cache" in mach-o or ".debug" for COFF which isn't debug
// information that we parse at all. This was causing system files with no
diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
index 13df6e2f26b53..d7cb60e3f0c38 100644
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
@@ -1156,6 +1156,7 @@ AddressClass ObjectFileMachO::GetAddressClass(lldb::addr_t file_addr) {
case eSectionTypeDataObjCMessageRefs:
case eSectionTypeDataObjCCFStrings:
case eSectionTypeGoSymtab:
+ case eSectionTypeWasmName:
return AddressClass::eData;
case eSectionTypeDebug:
diff --git a/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp b/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp
index b1efd25949379..30fec5dd759e7 100644
--- a/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp
+++ b/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp
@@ -50,7 +50,8 @@ static bool ValidateModuleHeader(const DataBufferSP &data_sp) {
return version == llvm::wasm::WasmVersion;
}
-static std::optional<ConstString>
+// FIXME: Use lldb::DataExtractor instead of llvm::DataExtractor.
+static std::optional<std::string>
GetWasmString(llvm::DataExtractor &data, llvm::DataExtractor::Cursor &c) {
// A Wasm string is encoded as a vector of UTF-8 codes.
// Vectors are encoded with their u32 length followed by the element
@@ -72,8 +73,7 @@ GetWasmString(llvm::DataExtractor &data, llvm::DataExtractor::Cursor &c) {
return std::nullopt;
}
- llvm::StringRef str = toStringRef(llvm::ArrayRef(str_storage));
- return ConstString(str);
+ return std::string(toStringRef(llvm::ArrayRef(str_storage)));
}
char ObjectFileWasm::ID;
@@ -182,7 +182,7 @@ bool ObjectFileWasm::DecodeNextSection(lldb::offset_t *offset_ptr) {
// identifying the custom section, followed by an uninterpreted sequence
// of bytes.
lldb::offset_t prev_offset = c.tell();
- std::optional<ConstString> sect_name = GetWasmString(data, c);
+ std::optional<std::string> sect_name = GetWasmString(data, c);
if (!sect_name)
return false;
@@ -191,7 +191,7 @@ bool ObjectFileWasm::DecodeNextSection(lldb::offset_t *offset_ptr) {
uint32_t section_length = payload_len - (c.tell() - prev_offset);
m_sect_infos.push_back(section_info{*offset_ptr + c.tell(), section_length,
- section_id, *sect_name});
+ section_id, ConstString(*sect_name)});
*offset_ptr += (c.tell() + section_length);
} else if (section_id <= llvm::wasm::WASM_SEC_LAST_KNOWN) {
m_sect_infos.push_back(section_info{*offset_ptr + c.tell(),
@@ -248,12 +248,93 @@ bool ObjectFileWasm::ParseHeader() {
return true;
}
-void ObjectFileWasm::ParseSymtab(Symtab &symtab) {}
+static std::vector<AddressRange> ParseFunctions(SectionSP code_section_sp) {
+ DataExtractor code_section_data;
+ code_section_sp->GetSectionData(code_section_data);
+ lldb::offset_t offset = 0;
+
+ const uint32_t function_count = code_section_data.GetULEB128(&offset);
+
+ std::vector<AddressRange> functions;
+ functions.reserve(function_count);
+
+ for (uint32_t i = 0; i < function_count; ++i) {
+ const uint32_t function_size = code_section_data.GetULEB128(&offset);
+ // llvm-objdump considers the ULEB with the function size to be part of the
+ // function. We can't do that here because that would break symbolic
+ // breakpoints, as that address is never executed.
+ functions.emplace_back(code_section_sp, offset, function_size);
+ offset += function_size;
+ }
+
+ return functions;
+}
+
+void ObjectFileWasm::ParseSymtab(Symtab &symtab) {
+ assert(m_sections_up && "sections must be parsed");
+ Log *log = GetLog(LLDBLog::Object);
+
+ // The name section contains names and indexes. First parse the functions from
+ // the code section so we can access them by their index.
+ SectionSP code_section_sp =
+ m_sections_up->FindSectionByType(lldb::eSectionTypeCode, false);
+ if (!code_section_sp)
+ return;
+ std::vector<AddressRange> functions = ParseFunctions(code_section_sp);
+
+ // Parse the name section.
+ SectionSP name_section_sp =
+ m_sections_up->FindSectionByType(lldb::eSectionTypeWasmName, false);
+ if (!name_section_sp)
+ return;
+
+ DataExtractor name_section_data;
+ name_section_sp->GetSectionData(name_section_data);
+
+ llvm::DataExtractor data = name_section_data.GetAsLLVM();
+ llvm::DataExtractor::Cursor c(0);
+ uint32_t sym_id = 0;
+ while (c && c.tell() < data.size()) {
+ const uint8_t type = data.getU8(c);
+ const uint32_t size = data.getULEB128(c);
+ switch (type) {
+ case llvm::wasm::WASM_NAMES_FUNCTION: {
+ const uint32_t count = data.getULEB128(c);
+ for (size_t i = 0; c && i < count; ++i) {
+ const uint32_t idx = data.getULEB128(c);
+ const std::optional<std::string> name = GetWasmString(data, c);
+ if (!name || idx >= functions.size())
+ continue;
+ symtab.AddSymbol(Symbol(
+ sym_id++, Mangled(*name), lldb::eSymbolTypeCode,
+ /*external=*/false, /*is_debug=*/false, /*is_trampoline=*/false,
+ /*is_artificial=*/false, functions[idx],
+ /*size_is_valid=*/true, /*contains_linker_annotations=*/false,
+ /*flags=*/0));
+ }
+ } break;
+ case llvm::wasm::WASM_NAMES_DATA_SEGMENT:
+ case llvm::wasm::WASM_NAMES_GLOBAL:
+ case llvm::wasm::WASM_NAMES_LOCAL:
+ default:
+ c.seek(c.tell() + size);
+ }
+ }
+
+ if (!c) {
+ LLDB_LOG_ERROR(log, c.takeError(),
+ "Failed to parse the Wasm symbol table: {0}");
+ return;
+ }
+
+ symtab.Finalize();
+}
static SectionType GetSectionTypeFromName(llvm::StringRef Name) {
- if (Name.consume_front(".debug_") || Name.consume_front(".zdebug_")) {
+ if (Name == "name")
+ return lldb::eSectionTypeWasmName;
+ if (Name.consume_front(".debug_") || Name.consume_front(".zdebug_"))
return ObjectFile::GetDWARFSectionTypeFromName(Name);
- }
return eSectionTypeOther;
}
@@ -263,13 +344,12 @@ void ObjectFileWasm::CreateSections(SectionList &unified_section_list) {
m_sections_up = std::make_unique<SectionList>();
- if (m_sect_infos.empty()) {
+ if (m_sect_infos.empty())
DecodeSections();
- }
for (const section_info §_info : m_sect_infos) {
- SectionType section_type = eSectionTypeOther;
- ConstString section_name;
+ SectionType section_type = GetSectionTypeFromName(sect_info.name);
+ ConstString section_name = sect_info.name;
offset_t file_offset = sect_info.offset & 0xffffffff;
addr_t vm_addr = file_offset;
size_t vm_size = sect_info.size;
@@ -283,15 +363,8 @@ void ObjectFileWasm::CreateSections(SectionList &unified_section_list) {
// For this reason Section::GetFileAddress() must return zero for the
// Code section.
vm_addr = 0;
- } else {
- section_type = GetSectionTypeFromName(sect_info.name.GetStringRef());
- if (section_type == eSectionTypeOther)
- continue;
- section_name = sect_info.name;
- if (!IsInMemory()) {
- vm_size = 0;
- vm_addr = 0;
- }
+ } else if (section_type == eSectionTypeOther) {
+ continue;
}
SectionSP section_sp(
@@ -397,9 +470,9 @@ std::optional<FileSpec> ObjectFileWasm::GetExternalDebugInfoFileSpec() {
ReadImageData(sect_info.offset, kBufferSize);
llvm::DataExtractor data = section_header_data.GetAsLLVM();
llvm::DataExtractor::Cursor c(0);
- std::optional<ConstString> symbols_url = GetWasmString(data, c);
+ std::optional<std::string> symbols_url = GetWasmString(data, c);
if (symbols_url)
- return FileSpec(symbols_url->GetStringRef());
+ return FileSpec(*symbols_url);
}
}
return std::nullopt;
diff --git a/lldb/source/Symbol/ObjectFile.cpp b/lldb/source/Symbol/ObjectFile.cpp
index 21daf7476b522..7efce2a035505 100644
--- a/lldb/source/Symbol/ObjectFile.cpp
+++ b/lldb/source/Symbol/ObjectFile.cpp
@@ -379,6 +379,7 @@ AddressClass ObjectFile::GetAddressClass(addr_t file_addr) {
case eSectionTypeELFDynamicSymbols:
case eSectionTypeELFRelocationEntries:
case eSectionTypeELFDynamicLinkInfo:
+ case eSectionTypeWasmName:
case eSectionTypeOther:
return AddressClass::eUnknown;
case eSectionTypeAbsoluteAddress:
diff --git a/lldb/test/Shell/Symtab/Inputs/simple.wasm.yaml b/lldb/test/Shell/Symtab/Inputs/simple.wasm.yaml
new file mode 100644
index 0000000000000..165bb53662f40
--- /dev/null
+++ b/lldb/test/Shell/Symtab/Inputs/simple.wasm.yaml
@@ -0,0 +1,210 @@
+--- !WASM
+FileHeader:
+ Version: 0x1
+Sections:
+ - Type: TYPE
+ Signatures:
+ - Index: 0
+ ParamTypes: []
+ ReturnTypes: []
+ - Index: 1
+ ParamTypes:
+ - I32
+ - I32
+ ReturnTypes:
+ - I32
+ - Index: 2
+ ParamTypes: []
+ ReturnTypes:
+ - I32
+ - Type: FUNCTION
+ FunctionTypes: [ 0, 1, 2, 1 ]
+ - Type: TABLE
+ Tables:
+ - Index: 0
+ ElemType: FUNCREF
+ Limits:
+ Flags: [ HAS_MAX ]
+ Minimum: 0x1
+ Maximum: 0x1
+ - Type: MEMORY
+ Memories:
+ - Minimum: 0x2
+ - Type: GLOBAL
+ Globals:
+ - Index: 0
+ Type: I32
+ Mutable: true
+ InitExpr:
+ Opcode: I32_CONST
+ Value: 66560
+ - Index: 1
+ Type: I32
+ Mutable: false
+ InitExpr:
+ Opcode: I32_CONST
+ Value: 1024
+ - Index: 2
+ Type: I32
+ Mutable: false
+ InitExpr:
+ Opcode: I32_CONST
+ Value: 1024
+ - Index: 3
+ Type: I32
+ Mutable: false
+ InitExpr:
+ Opcode: I32_CONST
+ Value: 1024
+ - Index: 4
+ Type: I32
+ Mutable: false
+ InitExpr:
+ Opcode: I32_CONST
+ Value: 66560
+ - Index: 5
+ Type: I32
+ Mutable: false
+ InitExpr:
+ Opcode: I32_CONST
+ Value: 1024
+ - Index: 6
+ Type: I32
+ Mutable: false
+ InitExpr:
+ Opcode: I32_CONST
+ Value: 66560
+ - Index: 7
+ Type: I32
+ Mutable: false
+ InitExpr:
+ Opcode: I32_CONST
+ Value: 131072
+ - Index: 8
+ Type: I32
+ Mutable: false
+ InitExpr:
+ Opcode: I32_CONST
+ Value: 0
+ - Index: 9
+ Type: I32
+ Mutable: false
+ InitExpr:
+ Opcode: I32_CONST
+ Value: 1
+ - Index: 10
+ Type: I32
+ Mutable: false
+ InitExpr:
+ Opcode: I32_CONST
+ Value: 65536
+ - Type: EXPORT
+ Exports:
+ - Name: memory
+ Kind: MEMORY
+ Index: 0
+ - Name: __wasm_call_ctors
+ Kind: FUNCTION
+ Index: 0
+ - Name: add
+ Kind: FUNCTION
+ Index: 1
+ - Name: __original_main
+ Kind: FUNCTION
+ Index: 2
+ - Name: main
+ Kind: FUNCTION
+ Index: 3
+ - Name: __main_void
+ Kind: FUNCTION
+ Index: 2
+ - Name: __indirect_function_table
+ Kind: TABLE
+ Index: 0
+ - Name: __dso_handle
+ Kind: GLOBAL
+ Index: 1
+ - Name: __data_end
+ Kind: GLOBAL
+ Index: 2
+ - Name: __stack_low
+ Kind: GLOBAL
+ Index: 3
+ - Name: __stack_high
+ Kind: GLOBAL
+ Index: 4
+ - Name: __global_base
+ Kind: GLOBAL
+ Index: 5
+ - Name: __heap_base
+ Kind: GLOBAL
+ Index: 6
+ - Name: __heap_end
+ Kind: GLOBAL
+ Index: 7
+ - Name: __memory_base
+ Kind: GLOBAL
+ Index: 8
+ - Name: __table_base
+ Kind: GLOBAL
+ Index: 9
+ - Name: __wasm_first_page_end
+ Kind: GLOBAL
+ Index: 10
+ - Type: CODE
+ Functions:
+ - Index: 0
+ Locals: []
+ Body: 0B
+ - Index: 1
+ Locals:
+ - Type: I32
+ Count: 1
+ Body: 23808080800041106B21022002200036020C20022001360208200228020C20022802086A0F0B
+ - Index: 2
+ Locals:
+ - Type: I32
+ Count: 2
+ Body: 23808080800041106B210020002480808080002000410036020C2000410136020820004102360204200028020820002802041081808080002101200041106A24808080800020010F0B
+ - Index: 3
+ Locals: []
+ Body: 1082808080000F0B
+ - Type: CUSTOM
+ Name: name
+ FunctionNames:
+ - Index: 0
+ Name: __wasm_call_ctors
+ - Index: 1
+ Name: add
+ - Index: 2
+ Name: __original_main
+ - Index: 3
+ Name: main
+ GlobalNames:
+ - Index: 0
+ Name: __stack_pointer
+ - Type: CUSTOM
+ Name: producers
+ Tools:
+ - Name: clang
+ Version: '22.0.0git'
+ - Type: CUSTOM
+ Name: target_features
+ Features:
+ - Prefix: USED
+ Name: bulk-memory
+ - Prefix: USED
+ Name: bulk-memory-opt
+ - Prefix: USED
+ Name: call-indirect-overlong
+ - Prefix: USED
+ Name: multivalue
+ - Prefix: USED
+ Name: mutable-globals
+ - Prefix: USED
+ Name: nontrapping-fptoint
+ - Prefix: USED
+ Name: reference-types
+ - Prefix: USED
+ Name: sign-ext
+...
diff --git a/lldb/test/Shell/Symtab/symtab-wasm.test b/lldb/test/Shell/Symtab/symtab-wasm.test
new file mode 100644
index 0000000000000..fc185cd81a0ec
--- /dev/null
+++ b/lldb/test/Shell/Symtab/symtab-wasm.test
@@ -0,0 +1,7 @@
+# RUN: yaml2obj %S/Inputs/simple.wasm.yaml -o %t.wasm
+# RUN: %lldb %t.wasm -o 'image dump symtab'
+
+# CHECK: Code 0x0000000000000002 {{.*}} __wasm_call_ctors
+# CHECK: Code 0x0000000000000005 {{.*}} add
+# CHECK: Code 0x000000000000002f {{.*}} __original_main
+# CHECK: Code 0x000000000000007c {{.*}} main
|
functions.reserve(function_count); | ||
|
||
for (uint32_t i = 0; i < function_count; ++i) { | ||
const uint32_t function_size = code_section_data.GetULEB128(&offset); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How does this API behave if the ULEB cannot be decoded?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looking at the implementation in LLVM, it returns 0
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks good, I left a few comments with potential extra opportunities for error handling.
This PR adds support for parsing the WebAssembly symbol table. The symbol table is encoded in the "names" section and contains names and indexes into other sections. For now we only support parsing function (code) symbols. The result is that you can set breakpoints by symbol name, while previously breakpoints by name required debug info (DWARF). This is also necessary for Swift, which checks for the presence of swift_release as a heuristic to determine if there's a static Swift stdlib.
8290aa6
to
dde1ce4
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The result is that you can set breakpoints by symbol name, while previously breakpoints by name required debug info (DWARF).
Assuming that this does not break things if both are present, this LGTM.
This PR adds support for parsing the WebAssembly symbol table. The symbol table is encoded in the "names" section and contains names and indexes into other sections. For now we only support parsing function (code) symbols. The result is that you can set breakpoints by symbol name, while previously breakpoints by name required debug info (DWARF). This is also necessary for Swift, which checks for the presence of `swift_release` as a heuristic to determine if there's a static Swift stdlib. (cherry picked from commit 5be2063)
This PR adds support for parsing the WebAssembly symbol table. The symbol table is encoded in the "names" section and contains names and indexes into other sections. For now we only support parsing function (code) symbols. The result is that you can set breakpoints by symbol name, while previously breakpoints by name required debug info (DWARF). This is also necessary for Swift, which checks for the presence of `swift_release` as a heuristic to determine if there's a static Swift stdlib. (cherry picked from commit 5be2063)
This PR adds support for parsing the WebAssembly symbol table. The symbol table is encoded in the "names" section and contains names and indexes into other sections. For now we only support parsing function (code) symbols. The result is that you can set breakpoints by symbol name, while previously breakpoints by name required debug info (DWARF).
This is also necessary for Swift, which checks for the presence of
swift_release
as a heuristic to determine if there's a static Swift stdlib.