diff --git a/ChangeLog.md b/ChangeLog.md index 2261f59300115..d056b7699f631 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -20,10 +20,6 @@ See docs/process.md for more on how version tagging works. 4.0.22 (in development) ----------------------- -- Source maps now support 'names' field with function name information. - emsymbolizer will show function names when used with a source map. The size - of source maps may increase 2-3x and the link time can increase slightly due - to more processing on source map creation. (#25870) - The minimum version of python required to run emscripten was updated from 3.8 to 3.10. (#25891) diff --git a/test/core/test_dwarf.cpp b/test/core/test_dwarf.cpp deleted file mode 100644 index ad91ccda9cd4a..0000000000000 --- a/test/core/test_dwarf.cpp +++ /dev/null @@ -1,26 +0,0 @@ -#include - -EM_JS(int, out_to_js, (int x), {}) - -class MyClass { -public: - void foo(); - void bar(); -}; - -void __attribute__((noinline)) MyClass::foo() { - out_to_js(0); // line 12 - out_to_js(1); - out_to_js(2); -} - -void __attribute__((always_inline)) MyClass::bar() { - out_to_js(3); - __builtin_trap(); // line 19 -} - -int main() { - MyClass mc; - mc.foo(); - mc.bar(); -} diff --git a/test/test_other.py b/test/test_other.py index 00ada9a6ad71c..76b051cc9206a 100644 --- a/test/test_other.py +++ b/test/test_other.py @@ -9629,49 +9629,12 @@ def check_dwarf_loc_info(address, funcs, locs): for loc in locs: self.assertIn(loc, out) - def check_source_map_loc_info(address, func, loc): + def check_source_map_loc_info(address, loc): out = self.run_process( [emsymbolizer, '-s', 'sourcemap', 'test_dwarf.wasm', address], stdout=PIPE).stdout - self.assertIn(func, out) self.assertIn(loc, out) - def do_tests(src): - # 1. Test DWARF + source map together - # For DWARF, we check for the full inlined info for both function names and - # source locations. Source maps does not provide inlined info. So we only - # check for the info of the outermost function. - self.run_process([EMCC, test_file(src), '-g', '-gsource-map', '-O1', '-o', - 'test_dwarf.js']) - check_dwarf_loc_info(out_to_js_call_addr, out_to_js_call_func, - out_to_js_call_loc) - check_source_map_loc_info(out_to_js_call_addr, out_to_js_call_func[0], - out_to_js_call_loc[0]) - check_dwarf_loc_info(unreachable_addr, unreachable_func, unreachable_loc) - # Source map shows the original (inlined) source location with the original - # function name - check_source_map_loc_info(unreachable_addr, unreachable_func[0], - unreachable_loc[0]) - - # 2. Test source map only - # The addresses, function names, and source locations are the same across - # the builds because they are relative offsets from the code section, so we - # don't need to recompute them - self.run_process([EMCC, test_file(src), '-gsource-map', '-O1', '-o', - 'test_dwarf.js']) - check_source_map_loc_info(out_to_js_call_addr, out_to_js_call_func[0], - out_to_js_call_loc[0]) - check_source_map_loc_info(unreachable_addr, unreachable_func[0], - unreachable_loc[0]) - - # 3. Test DWARF only - self.run_process([EMCC, test_file(src), '-g', '-O1', '-o', - 'test_dwarf.js']) - check_dwarf_loc_info(out_to_js_call_addr, out_to_js_call_func, - out_to_js_call_loc) - check_dwarf_loc_info(unreachable_addr, unreachable_func, unreachable_loc) - - # -- C program test -- # We test two locations within test_dwarf.c: # out_to_js(0); // line 6 # __builtin_trap(); // line 13 @@ -9694,32 +9657,31 @@ def do_tests(src): # The first one corresponds to the innermost inlined location. unreachable_loc = ['test_dwarf.c:13:3', 'test_dwarf.c:18:3'] - do_tests('core/test_dwarf.c') - - # -- C++ program test -- - # We test two locations within test_dwarf.cpp: - # out_to_js(0); // line 12 - # __builtin_trap(); // line 19 - self.run_process([EMCC, test_file('core/test_dwarf.cpp'), - '-g', '-gsource-map', '-O1', '-o', 'test_dwarf.js']) - # Address of out_to_js(0) within MyClass::foo(), uninlined - out_to_js_call_addr = self.get_instr_addr('call\t0', 'test_dwarf.wasm') - # Address of __builtin_trap() within MyClass::bar(), inlined into main() - unreachable_addr = self.get_instr_addr('unreachable', 'test_dwarf.wasm') - - # Function name of out_to_js(0) within MyClass::foo(), uninlined - out_to_js_call_func = ['MyClass::foo()'] - # Function names of __builtin_trap() within MyClass::bar(), inlined into - # main(). The first one corresponds to the innermost inlined function. - unreachable_func = ['MyClass::bar()', 'main'] - - # Source location of out_to_js(0) within MyClass::foo(), uninlined - out_to_js_call_loc = ['test_dwarf.cpp:12:3'] - # Source locations of __builtin_trap() within MyClass::bar(), inlined into - # main(). The first one corresponds to the innermost inlined location. - unreachable_loc = ['test_dwarf.cpp:19:3', 'test_dwarf.cpp:25:6'] + # 1. Test DWARF + source map together + # For DWARF, we check for the full inlined info for both function names and + # source locations. Source maps provide neither function names nor inlined + # info. So we only check for the source location of the outermost function. + check_dwarf_loc_info(out_to_js_call_addr, out_to_js_call_func, + out_to_js_call_loc) + check_source_map_loc_info(out_to_js_call_addr, out_to_js_call_loc[0]) + check_dwarf_loc_info(unreachable_addr, unreachable_func, unreachable_loc) + check_source_map_loc_info(unreachable_addr, unreachable_loc[0]) + + # 2. Test source map only + # The addresses, function names, and source locations are the same across + # the builds because they are relative offsets from the code section, so we + # don't need to recompute them + self.run_process([EMCC, test_file('core/test_dwarf.c'), + '-gsource-map', '-O1', '-o', 'test_dwarf.js']) + check_source_map_loc_info(out_to_js_call_addr, out_to_js_call_loc[0]) + check_source_map_loc_info(unreachable_addr, unreachable_loc[0]) - do_tests('core/test_dwarf.cpp') + # 3. Test DWARF only + self.run_process([EMCC, test_file('core/test_dwarf.c'), + '-g', '-O1', '-o', 'test_dwarf.js']) + check_dwarf_loc_info(out_to_js_call_addr, out_to_js_call_func, + out_to_js_call_loc) + check_dwarf_loc_info(unreachable_addr, unreachable_func, unreachable_loc) def test_emsymbolizer_functions(self): 'Test emsymbolizer use cases that only provide function-granularity info' diff --git a/tools/emsymbolizer.py b/tools/emsymbolizer.py index a4046ce7c5a81..37d50cb6c0a26 100755 --- a/tools/emsymbolizer.py +++ b/tools/emsymbolizer.py @@ -117,7 +117,6 @@ class Location: def __init__(self): self.version = None self.sources = [] - self.funcs = [] self.mappings = {} self.offsets = [] @@ -129,7 +128,6 @@ def parse(self, filename): self.version = source_map_json['version'] self.sources = source_map_json['sources'] - self.funcs = source_map_json['names'] chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=' vlq_map = {c: i for i, c in enumerate(chars)} @@ -157,7 +155,6 @@ def decodeVLQ(string): src = 0 line = 1 col = 1 - func = 0 for segment in source_map_json['mappings'].split(','): data = decodeVLQ(segment) info = [] @@ -172,9 +169,7 @@ def decodeVLQ(string): if len(data) >= 4: col += data[3] info.append(col) - if len(data) == 5: - func += data[4] - info.append(func) + # TODO: see if we need the name, which is the next field (data[4]) self.mappings[offset] = WasmSourceMap.Location(*info) self.offsets.append(offset) @@ -212,7 +207,6 @@ def lookup(self, offset, lower_bound=None): self.sources[info.source] if info.source is not None else None, info.line, info.column, - self.funcs[info.func] if info.func is not None else None, ) diff --git a/tools/wasm-sourcemap.py b/tools/wasm-sourcemap.py index 63ae7ea1c8b49..31f112f844a19 100755 --- a/tools/wasm-sourcemap.py +++ b/tools/wasm-sourcemap.py @@ -25,8 +25,6 @@ from tools import shared, utils from tools.system_libs import DETERMINISTIC_PREFIX -LLVM_CXXFILT = shared.llvm_tool_path('llvm-cxxfilt') - EMSCRIPTEN_PREFIX = utils.normalize_path(utils.path_from_root()) logger = logging.getLogger('wasm-sourcemap') @@ -219,189 +217,32 @@ def decode_octal_encoded_utf8(str): def extract_comp_dir_map(text): - compile_unit_pattern = re.compile(r"0x[0-9a-f]*: DW_TAG_compile_unit") - stmt_list_pattern = re.compile(r"DW_AT_stmt_list\s+\((0x[0-9a-f]*)\)") - comp_dir_pattern = re.compile(r"DW_AT_comp_dir\s+\(\"([^\"]+)\"\)") - map_stmt_list_to_comp_dir = {} - chunks = compile_unit_pattern.split(text) # DW_TAG_compile_unit + chunks = re.split(r"0x[0-9a-f]*: DW_TAG_compile_unit", text) for chunk in chunks[1:]: - stmt_list_match = stmt_list_pattern.search(chunk) # DW_AT_stmt_list + stmt_list_match = re.search(r"DW_AT_stmt_list\s+\((0x[0-9a-f]*)\)", chunk) if stmt_list_match is not None: stmt_list = stmt_list_match.group(1) - comp_dir_match = comp_dir_pattern.search(chunk) # DW_AT_comp_dir + comp_dir_match = re.search(r"DW_AT_comp_dir\s+\(\"([^\"]+)\"\)", chunk) comp_dir = decode_octal_encoded_utf8(comp_dir_match.group(1)) if comp_dir_match is not None else '' map_stmt_list_to_comp_dir[stmt_list] = comp_dir return map_stmt_list_to_comp_dir -def demangle_names(names): - # Only demangle names that look mangled - mangled_names = sorted({n for n in names if n.startswith('_Z')}) - if not mangled_names: - return {} - if not os.path.exists(LLVM_CXXFILT): - logger.warning('llvm-cxxfilt does not exist') - return {} - - # Gather all mangled names and call llvm-cxxfilt only once for all of them - input_str = '\n'.join(mangled_names) - proc = shared.check_call([LLVM_CXXFILT], input=input_str, stdout=shared.PIPE, stderr=shared.PIPE, text=True) - if proc.returncode != 0: - logger.warning('llvm-cxxfilt failed: %s' % proc.stderr) - return {} - - demangled_list = proc.stdout.splitlines() - if len(demangled_list) != len(mangled_names): - logger.warning('llvm-cxxfilt output length mismatch') - return {} - - return dict(zip(mangled_names, demangled_list, strict=True)) - - -class FuncRange: - def __init__(self, name, low_pc, high_pc): - self.name = name - self.low_pc = low_pc - self.high_pc = high_pc - - -# This function parses DW_TAG_subprogram entries and gets low_pc and high_pc for -# each function in a list of FuncRanges. The result list will be sorted in the -# increasing order of low_pcs. -def extract_func_ranges(text): - # This function handles four cases: - # 1. DW_TAG_subprogram with DW_AT_name, DW_AT_low_pc, and DW_AT_high_pc. - # 0x000000ba: DW_TAG_subprogram - # DW_AT_low_pc (0x0000005f) - # DW_AT_high_pc (0x00000071) - # DW_AT_name ("foo") - # ... - # - # 2. DW_TAG_subprogram with DW_AT_linkage_name, DW_AT_low_pc, and - # DW_AT_high_pc. Applies to mangled C++ functions. - # (We parse DW_AT_linkage_name instead of DW_AT_name here.) - # 0x000000ba: DW_TAG_subprogram - # DW_AT_low_pc (0x0000005f) - # DW_AT_high_pc (0x00000071) - # DW_AT_linkage_name ("_ZN7MyClass3fooEv") - # DW_AT_name ("foo") - # ... - # - # 3. DW_TAG_subprogram with DW_AT_specification, DW_AT_low_pc, and - # DW_AT_high_pc. C++ function info can be split into two DIEs (one with - # DW_AT_linkage_name and DW_AT_declaration (true) and the other with - # DW_AT_specification). In this case we parse DW_AT_specification for the - # function name. - # 0x0000006d: DW_TAG_subprogram - # DW_AT_linkage_name ("_ZN7MyClass3fooEv") - # DW_AT_name ("foo") - # DW_AT_declaration (true) - # ... - # 0x00000097: DW_TAG_subprogram - # DW_AT_low_pc (0x00000007) - # DW_AT_high_pc (0x0000004c) - # DW_AT_specification (0x0000006d "_ZN7MyClass3fooEv") - # ... - # - # 4. DW_TAG_inlined_subroutine with DW_AT_abstract_origin, DW_AT_low_pc, and - # DW_AT_high_pc. This represents an inlined function. We parse - # DW_AT_abstract_origin for the original function name. - # 0x0000011a: DW_TAG_inlined_subroutine - # DW_AT_abstract_origin (0x000000da "_ZN7MyClass3barEv") - # DW_AT_low_pc (0x00000078) - # DW_AT_high_pc (0x00000083) - # ... - - tag_pattern = re.compile(r'\r?\n(?=0x[0-9a-f]+:)') - subprogram_pattern = re.compile(r"0x[0-9a-f]+:\s+DW_TAG_subprogram") - inlined_pattern = re.compile(r"0x[0-9a-f]+:\s+DW_TAG_inlined_subroutine") - low_pc_pattern = re.compile(r'DW_AT_low_pc\s+\(0x([0-9a-f]+)\)') - high_pc_pattern = re.compile(r'DW_AT_high_pc\s+\(0x([0-9a-f]+)\)') - abstract_origin_pattern = re.compile(r'DW_AT_abstract_origin\s+\(0x[0-9a-f]+\s+"([^"]+)"\)') - linkage_name_pattern = re.compile(r'DW_AT_linkage_name\s+\("([^"]+)"\)') - name_pattern = re.compile(r'DW_AT_name\s+\("([^"]+)"\)') - specification_pattern = re.compile(r'DW_AT_specification\s+\(0x[0-9a-f]+\s+"([^"]+)"\)') - - func_ranges = [] - dw_tags = tag_pattern.split(text) - - def get_name_from_tag(tag): - m = linkage_name_pattern.search(tag) # DW_AT_linkage_name - if m: - return m.group(1) - m = name_pattern.search(tag) # DW_AT_name - if m: - return m.group(1) - # If name is missing, check for DW_AT_specification annotation - m = specification_pattern.search(tag) - if m: - return m.group(1) - return None - - for tag in dw_tags: - is_subprogram = subprogram_pattern.search(tag) # DW_TAG_subprogram - is_inlined = inlined_pattern.search(tag) # DW_TAG_inlined_subroutine - - if is_subprogram or is_inlined: - name = None - low_pc = None - high_pc = None - m = low_pc_pattern.search(tag) # DW_AT_low_pc - if m: - low_pc = int(m.group(1), 16) - m = high_pc_pattern.search(tag) # DW_AT_high_pc - if m: - high_pc = int(m.group(1), 16) - if is_subprogram: - name = get_name_from_tag(tag) - else: # is_inlined - m = abstract_origin_pattern.search(tag) # DW_AT_abstract_origin - if m: - name = m.group(1) - if name and low_pc is not None and high_pc is not None: - func_ranges.append(FuncRange(name, low_pc, high_pc)) - - # Demangle names - all_names = [item.name for item in func_ranges] - demangled_map = demangle_names(all_names) - for func_range in func_ranges: - if func_range.name in demangled_map: - func_range.name = demangled_map[func_range.name] - - # To correctly identify the innermost function for a given address, - # func_ranges is sorted primarily by low_pc in ascending order and secondarily - # by high_pc in descending order. This ensures that for overlapping ranges, - # the more specific (inner) range appears later in the list. - func_ranges.sort(key=lambda item: (item.low_pc, -item.high_pc)) - return func_ranges - - -def read_dwarf_info(wasm, options): +def read_dwarf_entries(wasm, options): if options.dwarfdump_output: output = utils.read_file(options.dwarfdump_output) elif options.dwarfdump: logger.debug('Reading DWARF information from %s' % wasm) if not os.path.exists(options.dwarfdump): utils.exit_with_error('llvm-dwarfdump not found: ' + options.dwarfdump) - # We need only three tags in the debug info: DW_TAG_compile_unit for - # source location, and DW_TAG_subprogram and DW_TAG_inlined_subroutine - # for the function ranges. - dwarfdump_cmd = [options.dwarfdump, '-debug-info', '-debug-line', wasm, - '-t', 'DW_TAG_compile_unit', '-t', 'DW_TAG_subprogram', - '-t', 'DW_TAG_inlined_subroutine'] - proc = shared.check_call(dwarfdump_cmd, stdout=shared.PIPE) + proc = shared.check_call([options.dwarfdump, '-debug-info', '-debug-line', '--recurse-depth=0', wasm], stdout=shared.PIPE) output = proc.stdout else: utils.exit_with_error('Please specify either --dwarfdump or --dwarfdump-output') - debug_line_pattern = re.compile(r"debug_line\[(0x[0-9a-f]*)\]") - include_dir_pattern = re.compile(r"include_directories\[\s*(\d+)\] = \"([^\"]*)") - file_pattern = re.compile(r"file_names\[\s*(\d+)\]:\s+name: \"([^\"]*)\"\s+dir_index: (\d+)") - line_pattern = re.compile(r"\n0x([0-9a-f]+)\s+(\d+)\s+(\d+)\s+(\d+)(.*?end_sequence)?") - entries = [] - debug_line_chunks = debug_line_pattern.split(output) + debug_line_chunks = re.split(r"debug_line\[(0x[0-9a-f]*)\]", output) map_stmt_list_to_comp_dir = extract_comp_dir_map(debug_line_chunks[0]) for stmt_list, line_chunk in zip(debug_line_chunks[1::2], debug_line_chunks[2::2], strict=True): comp_dir = map_stmt_list_to_comp_dir.get(stmt_list, '') @@ -422,16 +263,16 @@ def read_dwarf_info(wasm, options): # 0x0000000000000011 28 0 1 0 0 is_stmt include_directories = {'0': comp_dir} - for dir in include_dir_pattern.finditer(line_chunk): + for dir in re.finditer(r"include_directories\[\s*(\d+)\] = \"([^\"]*)", line_chunk): include_directories[dir.group(1)] = os.path.join(comp_dir, decode_octal_encoded_utf8(dir.group(2))) files = {} - for file in file_pattern.finditer(line_chunk): + for file in re.finditer(r"file_names\[\s*(\d+)\]:\s+name: \"([^\"]*)\"\s+dir_index: (\d+)", line_chunk): dir = include_directories[file.group(3)] file_path = os.path.join(dir, decode_octal_encoded_utf8(file.group(2))) files[file.group(1)] = file_path - for line in line_pattern.finditer(line_chunk): + for line in re.finditer(r"\n0x([0-9a-f]+)\s+(\d+)\s+(\d+)\s+(\d+)(.*?end_sequence)?", line_chunk): entry = {'address': int(line.group(1), 16), 'line': int(line.group(2)), 'column': int(line.group(3)), 'file': files[line.group(4)], 'eos': line.group(5) is not None} if not entry['eos']: entries.append(entry) @@ -447,61 +288,22 @@ def read_dwarf_info(wasm, options): remove_dead_entries(entries) # return entries sorted by the address field - entries = sorted(entries, key=lambda entry: entry['address']) + return sorted(entries, key=lambda entry: entry['address']) - func_ranges = extract_func_ranges(debug_line_chunks[0]) - return entries, func_ranges - -def build_sourcemap(entries, func_ranges, code_section_offset, options): +def build_sourcemap(entries, code_section_offset, options): base_path = options.basepath collect_sources = options.sources prefixes = SourceMapPrefixes(options.prefix, options.load_prefix, base_path) - # Add code section offset to the low/high pc in the function PC ranges - for func_range in func_ranges: - func_range.low_pc += code_section_offset - func_range.high_pc += code_section_offset - sources = [] sources_content = [] - # There can be duplicate names in case an original source function has - # multiple disjoint PC ranges or is inlined to multiple callsites. Make the - # 'names' list a unique list of names, and map the function ranges to the - # indices in that list. - names = sorted({item.name for item in func_ranges}) - name_to_id = {name: i for i, name in enumerate(names)} mappings = [] sources_map = {} last_address = 0 last_source_id = 0 last_line = 1 last_column = 1 - last_func_id = 0 - - active_funcs = [] - next_func_range_id = 0 - - # Get the function name ID that the given address falls into - def get_function_id(address): - nonlocal active_funcs - nonlocal next_func_range_id - - # Maintain a list of "active functions" whose ranges currently cover the - # address. As the address advances, it adds new functions that start and - # removes functions that end. The last function remaining in the active list - # at any point is the innermost function. - while next_func_range_id < len(func_ranges) and func_ranges[next_func_range_id].low_pc <= address: - # active_funcs contains (high_pc, id) pair - active_funcs.append((func_ranges[next_func_range_id].high_pc, next_func_range_id)) - next_func_range_id += 1 - active_funcs = [f for f in active_funcs if f[0] > address] - - if active_funcs: - func_range_id = active_funcs[-1][1] - name = func_ranges[func_range_id].name - return name_to_id[name] - return None for entry in entries: line = entry['line'] @@ -532,27 +334,21 @@ def get_function_id(address): sources_content.append(None) else: source_id = sources_map[source_name] - func_id = get_function_id(address) address_delta = address - last_address source_id_delta = source_id - last_source_id line_delta = line - last_line column_delta = column - last_column + mappings.append(encode_vlq(address_delta) + encode_vlq(source_id_delta) + encode_vlq(line_delta) + encode_vlq(column_delta)) last_address = address last_source_id = source_id last_line = line last_column = column - mapping = encode_vlq(address_delta) + encode_vlq(source_id_delta) + encode_vlq(line_delta) + encode_vlq(column_delta) - if func_id is not None: - func_id_delta = func_id - last_func_id - last_func_id = func_id - mapping += encode_vlq(func_id_delta) - mappings.append(mapping) return {'version': 3, 'sources': sources, 'sourcesContent': sources_content, - 'names': names, + 'names': [], 'mappings': ','.join(mappings)} @@ -563,12 +359,12 @@ def main(args): with open(wasm_input, 'rb') as infile: wasm = infile.read() - entries, func_ranges = read_dwarf_info(wasm_input, options) + entries = read_dwarf_entries(wasm_input, options) code_section_offset = get_code_section_offset(wasm) logger.debug('Saving to %s' % options.output) - map = build_sourcemap(entries, func_ranges, code_section_offset, options) + map = build_sourcemap(entries, code_section_offset, options) with open(options.output, 'w', encoding='utf-8') as outfile: json.dump(map, outfile, separators=(',', ':'), ensure_ascii=False)