diff --git a/cle/backends/elf/compilation_unit.py b/cle/backends/elf/compilation_unit.py index 0005453c..8a94b55c 100644 --- a/cle/backends/elf/compilation_unit.py +++ b/cle/backends/elf/compilation_unit.py @@ -1,5 +1,5 @@ import os.path -from typing import Dict, List +from typing import Dict, Generator, List, Tuple from cle.address_translator import AT @@ -13,17 +13,22 @@ class CompilationUnit: See http://dwarfstd.org/doc/DWARF5.pdf page 60 """ - def __init__(self, name, comp_dir, low_pc, high_pc, language, elf_object): + def __init__(self, name, comp_dir, language, ranges: List[Tuple[int, int]], elf_object): self.name = name self.comp_dir = comp_dir self.file_path = os.path.join(self.comp_dir, self.name) - self.low_pc = low_pc - self.high_pc = high_pc self.language = language self.functions: Dict[int, Subprogram] = {} self.global_variables: List[Variable] = [] self._elf_object = elf_object + self._ranges = ranges + self.low_pc = min(ranges)[0] + self.high_pc = max(ranges)[0] + + def __repr__(self): + return f"" + @property def min_addr(self): return AT.from_rva(self.low_pc, self._elf_object).to_mva() @@ -31,3 +36,12 @@ def min_addr(self): @property def max_addr(self): return AT.from_rva(self.high_pc, self._elf_object).to_mva() + + @property + def ranges(self) -> Generator[int, None, None]: + for lo, hi in self._ranges: + yield AT.from_rva(lo, self._elf_object).to_mva(), AT.from_rva(hi, self._elf_object).to_mva() + + @property + def multiple_ranges(self) -> bool: + return len(self._ranges) > 1 diff --git a/cle/backends/elf/elf.py b/cle/backends/elf/elf.py index acb1ceb1..03f8da11 100644 --- a/cle/backends/elf/elf.py +++ b/cle/backends/elf/elf.py @@ -4,7 +4,7 @@ import os import xml.etree.ElementTree from collections import OrderedDict, defaultdict -from typing import Dict, List, Optional, Set, Tuple +from typing import Dict, List, Optional, Set, Tuple, TYPE_CHECKING import archinfo import elftools @@ -14,6 +14,7 @@ from elftools.dwarf.die import DIE from elftools.dwarf.dwarf_expr import DWARFExprParser from elftools.dwarf.dwarfinfo import DWARFInfo +from elftools.dwarf.ranges import RangeEntry from elftools.elf import dynamic, elffile, enums, sections from sortedcontainers import SortedDict @@ -40,6 +41,9 @@ except ImportError: pypcode = None +if TYPE_CHECKING: + from elftools.dwarf.callframe import DecodedCallFrameTable + log = logging.getLogger(name=__name__) @@ -161,6 +165,9 @@ def __init__( self._dynamic = {} self.deps = [] + # Decoded CFT (call frame table) + self.decoded_cft: List["DecodedCallFrameTable"] = [] + # The linked image base should be evaluated before registering any segment or section due to # the fact that elffile, used by those methods, is working only with un-based virtual addresses, but Clemories # themselves are organized as a tree where each node backer internally uses relative addressing @@ -204,8 +211,12 @@ def __init__( # Load function hints and exception handling artifacts if dwarf.has_EH_CFI(): self._load_function_hints_from_fde(dwarf, FunctionHintSource.EH_FRAME) + self._load_decoded_cft_from_eh_cfi_entries(dwarf) self._load_exception_handling(dwarf) self._load_line_info(dwarf) + # Load CFI entries + if dwarf.has_CFI(): + self._load_decoded_cft_from_cfi_entries(dwarf) if debug_symbols: self.__process_debug_file(debug_symbols) @@ -437,6 +448,11 @@ def rebase(self, new_base): self.addr_to_line = SortedDict((addr + delta, value) for addr, value in self.addr_to_line.items()) + for cft in self.decoded_cft: + for entry in cft.table: + if entry and "pc" in entry: + entry["pc"] += self.image_base_delta + # # Private Methods # @@ -588,6 +604,37 @@ def _load_function_hints_from_fde(self, dwarf, source): except (DWARFError, ValueError): log.warning("An exception occurred in pyelftools when loading FDE information.", exc_info=True) + def _load_decoded_cft_from_eh_cfi_entries(self, dwarf): + """ + Load decoded Call Frame Tables from EH-CFI entries in the binary. + + :param dwarf: The DWARF info object from pyelftools. + :return: None + """ + + try: + for entry in dwarf.EH_CFI_entries(): + if type(entry) is callframe.FDE: + decoded = entry.get_decoded() + self.decoded_cft.append(decoded) + except (DWARFError, ValueError): + log.warning("An exception occurred in pyelftools when loading FDE information.", exc_info=True) + + def _load_decoded_cft_from_cfi_entries(self, dwarf): + """ + Load decoded Call Frame Tables from CFI entries in the binary. + + :param dwarf: The DWARF info object from pyelftools. + :return: None + """ + + try: + for entry in dwarf.CFI_entries(): + decoded = entry.get_decoded() + self.decoded_cft.append(decoded) + except (DWARFError, ValueError): + log.warning("An exception occurred in pyelftools when loading CFI entries.", exc_info=True) + def _load_exception_handling(self, dwarf): """ Load exception handling information out of the .eh_frame and .gcc_except_table sections. We may support more @@ -669,7 +716,7 @@ def _load_line_info(self, dwarf): self.addr_to_line[relocated_addr].add((filename, line.state.line)) @staticmethod - def _load_low_high_pc_form_die(die: DIE): + def _load_low_high_pc_form_die(die: DIE) -> Tuple[Optional[int], Optional[int]]: """ Load low and high pc from a DIE. @@ -699,6 +746,21 @@ def _load_low_high_pc_form_die(die: DIE): return lowpc, None return lowpc, highpc + @staticmethod + def _load_ranges_from_die(die: DIE, range_lists) -> List[Tuple[int, int]]: + """ + When a compilation unit spans across multiple ranges, load them. + + :param die: The DIE object from pyelftools. + :return: + """ + if "DW_AT_ranges" not in die.attributes: + return [] + ranges_offset = die.attributes["DW_AT_ranges"].value + ranges = range_lists.get_range_list_at_offset(ranges_offset) + + return [(r.begin_offset, r.end_offset) for r in ranges if isinstance(r, RangeEntry)] + def _load_dies(self, dwarf: DWARFInfo): """ Load DIEs and CUs from DWARF. @@ -708,6 +770,7 @@ def _load_dies(self, dwarf: DWARFInfo): """ compilation_units: List[CompilationUnit] = [] type_list: Dict[int, VariableType] = {} + range_lists = dwarf.range_lists() for cu in dwarf.iter_CUs(): expr_parser = DWARFExprParser(cu.structs) @@ -731,41 +794,49 @@ def _load_dies(self, dwarf: DWARFInfo): die_name = top_die.attributes.get("DW_AT_name", None) die_comp_dir = top_die.attributes.get("DW_AT_comp_dir", None) - die_low_pc, die_high_pc = self._load_low_high_pc_form_die(top_die) die_lang = top_die.attributes.get("DW_AT_language", None) - if ( - die_name is None - or die_comp_dir is None - or die_low_pc is None - or die_high_pc is None - or die_lang is None - ): + die_low_pc, die_high_pc = self._load_low_high_pc_form_die(top_die) + ranges = None + if die_high_pc is None: + # load ranges instead + ranges = self._load_ranges_from_die(top_die, range_lists) + if not ranges: + continue + + if die_name is None or die_comp_dir is None or die_low_pc is None or die_lang is None: continue die_name = die_name.value.decode("utf-8") die_comp_dir = die_comp_dir.value.decode("utf-8") die_lang = describe_attr_value(die_lang, top_die, top_die.offset) - cu_ = CompilationUnit(die_name, die_comp_dir, die_low_pc, die_high_pc, die_lang, self) + if ranges: + cu_ = CompilationUnit(die_name, die_comp_dir, die_lang, ranges, self) + else: + cu_ = CompilationUnit(die_name, die_comp_dir, die_lang, [(die_low_pc, die_high_pc)], self) compilation_units.append(cu_) for die_child in cu.iter_DIE_children(top_die): if die_child.tag == "DW_TAG_variable": # load global variable - var = Variable.from_die(die_child, expr_parser, self) + var = Variable.from_die(die_child, expr_parser, self, dwarf, die_low_pc) var.decl_file = cu_.file_path cu_.global_variables.append(var) elif die_child.tag == "DW_TAG_subprogram": # load subprogram - sub_prog = self._load_die_lex_block(die_child, expr_parser, type_list, cu, cu_.file_path, None) + sub_prog = self._load_die_lex_block( + dwarf, die_child, expr_parser, type_list, cu, cu_.file_path, die_low_pc, None + ) if sub_prog is not None: cu_.functions[sub_prog.low_pc] = sub_prog self.type_list = type_list self.compilation_units = compilation_units - def _load_die_lex_block(self, die: DIE, expr_parser, type_list, cu, file_path, subprogram) -> LexicalBlock: + def _load_die_lex_block( + self, dwarf, die: DIE, expr_parser, type_list, cu, file_path, cu_low_pc: int, subprogram + ) -> LexicalBlock: if "DW_AT_name" in die.attributes: name = die.attributes["DW_AT_name"].value.decode("utf-8") else: @@ -781,13 +852,18 @@ def _load_die_lex_block(self, die: DIE, expr_parser, type_list, cu, file_path, s block = LexicalBlock(low_pc, high_pc) for sub_die in cu.iter_DIE_children(die): - if sub_die.tag in ["DW_TAG_variable", "DW_TAG_formal_parameter"]: + if sub_die.tag in {"DW_TAG_variable", "DW_TAG_formal_parameter"}: # load local variable - var = Variable.from_die(sub_die, expr_parser, self, block) + var = Variable.from_die(sub_die, expr_parser, self, dwarf, cu_low_pc, lexical_block=block) var.decl_file = file_path - subprogram.local_variables.append(var) + if var.parameter: + subprogram.parameters.append(var) + else: + subprogram.local_variables.append(var) elif sub_die.tag == "DW_TAG_lexical_block": - sub_block = self._load_die_lex_block(sub_die, expr_parser, type_list, cu, file_path, subprogram) + sub_block = self._load_die_lex_block( + dwarf, sub_die, expr_parser, type_list, cu, file_path, cu_low_pc, subprogram + ) if sub_block is not None: block.child_blocks.append(sub_block) diff --git a/cle/backends/elf/subprogram.py b/cle/backends/elf/subprogram.py index c784ca10..b997f3c4 100644 --- a/cle/backends/elf/subprogram.py +++ b/cle/backends/elf/subprogram.py @@ -47,3 +47,4 @@ def __init__(self, name, low_pc, high_pc) -> None: super().__init__(low_pc, high_pc) self.name = name self.local_variables: List[Variable] = [] + self.parameters: List[Variable] = [] diff --git a/cle/backends/elf/variable.py b/cle/backends/elf/variable.py index bb43fc28..91e84dda 100644 --- a/cle/backends/elf/variable.py +++ b/cle/backends/elf/variable.py @@ -1,4 +1,5 @@ -from typing import TYPE_CHECKING, Optional +from enum import Enum +from typing import TYPE_CHECKING, List, Optional, Tuple from elftools.dwarf.die import DIE @@ -10,6 +11,38 @@ from .elf import ELF from .subprogram import LexicalBlock +DW_OP_reg0 = 0x50 + + +class VariableLocationType(Enum): + """ + Describes the various types of variable locations. + """ + + Register = 0 + Stack = 1 + Global = 2 + + +class VariableLocation: + """ + Describes a variable location (register, on stack, or in the global memory region). + """ + + __slots__ = ( + "loc_type", + "relative_addr", + ) + + def __init__(self, loc_type: VariableLocationType, relative_addr): + self.loc_type = loc_type + self.relative_addr = relative_addr + + def __repr__(self): + if self.loc_type == VariableLocationType.Register: + return f"{self.loc_type.name}:{self.relative_addr}" + return f"{self.loc_type.name}:{self.relative_addr:#x}" + class Variable: """ @@ -21,6 +54,20 @@ class Variable: :ivar lexical_block: For a local variable, the lexical block where the variable is declared """ + __slots__ = ( + "_elf_object", + "relative_addr", + "name", + "_type_offset", + "decl_line", + "decl_file", + "lexical_block", + "external", + "declaration_only", + "_location", + "parameter", + ) + def __init__(self, elf_object: "ELF"): self._elf_object = elf_object # all other optional params can be set afterwards @@ -32,24 +79,37 @@ def __init__(self, elf_object: "ELF"): self.lexical_block = None self.external = False self.declaration_only = False + self._location: Optional[List[Tuple[int, int, Optional[VariableLocation]]]] = None + self.parameter: bool = False @staticmethod - def from_die(die: DIE, expr_parser, elf_object: "ELF", lexical_block: Optional["LexicalBlock"] = None): + def from_die( + die: DIE, expr_parser, elf_object: "ELF", dwarf, cu_low_pc: int, lexical_block: Optional["LexicalBlock"] = None + ): # first the address - if "DW_AT_location" in die.attributes and die.attributes["DW_AT_location"].form == "DW_FORM_exprloc": - parsed_exprs = expr_parser.parse_expr(die.attributes["DW_AT_location"].value) - if len(parsed_exprs) == 1 and parsed_exprs[0].op_name == "DW_OP_addr": - addr = parsed_exprs[0].args[0] - var = MemoryVariable(elf_object, addr) - elif len(parsed_exprs) == 1 and parsed_exprs[0].op_name == "DW_OP_fbreg": - addr = parsed_exprs[0].args[0] - var = StackVariable(elf_object, addr) - elif len(parsed_exprs) == 1 and parsed_exprs[0].op_name.startswith("DW_OP_reg"): - addr = parsed_exprs[0].op - 0x50 # 0x50 == DW_OP_reg0 - var = RegisterVariable(elf_object, addr) - else: + var = None + if "DW_AT_location" in die.attributes: + loc_attr = die.attributes["DW_AT_location"] + if loc_attr.form == "DW_FORM_exprloc": + parsed_exprs = expr_parser.parse_expr(loc_attr.value) + if len(parsed_exprs) == 1 and parsed_exprs[0].op_name == "DW_OP_addr": + addr = parsed_exprs[0].args[0] + var = MemoryVariable(elf_object, addr) + elif len(parsed_exprs) == 1 and parsed_exprs[0].op_name == "DW_OP_fbreg": + addr = parsed_exprs[0].args[0] + var = StackVariable(elf_object, addr) + elif len(parsed_exprs) == 1 and parsed_exprs[0].op_name.startswith("DW_OP_reg"): + addr = parsed_exprs[0].op - DW_OP_reg0 + var = RegisterVariable(elf_object, addr) + else: + # we do not support the location form (yet) + var = Variable(elf_object) + elif loc_attr.form == "DW_FORM_sec_offset": var = Variable(elf_object) - else: + loc_lists = dwarf.location_lists() + var._location = Variable.load_variable_location(loc_lists, loc_attr.value, expr_parser, cu_low_pc) + + if var is None: var = Variable(elf_object) if "DW_AT_name" in die.attributes: @@ -62,11 +122,32 @@ def from_die(die: DIE, expr_parser, elf_object: "ELF", lexical_block: Optional[" var.external = True if "DW_AT_declaration" in die.attributes: var.declaration_only = True + var.parameter = die.tag == "DW_TAG_formal_parameter" var.lexical_block = lexical_block return var + @staticmethod + def load_variable_location( + location_lists, offset: int, expr_parser, cu_low_pc: int + ) -> List[Tuple[int, int, Optional[VariableLocation]]]: + loc_list = location_lists.get_location_list_at_offset(offset) + locs = [] + for entry in loc_list: + parsed_exprs = expr_parser.parse_expr(entry.loc_expr) + loc_expr = None + if len(parsed_exprs) == 1: + the_parsed_expr = parsed_exprs[0] + if the_parsed_expr.op_name == "DW_OP_addr": + loc_expr = VariableLocation(VariableLocationType.Global, the_parsed_expr.args[0]) + elif the_parsed_expr.op_name == "DW_OP_fbreg": + loc_expr = VariableLocation(VariableLocationType.Stack, the_parsed_expr.args[0]) + elif the_parsed_expr.op_name.startswith("DW_OP_reg"): + loc_expr = VariableLocation(VariableLocationType.Register, the_parsed_expr.op - DW_OP_reg0) + locs.append((cu_low_pc + entry.begin_offset, cu_low_pc + entry.end_offset, loc_expr)) + return locs + # overwritten for stack variables def rebased_addr_from_cfa(self, cfa: int): """ @@ -76,6 +157,17 @@ def rebased_addr_from_cfa(self, cfa: int): """ return self.rebased_addr + @property + def location(self) -> Optional[List[Tuple[int, int, Optional[VariableLocation]]]]: + if self._location is None: + return None + # rebase all addresses + locs = [] + for lo, hi, vl in self._location: + tpl = AT.from_rva(lo, self._elf_object).to_mva(), AT.from_rva(hi, self._elf_object).to_mva(), vl + locs.append(tpl) + return locs + @property def rebased_addr(self): return None @@ -88,7 +180,7 @@ def addr(self): return self.relative_addr @property - def type(self) -> VariableType: + def type(self) -> Optional[VariableType]: try: return self._elf_object.type_list[self._type_offset] except KeyError: @@ -106,6 +198,8 @@ class MemoryVariable(Variable): So all global variables, and also local static variables in C! """ + __slots__ = () + def __init__(self, elf_object: "ELF", relative_addr): super().__init__(elf_object) self.relative_addr = relative_addr @@ -124,6 +218,8 @@ class StackVariable(Variable): Stack Variable from DWARF. """ + __slots__ = () + def __init__(self, elf_object: "ELF", relative_addr): super().__init__(elf_object) self.relative_addr = relative_addr @@ -141,6 +237,8 @@ class RegisterVariable(Variable): Register Variable from DWARF. """ + __slots__ = () + def __init__(self, elf_object: "ELF", register_addr): super().__init__(elf_object) # FIXME should this really go into relative addr?