248 lines
9.4 KiB
Python
Executable File
248 lines
9.4 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
#
|
|
# Copyright (C) 2022 The Android Open Source Project
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""
|
|
Checks dwarf CFI (unwinding) information by comparing it to disassembly.
|
|
It is only a simple heuristic check of stack pointer adjustments.
|
|
Fully inferring CFI from disassembly is not possible in general.
|
|
"""
|
|
|
|
import os, re, subprocess, collections, pathlib, bisect, collections
|
|
from typing import List, Optional, Set, Tuple
|
|
|
|
Source = collections.namedtuple("Source", ["addr", "file", "line", "flag"])
|
|
|
|
def get_source(lib: pathlib.Path) -> List[Source]:
|
|
""" Get source-file and line-number for all hand-written assembly code. """
|
|
|
|
proc = subprocess.run(["llvm-dwarfdump", "--debug-line", lib],
|
|
encoding='utf-8',
|
|
capture_output=True,
|
|
check=True)
|
|
|
|
section_re = re.compile("^debug_line\[0x[0-9a-f]+\]$", re.MULTILINE)
|
|
filename_re = re.compile('file_names\[ *(\d)+\]:\n\s*name: "(.*)"', re.MULTILINE)
|
|
line_re = re.compile('0x([0-9a-f]{16}) +(\d+) +\d+ +(\d+)' # addr, line, column, file
|
|
' +\d+ +\d +(.*)') # isa, discriminator, flag
|
|
|
|
results = []
|
|
for section in section_re.split(proc.stdout):
|
|
files = {m[1]: m[2] for m in filename_re.finditer(section)}
|
|
if not any(f.endswith(".S") for f in files.values()):
|
|
continue
|
|
lines = line_re.findall(section)
|
|
results.extend([Source(int(a, 16), files[fn], l, fg) for a, l, fn, fg in lines])
|
|
return sorted(filter(lambda line: "end_sequence" not in line.flag, results))
|
|
|
|
Fde = collections.namedtuple("Fde", ["addr", "end", "data"])
|
|
|
|
def get_fde(lib: pathlib.Path) -> List[Fde]:
|
|
""" Get all unwinding FDE blocks (in dumped text-based format) """
|
|
|
|
proc = subprocess.run(["llvm-dwarfdump", "--debug-frame", lib],
|
|
encoding='utf-8',
|
|
capture_output=True,
|
|
check=True)
|
|
|
|
section_re = re.compile("\n(?! |\n)", re.MULTILINE) # New-line not followed by indent.
|
|
fda_re = re.compile(".* FDE .* pc=([0-9a-f]+)...([0-9a-f]+)")
|
|
|
|
results = []
|
|
for section in section_re.split(proc.stdout):
|
|
m = fda_re.match(section)
|
|
if m:
|
|
fde = Fde(int(m[1], 16), int(m[2], 16), section)
|
|
if fde.addr != 0:
|
|
results.append(fde)
|
|
return sorted(results)
|
|
|
|
Asm = collections.namedtuple("Asm", ["addr", "name", "data"])
|
|
|
|
def get_asm(lib: pathlib.Path) -> List[Asm]:
|
|
""" Get disassembly for all methods (in dumped text-based format) """
|
|
|
|
proc = subprocess.run(["llvm-objdump", "--disassemble", lib],
|
|
encoding='utf-8',
|
|
capture_output=True,
|
|
check=True)
|
|
|
|
section_re = re.compile("\n(?! |\n)", re.MULTILINE) # New-line not followed by indent.
|
|
sym_re = re.compile("([0-9a-f]+) <(.+)>:")
|
|
|
|
results = []
|
|
for section in section_re.split(proc.stdout):
|
|
sym = sym_re.match(section)
|
|
if sym:
|
|
results.append(Asm(int(sym[1], 16), sym[2], section))
|
|
return sorted(results)
|
|
|
|
Cfa = collections.namedtuple("Cfa", ["addr", "cfa"])
|
|
|
|
def get_cfa(fde: Fde) -> List[Cfa]:
|
|
""" Extract individual CFA (SP+offset) entries from the FDE block """
|
|
|
|
cfa_re = re.compile("0x([0-9a-f]+): CFA=([^\s:]+)")
|
|
return [Cfa(int(addr, 16), cfa) for addr, cfa in cfa_re.findall(fde.data)]
|
|
|
|
Inst = collections.namedtuple("Inst", ["addr", "inst", "symbol"])
|
|
|
|
def get_instructions(asm: Asm) -> List[Inst]:
|
|
""" Extract individual instructions from disassembled code block """
|
|
|
|
data = re.sub(r"[ \t]+", " ", asm.data)
|
|
inst_re = re.compile(r"([0-9a-f]+): +(?:[0-9a-f]{2} +)*(.*)")
|
|
return [Inst(int(addr, 16), inst, asm.name) for addr, inst in inst_re.findall(data)]
|
|
|
|
CfaOffset = collections.namedtuple("CfaOffset", ["addr", "offset"])
|
|
|
|
def get_dwarf_cfa_offsets(cfas: List[Cfa]) -> List[CfaOffset]:
|
|
""" Parse textual CFA entries into integer stack offsets """
|
|
|
|
result = []
|
|
for addr, cfa in cfas:
|
|
if cfa == "WSP" or cfa == "SP":
|
|
result.append(CfaOffset(addr, 0))
|
|
elif cfa.startswith("WSP+") or cfa.startswith("SP+"):
|
|
result.append(CfaOffset(addr, int(cfa.split("+")[1])))
|
|
else:
|
|
result.append(CfaOffset(addr, None))
|
|
return result
|
|
|
|
def get_infered_cfa_offsets(insts: List[Inst]) -> List[CfaOffset]:
|
|
""" Heuristic to convert disassembly into stack offsets """
|
|
|
|
# Regular expressions which find instructions that adjust stack pointer.
|
|
rexprs = []
|
|
def add(rexpr, adjust_offset):
|
|
rexprs.append((re.compile(rexpr), adjust_offset))
|
|
add(r"sub sp,(?: sp,)? #(\d+)", lambda m: int(m[1]))
|
|
add(r"add sp,(?: sp,)? #(\d+)", lambda m: -int(m[1]))
|
|
add(r"str \w+, \[sp, #-(\d+)\]!", lambda m: int(m[1]))
|
|
add(r"ldr \w+, \[sp\], #(\d+)", lambda m: -int(m[1]))
|
|
add(r"stp \w+, \w+, \[sp, #-(\d+)\]!", lambda m: int(m[1]))
|
|
add(r"ldp \w+, \w+, \[sp\], #(\d+)", lambda m: -int(m[1]))
|
|
add(r"vpush \{([d0-9, ]*)\}", lambda m: 8 * len(m[1].split(",")))
|
|
add(r"vpop \{([d0-9, ]*)\}", lambda m: -8 * len(m[1].split(",")))
|
|
add(r"v?push(?:\.w)? \{([\w+, ]*)\}", lambda m: 4 * len(m[1].split(",")))
|
|
add(r"v?pop(?:\.w)? \{([\w+, ]*)\}", lambda m: -4 * len(m[1].split(",")))
|
|
|
|
# Regular expression which identifies branches.
|
|
jmp_re = re.compile(r"cb\w* \w+, 0x(\w+)|(?:b|bl|b\w\w) 0x(\w+)")
|
|
|
|
offset, future_offset = 0, {}
|
|
result = [CfaOffset(insts[0].addr, offset)]
|
|
for addr, inst, symbol in insts:
|
|
# Previous code branched here, so us that offset instead.
|
|
# This likely identifies slow-path which is after return.
|
|
if addr in future_offset:
|
|
offset = future_offset[addr]
|
|
|
|
# Add entry to output (only if the offset changed).
|
|
if result[-1].offset != offset:
|
|
result.append(CfaOffset(addr, offset))
|
|
|
|
# Adjust offset if the instruction modifies stack pointer.
|
|
for rexpr, adjust_offset in rexprs:
|
|
m = rexpr.match(inst)
|
|
if m:
|
|
offset += adjust_offset(m)
|
|
break # First matched pattern wins.
|
|
|
|
# Record branches. We only support forward edges for now.
|
|
m = jmp_re.match(inst)
|
|
if m:
|
|
future_offset[int(m[m.lastindex], 16)] = offset
|
|
return result
|
|
|
|
def check_fde(fde: Fde, insts: List[Inst], srcs, verbose: bool = False) -> Tuple[str, Set[int]]:
|
|
""" Compare DWARF offsets to assembly-inferred offsets. Report differences. """
|
|
|
|
error, seen_addrs = None, set()
|
|
cfas = get_cfa(fde)
|
|
i, dwarf_cfa = 0, get_dwarf_cfa_offsets(cfas)
|
|
j, infered_cfa = 0, get_infered_cfa_offsets(insts)
|
|
for inst in insts:
|
|
seen_addrs.add(inst.addr)
|
|
while i+1 < len(dwarf_cfa) and dwarf_cfa[i+1].addr <= inst.addr:
|
|
i += 1
|
|
while j+1 < len(infered_cfa) and infered_cfa[j+1].addr <= inst.addr:
|
|
j += 1
|
|
if verbose:
|
|
print("{:08x}: dwarf={:4} infered={:4} {:40} // {}".format(
|
|
inst.addr, str(dwarf_cfa[i].offset), str(infered_cfa[j].offset),
|
|
inst.inst.strip(), srcs.get(inst.addr, "")))
|
|
if dwarf_cfa[i].offset is not None and dwarf_cfa[i].offset != infered_cfa[j].offset:
|
|
if inst.addr in srcs: # Only report if it maps to source code (not padding or literals).
|
|
error = error or "{:08x} {}".format(inst.addr, srcs.get(inst.addr, ""))
|
|
return error, seen_addrs
|
|
|
|
def check_lib(lib: pathlib.Path):
|
|
assert lib.exists()
|
|
IGNORE = [
|
|
"art_quick_throw_null_pointer_exception_from_signal", # Starts with non-zero offset.
|
|
"art_quick_generic_jni_trampoline", # Saves/restores SP in other register.
|
|
"nterp_op_", # Uses calculated CFA due to dynamic stack size.
|
|
"$d.", # Data (literals) interleaved within code.
|
|
]
|
|
fdes = get_fde(lib)
|
|
asms = collections.deque(get_asm(lib))
|
|
srcs = {src.addr: src.file + ":" + src.line for src in get_source(lib)}
|
|
seen = set() # Used to verify the we have covered all assembly source lines.
|
|
|
|
for fde in fdes:
|
|
if fde.addr not in srcs:
|
|
continue # Ignore if it is not hand-written assembly.
|
|
|
|
# Assembly instructions (one FDE can cover several assembly chunks).
|
|
all_insts, name = [], None
|
|
while asms and asms[0].addr < fde.end:
|
|
asm = asms.popleft()
|
|
if asm.addr < fde.addr:
|
|
continue
|
|
insts = get_instructions(asm)
|
|
if any(asm.name.startswith(i) for i in IGNORE):
|
|
seen.update([inst.addr for inst in insts])
|
|
continue
|
|
all_insts.extend(insts)
|
|
name = name or asm.name
|
|
if not all_insts:
|
|
continue # No assembly
|
|
|
|
# Compare DWARF data to assembly instructions
|
|
error, seen_addrs = check_fde(fde, all_insts, srcs)
|
|
if error:
|
|
print("ERROR at " + name + " " + error)
|
|
check_fde(fde, all_insts, srcs, True)
|
|
print("")
|
|
seen.update(seen_addrs)
|
|
for addr in sorted(set(srcs.keys()) - seen):
|
|
print("Missing CFI for {:08x}: {}".format(addr, srcs[addr]))
|
|
|
|
|
|
def main(argv):
|
|
""" Check libraries provided on the command line, or use the default build output """
|
|
|
|
libs = argv[1:]
|
|
if not libs:
|
|
out = os.environ["OUT"]
|
|
libs.append(out + "/symbols/apex/com.android.art/lib/libart.so")
|
|
libs.append(out + "/symbols/apex/com.android.art/lib64/libart.so")
|
|
for lib in libs:
|
|
check_lib(pathlib.Path(lib))
|
|
|
|
if __name__ == "__main__":
|
|
main(os.sys.argv)
|