android13/art/tools/check_cfi.py

#!/usr/bin/env python3
#
# Copyright (C) 2022 The Android Open Source Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Checks dwarf CFI (unwinding) information by comparing it to disassembly.
It is only a simple heuristic check of stack pointer adjustments.
Fully inferring CFI from disassembly is not possible in general.
"""

import os, re, subprocess, collections, pathlib, bisect, collections
from typing import List, Optional, Set, Tuple

Source = collections.namedtuple("Source", ["addr", "file", "line", "flag"])

def get_source(lib: pathlib.Path) -> List[Source]:
  """ Get source-file and line-number for all hand-written assembly code. """

  proc = subprocess.run(["llvm-dwarfdump", "--debug-line", lib],
                        encoding='utf-8',
                        capture_output=True,
                        check=True)

  section_re = re.compile("^debug_line\[0x[0-9a-f]+\]$", re.MULTILINE)
  filename_re = re.compile('file_names\[ *(\d)+\]:\n\s*name: "(.*)"', re.MULTILINE)
  line_re = re.compile('0x([0-9a-f]{16}) +(\d+) +\d+ +(\d+)'  # addr, line, column, file
                       ' +\d+ +\d +(.*)')                     # isa, discriminator, flag

  results = []
  for section in section_re.split(proc.stdout):
    files = {m[1]: m[2] for m in filename_re.finditer(section)}
    if not any(f.endswith(".S") for f in files.values()):
      continue
    lines = line_re.findall(section)
    results.extend([Source(int(a, 16), files[fn], l, fg) for a, l, fn, fg in lines])
  return sorted(filter(lambda line: "end_sequence" not in line.flag, results))

Fde = collections.namedtuple("Fde", ["addr", "end", "data"])

def get_fde(lib: pathlib.Path) -> List[Fde]:
  """ Get all unwinding FDE blocks (in dumped text-based format) """

  proc = subprocess.run(["llvm-dwarfdump", "--debug-frame", lib],
                        encoding='utf-8',
                        capture_output=True,
                        check=True)

  section_re = re.compile("\n(?! |\n)", re.MULTILINE)  # New-line not followed by indent.
  fda_re = re.compile(".* FDE .* pc=([0-9a-f]+)...([0-9a-f]+)")

  results = []
  for section in section_re.split(proc.stdout):
    m = fda_re.match(section)
    if m:
      fde = Fde(int(m[1], 16), int(m[2], 16), section)
      if fde.addr != 0:
        results.append(fde)
  return sorted(results)

Asm = collections.namedtuple("Asm", ["addr", "name", "data"])

def get_asm(lib: pathlib.Path) -> List[Asm]:
  """ Get disassembly for all methods (in dumped text-based format) """

  proc = subprocess.run(["llvm-objdump", "--disassemble", lib],
                        encoding='utf-8',
                        capture_output=True,
                        check=True)

  section_re = re.compile("\n(?! |\n)", re.MULTILINE)  # New-line not followed by indent.
  sym_re = re.compile("([0-9a-f]+) <(.+)>:")

  results = []
  for section in section_re.split(proc.stdout):
    sym = sym_re.match(section)
    if sym:
      results.append(Asm(int(sym[1], 16), sym[2], section))
  return sorted(results)

Cfa = collections.namedtuple("Cfa", ["addr", "cfa"])

def get_cfa(fde: Fde) -> List[Cfa]:
  """ Extract individual CFA (SP+offset) entries from the FDE block """

  cfa_re = re.compile("0x([0-9a-f]+): CFA=([^\s:]+)")
  return [Cfa(int(addr, 16), cfa) for addr, cfa in cfa_re.findall(fde.data)]

Inst = collections.namedtuple("Inst", ["addr", "inst", "symbol"])

def get_instructions(asm: Asm) -> List[Inst]:
  """ Extract individual instructions from disassembled code block """

  data = re.sub(r"[ \t]+", " ", asm.data)
  inst_re = re.compile(r"([0-9a-f]+): +(?:[0-9a-f]{2} +)*(.*)")
  return [Inst(int(addr, 16), inst, asm.name) for addr, inst in inst_re.findall(data)]

CfaOffset = collections.namedtuple("CfaOffset", ["addr", "offset"])

def get_dwarf_cfa_offsets(cfas: List[Cfa]) -> List[CfaOffset]:
  """ Parse textual CFA entries into integer stack offsets """

  result = []
  for addr, cfa in cfas:
    if cfa == "WSP" or cfa == "SP":
      result.append(CfaOffset(addr, 0))
    elif cfa.startswith("WSP+") or cfa.startswith("SP+"):
      result.append(CfaOffset(addr, int(cfa.split("+")[1])))
    else:
      result.append(CfaOffset(addr, None))
  return result

def get_infered_cfa_offsets(insts: List[Inst]) -> List[CfaOffset]:
  """ Heuristic to convert disassembly into stack offsets """

  # Regular expressions which find instructions that adjust stack pointer.
  rexprs = []
  def add(rexpr, adjust_offset):
    rexprs.append((re.compile(rexpr), adjust_offset))
  add(r"sub sp,(?: sp,)? #(\d+)", lambda m: int(m[1]))
  add(r"add sp,(?: sp,)? #(\d+)", lambda m: -int(m[1]))
  add(r"str \w+, \[sp, #-(\d+)\]!", lambda m: int(m[1]))
  add(r"ldr \w+, \[sp\], #(\d+)", lambda m: -int(m[1]))
  add(r"stp \w+, \w+, \[sp, #-(\d+)\]!", lambda m: int(m[1]))
  add(r"ldp \w+, \w+, \[sp\], #(\d+)", lambda m: -int(m[1]))
  add(r"vpush \{([d0-9, ]*)\}", lambda m: 8 * len(m[1].split(",")))
  add(r"vpop \{([d0-9, ]*)\}", lambda m: -8 * len(m[1].split(",")))
  add(r"v?push(?:\.w)? \{([\w+, ]*)\}", lambda m: 4 * len(m[1].split(",")))
  add(r"v?pop(?:\.w)? \{([\w+, ]*)\}", lambda m: -4 * len(m[1].split(",")))

  # Regular expression which identifies branches.
  jmp_re = re.compile(r"cb\w* \w+, 0x(\w+)|(?:b|bl|b\w\w) 0x(\w+)")

  offset, future_offset = 0, {}
  result = [CfaOffset(insts[0].addr, offset)]
  for addr, inst, symbol in insts:
    # Previous code branched here, so us that offset instead.
    # This likely identifies slow-path which is after return.
    if addr in future_offset:
      offset = future_offset[addr]

    # Add entry to output (only if the offset changed).
    if result[-1].offset != offset:
      result.append(CfaOffset(addr, offset))

    # Adjust offset if the instruction modifies stack pointer.
    for rexpr, adjust_offset in rexprs:
      m = rexpr.match(inst)
      if m:
        offset += adjust_offset(m)
        break  # First matched pattern wins.

    # Record branches.  We only support forward edges for now.
    m = jmp_re.match(inst)
    if m:
      future_offset[int(m[m.lastindex], 16)] = offset
  return result

def check_fde(fde: Fde, insts: List[Inst], srcs, verbose: bool = False) -> Tuple[str, Set[int]]:
  """ Compare DWARF offsets to assembly-inferred offsets. Report differences. """

  error, seen_addrs = None, set()
  cfas = get_cfa(fde)
  i, dwarf_cfa = 0, get_dwarf_cfa_offsets(cfas)
  j, infered_cfa = 0, get_infered_cfa_offsets(insts)
  for inst in insts:
    seen_addrs.add(inst.addr)
    while i+1 < len(dwarf_cfa) and dwarf_cfa[i+1].addr <= inst.addr:
      i += 1
    while j+1 < len(infered_cfa) and infered_cfa[j+1].addr <= inst.addr:
      j += 1
    if verbose:
      print("{:08x}: dwarf={:4} infered={:4} {:40} // {}".format(
                inst.addr, str(dwarf_cfa[i].offset), str(infered_cfa[j].offset),
                inst.inst.strip(), srcs.get(inst.addr, "")))
    if dwarf_cfa[i].offset is not None and dwarf_cfa[i].offset != infered_cfa[j].offset:
      if inst.addr in srcs:  # Only report if it maps to source code (not padding or literals).
        error = error or "{:08x} {}".format(inst.addr, srcs.get(inst.addr, ""))
  return error, seen_addrs

def check_lib(lib: pathlib.Path):
  assert lib.exists()
  IGNORE = [
      "art_quick_throw_null_pointer_exception_from_signal",  # Starts with non-zero offset.
      "art_quick_generic_jni_trampoline",  # Saves/restores SP in other register.
      "nterp_op_",  # Uses calculated CFA due to dynamic stack size.
      "$d.",  # Data (literals) interleaved within code.
  ]
  fdes = get_fde(lib)
  asms = collections.deque(get_asm(lib))
  srcs = {src.addr: src.file + ":" + src.line for src in get_source(lib)}
  seen = set()  # Used to verify the we have covered all assembly source lines.

  for fde in fdes:
    if fde.addr not in srcs:
      continue  # Ignore if it is not hand-written assembly.

    # Assembly instructions (one FDE can cover several assembly chunks).
    all_insts, name = [], None
    while asms and asms[0].addr < fde.end:
      asm = asms.popleft()
      if asm.addr < fde.addr:
        continue
      insts = get_instructions(asm)
      if any(asm.name.startswith(i) for i in IGNORE):
        seen.update([inst.addr for inst in insts])
        continue
      all_insts.extend(insts)
      name = name or asm.name
    if not all_insts:
      continue  # No assembly

    # Compare DWARF data to assembly instructions
    error, seen_addrs = check_fde(fde, all_insts, srcs)
    if error:
      print("ERROR at " + name + " " + error)
      check_fde(fde, all_insts, srcs, True)
      print("")
    seen.update(seen_addrs)
  for addr in sorted(set(srcs.keys()) - seen):
    print("Missing CFI for {:08x}: {}".format(addr, srcs[addr]))


def main(argv):
  """ Check libraries provided on the command line, or use the default build output """

  libs = argv[1:]
  if not libs:
    out = os.environ["OUT"]
    libs.append(out + "/symbols/apex/com.android.art/lib/libart.so")
    libs.append(out + "/symbols/apex/com.android.art/lib64/libart.so")
  for lib in libs:
    check_lib(pathlib.Path(lib))

if __name__ == "__main__":
  main(os.sys.argv)