955 lines
44 KiB
Python
Executable File
955 lines
44 KiB
Python
Executable File
#!/usr/bin/python
|
|
#
|
|
# trace Trace a function and print a trace message based on its
|
|
# parameters, with an optional filter.
|
|
#
|
|
# usage: trace [-h] [-p PID] [-L TID] [-v] [-Z STRING_SIZE] [-S] [-c cgroup_path]
|
|
# [-M MAX_EVENTS] [-s SYMBOLFILES] [-T] [-t] [-K] [-U] [-a] [-I header]
|
|
# [-A]
|
|
# probe [probe ...]
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License")
|
|
# Copyright (C) 2016 Sasha Goldshtein.
|
|
|
|
from __future__ import print_function
|
|
from bcc import BPF, USDT, StrcmpRewrite
|
|
from functools import partial
|
|
from time import strftime
|
|
import time
|
|
import argparse
|
|
import re
|
|
import ctypes as ct
|
|
import os
|
|
import traceback
|
|
import sys
|
|
|
|
class Probe(object):
|
|
probe_count = 0
|
|
streq_index = 0
|
|
max_events = None
|
|
event_count = 0
|
|
first_ts = 0
|
|
first_ts_real = None
|
|
print_time = False
|
|
print_unix_timestamp = False
|
|
use_localtime = True
|
|
time_field = False
|
|
print_cpu = False
|
|
print_address = False
|
|
tgid = -1
|
|
pid = -1
|
|
uid = -1
|
|
page_cnt = None
|
|
build_id_enabled = False
|
|
aggregate = False
|
|
symcount = {}
|
|
|
|
@classmethod
|
|
def configure(cls, args):
|
|
cls.max_events = args.max_events
|
|
cls.print_time = args.timestamp or args.time
|
|
cls.print_unix_timestamp = args.unix_timestamp
|
|
cls.use_localtime = not args.timestamp
|
|
cls.time_field = cls.print_time and (not cls.use_localtime)
|
|
cls.print_cpu = args.print_cpu
|
|
cls.print_address = args.address
|
|
cls.first_ts = BPF.monotonic_time()
|
|
cls.first_ts_real = time.time()
|
|
cls.tgid = args.tgid or -1
|
|
cls.pid = args.pid or -1
|
|
cls.uid = args.uid or -1
|
|
cls.page_cnt = args.buffer_pages
|
|
cls.bin_cmp = args.bin_cmp
|
|
cls.build_id_enabled = args.sym_file_list is not None
|
|
cls.aggregate = args.aggregate
|
|
if cls.aggregate and cls.max_events is None:
|
|
raise ValueError("-M/--max-events should be specified"
|
|
" with -A/--aggregate")
|
|
|
|
def __init__(self, probe, string_size, kernel_stack, user_stack,
|
|
cgroup_map_name, name, msg_filter):
|
|
self.usdt = None
|
|
self.streq_functions = ""
|
|
self.raw_probe = probe
|
|
self.string_size = string_size
|
|
self.kernel_stack = kernel_stack
|
|
self.user_stack = user_stack
|
|
self.probe_user_list = set()
|
|
Probe.probe_count += 1
|
|
self._parse_probe()
|
|
self.probe_num = Probe.probe_count
|
|
self.probe_name = "probe_%s_%d" % \
|
|
(self._display_function(), self.probe_num)
|
|
self.probe_name = re.sub(r'[^A-Za-z0-9_]', '_',
|
|
self.probe_name)
|
|
self.cgroup_map_name = cgroup_map_name
|
|
if name is None:
|
|
# An empty bytestring is always contained in the command
|
|
# name so this will always succeed.
|
|
self.name = b''
|
|
else:
|
|
self.name = name.encode('ascii')
|
|
self.msg_filter = msg_filter
|
|
# compiler can generate proper codes for function
|
|
# signatures with "syscall__" prefix
|
|
if self.is_syscall_kprobe:
|
|
self.probe_name = "syscall__" + self.probe_name[6:]
|
|
|
|
def __str__(self):
|
|
return "%s:%s:%s FLT=%s ACT=%s/%s" % (self.probe_type,
|
|
self.library, self._display_function(), self.filter,
|
|
self.types, self.values)
|
|
|
|
def is_default_action(self):
|
|
return self.python_format == ""
|
|
|
|
def _bail(self, error):
|
|
raise ValueError("error in probe '%s': %s" %
|
|
(self.raw_probe, error))
|
|
|
|
def _parse_probe(self):
|
|
text = self.raw_probe
|
|
|
|
# There might be a function signature preceding the actual
|
|
# filter/print part, or not. Find the probe specifier first --
|
|
# it ends with either a space or an open paren ( for the
|
|
# function signature part.
|
|
# opt. signature
|
|
# probespec | rest
|
|
# --------- ---------- --
|
|
(spec, sig, rest) = re.match(r'([^ \t\(]+)(\([^\(]*\))?(.*)',
|
|
text).groups()
|
|
|
|
self._parse_spec(spec)
|
|
# Remove the parens
|
|
self.signature = sig[1:-1] if sig else None
|
|
if self.signature and self.probe_type in ['u', 't']:
|
|
self._bail("USDT and tracepoint probes can't have " +
|
|
"a function signature; use arg1, arg2, " +
|
|
"... instead")
|
|
|
|
text = rest.lstrip()
|
|
# If we now have a (, wait for the balanced closing ) and that
|
|
# will be the predicate
|
|
self.filter = None
|
|
if len(text) > 0 and text[0] == "(":
|
|
balance = 1
|
|
for i in range(1, len(text)):
|
|
if text[i] == "(":
|
|
balance += 1
|
|
if text[i] == ")":
|
|
balance -= 1
|
|
if balance == 0:
|
|
self._parse_filter(text[:i + 1])
|
|
text = text[i + 1:]
|
|
break
|
|
if self.filter is None:
|
|
self._bail("unmatched end of predicate")
|
|
|
|
if self.filter is None:
|
|
self.filter = "1"
|
|
|
|
# The remainder of the text is the printf action
|
|
self._parse_action(text.lstrip())
|
|
|
|
def _parse_offset(self, func_and_offset):
|
|
func, offset_str = func_and_offset.split("+")
|
|
try:
|
|
if "x" in offset_str or "X" in offset_str:
|
|
offset = int(offset_str, 16)
|
|
else:
|
|
offset = int(offset_str)
|
|
except ValueError:
|
|
self._bail("invalid offset format " +
|
|
" '%s', must be decimal or hexadecimal" % offset_str)
|
|
|
|
return func, offset
|
|
|
|
def _parse_spec(self, spec):
|
|
parts = spec.split(":")
|
|
# Two special cases: 'func' means 'p::func', 'lib:func' means
|
|
# 'p:lib:func'. Other combinations need to provide an empty
|
|
# value between delimiters, e.g. 'r::func' for a kretprobe on
|
|
# the function func.
|
|
if len(parts) == 1:
|
|
parts = ["p", "", parts[0]]
|
|
elif len(parts) == 2:
|
|
parts = ["p", parts[0], parts[1]]
|
|
if len(parts[0]) == 0:
|
|
self.probe_type = "p"
|
|
elif parts[0] in ["p", "r", "t", "u"]:
|
|
self.probe_type = parts[0]
|
|
else:
|
|
self._bail("probe type must be '', 'p', 't', 'r', " +
|
|
"or 'u', but got '%s'" % parts[0])
|
|
self.offset = 0
|
|
if "+" in parts[-1]:
|
|
parts[-1], self.offset = self._parse_offset(parts[-1])
|
|
|
|
if self.probe_type == "t":
|
|
self.tp_category = parts[1]
|
|
self.tp_event = parts[2]
|
|
self.library = "" # kernel
|
|
self.function = "" # from TRACEPOINT_PROBE
|
|
elif self.probe_type == "u":
|
|
# u:<library>[:<provider>]:<probe> where :<provider> is optional
|
|
self.library = parts[1]
|
|
self.usdt_name = ":".join(parts[2:])
|
|
self.function = "" # no function, just address
|
|
# We will discover the USDT provider by matching on
|
|
# the USDT name in the specified library
|
|
self._find_usdt_probe()
|
|
else:
|
|
self.library = ':'.join(parts[1:-1])
|
|
self.function = parts[-1]
|
|
|
|
# only x64 syscalls needs checking, no other syscall wrapper yet.
|
|
self.is_syscall_kprobe = False
|
|
if self.probe_type == "p" and len(self.library) == 0 and \
|
|
self.function[:10] == "__x64_sys_":
|
|
self.is_syscall_kprobe = True
|
|
|
|
def _find_usdt_probe(self):
|
|
target = Probe.pid if Probe.pid and Probe.pid != -1 \
|
|
else Probe.tgid
|
|
self.usdt = USDT(path=self.library, pid=target)
|
|
|
|
parts = self.usdt_name.split(":")
|
|
if len(parts) == 1:
|
|
provider_name = None
|
|
usdt_name = parts[0].encode("ascii")
|
|
else:
|
|
provider_name = parts[0].encode("ascii")
|
|
usdt_name = parts[1].encode("ascii")
|
|
for probe in self.usdt.enumerate_probes():
|
|
if ((not provider_name or probe.provider == provider_name)
|
|
and probe.name == usdt_name):
|
|
return # Found it, will enable later
|
|
self._bail("unrecognized USDT probe %s" % self.usdt_name)
|
|
|
|
def _parse_filter(self, filt):
|
|
self.filter = self._rewrite_expr(filt)
|
|
|
|
def _parse_types(self, fmt):
|
|
for match in re.finditer(
|
|
r'[^%]%(s|u|d|lu|llu|ld|lld|hu|hd|x|lx|llx|c|K|U)', fmt):
|
|
self.types.append(match.group(1))
|
|
fmt = re.sub(r'([^%]%)(u|d|lu|llu|ld|lld|hu|hd)', r'\1d', fmt)
|
|
fmt = re.sub(r'([^%]%)(x|lx|llx)', r'\1x', fmt)
|
|
fmt = re.sub('%K|%U', '%s', fmt)
|
|
self.python_format = fmt.strip('"')
|
|
|
|
def _parse_action(self, action):
|
|
self.values = []
|
|
self.types = []
|
|
self.python_format = ""
|
|
if len(action) == 0:
|
|
return
|
|
|
|
action = action.strip()
|
|
match = re.search(r'(\".*?\"),?(.*)', action)
|
|
if match is None:
|
|
self._bail("expected format string in \"s")
|
|
|
|
self.raw_format = match.group(1)
|
|
self._parse_types(self.raw_format)
|
|
for part in re.split('(?<!"),', match.group(2)):
|
|
part = self._rewrite_expr(part)
|
|
if len(part) > 0:
|
|
self.values.append(part)
|
|
|
|
aliases_arg = {
|
|
"arg1": "PT_REGS_PARM1(ctx)",
|
|
"arg2": "PT_REGS_PARM2(ctx)",
|
|
"arg3": "PT_REGS_PARM3(ctx)",
|
|
"arg4": "PT_REGS_PARM4(ctx)",
|
|
"arg5": "PT_REGS_PARM5(ctx)",
|
|
"arg6": "PT_REGS_PARM6(ctx)",
|
|
}
|
|
|
|
aliases_indarg = {
|
|
"arg1": "({u64 _val; struct pt_regs *_ctx = (struct pt_regs *)PT_REGS_PARM1(ctx);"
|
|
" bpf_probe_read_kernel(&_val, sizeof(_val), &(PT_REGS_PARM1(_ctx))); _val;})",
|
|
"arg2": "({u64 _val; struct pt_regs *_ctx = (struct pt_regs *)PT_REGS_PARM1(ctx);"
|
|
" bpf_probe_read_kernel(&_val, sizeof(_val), &(PT_REGS_PARM2(_ctx))); _val;})",
|
|
"arg3": "({u64 _val; struct pt_regs *_ctx = (struct pt_regs *)PT_REGS_PARM1(ctx);"
|
|
" bpf_probe_read_kernel(&_val, sizeof(_val), &(PT_REGS_PARM3(_ctx))); _val;})",
|
|
"arg4": "({u64 _val; struct pt_regs *_ctx = (struct pt_regs *)PT_REGS_PARM1(ctx);"
|
|
" bpf_probe_read_kernel(&_val, sizeof(_val), &(PT_REGS_PARM4(_ctx))); _val;})",
|
|
"arg5": "({u64 _val; struct pt_regs *_ctx = (struct pt_regs *)PT_REGS_PARM1(ctx);"
|
|
" bpf_probe_read_kernel(&_val, sizeof(_val), &(PT_REGS_PARM5(_ctx))); _val;})",
|
|
"arg6": "({u64 _val; struct pt_regs *_ctx = (struct pt_regs *)PT_REGS_PARM1(ctx);"
|
|
" bpf_probe_read_kernel(&_val, sizeof(_val), &(PT_REGS_PARM6(_ctx))); _val;})",
|
|
}
|
|
|
|
aliases_common = {
|
|
"retval": "PT_REGS_RC(ctx)",
|
|
"$uid": "(unsigned)(bpf_get_current_uid_gid() & 0xffffffff)",
|
|
"$gid": "(unsigned)(bpf_get_current_uid_gid() >> 32)",
|
|
"$pid": "(unsigned)(bpf_get_current_pid_tgid() & 0xffffffff)",
|
|
"$tgid": "(unsigned)(bpf_get_current_pid_tgid() >> 32)",
|
|
"$cpu": "bpf_get_smp_processor_id()",
|
|
"$task": "((struct task_struct *)bpf_get_current_task())"
|
|
}
|
|
|
|
def _rewrite_expr(self, expr):
|
|
# Find the occurances of any arg[1-6]@user. Use it later to
|
|
# identify bpf_probe_read_user
|
|
for matches in re.finditer(r'(arg[1-6])(@user)', expr):
|
|
if matches.group(1).strip() not in self.probe_user_list:
|
|
self.probe_user_list.add(matches.group(1).strip())
|
|
# Remove @user occurrences from arg before resolving to its
|
|
# corresponding aliases.
|
|
expr = re.sub(r'(arg[1-6])@user', r'\1', expr)
|
|
rdict = StrcmpRewrite.rewrite_expr(expr,
|
|
self.bin_cmp, self.library,
|
|
self.probe_user_list, self.streq_functions,
|
|
Probe.streq_index)
|
|
expr = rdict["expr"]
|
|
self.streq_functions = rdict["streq_functions"]
|
|
Probe.streq_index = rdict["probeid"]
|
|
alias_to_check = Probe.aliases_indarg \
|
|
if self.is_syscall_kprobe \
|
|
else Probe.aliases_arg
|
|
# For USDT probes, we replace argN values with the
|
|
# actual arguments for that probe obtained using
|
|
# bpf_readarg_N macros emitted at BPF construction.
|
|
if not self.probe_type == "u":
|
|
for alias, replacement in alias_to_check.items():
|
|
expr = expr.replace(alias, replacement)
|
|
for alias, replacement in Probe.aliases_common.items():
|
|
expr = expr.replace(alias, replacement)
|
|
return expr
|
|
|
|
p_type = {"u": ct.c_uint, "d": ct.c_int, "lu": ct.c_ulong,
|
|
"ld": ct.c_long,
|
|
"llu": ct.c_ulonglong, "lld": ct.c_longlong,
|
|
"hu": ct.c_ushort, "hd": ct.c_short,
|
|
"x": ct.c_uint, "lx": ct.c_ulong, "llx": ct.c_ulonglong,
|
|
"c": ct.c_ubyte,
|
|
"K": ct.c_ulonglong, "U": ct.c_ulonglong}
|
|
|
|
def _generate_python_field_decl(self, idx, fields):
|
|
field_type = self.types[idx]
|
|
if field_type == "s":
|
|
ptype = ct.c_char * self.string_size
|
|
else:
|
|
ptype = Probe.p_type[field_type]
|
|
fields.append(("v%d" % idx, ptype))
|
|
|
|
def _generate_python_data_decl(self):
|
|
self.python_struct_name = "%s_%d_Data" % \
|
|
(self._display_function(), self.probe_num)
|
|
fields = []
|
|
if self.time_field:
|
|
fields.append(("timestamp_ns", ct.c_ulonglong))
|
|
if self.print_cpu:
|
|
fields.append(("cpu", ct.c_int))
|
|
fields.extend([
|
|
("tgid", ct.c_uint),
|
|
("pid", ct.c_uint),
|
|
("comm", ct.c_char * 16) # TASK_COMM_LEN
|
|
])
|
|
for i in range(0, len(self.types)):
|
|
self._generate_python_field_decl(i, fields)
|
|
if self.kernel_stack:
|
|
fields.append(("kernel_stack_id", ct.c_int))
|
|
if self.user_stack:
|
|
fields.append(("user_stack_id", ct.c_int))
|
|
return type(self.python_struct_name, (ct.Structure,),
|
|
dict(_fields_=fields))
|
|
|
|
c_type = {"u": "unsigned int", "d": "int",
|
|
"lu": "unsigned long", "ld": "long",
|
|
"llu": "unsigned long long", "lld": "long long",
|
|
"hu": "unsigned short", "hd": "short",
|
|
"x": "unsigned int", "lx": "unsigned long",
|
|
"llx": "unsigned long long",
|
|
"c": "char", "K": "unsigned long long",
|
|
"U": "unsigned long long"}
|
|
fmt_types = c_type.keys()
|
|
|
|
def _generate_field_decl(self, idx):
|
|
field_type = self.types[idx]
|
|
if field_type == "s":
|
|
return "char v%d[%d];\n" % (idx, self.string_size)
|
|
if field_type in Probe.fmt_types:
|
|
return "%s v%d;\n" % (Probe.c_type[field_type], idx)
|
|
self._bail("unrecognized format specifier %s" % field_type)
|
|
|
|
def _generate_data_decl(self):
|
|
# The BPF program will populate values into the struct
|
|
# according to the format string, and the Python program will
|
|
# construct the final display string.
|
|
self.events_name = "%s_events" % self.probe_name
|
|
self.struct_name = "%s_data_t" % self.probe_name
|
|
self.stacks_name = "%s_stacks" % self.probe_name
|
|
stack_type = "BPF_STACK_TRACE" if self.build_id_enabled is False \
|
|
else "BPF_STACK_TRACE_BUILDID"
|
|
stack_table = "%s(%s, 1024);" % (stack_type, self.stacks_name) \
|
|
if (self.kernel_stack or self.user_stack) else ""
|
|
data_fields = ""
|
|
for i, field_type in enumerate(self.types):
|
|
data_fields += " " + \
|
|
self._generate_field_decl(i)
|
|
time_str = "u64 timestamp_ns;" if self.time_field else ""
|
|
cpu_str = "int cpu;" if self.print_cpu else ""
|
|
kernel_stack_str = " int kernel_stack_id;" \
|
|
if self.kernel_stack else ""
|
|
user_stack_str = " int user_stack_id;" \
|
|
if self.user_stack else ""
|
|
|
|
text = """
|
|
struct %s
|
|
{
|
|
%s
|
|
%s
|
|
u32 tgid;
|
|
u32 pid;
|
|
char comm[TASK_COMM_LEN];
|
|
%s
|
|
%s
|
|
%s
|
|
u32 uid;
|
|
};
|
|
|
|
BPF_PERF_OUTPUT(%s);
|
|
%s
|
|
"""
|
|
return text % (self.struct_name, time_str, cpu_str, data_fields,
|
|
kernel_stack_str, user_stack_str,
|
|
self.events_name, stack_table)
|
|
|
|
def _generate_field_assign(self, idx):
|
|
field_type = self.types[idx]
|
|
expr = self.values[idx].strip()
|
|
text = ""
|
|
if self.probe_type == "u" and expr[0:3] == "arg":
|
|
arg_index = int(expr[3])
|
|
arg_ctype = self.usdt.get_probe_arg_ctype(
|
|
self.usdt_name, arg_index - 1)
|
|
text = (" %s %s = 0;\n" +
|
|
" bpf_usdt_readarg(%s, ctx, &%s);\n") \
|
|
% (arg_ctype, expr, expr[3], expr)
|
|
probe_read_func = "bpf_probe_read_kernel"
|
|
if field_type == "s":
|
|
if self.library:
|
|
probe_read_func = "bpf_probe_read_user"
|
|
else:
|
|
alias_to_check = Probe.aliases_indarg \
|
|
if self.is_syscall_kprobe \
|
|
else Probe.aliases_arg
|
|
for arg, alias in alias_to_check.items():
|
|
if alias == expr and arg in self.probe_user_list:
|
|
probe_read_func = "bpf_probe_read_user"
|
|
break
|
|
return text + """
|
|
if (%s != 0) {
|
|
void *__tmp = (void *)%s;
|
|
%s(&__data.v%d, sizeof(__data.v%d), __tmp);
|
|
}
|
|
""" % (expr, expr, probe_read_func, idx, idx)
|
|
if field_type in Probe.fmt_types:
|
|
return text + " __data.v%d = (%s)%s;\n" % \
|
|
(idx, Probe.c_type[field_type], expr)
|
|
self._bail("unrecognized field type %s" % field_type)
|
|
|
|
def _generate_usdt_filter_read(self):
|
|
text = ""
|
|
if self.probe_type != "u":
|
|
return text
|
|
for arg, _ in Probe.aliases_arg.items():
|
|
if not (arg in self.filter):
|
|
continue
|
|
arg_index = int(arg.replace("arg", ""))
|
|
arg_ctype = self.usdt.get_probe_arg_ctype(
|
|
self.usdt_name, arg_index - 1)
|
|
if not arg_ctype:
|
|
self._bail("Unable to determine type of {} "
|
|
"in the filter".format(arg))
|
|
text += """
|
|
{} {}_filter;
|
|
bpf_usdt_readarg({}, ctx, &{}_filter);
|
|
""".format(arg_ctype, arg, arg_index, arg)
|
|
self.filter = self.filter.replace(
|
|
arg, "{}_filter".format(arg))
|
|
return text
|
|
|
|
def generate_program(self, include_self):
|
|
data_decl = self._generate_data_decl()
|
|
if Probe.pid != -1:
|
|
pid_filter = """
|
|
if (__pid != %d) { return 0; }
|
|
""" % Probe.pid
|
|
# uprobes can have a built-in tgid filter passed to
|
|
# attach_uprobe, hence the check here -- for kprobes, we
|
|
# need to do the tgid test by hand:
|
|
elif len(self.library) == 0 and Probe.tgid != -1:
|
|
pid_filter = """
|
|
if (__tgid != %d) { return 0; }
|
|
""" % Probe.tgid
|
|
elif not include_self:
|
|
pid_filter = """
|
|
if (__tgid == %d) { return 0; }
|
|
""" % os.getpid()
|
|
else:
|
|
pid_filter = ""
|
|
|
|
if Probe.uid != -1:
|
|
uid_filter = """
|
|
if (__uid != %d) { return 0; }
|
|
""" % Probe.uid
|
|
else:
|
|
uid_filter = ""
|
|
|
|
if self.cgroup_map_name is not None:
|
|
cgroup_filter = """
|
|
if (%s.check_current_task(0) <= 0) { return 0; }
|
|
""" % self.cgroup_map_name
|
|
else:
|
|
cgroup_filter = ""
|
|
|
|
prefix = ""
|
|
signature = "struct pt_regs *ctx"
|
|
if self.signature:
|
|
signature += ", " + self.signature
|
|
|
|
data_fields = ""
|
|
for i, expr in enumerate(self.values):
|
|
data_fields += self._generate_field_assign(i)
|
|
|
|
if self.probe_type == "t":
|
|
heading = "TRACEPOINT_PROBE(%s, %s)" % \
|
|
(self.tp_category, self.tp_event)
|
|
ctx_name = "args"
|
|
else:
|
|
heading = "int %s(%s)" % (self.probe_name, signature)
|
|
ctx_name = "ctx"
|
|
|
|
time_str = """
|
|
__data.timestamp_ns = bpf_ktime_get_ns();""" if self.time_field else ""
|
|
cpu_str = """
|
|
__data.cpu = bpf_get_smp_processor_id();""" if self.print_cpu else ""
|
|
stack_trace = ""
|
|
if self.user_stack:
|
|
stack_trace += """
|
|
__data.user_stack_id = %s.get_stackid(
|
|
%s, BPF_F_USER_STACK
|
|
);""" % (self.stacks_name, ctx_name)
|
|
if self.kernel_stack:
|
|
stack_trace += """
|
|
__data.kernel_stack_id = %s.get_stackid(
|
|
%s, 0
|
|
);""" % (self.stacks_name, ctx_name)
|
|
|
|
text = heading + """
|
|
{
|
|
u64 __pid_tgid = bpf_get_current_pid_tgid();
|
|
u32 __tgid = __pid_tgid >> 32;
|
|
u32 __pid = __pid_tgid; // implicit cast to u32 for bottom half
|
|
u32 __uid = bpf_get_current_uid_gid();
|
|
%s
|
|
%s
|
|
%s
|
|
%s
|
|
%s
|
|
if (!(%s)) return 0;
|
|
|
|
struct %s __data = {0};
|
|
%s
|
|
%s
|
|
__data.tgid = __tgid;
|
|
__data.pid = __pid;
|
|
__data.uid = __uid;
|
|
bpf_get_current_comm(&__data.comm, sizeof(__data.comm));
|
|
%s
|
|
%s
|
|
%s.perf_submit(%s, &__data, sizeof(__data));
|
|
return 0;
|
|
}
|
|
"""
|
|
text = text % (pid_filter, uid_filter, cgroup_filter, prefix,
|
|
self._generate_usdt_filter_read(), self.filter,
|
|
self.struct_name, time_str, cpu_str, data_fields,
|
|
stack_trace, self.events_name, ctx_name)
|
|
|
|
return self.streq_functions + data_decl + "\n" + text
|
|
|
|
@classmethod
|
|
def _time_off_str(cls, timestamp_ns):
|
|
offset = 1e-9 * (timestamp_ns - cls.first_ts)
|
|
if cls.print_unix_timestamp:
|
|
return "%.6f" % (offset + cls.first_ts_real)
|
|
else:
|
|
return "%.6f" % offset
|
|
|
|
def _display_function(self):
|
|
if self.probe_type == 'p' or self.probe_type == 'r':
|
|
return self.function
|
|
elif self.probe_type == 'u':
|
|
return self.usdt_name
|
|
else: # self.probe_type == 't'
|
|
return self.tp_event
|
|
|
|
def _stack_to_string(self, bpf, stack_id, tgid):
|
|
if stack_id < 0:
|
|
return (" %d" % stack_id)
|
|
|
|
stackstr = ''
|
|
stack = list(bpf.get_table(self.stacks_name).walk(stack_id))
|
|
for addr in stack:
|
|
stackstr += ' '
|
|
if Probe.print_address:
|
|
stackstr += ("%16x " % addr)
|
|
symstr = bpf.sym(addr, tgid, show_module=True, show_offset=True)
|
|
stackstr += ('%s\n' % (symstr.decode('utf-8')))
|
|
|
|
return stackstr
|
|
|
|
def _format_message(self, bpf, tgid, values):
|
|
# Replace each %K with kernel sym and %U with user sym in tgid
|
|
kernel_placeholders = [i for i, t in enumerate(self.types)
|
|
if t == 'K']
|
|
user_placeholders = [i for i, t in enumerate(self.types)
|
|
if t == 'U']
|
|
for kp in kernel_placeholders:
|
|
values[kp] = bpf.ksym(values[kp], show_offset=True)
|
|
for up in user_placeholders:
|
|
values[up] = bpf.sym(values[up], tgid,
|
|
show_module=True, show_offset=True)
|
|
return self.python_format % tuple(values)
|
|
|
|
def print_aggregate_events(self):
|
|
for k, v in sorted(self.symcount.items(), key=lambda item: \
|
|
item[1], reverse=True):
|
|
print("%s-->COUNT %d\n\n" % (k, v), end="")
|
|
|
|
def print_event(self, bpf, cpu, data, size):
|
|
# Cast as the generated structure type and display
|
|
# according to the format string in the probe.
|
|
event = ct.cast(data, ct.POINTER(self.python_struct)).contents
|
|
if self.name not in event.comm:
|
|
return
|
|
values = map(lambda i: getattr(event, "v%d" % i),
|
|
range(0, len(self.values)))
|
|
msg = self._format_message(bpf, event.tgid, values)
|
|
if self.msg_filter and self.msg_filter not in msg:
|
|
return
|
|
eventstr = ''
|
|
if Probe.print_time:
|
|
time = strftime("%H:%M:%S") if Probe.use_localtime else \
|
|
Probe._time_off_str(event.timestamp_ns)
|
|
if Probe.print_unix_timestamp:
|
|
eventstr += ("%-17s " % time[:17])
|
|
else:
|
|
eventstr += ("%-8s " % time[:8])
|
|
if Probe.print_cpu:
|
|
eventstr += ("%-3s " % event.cpu)
|
|
eventstr += ("%-7d %-7d %-15s %-16s %s\n" %
|
|
(event.tgid, event.pid,
|
|
event.comm.decode('utf-8', 'replace'),
|
|
self._display_function(), msg))
|
|
|
|
if self.kernel_stack:
|
|
eventstr += self._stack_to_string(bpf, event.kernel_stack_id, -1)
|
|
if self.user_stack:
|
|
eventstr += self._stack_to_string(bpf, event.user_stack_id, event.tgid)
|
|
|
|
if self.aggregate is False:
|
|
print(eventstr, end="")
|
|
if self.kernel_stack or self.user_stack:
|
|
print("")
|
|
else:
|
|
if eventstr in self.symcount:
|
|
self.symcount[eventstr] += 1
|
|
else:
|
|
self.symcount[eventstr] = 1
|
|
|
|
Probe.event_count += 1
|
|
if Probe.max_events is not None and \
|
|
Probe.event_count >= Probe.max_events:
|
|
if self.aggregate:
|
|
self.print_aggregate_events()
|
|
sys.stdout.flush()
|
|
exit()
|
|
|
|
def attach(self, bpf, verbose):
|
|
if len(self.library) == 0:
|
|
self._attach_k(bpf)
|
|
else:
|
|
self._attach_u(bpf)
|
|
self.python_struct = self._generate_python_data_decl()
|
|
callback = partial(self.print_event, bpf)
|
|
bpf[self.events_name].open_perf_buffer(callback,
|
|
page_cnt=self.page_cnt)
|
|
|
|
def _attach_k(self, bpf):
|
|
if self.probe_type == "r":
|
|
bpf.attach_kretprobe(event=self.function,
|
|
fn_name=self.probe_name)
|
|
elif self.probe_type == "p":
|
|
bpf.attach_kprobe(event=self.function,
|
|
fn_name=self.probe_name,
|
|
event_off=self.offset)
|
|
# Note that tracepoints don't need an explicit attach
|
|
|
|
def _attach_u(self, bpf):
|
|
libpath = BPF.find_library(self.library)
|
|
if libpath is None:
|
|
# This might be an executable (e.g. 'bash')
|
|
libpath = BPF.find_exe(self.library)
|
|
if libpath is None or len(libpath) == 0:
|
|
self._bail("unable to find library %s" % self.library)
|
|
|
|
if self.probe_type == "u":
|
|
pass # Was already enabled by the BPF constructor
|
|
elif self.probe_type == "r":
|
|
bpf.attach_uretprobe(name=libpath,
|
|
sym=self.function,
|
|
fn_name=self.probe_name,
|
|
pid=Probe.tgid)
|
|
else:
|
|
bpf.attach_uprobe(name=libpath,
|
|
sym=self.function,
|
|
fn_name=self.probe_name,
|
|
pid=Probe.tgid,
|
|
sym_off=self.offset)
|
|
|
|
class Tool(object):
|
|
DEFAULT_PERF_BUFFER_PAGES = 64
|
|
examples = """
|
|
EXAMPLES:
|
|
|
|
trace do_sys_open
|
|
Trace the open syscall and print a default trace message when entered
|
|
trace kfree_skb+0x12
|
|
Trace the kfree_skb kernel function after the instruction on the 0x12 offset
|
|
trace 'do_sys_open "%s", arg2@user'
|
|
Trace the open syscall and print the filename being opened @user is
|
|
added to arg2 in kprobes to ensure that char * should be copied from
|
|
the userspace stack to the bpf stack. If not specified, previous
|
|
behaviour is expected.
|
|
|
|
trace 'do_sys_open "%s", arg2@user' -n main
|
|
Trace the open syscall and only print event that process names containing "main"
|
|
trace 'do_sys_open "%s", arg2@user' --uid 1001
|
|
Trace the open syscall and only print event that processes with user ID 1001
|
|
trace 'do_sys_open "%s", arg2@user' -f config
|
|
Trace the open syscall and print the filename being opened filtered by "config"
|
|
trace 'sys_read (arg3 > 20000) "read %d bytes", arg3'
|
|
Trace the read syscall and print a message for reads >20000 bytes
|
|
trace 'r::do_sys_open "%llx", retval'
|
|
Trace the return from the open syscall and print the return value
|
|
trace 'c:open (arg2 == 42) "%s %d", arg1, arg2'
|
|
Trace the open() call from libc only if the flags (arg2) argument is 42
|
|
trace 'c:malloc "size = %d", arg1'
|
|
Trace malloc calls and print the size being allocated
|
|
trace 'p:c:write (arg1 == 1) "writing %d bytes to STDOUT", arg3'
|
|
Trace the write() call from libc to monitor writes to STDOUT
|
|
trace 'r::__kmalloc (retval == 0) "kmalloc failed!"'
|
|
Trace returns from __kmalloc which returned a null pointer
|
|
trace 'r:c:malloc (retval) "allocated = %x", retval'
|
|
Trace returns from malloc and print non-NULL allocated buffers
|
|
trace 't:block:block_rq_complete "sectors=%d", args->nr_sector'
|
|
Trace the block_rq_complete kernel tracepoint and print # of tx sectors
|
|
trace 'u:pthread:pthread_create (arg4 != 0)'
|
|
Trace the USDT probe pthread_create when its 4th argument is non-zero
|
|
trace 'u:pthread:libpthread:pthread_create (arg4 != 0)'
|
|
Ditto, but the provider name "libpthread" is specified.
|
|
trace 'p::SyS_nanosleep(struct timespec *ts) "sleep for %lld ns", ts->tv_nsec'
|
|
Trace the nanosleep syscall and print the sleep duration in ns
|
|
trace -c /sys/fs/cgroup/system.slice/workload.service '__x64_sys_nanosleep' '__x64_sys_clone'
|
|
Trace nanosleep/clone syscall calls only under workload.service
|
|
cgroup hierarchy.
|
|
trace -I 'linux/fs.h' \\
|
|
'p::uprobe_register(struct inode *inode) "a_ops = %llx", inode->i_mapping->a_ops'
|
|
Trace the uprobe_register inode mapping ops, and the symbol can be found
|
|
in /proc/kallsyms
|
|
trace -I 'kernel/sched/sched.h' \\
|
|
'p::__account_cfs_rq_runtime(struct cfs_rq *cfs_rq) "%d", cfs_rq->runtime_remaining'
|
|
Trace the cfs scheduling runqueue remaining runtime. The struct cfs_rq is defined
|
|
in kernel/sched/sched.h which is in kernel source tree and not in kernel-devel
|
|
package. So this command needs to run at the kernel source tree root directory
|
|
so that the added header file can be found by the compiler.
|
|
trace -I 'net/sock.h' \\
|
|
'udpv6_sendmsg(struct sock *sk) (sk->sk_dport == 13568)'
|
|
Trace udpv6 sendmsg calls only if socket's destination port is equal
|
|
to 53 (DNS; 13568 in big endian order)
|
|
trace -I 'linux/fs_struct.h' 'mntns_install "users = %d", $task->fs->users'
|
|
Trace the number of users accessing the file system of the current task
|
|
trace -s /lib/x86_64-linux-gnu/libc.so.6,/bin/ping 'p:c:inet_pton' -U
|
|
Trace inet_pton system call and use the specified libraries/executables for
|
|
symbol resolution.
|
|
"""
|
|
|
|
def __init__(self):
|
|
parser = argparse.ArgumentParser(description="Attach to " +
|
|
"functions and print trace messages.",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog=Tool.examples)
|
|
parser.add_argument("-b", "--buffer-pages", type=int,
|
|
default=Tool.DEFAULT_PERF_BUFFER_PAGES,
|
|
help="number of pages to use for perf_events ring buffer "
|
|
"(default: %(default)d)")
|
|
# we'll refer to the userspace concepts of "pid" and "tid" by
|
|
# their kernel names -- tgid and pid -- inside the script
|
|
parser.add_argument("-p", "--pid", type=int, metavar="PID",
|
|
dest="tgid", help="id of the process to trace (optional)")
|
|
parser.add_argument("-L", "--tid", type=int, metavar="TID",
|
|
dest="pid", help="id of the thread to trace (optional)")
|
|
parser.add_argument("--uid", type=int, metavar="UID",
|
|
dest="uid", help="id of the user to trace (optional)")
|
|
parser.add_argument("-v", "--verbose", action="store_true",
|
|
help="print resulting BPF program code before executing")
|
|
parser.add_argument("-Z", "--string-size", type=int,
|
|
default=80, help="maximum size to read from strings")
|
|
parser.add_argument("-S", "--include-self",
|
|
action="store_true",
|
|
help="do not filter trace's own pid from the trace")
|
|
parser.add_argument("-M", "--max-events", type=int,
|
|
help="number of events to print before quitting")
|
|
parser.add_argument("-t", "--timestamp", action="store_true",
|
|
help="print timestamp column (offset from trace start)")
|
|
parser.add_argument("-u", "--unix-timestamp", action="store_true",
|
|
help="print UNIX timestamp instead of offset from trace start, requires -t")
|
|
parser.add_argument("-T", "--time", action="store_true",
|
|
help="print time column")
|
|
parser.add_argument("-C", "--print_cpu", action="store_true",
|
|
help="print CPU id")
|
|
parser.add_argument("-c", "--cgroup-path", type=str,
|
|
metavar="CGROUP_PATH", dest="cgroup_path",
|
|
help="cgroup path")
|
|
parser.add_argument("-n", "--name", type=str,
|
|
help="only print process names containing this name")
|
|
parser.add_argument("-f", "--msg-filter", type=str, dest="msg_filter",
|
|
help="only print the msg of event containing this string")
|
|
parser.add_argument("-B", "--bin_cmp", action="store_true",
|
|
help="allow to use STRCMP with binary values")
|
|
parser.add_argument('-s', "--sym_file_list", type=str,
|
|
metavar="SYM_FILE_LIST", dest="sym_file_list",
|
|
help="coma separated list of symbol files to use \
|
|
for symbol resolution")
|
|
parser.add_argument("-K", "--kernel-stack",
|
|
action="store_true", help="output kernel stack trace")
|
|
parser.add_argument("-U", "--user-stack",
|
|
action="store_true", help="output user stack trace")
|
|
parser.add_argument("-a", "--address", action="store_true",
|
|
help="print virtual address in stacks")
|
|
parser.add_argument(metavar="probe", dest="probes", nargs="+",
|
|
help="probe specifier (see examples)")
|
|
parser.add_argument("-I", "--include", action="append",
|
|
metavar="header",
|
|
help="additional header files to include in the BPF program "
|
|
"as either full path, "
|
|
"or relative to current working directory, "
|
|
"or relative to default kernel header search path")
|
|
parser.add_argument("-A", "--aggregate", action="store_true",
|
|
help="aggregate amount of each trace")
|
|
parser.add_argument("--ebpf", action="store_true",
|
|
help=argparse.SUPPRESS)
|
|
self.args = parser.parse_args()
|
|
if self.args.tgid and self.args.pid:
|
|
parser.error("only one of -p and -L may be specified")
|
|
if self.args.cgroup_path is not None:
|
|
self.cgroup_map_name = "__cgroup"
|
|
else:
|
|
self.cgroup_map_name = None
|
|
|
|
def _create_probes(self):
|
|
Probe.configure(self.args)
|
|
self.probes = []
|
|
for probe_spec in self.args.probes:
|
|
self.probes.append(Probe(
|
|
probe_spec, self.args.string_size,
|
|
self.args.kernel_stack, self.args.user_stack,
|
|
self.cgroup_map_name, self.args.name, self.args.msg_filter))
|
|
|
|
def _generate_program(self):
|
|
self.program = """
|
|
#include <linux/ptrace.h>
|
|
#include <linux/sched.h> /* For TASK_COMM_LEN */
|
|
|
|
"""
|
|
for include in (self.args.include or []):
|
|
if include.startswith((".", "/")):
|
|
include = os.path.abspath(include)
|
|
self.program += "#include \"%s\"\n" % include
|
|
else:
|
|
self.program += "#include <%s>\n" % include
|
|
self.program += BPF.generate_auto_includes(
|
|
map(lambda p: p.raw_probe, self.probes))
|
|
if self.cgroup_map_name is not None:
|
|
self.program += "BPF_CGROUP_ARRAY(%s, 1);\n" % \
|
|
self.cgroup_map_name
|
|
for probe in self.probes:
|
|
self.program += probe.generate_program(
|
|
self.args.include_self)
|
|
|
|
if self.args.verbose or self.args.ebpf:
|
|
print(self.program)
|
|
if self.args.ebpf:
|
|
exit()
|
|
|
|
def _attach_probes(self):
|
|
usdt_contexts = []
|
|
for probe in self.probes:
|
|
if probe.usdt:
|
|
# USDT probes must be enabled before the BPF object
|
|
# is initialized, because that's where the actual
|
|
# uprobe is being attached.
|
|
probe.usdt.enable_probe(
|
|
probe.usdt_name, probe.probe_name)
|
|
if self.args.verbose:
|
|
print(probe.usdt.get_text())
|
|
usdt_contexts.append(probe.usdt)
|
|
self.bpf = BPF(text=self.program, usdt_contexts=usdt_contexts)
|
|
if self.args.sym_file_list is not None:
|
|
print("Note: Kernel bpf will report stack map with ip/build_id")
|
|
map(lambda x: self.bpf.add_module(x), self.args.sym_file_list.split(','))
|
|
|
|
# if cgroup filter is requested, update the cgroup array map
|
|
if self.cgroup_map_name is not None:
|
|
cgroup_array = self.bpf.get_table(self.cgroup_map_name)
|
|
cgroup_array[0] = self.args.cgroup_path
|
|
|
|
for probe in self.probes:
|
|
if self.args.verbose:
|
|
print(probe)
|
|
probe.attach(self.bpf, self.args.verbose)
|
|
|
|
def _main_loop(self):
|
|
all_probes_trivial = all(map(Probe.is_default_action,
|
|
self.probes))
|
|
|
|
# Print header
|
|
if self.args.timestamp or self.args.time:
|
|
col_fmt = "%-17s " if self.args.unix_timestamp else "%-8s "
|
|
print(col_fmt % "TIME", end="")
|
|
if self.args.print_cpu:
|
|
print("%-3s " % "CPU", end="")
|
|
print("%-7s %-7s %-15s %-16s %s" %
|
|
("PID", "TID", "COMM", "FUNC",
|
|
"-" if not all_probes_trivial else ""))
|
|
sys.stdout.flush()
|
|
|
|
while True:
|
|
self.bpf.perf_buffer_poll()
|
|
|
|
def run(self):
|
|
try:
|
|
self._create_probes()
|
|
self._generate_program()
|
|
self._attach_probes()
|
|
self._main_loop()
|
|
except:
|
|
exc_info = sys.exc_info()
|
|
sys_exit = exc_info[0] is SystemExit
|
|
if self.args.verbose:
|
|
traceback.print_exc()
|
|
elif not sys_exit:
|
|
print(exc_info[1])
|
|
exit(0 if sys_exit else 1)
|
|
|
|
if __name__ == "__main__":
|
|
Tool().run()
|