497 lines
15 KiB
C++
497 lines
15 KiB
C++
/*
|
|
* Copyright (c) Facebook, Inc.
|
|
* Licensed under the Apache License, Version 2.0 (the "License")
|
|
*/
|
|
|
|
#include <string>
|
|
|
|
namespace ebpf {
|
|
namespace pyperf {
|
|
|
|
extern const std::string PYPERF_BPF_PROGRAM = R"(
|
|
#include <linux/sched.h>
|
|
#include <uapi/linux/ptrace.h>
|
|
|
|
#define PYTHON_STACK_FRAMES_PER_PROG 25
|
|
#define PYTHON_STACK_PROG_CNT 3
|
|
#define STACK_MAX_LEN (PYTHON_STACK_FRAMES_PER_PROG * PYTHON_STACK_PROG_CNT)
|
|
#define CLASS_NAME_LEN 32
|
|
#define FUNCTION_NAME_LEN 64
|
|
#define FILE_NAME_LEN 128
|
|
#define TASK_COMM_LEN 16
|
|
|
|
enum {
|
|
STACK_STATUS_COMPLETE = 0,
|
|
STACK_STATUS_ERROR = 1,
|
|
STACK_STATUS_TRUNCATED = 2,
|
|
};
|
|
|
|
enum {
|
|
GIL_STATE_NO_INFO = 0,
|
|
GIL_STATE_ERROR = 1,
|
|
GIL_STATE_UNINITIALIZED = 2,
|
|
GIL_STATE_NOT_LOCKED = 3,
|
|
GIL_STATE_THIS_THREAD = 4,
|
|
GIL_STATE_GLOBAL_CURRENT_THREAD = 5,
|
|
GIL_STATE_OTHER_THREAD = 6,
|
|
GIL_STATE_NULL = 7,
|
|
};
|
|
|
|
enum {
|
|
THREAD_STATE_UNKNOWN = 0,
|
|
THREAD_STATE_MATCH = 1,
|
|
THREAD_STATE_MISMATCH = 2,
|
|
THREAD_STATE_THIS_THREAD_NULL = 3,
|
|
THREAD_STATE_GLOBAL_CURRENT_THREAD_NULL = 4,
|
|
THREAD_STATE_BOTH_NULL = 5,
|
|
};
|
|
|
|
enum {
|
|
PTHREAD_ID_UNKNOWN = 0,
|
|
PTHREAD_ID_MATCH = 1,
|
|
PTHREAD_ID_MISMATCH = 2,
|
|
PTHREAD_ID_THREAD_STATE_NULL = 3,
|
|
PTHREAD_ID_NULL = 4,
|
|
PTHREAD_ID_ERROR = 5,
|
|
};
|
|
|
|
typedef struct {
|
|
int64_t PyObject_type;
|
|
int64_t PyTypeObject_name;
|
|
int64_t PyThreadState_frame;
|
|
int64_t PyThreadState_thread;
|
|
int64_t PyFrameObject_back;
|
|
int64_t PyFrameObject_code;
|
|
int64_t PyFrameObject_lineno;
|
|
int64_t PyFrameObject_localsplus;
|
|
int64_t PyCodeObject_filename;
|
|
int64_t PyCodeObject_name;
|
|
int64_t PyCodeObject_varnames;
|
|
int64_t PyTupleObject_item;
|
|
int64_t String_data;
|
|
int64_t String_size;
|
|
} OffsetConfig;
|
|
|
|
typedef struct {
|
|
uintptr_t current_state_addr; // virtual address of _PyThreadState_Current
|
|
uintptr_t tls_key_addr; // virtual address of autoTLSkey for pthreads TLS
|
|
uintptr_t gil_locked_addr; // virtual address of gil_locked
|
|
uintptr_t gil_last_holder_addr; // virtual address of gil_last_holder
|
|
OffsetConfig offsets;
|
|
} PidData;
|
|
|
|
typedef struct {
|
|
char classname[CLASS_NAME_LEN];
|
|
char name[FUNCTION_NAME_LEN];
|
|
char file[FILE_NAME_LEN];
|
|
// NOTE: PyFrameObject also has line number but it is typically just the
|
|
// first line of that function and PyCode_Addr2Line needs to be called
|
|
// to get the actual line
|
|
} Symbol;
|
|
|
|
typedef struct {
|
|
uint32_t pid;
|
|
uint32_t tid;
|
|
char comm[TASK_COMM_LEN];
|
|
uint8_t thread_state_match;
|
|
uint8_t gil_state;
|
|
uint8_t pthread_id_match;
|
|
uint8_t stack_status;
|
|
// instead of storing symbol name here directly, we add it to another
|
|
// hashmap with Symbols and only store the ids here
|
|
int64_t stack_len;
|
|
int32_t stack[STACK_MAX_LEN];
|
|
} Event;
|
|
|
|
#define _STR_CONCAT(str1, str2) str1##str2
|
|
#define STR_CONCAT(str1, str2) _STR_CONCAT(str1, str2)
|
|
#define FAIL_COMPILATION_IF(condition) \
|
|
typedef struct { \
|
|
char _condition_check[1 - 2 * !!(condition)]; \
|
|
} STR_CONCAT(compile_time_condition_check, __COUNTER__);
|
|
// See comments in get_frame_data
|
|
FAIL_COMPILATION_IF(sizeof(Symbol) == sizeof(struct bpf_perf_event_value))
|
|
|
|
typedef struct {
|
|
OffsetConfig offsets;
|
|
uint64_t cur_cpu;
|
|
int64_t symbol_counter;
|
|
void* frame_ptr;
|
|
int64_t python_stack_prog_call_cnt;
|
|
Event event;
|
|
} sample_state_t;
|
|
|
|
BPF_PERCPU_ARRAY(state_heap, sample_state_t, 1);
|
|
BPF_HASH(symbols, Symbol, int32_t, __SYMBOLS_SIZE__);
|
|
BPF_HASH(pid_config, pid_t, PidData);
|
|
BPF_PROG_ARRAY(progs, 1);
|
|
|
|
BPF_PERF_OUTPUT(events);
|
|
|
|
static inline __attribute__((__always_inline__)) void* get_thread_state(
|
|
void* tls_base,
|
|
PidData* pid_data) {
|
|
// Python sets the thread_state using pthread_setspecific with the key
|
|
// stored in a global variable autoTLSkey.
|
|
// We read the value of the key from the global variable and then read
|
|
// the value in the thread-local storage. This relies on pthread implementation.
|
|
// This is basically the same as running the following in GDB:
|
|
// p *(PyThreadState*)((struct pthread*)pthread_self())->
|
|
// specific_1stblock[autoTLSkey]->data
|
|
int key;
|
|
bpf_probe_read_user(&key, sizeof(key), (void*)pid_data->tls_key_addr);
|
|
// This assumes autoTLSkey < 32, which means that the TLS is stored in
|
|
// pthread->specific_1stblock[autoTLSkey]
|
|
// 0x310 is offsetof(struct pthread, specific_1stblock),
|
|
// 0x10 is sizeof(pthread_key_data)
|
|
// 0x8 is offsetof(struct pthread_key_data, data)
|
|
// 'struct pthread' is not in the public API so we have to hardcode
|
|
// the offsets here
|
|
void* thread_state;
|
|
bpf_probe_read_user(
|
|
&thread_state,
|
|
sizeof(thread_state),
|
|
tls_base + 0x310 + key * 0x10 + 0x08);
|
|
return thread_state;
|
|
}
|
|
|
|
static inline __attribute__((__always_inline__)) int submit_sample(
|
|
struct pt_regs* ctx,
|
|
sample_state_t* state) {
|
|
events.perf_submit(ctx, &state->event, sizeof(Event));
|
|
return 0;
|
|
}
|
|
|
|
// this function is trivial, but we need to do map lookup in separate function,
|
|
// because BCC doesn't allow direct map calls (including lookups) from inside
|
|
// a macro (which we want to do in GET_STATE() macro below)
|
|
static inline __attribute__((__always_inline__)) sample_state_t* get_state() {
|
|
int zero = 0;
|
|
return state_heap.lookup(&zero);
|
|
}
|
|
|
|
#define GET_STATE() \
|
|
sample_state_t* state = get_state(); \
|
|
if (!state) { \
|
|
return 0; /* should never happen */ \
|
|
}
|
|
|
|
static inline __attribute__((__always_inline__)) int get_thread_state_match(
|
|
void* this_thread_state,
|
|
void* global_thread_state) {
|
|
if (this_thread_state == 0 && global_thread_state == 0) {
|
|
return THREAD_STATE_BOTH_NULL;
|
|
}
|
|
if (this_thread_state == 0) {
|
|
return THREAD_STATE_THIS_THREAD_NULL;
|
|
}
|
|
if (global_thread_state == 0) {
|
|
return THREAD_STATE_GLOBAL_CURRENT_THREAD_NULL;
|
|
}
|
|
if (this_thread_state == global_thread_state) {
|
|
return THREAD_STATE_MATCH;
|
|
} else {
|
|
return THREAD_STATE_MISMATCH;
|
|
}
|
|
}
|
|
|
|
static inline __attribute__((__always_inline__)) int get_gil_state(
|
|
void* this_thread_state,
|
|
void* global_thread_state,
|
|
PidData* pid_data) {
|
|
// Get information of GIL state
|
|
if (pid_data->gil_locked_addr == 0 || pid_data->gil_last_holder_addr == 0) {
|
|
return GIL_STATE_NO_INFO;
|
|
}
|
|
|
|
int gil_locked = 0;
|
|
void* gil_thread_state = 0;
|
|
if (bpf_probe_read_user(
|
|
&gil_locked, sizeof(gil_locked), (void*)pid_data->gil_locked_addr)) {
|
|
return GIL_STATE_ERROR;
|
|
}
|
|
|
|
switch (gil_locked) {
|
|
case -1:
|
|
return GIL_STATE_UNINITIALIZED;
|
|
case 0:
|
|
return GIL_STATE_NOT_LOCKED;
|
|
case 1:
|
|
// GIL is held by some Thread
|
|
bpf_probe_read_user(
|
|
&gil_thread_state,
|
|
sizeof(void*),
|
|
(void*)pid_data->gil_last_holder_addr);
|
|
if (gil_thread_state == this_thread_state) {
|
|
return GIL_STATE_THIS_THREAD;
|
|
} else if (gil_thread_state == global_thread_state) {
|
|
return GIL_STATE_GLOBAL_CURRENT_THREAD;
|
|
} else if (gil_thread_state == 0) {
|
|
return GIL_STATE_NULL;
|
|
} else {
|
|
return GIL_STATE_OTHER_THREAD;
|
|
}
|
|
default:
|
|
return GIL_STATE_ERROR;
|
|
}
|
|
}
|
|
|
|
static inline __attribute__((__always_inline__)) int
|
|
get_pthread_id_match(void* thread_state, void* tls_base, PidData* pid_data) {
|
|
if (thread_state == 0) {
|
|
return PTHREAD_ID_THREAD_STATE_NULL;
|
|
}
|
|
|
|
uint64_t pthread_self, pthread_created;
|
|
|
|
bpf_probe_read_user(
|
|
&pthread_created,
|
|
sizeof(pthread_created),
|
|
thread_state + pid_data->offsets.PyThreadState_thread);
|
|
if (pthread_created == 0) {
|
|
return PTHREAD_ID_NULL;
|
|
}
|
|
|
|
// 0x10 = offsetof(struct pthread, header.self)
|
|
bpf_probe_read_user(&pthread_self, sizeof(pthread_self), tls_base + 0x10);
|
|
if (pthread_self == 0) {
|
|
return PTHREAD_ID_ERROR;
|
|
}
|
|
|
|
if (pthread_self == pthread_created) {
|
|
return PTHREAD_ID_MATCH;
|
|
} else {
|
|
return PTHREAD_ID_MISMATCH;
|
|
}
|
|
}
|
|
|
|
int on_event(struct pt_regs* ctx) {
|
|
uint64_t pid_tgid = bpf_get_current_pid_tgid();
|
|
pid_t pid = (pid_t)(pid_tgid >> 32);
|
|
PidData* pid_data = pid_config.lookup(&pid);
|
|
if (!pid_data) {
|
|
return 0;
|
|
}
|
|
|
|
GET_STATE();
|
|
|
|
state->offsets = pid_data->offsets;
|
|
state->cur_cpu = bpf_get_smp_processor_id();
|
|
state->python_stack_prog_call_cnt = 0;
|
|
|
|
Event* event = &state->event;
|
|
event->pid = pid;
|
|
event->tid = (pid_t)pid_tgid;
|
|
bpf_get_current_comm(&event->comm, sizeof(event->comm));
|
|
|
|
// Get pointer of global PyThreadState, which should belong to the Thread
|
|
// currently holds the GIL
|
|
void* global_current_thread = (void*)0;
|
|
bpf_probe_read_user(
|
|
&global_current_thread,
|
|
sizeof(global_current_thread),
|
|
(void*)pid_data->current_state_addr);
|
|
|
|
struct task_struct* task = (struct task_struct*)bpf_get_current_task();
|
|
#if __x86_64__
|
|
// thread_struct->fs was renamed to fsbase in
|
|
// https://github.com/torvalds/linux/commit/296f781a4b7801ad9c1c0219f9e87b6c25e196fe
|
|
// so depending on kernel version, we need to account for that
|
|
#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 7, 0)
|
|
void* tls_base = (void*)task->thread.fs;
|
|
#else
|
|
void* tls_base = (void*)task->thread.fsbase;
|
|
#endif
|
|
#elif __aarch64__
|
|
void* tls_base = (void*)task->thread.tp_value;
|
|
#else
|
|
#error "Unsupported platform"
|
|
#endif
|
|
|
|
// Read PyThreadState of this Thread from TLS
|
|
void* thread_state = get_thread_state(tls_base, pid_data);
|
|
|
|
// Check for matching between TLS PyThreadState and
|
|
// the global _PyThreadState_Current
|
|
event->thread_state_match =
|
|
get_thread_state_match(thread_state, global_current_thread);
|
|
|
|
// Read GIL state
|
|
event->gil_state =
|
|
get_gil_state(thread_state, global_current_thread, pid_data);
|
|
|
|
// Check for matching between pthread ID created current PyThreadState and
|
|
// pthread of actual current pthread
|
|
event->pthread_id_match =
|
|
get_pthread_id_match(thread_state, tls_base, pid_data);
|
|
|
|
// pre-initialize event struct in case any subprogram below fails
|
|
event->stack_status = STACK_STATUS_COMPLETE;
|
|
event->stack_len = 0;
|
|
|
|
if (thread_state != 0) {
|
|
// Get pointer to top frame from PyThreadState
|
|
bpf_probe_read_user(
|
|
&state->frame_ptr,
|
|
sizeof(void*),
|
|
thread_state + pid_data->offsets.PyThreadState_frame);
|
|
// jump to reading first set of Python frames
|
|
progs.call(ctx, PYTHON_STACK_PROG_IDX);
|
|
// we won't ever get here
|
|
}
|
|
|
|
return submit_sample(ctx, state);
|
|
}
|
|
|
|
static inline __attribute__((__always_inline__)) void get_names(
|
|
void* cur_frame,
|
|
void* code_ptr,
|
|
OffsetConfig* offsets,
|
|
Symbol* symbol,
|
|
void* ctx) {
|
|
// Figure out if we want to parse class name, basically checking the name of
|
|
// the first argument,
|
|
// ((PyTupleObject*)$frame->f_code->co_varnames)->ob_item[0]
|
|
// If it's 'self', we get the type and it's name, if it's cls, we just get
|
|
// the name. This is not perfect but there is no better way to figure this
|
|
// out from the code object.
|
|
void* args_ptr;
|
|
bpf_probe_read_user(
|
|
&args_ptr, sizeof(void*), code_ptr + offsets->PyCodeObject_varnames);
|
|
bpf_probe_read_user(
|
|
&args_ptr, sizeof(void*), args_ptr + offsets->PyTupleObject_item);
|
|
bpf_probe_read_user_str(
|
|
&symbol->name, sizeof(symbol->name), args_ptr + offsets->String_data);
|
|
|
|
// compare strings as ints to save instructions
|
|
char self_str[4] = {'s', 'e', 'l', 'f'};
|
|
char cls_str[4] = {'c', 'l', 's', '\0'};
|
|
bool first_self = *(int32_t*)symbol->name == *(int32_t*)self_str;
|
|
bool first_cls = *(int32_t*)symbol->name == *(int32_t*)cls_str;
|
|
|
|
// We re-use the same Symbol instance across loop iterations, which means
|
|
// we will have left-over data in the struct. Although this won't affect
|
|
// correctness of the result because we have '\0' at end of the strings read,
|
|
// it would affect effectiveness of the deduplication.
|
|
// Helper bpf_perf_prog_read_value clears the buffer on error, so here we
|
|
// (ab)use this behavior to clear the memory. It requires the size of Symbol
|
|
// to be different from struct bpf_perf_event_value, which we check at
|
|
// compilation time using the FAIL_COMPILATION_IF macro.
|
|
bpf_perf_prog_read_value(ctx, symbol, sizeof(Symbol));
|
|
|
|
// Read class name from $frame->f_localsplus[0]->ob_type->tp_name.
|
|
if (first_self || first_cls) {
|
|
void* ptr;
|
|
bpf_probe_read_user(
|
|
&ptr, sizeof(void*), cur_frame + offsets->PyFrameObject_localsplus);
|
|
if (first_self) {
|
|
// we are working with an instance, first we need to get type
|
|
bpf_probe_read_user(&ptr, sizeof(void*), ptr + offsets->PyObject_type);
|
|
}
|
|
bpf_probe_read_user(&ptr, sizeof(void*), ptr + offsets->PyTypeObject_name);
|
|
bpf_probe_read_user_str(&symbol->classname, sizeof(symbol->classname), ptr);
|
|
}
|
|
|
|
void* pystr_ptr;
|
|
// read PyCodeObject's filename into symbol
|
|
bpf_probe_read_user(
|
|
&pystr_ptr, sizeof(void*), code_ptr + offsets->PyCodeObject_filename);
|
|
bpf_probe_read_user_str(
|
|
&symbol->file, sizeof(symbol->file), pystr_ptr + offsets->String_data);
|
|
// read PyCodeObject's name into symbol
|
|
bpf_probe_read_user(
|
|
&pystr_ptr, sizeof(void*), code_ptr + offsets->PyCodeObject_name);
|
|
bpf_probe_read_user_str(
|
|
&symbol->name, sizeof(symbol->name), pystr_ptr + offsets->String_data);
|
|
}
|
|
|
|
// get_frame_data reads current PyFrameObject filename/name and updates
|
|
// stack_info->frame_ptr with pointer to next PyFrameObject
|
|
static inline __attribute__((__always_inline__)) bool get_frame_data(
|
|
void** frame_ptr,
|
|
OffsetConfig* offsets,
|
|
Symbol* symbol,
|
|
// ctx is only used to call helper to clear symbol, see documentation below
|
|
void* ctx) {
|
|
void* cur_frame = *frame_ptr;
|
|
if (!cur_frame) {
|
|
return false;
|
|
}
|
|
void* code_ptr;
|
|
// read PyCodeObject first, if that fails, then no point reading next frame
|
|
bpf_probe_read_user(
|
|
&code_ptr, sizeof(void*), cur_frame + offsets->PyFrameObject_code);
|
|
if (!code_ptr) {
|
|
return false;
|
|
}
|
|
|
|
get_names(cur_frame, code_ptr, offsets, symbol, ctx);
|
|
|
|
// read next PyFrameObject pointer, update in place
|
|
bpf_probe_read_user(
|
|
frame_ptr, sizeof(void*), cur_frame + offsets->PyFrameObject_back);
|
|
|
|
return true;
|
|
}
|
|
|
|
// To avoid duplicate ids, every CPU needs to use different ids when inserting
|
|
// into the hashmap. NUM_CPUS is defined at PyPerf backend side and passed
|
|
// through CFlag.
|
|
static inline __attribute__((__always_inline__)) int64_t get_symbol_id(
|
|
sample_state_t* state,
|
|
Symbol* sym) {
|
|
int32_t* symbol_id_ptr = symbols.lookup(sym);
|
|
if (symbol_id_ptr) {
|
|
return *symbol_id_ptr;
|
|
}
|
|
// the symbol is new, bump the counter
|
|
int32_t symbol_id = state->symbol_counter * NUM_CPUS + state->cur_cpu;
|
|
state->symbol_counter++;
|
|
symbols.update(sym, &symbol_id);
|
|
return symbol_id;
|
|
}
|
|
|
|
int read_python_stack(struct pt_regs* ctx) {
|
|
GET_STATE();
|
|
|
|
state->python_stack_prog_call_cnt++;
|
|
Event* sample = &state->event;
|
|
|
|
Symbol sym = {};
|
|
bool last_res = false;
|
|
#pragma unroll
|
|
for (int i = 0; i < PYTHON_STACK_FRAMES_PER_PROG; i++) {
|
|
last_res = get_frame_data(&state->frame_ptr, &state->offsets, &sym, ctx);
|
|
if (last_res) {
|
|
uint32_t symbol_id = get_symbol_id(state, &sym);
|
|
int64_t cur_len = sample->stack_len;
|
|
if (cur_len >= 0 && cur_len < STACK_MAX_LEN) {
|
|
sample->stack[cur_len] = symbol_id;
|
|
sample->stack_len++;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!state->frame_ptr) {
|
|
sample->stack_status = STACK_STATUS_COMPLETE;
|
|
} else {
|
|
if (!last_res) {
|
|
sample->stack_status = STACK_STATUS_ERROR;
|
|
} else {
|
|
sample->stack_status = STACK_STATUS_TRUNCATED;
|
|
}
|
|
}
|
|
|
|
if (sample->stack_status == STACK_STATUS_TRUNCATED &&
|
|
state->python_stack_prog_call_cnt < PYTHON_STACK_PROG_CNT) {
|
|
// read next batch of frames
|
|
progs.call(ctx, PYTHON_STACK_PROG_IDX);
|
|
}
|
|
|
|
return submit_sample(ctx, state);
|
|
}
|
|
)";
|
|
|
|
}
|
|
} // namespace ebpf
|