mirror of
				git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
				synced 2025-09-04 20:19:47 +08:00 
			
		
		
		
	 8b3b1bb3ea
			
		
	
	
		8b3b1bb3ea
		
	
	
	
	
		
			
			The control knobs set before loading BPF programs should be declared as
'const volatile' so that it can be optimized by the BPF core.
Committer testing:
  root@x1:~# perf record --off-cpu
  ^C[ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 1.807 MB perf.data (5645 samples) ]
  root@x1:~# perf evlist
  cpu_atom/cycles/P
  cpu_core/cycles/P
  offcpu-time
  dummy:u
  root@x1:~# perf evlist -v
  cpu_atom/cycles/P: type: 0 (PERF_TYPE_HARDWARE), size: 136, config: 0xa00000000, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CPU|PERIOD|IDENTIFIER, read_format: ID|LOST, disabled: 1, inherit: 1, freq: 1, precise_ip: 3, sample_id_all: 1
  cpu_core/cycles/P: type: 0 (PERF_TYPE_HARDWARE), size: 136, config: 0x400000000, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CPU|PERIOD|IDENTIFIER, read_format: ID|LOST, disabled: 1, inherit: 1, freq: 1, precise_ip: 3, sample_id_all: 1
  offcpu-time: type: 1 (software), size: 136, config: 0xa (PERF_COUNT_SW_BPF_OUTPUT), { sample_period, sample_freq }: 1, sample_type: IP|TID|TIME|CALLCHAIN|CPU|PERIOD|IDENTIFIER, read_format: ID|LOST, disabled: 1, inherit: 1, freq: 1, sample_id_all: 1
  dummy:u: type: 1 (software), size: 136, config: 0x9 (PERF_COUNT_SW_DUMMY), { sample_period, sample_freq }: 1, sample_type: IP|TID|TIME|CPU|IDENTIFIER, read_format: ID|LOST, inherit: 1, exclude_kernel: 1, exclude_hv: 1, mmap: 1, comm: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1
  root@x1:~# perf trace -e bpf --max-events 5 perf record --off-cpu
       0.000 ( 0.015 ms): :2949124/2949124 bpf(cmd: 36, uattr: 0x7ffefc6dbe30, size: 8)          = -1 EOPNOTSUPP (Operation not supported)
       0.031 ( 0.115 ms): :2949124/2949124 bpf(cmd: PROG_LOAD, uattr: 0x7ffefc6dbb60, size: 148) = 14
       0.159 ( 0.037 ms): :2949124/2949124 bpf(cmd: PROG_LOAD, uattr: 0x7ffefc6dbc20, size: 148) = 14
      23.868 ( 0.144 ms): perf/2949124 bpf(cmd: PROG_LOAD, uattr: 0x7ffefc6dbad0, size: 148)     = 14
      24.027 ( 0.014 ms): perf/2949124 bpf(uattr: 0x7ffefc6dbc80, size: 80)                      = 14
  root@x1:~#
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20240902200515.2103769-6-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
		
	
			
		
			
				
	
	
		
			285 lines
		
	
	
		
			6.0 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			285 lines
		
	
	
		
			6.0 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
 | |
| // Copyright (c) 2022 Google
 | |
| #include "vmlinux.h"
 | |
| #include <bpf/bpf_helpers.h>
 | |
| #include <bpf/bpf_tracing.h>
 | |
| #include <bpf/bpf_core_read.h>
 | |
| 
 | |
| /* task->flags for off-cpu analysis */
 | |
| #define PF_KTHREAD   0x00200000  /* I am a kernel thread */
 | |
| 
 | |
| /* task->state for off-cpu analysis */
 | |
| #define TASK_INTERRUPTIBLE	0x0001
 | |
| #define TASK_UNINTERRUPTIBLE	0x0002
 | |
| 
 | |
| /* create a new thread */
 | |
| #define CLONE_THREAD  0x10000
 | |
| 
 | |
| #define MAX_STACKS   32
 | |
| #define MAX_ENTRIES  102400
 | |
| 
 | |
| struct tstamp_data {
 | |
| 	__u32 stack_id;
 | |
| 	__u32 state;
 | |
| 	__u64 timestamp;
 | |
| };
 | |
| 
 | |
| struct offcpu_key {
 | |
| 	__u32 pid;
 | |
| 	__u32 tgid;
 | |
| 	__u32 stack_id;
 | |
| 	__u32 state;
 | |
| 	__u64 cgroup_id;
 | |
| };
 | |
| 
 | |
| struct {
 | |
| 	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
 | |
| 	__uint(key_size, sizeof(__u32));
 | |
| 	__uint(value_size, MAX_STACKS * sizeof(__u64));
 | |
| 	__uint(max_entries, MAX_ENTRIES);
 | |
| } stacks SEC(".maps");
 | |
| 
 | |
| struct {
 | |
| 	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
 | |
| 	__uint(map_flags, BPF_F_NO_PREALLOC);
 | |
| 	__type(key, int);
 | |
| 	__type(value, struct tstamp_data);
 | |
| } tstamp SEC(".maps");
 | |
| 
 | |
| struct {
 | |
| 	__uint(type, BPF_MAP_TYPE_HASH);
 | |
| 	__uint(key_size, sizeof(struct offcpu_key));
 | |
| 	__uint(value_size, sizeof(__u64));
 | |
| 	__uint(max_entries, MAX_ENTRIES);
 | |
| } off_cpu SEC(".maps");
 | |
| 
 | |
| struct {
 | |
| 	__uint(type, BPF_MAP_TYPE_HASH);
 | |
| 	__uint(key_size, sizeof(__u32));
 | |
| 	__uint(value_size, sizeof(__u8));
 | |
| 	__uint(max_entries, 1);
 | |
| } cpu_filter SEC(".maps");
 | |
| 
 | |
| struct {
 | |
| 	__uint(type, BPF_MAP_TYPE_HASH);
 | |
| 	__uint(key_size, sizeof(__u32));
 | |
| 	__uint(value_size, sizeof(__u8));
 | |
| 	__uint(max_entries, 1);
 | |
| } task_filter SEC(".maps");
 | |
| 
 | |
| struct {
 | |
| 	__uint(type, BPF_MAP_TYPE_HASH);
 | |
| 	__uint(key_size, sizeof(__u64));
 | |
| 	__uint(value_size, sizeof(__u8));
 | |
| 	__uint(max_entries, 1);
 | |
| } cgroup_filter SEC(".maps");
 | |
| 
 | |
| /* new kernel task_struct definition */
 | |
| struct task_struct___new {
 | |
| 	long __state;
 | |
| } __attribute__((preserve_access_index));
 | |
| 
 | |
| /* old kernel task_struct definition */
 | |
| struct task_struct___old {
 | |
| 	long state;
 | |
| } __attribute__((preserve_access_index));
 | |
| 
 | |
| int enabled = 0;
 | |
| 
 | |
| const volatile int has_cpu = 0;
 | |
| const volatile int has_task = 0;
 | |
| const volatile int has_cgroup = 0;
 | |
| const volatile int uses_tgid = 0;
 | |
| 
 | |
| const volatile bool has_prev_state = false;
 | |
| const volatile bool needs_cgroup = false;
 | |
| const volatile bool uses_cgroup_v1 = false;
 | |
| 
 | |
| int perf_subsys_id = -1;
 | |
| 
 | |
| /*
 | |
|  * Old kernel used to call it task_struct->state and now it's '__state'.
 | |
|  * Use BPF CO-RE "ignored suffix rule" to deal with it like below:
 | |
|  *
 | |
|  * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes
 | |
|  */
 | |
| static inline int get_task_state(struct task_struct *t)
 | |
| {
 | |
| 	/* recast pointer to capture new type for compiler */
 | |
| 	struct task_struct___new *t_new = (void *)t;
 | |
| 
 | |
| 	if (bpf_core_field_exists(t_new->__state)) {
 | |
| 		return BPF_CORE_READ(t_new, __state);
 | |
| 	} else {
 | |
| 		/* recast pointer to capture old type for compiler */
 | |
| 		struct task_struct___old *t_old = (void *)t;
 | |
| 
 | |
| 		return BPF_CORE_READ(t_old, state);
 | |
| 	}
 | |
| }
 | |
| 
 | |
| static inline __u64 get_cgroup_id(struct task_struct *t)
 | |
| {
 | |
| 	struct cgroup *cgrp;
 | |
| 
 | |
| 	if (!uses_cgroup_v1)
 | |
| 		return BPF_CORE_READ(t, cgroups, dfl_cgrp, kn, id);
 | |
| 
 | |
| 	if (perf_subsys_id == -1) {
 | |
| #if __has_builtin(__builtin_preserve_enum_value)
 | |
| 		perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
 | |
| 						     perf_event_cgrp_id);
 | |
| #else
 | |
| 		perf_subsys_id = perf_event_cgrp_id;
 | |
| #endif
 | |
| 	}
 | |
| 
 | |
| 	cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_subsys_id], cgroup);
 | |
| 	return BPF_CORE_READ(cgrp, kn, id);
 | |
| }
 | |
| 
 | |
| static inline int can_record(struct task_struct *t, int state)
 | |
| {
 | |
| 	/* kernel threads don't have user stack */
 | |
| 	if (t->flags & PF_KTHREAD)
 | |
| 		return 0;
 | |
| 
 | |
| 	if (state != TASK_INTERRUPTIBLE &&
 | |
| 	    state != TASK_UNINTERRUPTIBLE)
 | |
| 		return 0;
 | |
| 
 | |
| 	if (has_cpu) {
 | |
| 		__u32 cpu = bpf_get_smp_processor_id();
 | |
| 		__u8 *ok;
 | |
| 
 | |
| 		ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
 | |
| 		if (!ok)
 | |
| 			return 0;
 | |
| 	}
 | |
| 
 | |
| 	if (has_task) {
 | |
| 		__u8 *ok;
 | |
| 		__u32 pid;
 | |
| 
 | |
| 		if (uses_tgid)
 | |
| 			pid = t->tgid;
 | |
| 		else
 | |
| 			pid = t->pid;
 | |
| 
 | |
| 		ok = bpf_map_lookup_elem(&task_filter, &pid);
 | |
| 		if (!ok)
 | |
| 			return 0;
 | |
| 	}
 | |
| 
 | |
| 	if (has_cgroup) {
 | |
| 		__u8 *ok;
 | |
| 		__u64 cgrp_id = get_cgroup_id(t);
 | |
| 
 | |
| 		ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id);
 | |
| 		if (!ok)
 | |
| 			return 0;
 | |
| 	}
 | |
| 
 | |
| 	return 1;
 | |
| }
 | |
| 
 | |
| static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
 | |
| 			struct task_struct *next, int state)
 | |
| {
 | |
| 	__u64 ts;
 | |
| 	__u32 stack_id;
 | |
| 	struct tstamp_data *pelem;
 | |
| 
 | |
| 	ts = bpf_ktime_get_ns();
 | |
| 
 | |
| 	if (!can_record(prev, state))
 | |
| 		goto next;
 | |
| 
 | |
| 	stack_id = bpf_get_stackid(ctx, &stacks,
 | |
| 				   BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK);
 | |
| 
 | |
| 	pelem = bpf_task_storage_get(&tstamp, prev, NULL,
 | |
| 				     BPF_LOCAL_STORAGE_GET_F_CREATE);
 | |
| 	if (!pelem)
 | |
| 		goto next;
 | |
| 
 | |
| 	pelem->timestamp = ts;
 | |
| 	pelem->state = state;
 | |
| 	pelem->stack_id = stack_id;
 | |
| 
 | |
| next:
 | |
| 	pelem = bpf_task_storage_get(&tstamp, next, NULL, 0);
 | |
| 
 | |
| 	if (pelem && pelem->timestamp) {
 | |
| 		struct offcpu_key key = {
 | |
| 			.pid = next->pid,
 | |
| 			.tgid = next->tgid,
 | |
| 			.stack_id = pelem->stack_id,
 | |
| 			.state = pelem->state,
 | |
| 			.cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0,
 | |
| 		};
 | |
| 		__u64 delta = ts - pelem->timestamp;
 | |
| 		__u64 *total;
 | |
| 
 | |
| 		total = bpf_map_lookup_elem(&off_cpu, &key);
 | |
| 		if (total)
 | |
| 			*total += delta;
 | |
| 		else
 | |
| 			bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY);
 | |
| 
 | |
| 		/* prevent to reuse the timestamp later */
 | |
| 		pelem->timestamp = 0;
 | |
| 	}
 | |
| 
 | |
| 	return 0;
 | |
| }
 | |
| 
 | |
| SEC("tp_btf/task_newtask")
 | |
| int on_newtask(u64 *ctx)
 | |
| {
 | |
| 	struct task_struct *task;
 | |
| 	u64 clone_flags;
 | |
| 	u32 pid;
 | |
| 	u8 val = 1;
 | |
| 
 | |
| 	if (!uses_tgid)
 | |
| 		return 0;
 | |
| 
 | |
| 	task = (struct task_struct *)bpf_get_current_task();
 | |
| 
 | |
| 	pid = BPF_CORE_READ(task, tgid);
 | |
| 	if (!bpf_map_lookup_elem(&task_filter, &pid))
 | |
| 		return 0;
 | |
| 
 | |
| 	task = (struct task_struct *)ctx[0];
 | |
| 	clone_flags = ctx[1];
 | |
| 
 | |
| 	pid = task->tgid;
 | |
| 	if (!(clone_flags & CLONE_THREAD))
 | |
| 		bpf_map_update_elem(&task_filter, &pid, &val, BPF_NOEXIST);
 | |
| 
 | |
| 	return 0;
 | |
| }
 | |
| 
 | |
| SEC("tp_btf/sched_switch")
 | |
| int on_switch(u64 *ctx)
 | |
| {
 | |
| 	struct task_struct *prev, *next;
 | |
| 	int prev_state;
 | |
| 
 | |
| 	if (!enabled)
 | |
| 		return 0;
 | |
| 
 | |
| 	prev = (struct task_struct *)ctx[1];
 | |
| 	next = (struct task_struct *)ctx[2];
 | |
| 
 | |
| 	if (has_prev_state)
 | |
| 		prev_state = (int)ctx[3];
 | |
| 	else
 | |
| 		prev_state = get_task_state(prev);
 | |
| 
 | |
| 	return off_cpu_stat(ctx, prev, next, prev_state & 0xff);
 | |
| }
 | |
| 
 | |
| char LICENSE[] SEC("license") = "Dual BSD/GPL";
 |