2
0
mirror of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git synced 2025-09-04 20:19:47 +08:00

tracing changes for 6.17

- Deprecate auto-mounting tracefs to /sys/kernel/debug/tracing
 
   When tracefs was first introduced back in 2014, the directory
   /sys/kernel/tracing was added and is the designated location to mount
   tracefs. To keep backward compatibility, tracefs was auto-mounted in
   /sys/kernel/debug/tracing as well.
 
   All distros now mount tracefs on /sys/kernel/tracing. Having it seen in two
   different locations has lead to various issues and inconsistencies.
 
   The VFS folks have to also maintain debugfs_create_automount() for this
   single user.
 
   It's been over 10 years. Tooling and scripts should start replacing the
   debugfs location with the tracefs one. The reason tracefs was created in the
   first place was to allow access to the tracing facilities without the need
   to configure debugfs into the kernel. Using tracefs should now be more
   robust.
 
   A new config is created: CONFIG_TRACEFS_AUTOMOUNT_DEPRECATED
   which is default y, so that the kernel is still built with the automount.
   This config allows those that want to remove the automount from debugfs to
   do so.
 
   When tracefs is accessed from /sys/kernel/debug/tracing, the following
   printk is triggerd:
 
    pr_warn("NOTICE: Automounting of tracing to debugfs is deprecated and will be removed in 2030\n");
 
   This gives users another 5 years to fix their scripts.
 
 - Use queue_rcu_work() instead of call_rcu() for freeing event filters
 
   The number of filters to be free can be many depending on the number of
   events within an event system. Freeing them from softirq context can
   potentially cause undesired latency. Use the RCU workqueue to free them
   instead.
 
 - Remove pointless memory barriers in latency code
 
   Memory barriers were added to some of the latency code a long time ago with
   the idea of "making them visible", but that's not what memory barriers are
   for. They are to synchronize access between different variables. There was
   no synchronization here making them pointless.
 
 - Remove "__attribute__()" from the type field of event format
 
   When LLVM is used to compile the kernel with CONFIG_DEBUG_INFO_BTF=y and
   PAHOLE_HAS_BTF_TAG=y, some of the format fields get expanded with the
   following:
 
     field:const char * filename;      offset:24;      size:8; signed:0;
 
   Turns into:
 
     field:const char __attribute__((btf_type_tag("user"))) * filename;      offset:24;      size:8; signed:0;
 
   This confuses parsers. Add code to strip these tags from the strings.
 
 - Add eprobe config option CONFIG_EPROBE_EVENTS
 
   Eprobes were added back in 5.15 but were only enabled when another probe was
   enabled (kprobe, fprobe, uprobe, etc). The eprobes had no config option
   of their own. Add one as they should be a separate entity.
 
   It's default y to keep with the old kernels but still has dependencies on
   TRACING and HAVE_REGS_AND_STACK_ACCESS_API.
 
 - Add eprobe documentation
 
   When eprobes were added back in 5.15 no documentation was added to describe
   them. This needs to be rectified.
 
 - Replace open coded cpumask_next_wrap() in move_to_next_cpu()
 
 - Have preemptirq_delay_run() use off-stack CPU mask
 
 - Remove obsolete comment about pelt_cfs event
 
   DECLARE_TRACE() appends "_tp" to trace events now, but the comment above
   pelt_cfs still mentioned appending it manually.
 
 - Remove EVENT_FILE_FL_SOFT_MODE flag
 
   The SOFT_MODE flag was required when the soft enabling and disabling of
   trace events was first introduced. But there was a bug with this approach
   as it only worked for a single instance. When multiple users required soft
   disabling and disabling the code was changed to have a ref count. The
   SOFT_MODE flag is now set iff the ref count is non zero. This is redundant
   and just reading the ref count is good enough.
 
 - Fix typo in comment
 -----BEGIN PGP SIGNATURE-----
 
 iIoEABYKADIWIQRRSw7ePDh/lE+zeZMp5XQQmuv6qgUCaIt5ZRQccm9zdGVkdEBn
 b29kbWlzLm9yZwAKCRAp5XQQmuv6qvriAPsEbOEgMrPF1Tdj1mHLVajYTxI8ft5J
 aX5bfM2cDDRVcgEA57JHOXp4d05dj555/hgAUuCWuFp/E0Anp45EnFTedgQ=
 =wKZW
 -----END PGP SIGNATURE-----

Merge tag 'trace-v6.17' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace

Pull tracing updates from Steven Rostedt:

 - Deprecate auto-mounting tracefs to /sys/kernel/debug/tracing

   When tracefs was first introduced back in 2014, the directory
   /sys/kernel/tracing was added and is the designated location to mount
   tracefs. To keep backward compatibility, tracefs was auto-mounted in
   /sys/kernel/debug/tracing as well.

   All distros now mount tracefs on /sys/kernel/tracing. Having it seen
   in two different locations has lead to various issues and
   inconsistencies.

   The VFS folks have to also maintain debugfs_create_automount() for
   this single user.

   It's been over 10 years. Tooling and scripts should start replacing
   the debugfs location with the tracefs one. The reason tracefs was
   created in the first place was to allow access to the tracing
   facilities without the need to configure debugfs into the kernel.
   Using tracefs should now be more robust.

   A new config is created: CONFIG_TRACEFS_AUTOMOUNT_DEPRECATED which is
   default y, so that the kernel is still built with the automount. This
   config allows those that want to remove the automount from debugfs to
   do so.

   When tracefs is accessed from /sys/kernel/debug/tracing, the
   following printk is triggerd:

     pr_warn("NOTICE: Automounting of tracing to debugfs is deprecated and will be removed in 2030\n");

   This gives users another 5 years to fix their scripts.

 - Use queue_rcu_work() instead of call_rcu() for freeing event filters

   The number of filters to be free can be many depending on the number
   of events within an event system. Freeing them from softirq context
   can potentially cause undesired latency. Use the RCU workqueue to
   free them instead.

 - Remove pointless memory barriers in latency code

   Memory barriers were added to some of the latency code a long time
   ago with the idea of "making them visible", but that's not what
   memory barriers are for. They are to synchronize access between
   different variables. There was no synchronization here making them
   pointless.

 - Remove "__attribute__()" from the type field of event format

   When LLVM is used to compile the kernel with CONFIG_DEBUG_INFO_BTF=y
   and PAHOLE_HAS_BTF_TAG=y, some of the format fields get expanded with
   the following:

     field:const char * filename;      offset:24;      size:8; signed:0;

   Turns into:

     field:const char __attribute__((btf_type_tag("user"))) * filename;      offset:24;      size:8; signed:0;

   This confuses parsers. Add code to strip these tags from the strings.

 - Add eprobe config option CONFIG_EPROBE_EVENTS

   Eprobes were added back in 5.15 but were only enabled when another
   probe was enabled (kprobe, fprobe, uprobe, etc). The eprobes had no
   config option of their own. Add one as they should be a separate
   entity.

   It's default y to keep with the old kernels but still has
   dependencies on TRACING and HAVE_REGS_AND_STACK_ACCESS_API.

 - Add eprobe documentation

   When eprobes were added back in 5.15 no documentation was added to
   describe them. This needs to be rectified.

 - Replace open coded cpumask_next_wrap() in move_to_next_cpu()

 - Have preemptirq_delay_run() use off-stack CPU mask

 - Remove obsolete comment about pelt_cfs event

   DECLARE_TRACE() appends "_tp" to trace events now, but the comment
   above pelt_cfs still mentioned appending it manually.

 - Remove EVENT_FILE_FL_SOFT_MODE flag

   The SOFT_MODE flag was required when the soft enabling and disabling
   of trace events was first introduced. But there was a bug with this
   approach as it only worked for a single instance. When multiple users
   required soft disabling and disabling the code was changed to have a
   ref count. The SOFT_MODE flag is now set iff the ref count is non
   zero. This is redundant and just reading the ref count is good
   enough.

 - Fix typo in comment

* tag 'trace-v6.17' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace:
  Documentation: tracing: Add documentation about eprobes
  tracing: Have eprobes have their own config option
  tracing: Remove "__attribute__()" from the type field of event format
  tracing: Deprecate auto-mounting tracefs in debugfs
  tracing: Fix comment in trace_module_remove_events()
  tracing: Remove EVENT_FILE_FL_SOFT_MODE flag
  tracing: Remove pointless memory barriers
  tracing/sched: Remove obsolete comment on suffixes
  kernel: trace: preemptirq_delay_test: use offstack cpu mask
  tracing: Use queue_rcu_work() to free filters
  tracing: Replace opencoded cpumask_next_wrap() in move_to_next_cpu()
This commit is contained in:
Linus Torvalds 2025-08-01 10:29:36 -07:00
commit d6f38c1239
14 changed files with 499 additions and 86 deletions

View File

@ -0,0 +1,20 @@
What: /sys/kernel/debug/tracing
Date: May 2008
KernelVersion: 2.6.27
Contact: linux-trace-kernel@vger.kernel.org
Description:
The ftrace was first added to the kernel, its interface was placed
into the debugfs file system under the "tracing" directory. Access
to the files were in /sys/kernel/debug/tracing. As systems wanted
access to the tracing interface without having to enable debugfs, a
new interface was created called "tracefs". This was a stand alone
file system and was usually mounted in /sys/kernel/tracing.
To allow older tooling to continue to operate, when mounting
debugfs, the tracefs file system would automatically get mounted in
the "tracing" directory of debugfs. The tracefs interface was added
in January 2015 in the v4.1 kernel.
All tooling should now be using tracefs directly and the "tracing"
directory in debugfs should be removed by January 2030.

View File

@ -0,0 +1,269 @@
.. SPDX-License-Identifier: GPL-2.0
==================================
Eprobe - Event-based Probe Tracing
==================================
:Author: Steven Rostedt <rostedt@goodmis.org>
- Written for v6.17
Overview
========
Eprobes are dynamic events that are placed on existing events to either
dereference a field that is a pointer, or simply to limit what fields are
recorded in the trace event.
Eprobes depend on kprobe events so to enable this feature, build your kernel
with CONFIG_EPROBE_EVENTS=y.
Eprobes are created via the /sys/kernel/tracing/dynamic_events file.
Synopsis of eprobe_events
-------------------------
::
e[:[EGRP/][EEVENT]] GRP.EVENT [FETCHARGS] : Set a probe
-:[EGRP/][EEVENT] : Clear a probe
EGRP : Group name of the new event. If omitted, use "eprobes" for it.
EEVENT : Event name. If omitted, the event name is generated and will
be the same event name as the event it attached to.
GRP : Group name of the event to attach to.
EVENT : Event name of the event to attach to.
FETCHARGS : Arguments. Each probe can have up to 128 args.
$FIELD : Fetch the value of the event field called FIELD.
@ADDR : Fetch memory at ADDR (ADDR should be in kernel)
@SYM[+|-offs] : Fetch memory at SYM +|- offs (SYM should be a data symbol)
$comm : Fetch current task comm.
+|-[u]OFFS(FETCHARG) : Fetch memory at FETCHARG +|- OFFS address.(\*3)(\*4)
\IMM : Store an immediate value to the argument.
NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types
(u8/u16/u32/u64/s8/s16/s32/s64), hexadecimal types
(x8/x16/x32/x64), VFS layer common type(%pd/%pD), "char",
"string", "ustring", "symbol", "symstr" and "bitfield" are
supported.
Types
-----
The FETCHARGS above is very similar to the kprobe events as described in
Documentation/trace/kprobetrace.rst.
The difference between eprobes and kprobes FETCHARGS is that eprobes has a
$FIELD command that returns the content of the event field of the event
that is attached. Eprobes do not have access to registers, stacks and function
arguments that kprobes has.
If a field argument is a pointer, it may be dereferenced just like a memory
address using the FETCHARGS syntax.
Attaching to dynamic events
---------------------------
Eprobes may attach to dynamic events as well as to normal events. It may
attach to a kprobe event, a synthetic event or a fprobe event. This is useful
if the type of a field needs to be changed. See Example 2 below.
Usage examples
==============
Example 1
---------
The basic usage of eprobes is to limit the data that is being recorded into
the tracing buffer. For example, a common event to trace is the sched_switch
trace event. That has a format of::
field:unsigned short common_type; offset:0; size:2; signed:0;
field:unsigned char common_flags; offset:2; size:1; signed:0;
field:unsigned char common_preempt_count; offset:3; size:1; signed:0;
field:int common_pid; offset:4; size:4; signed:1;
field:char prev_comm[16]; offset:8; size:16; signed:0;
field:pid_t prev_pid; offset:24; size:4; signed:1;
field:int prev_prio; offset:28; size:4; signed:1;
field:long prev_state; offset:32; size:8; signed:1;
field:char next_comm[16]; offset:40; size:16; signed:0;
field:pid_t next_pid; offset:56; size:4; signed:1;
field:int next_prio; offset:60; size:4; signed:1;
The first four fields are common to all events and can not be limited. But the
rest of the event has 60 bytes of information. It records the names of the
previous and next tasks being scheduled out and in, as well as their pids and
priorities. It also records the state of the previous task. If only the pids
of the tasks are of interest, why waste the ring buffer with all the other
fields?
An eprobe can limit what gets recorded. Note, it does not help in performance,
as all the fields are recorded in a temporary buffer to process the eprobe.
::
# echo 'e:sched/switch sched.sched_switch prev=$prev_pid:u32 next=$next_pid:u32' >> /sys/kernel/tracing/dynamic_events
# echo 1 > /sys/kernel/tracing/events/sched/switch/enable
# cat /sys/kernel/tracing/trace
# tracer: nop
#
# entries-in-buffer/entries-written: 2721/2721 #P:8
#
# _-----=> irqs-off/BH-disabled
# / _----=> need-resched
# | / _---=> hardirq/softirq
# || / _--=> preempt-depth
# ||| / _-=> migrate-disable
# |||| / delay
# TASK-PID CPU# ||||| TIMESTAMP FUNCTION
# | | | ||||| | |
sshd-session-1082 [004] d..4. 5041.239906: switch: (sched.sched_switch) prev=1082 next=0
bash-1085 [001] d..4. 5041.240198: switch: (sched.sched_switch) prev=1085 next=141
kworker/u34:5-141 [001] d..4. 5041.240259: switch: (sched.sched_switch) prev=141 next=1085
<idle>-0 [004] d..4. 5041.240354: switch: (sched.sched_switch) prev=0 next=1082
bash-1085 [001] d..4. 5041.240385: switch: (sched.sched_switch) prev=1085 next=141
kworker/u34:5-141 [001] d..4. 5041.240410: switch: (sched.sched_switch) prev=141 next=1085
bash-1085 [001] d..4. 5041.240478: switch: (sched.sched_switch) prev=1085 next=0
sshd-session-1082 [004] d..4. 5041.240526: switch: (sched.sched_switch) prev=1082 next=0
<idle>-0 [001] d..4. 5041.247524: switch: (sched.sched_switch) prev=0 next=90
<idle>-0 [002] d..4. 5041.247545: switch: (sched.sched_switch) prev=0 next=16
kworker/1:1-90 [001] d..4. 5041.247580: switch: (sched.sched_switch) prev=90 next=0
rcu_sched-16 [002] d..4. 5041.247591: switch: (sched.sched_switch) prev=16 next=0
<idle>-0 [002] d..4. 5041.257536: switch: (sched.sched_switch) prev=0 next=16
rcu_sched-16 [002] d..4. 5041.257573: switch: (sched.sched_switch) prev=16 next=0
Note, without adding the "u32" after the prev_pid and next_pid, the values
would default showing in hexadecimal.
Example 2
---------
If a specific system call is to be recorded but the syscalls events are not
enabled, the raw_syscalls can still be used (syscalls are system call
events are not normal events, but are created from the raw_syscalls events
within the kernel). In order to trace the openat system call, one can create
an event probe on top of the raw_syscalls event:
::
# cd /sys/kernel/tracing
# cat events/raw_syscalls/sys_enter/format
name: sys_enter
ID: 395
format:
field:unsigned short common_type; offset:0; size:2; signed:0;
field:unsigned char common_flags; offset:2; size:1; signed:0;
field:unsigned char common_preempt_count; offset:3; size:1; signed:0;
field:int common_pid; offset:4; size:4; signed:1;
field:long id; offset:8; size:8; signed:1;
field:unsigned long args[6]; offset:16; size:48; signed:0;
print fmt: "NR %ld (%lx, %lx, %lx, %lx, %lx, %lx)", REC->id, REC->args[0], REC->args[1], REC->args[2], REC->args[3], REC->args[4], REC->args[5]
From the source code, the sys_openat() has:
::
int sys_openat(int dirfd, const char *path, int flags, mode_t mode)
{
return my_syscall4(__NR_openat, dirfd, path, flags, mode);
}
The path is the second parameter, and that is what is wanted.
::
# echo 'e:openat raw_syscalls.sys_enter nr=$id filename=+8($args):ustring' >> dynamic_events
This is being run on x86_64 where the word size is 8 bytes and the openat
system call __NR_openat is set at 257.
::
# echo 'nr == 257' > events/eprobes/openat/filter
Now enable the event and look at the trace.
::
# echo 1 > events/eprobes/openat/enable
# cat trace
# tracer: nop
#
# entries-in-buffer/entries-written: 4/4 #P:8
#
# _-----=> irqs-off/BH-disabled
# / _----=> need-resched
# | / _---=> hardirq/softirq
# || / _--=> preempt-depth
# ||| / _-=> migrate-disable
# |||| / delay
# TASK-PID CPU# ||||| TIMESTAMP FUNCTION
# | | | ||||| | |
cat-1298 [003] ...2. 2060.875970: openat: (raw_syscalls.sys_enter) nr=0x101 filename=(fault)
cat-1298 [003] ...2. 2060.876197: openat: (raw_syscalls.sys_enter) nr=0x101 filename=(fault)
cat-1298 [003] ...2. 2060.879126: openat: (raw_syscalls.sys_enter) nr=0x101 filename=(fault)
cat-1298 [003] ...2. 2060.879639: openat: (raw_syscalls.sys_enter) nr=0x101 filename=(fault)
The filename shows "(fault)". This is likely because the filename has not been
pulled into memory yet and currently trace events cannot fault in memory that
is not present. When an eprobe tries to read memory that has not been faulted
in yet, it will show the "(fault)" text.
To get around this, as the kernel will likely pull in this filename and make
it present, attaching it to a synthetic event that can pass the address of the
filename from the entry of the event to the end of the event, this can be used
to show the filename when the system call returns.
Remove the old eprobe::
# echo 1 > events/eprobes/openat/enable
# echo '-:openat' >> dynamic_events
This time make an eprobe where the address of the filename is saved::
# echo 'e:openat_start raw_syscalls.sys_enter nr=$id filename=+8($args):x64' >> dynamic_events
Create a synthetic event that passes the address of the filename to the
end of the event::
# echo 's:filename u64 file' >> dynamic_events
# echo 'hist:keys=common_pid:f=filename if nr == 257' > events/eprobes/openat_start/trigger
# echo 'hist:keys=common_pid:file=$f:onmatch(eprobes.openat_start).trace(filename,$file) if id == 257' > events/raw_syscalls/sys_exit/trigger
Now that the address of the filename has been passed to the end of the
system call, create another eprobe to attach to the exit event to show the
string::
# echo 'e:openat synthetic.filename filename=+0($file):ustring' >> dynamic_events
# echo 1 > events/eprobes/openat/enable
# cat trace
# tracer: nop
#
# entries-in-buffer/entries-written: 4/4 #P:8
#
# _-----=> irqs-off/BH-disabled
# / _----=> need-resched
# | / _---=> hardirq/softirq
# || / _--=> preempt-depth
# ||| / _-=> migrate-disable
# |||| / delay
# TASK-PID CPU# ||||| TIMESTAMP FUNCTION
# | | | ||||| | |
cat-1331 [001] ...5. 2944.787977: openat: (synthetic.filename) filename="/etc/ld.so.cache"
cat-1331 [001] ...5. 2944.788480: openat: (synthetic.filename) filename="/lib/x86_64-linux-gnu/libc.so.6"
cat-1331 [001] ...5. 2944.793426: openat: (synthetic.filename) filename="/usr/lib/locale/locale-archive"
cat-1331 [001] ...5. 2944.831362: openat: (synthetic.filename) filename="trace"
Example 3
---------
If syscall trace events are available, the above would not need the first
eprobe, but it would still need the last one::
# echo 's:filename u64 file' >> dynamic_events
# echo 'hist:keys=common_pid:f=filename' > events/syscalls/sys_enter_openat/trigger
# echo 'hist:keys=common_pid:file=$f:onmatch(syscalls.sys_enter_openat).trace(filename,$file)' > events/syscalls/sys_exit_openat/trigger
# echo 'e:openat synthetic.filename filename=+0($file):ustring' >> dynamic_events
# echo 1 > events/eprobes/openat/enable
And this would produce the same result as Example 2.

View File

@ -36,6 +36,7 @@ the Linux kernel.
kprobes
kprobetrace
fprobetrace
eprobetrace
fprobe
ring-buffer-design

View File

@ -480,7 +480,6 @@ enum {
EVENT_FILE_FL_RECORDED_TGID_BIT,
EVENT_FILE_FL_FILTERED_BIT,
EVENT_FILE_FL_NO_SET_FILTER_BIT,
EVENT_FILE_FL_SOFT_MODE_BIT,
EVENT_FILE_FL_SOFT_DISABLED_BIT,
EVENT_FILE_FL_TRIGGER_MODE_BIT,
EVENT_FILE_FL_TRIGGER_COND_BIT,
@ -618,7 +617,6 @@ extern int __kprobe_event_add_fields(struct dynevent_cmd *cmd, ...);
* RECORDED_TGID - The tgids should be recorded at sched_switch
* FILTERED - The event has a filter attached
* NO_SET_FILTER - Set when filter has error and is to be ignored
* SOFT_MODE - The event is enabled/disabled by SOFT_DISABLED
* SOFT_DISABLED - When set, do not trace the event (even though its
* tracepoint may be enabled)
* TRIGGER_MODE - When set, invoke the triggers associated with the event
@ -633,7 +631,6 @@ enum {
EVENT_FILE_FL_RECORDED_TGID = (1 << EVENT_FILE_FL_RECORDED_TGID_BIT),
EVENT_FILE_FL_FILTERED = (1 << EVENT_FILE_FL_FILTERED_BIT),
EVENT_FILE_FL_NO_SET_FILTER = (1 << EVENT_FILE_FL_NO_SET_FILTER_BIT),
EVENT_FILE_FL_SOFT_MODE = (1 << EVENT_FILE_FL_SOFT_MODE_BIT),
EVENT_FILE_FL_SOFT_DISABLED = (1 << EVENT_FILE_FL_SOFT_DISABLED_BIT),
EVENT_FILE_FL_TRIGGER_MODE = (1 << EVENT_FILE_FL_TRIGGER_MODE_BIT),
EVENT_FILE_FL_TRIGGER_COND = (1 << EVENT_FILE_FL_TRIGGER_COND_BIT),

View File

@ -829,8 +829,6 @@ TRACE_EVENT(sched_wake_idle_without_ipi,
/*
* Following tracepoints are not exported in tracefs and provide hooking
* mechanisms only for testing and debugging purposes.
*
* Postfixed with _tp to make them easily identifiable in the code.
*/
DECLARE_TRACE(pelt_cfs,
TP_PROTO(struct cfs_rq *cfs_rq),

View File

@ -200,6 +200,19 @@ menuconfig FTRACE
if FTRACE
config TRACEFS_AUTOMOUNT_DEPRECATED
bool "Automount tracefs on debugfs [DEPRECATED]"
depends on TRACING
default y
help
The tracing interface was moved from /sys/kernel/debug/tracing
to /sys/kernel/tracing in 2015, but the tracing file system
was still automounted in /sys/kernel/debug for backward
compatibility with tooling.
The new interface has been around for more than 10 years and
the old debug mount will soon be removed.
config BOOTTIME_TRACING
bool "Boot-time Tracing support"
depends on TRACING
@ -780,6 +793,20 @@ config UPROBE_EVENTS
This option is required if you plan to use perf-probe subcommand
of perf tools on user space applications.
config EPROBE_EVENTS
bool "Enable event-based dynamic events"
depends on TRACING
depends on HAVE_REGS_AND_STACK_ACCESS_API
select PROBE_EVENTS
select DYNAMIC_EVENTS
default y
help
Eprobes are dynamic events that can be placed on other existing
events. It can be used to limit what fields are recorded in
an event or even dereference a field of an event. It can
convert the type of an event field. For example, turn an
address into a string.
config BPF_EVENTS
depends on BPF_SYSCALL
depends on (KPROBE_EVENTS || UPROBE_EVENTS) && PERF_EVENTS

View File

@ -82,7 +82,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
endif
obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
obj-$(CONFIG_PROBE_EVENTS) += trace_eprobe.o
obj-$(CONFIG_EPROBE_EVENTS) += trace_eprobe.o
obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o
obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o
obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o

View File

@ -117,12 +117,15 @@ static int preemptirq_delay_run(void *data)
{
int i;
int s = MIN(burst_size, NR_TEST_FUNCS);
struct cpumask cpu_mask;
cpumask_var_t cpu_mask;
if (!alloc_cpumask_var(&cpu_mask, GFP_KERNEL))
return -ENOMEM;
if (cpu_affinity > -1) {
cpumask_clear(&cpu_mask);
cpumask_set_cpu(cpu_affinity, &cpu_mask);
if (set_cpus_allowed_ptr(current, &cpu_mask))
cpumask_clear(cpu_mask);
cpumask_set_cpu(cpu_affinity, cpu_mask);
if (set_cpus_allowed_ptr(current, cpu_mask))
pr_err("cpu_affinity:%d, failed\n", cpu_affinity);
}
@ -139,6 +142,8 @@ static int preemptirq_delay_run(void *data)
__set_current_state(TASK_RUNNING);
free_cpumask_var(cpu_mask);
return 0;
}

View File

@ -674,8 +674,6 @@ static bool __read_mostly monitoring_on;
*/
bool rv_monitoring_on(void)
{
/* Ensures that concurrent monitors read consistent monitoring_on */
smp_rmb();
return READ_ONCE(monitoring_on);
}
@ -695,8 +693,6 @@ static ssize_t monitoring_on_read_data(struct file *filp, char __user *user_buf,
static void turn_monitoring_off(void)
{
WRITE_ONCE(monitoring_on, false);
/* Ensures that concurrent monitors read consistent monitoring_on */
smp_wmb();
}
static void reset_all_monitors(void)
@ -712,8 +708,6 @@ static void reset_all_monitors(void)
static void turn_monitoring_on(void)
{
WRITE_ONCE(monitoring_on, true);
/* Ensures that concurrent monitors read consistent monitoring_on */
smp_wmb();
}
static void turn_monitoring_on_with_reset(void)

View File

@ -936,7 +936,6 @@ int tracing_is_enabled(void)
* return the mirror variable of the state of the ring buffer.
* It's a little racy, but we don't really care.
*/
smp_rmb();
return !global_trace.buffer_disabled;
}
@ -1107,8 +1106,6 @@ void tracer_tracing_on(struct trace_array *tr)
* important to be fast than accurate.
*/
tr->buffer_disabled = 0;
/* Make the flag seen by readers */
smp_wmb();
}
/**
@ -1640,8 +1637,6 @@ void tracer_tracing_off(struct trace_array *tr)
* important to be fast than accurate.
*/
tr->buffer_disabled = 1;
/* Make the flag seen by readers */
smp_wmb();
}
/**
@ -2710,8 +2705,6 @@ void trace_buffered_event_enable(void)
static void enable_trace_buffered_event(void *data)
{
/* Probably not needed, but do it anyway */
smp_rmb();
this_cpu_dec(trace_buffered_event_cnt);
}
@ -5931,17 +5924,27 @@ static inline void trace_insert_eval_map_file(struct module *mod,
struct trace_eval_map **start, int len) { }
#endif /* !CONFIG_TRACE_EVAL_MAP_FILE */
static void trace_insert_eval_map(struct module *mod,
struct trace_eval_map **start, int len)
static void
trace_event_update_with_eval_map(struct module *mod,
struct trace_eval_map **start,
int len)
{
struct trace_eval_map **map;
if (len <= 0)
return;
/* Always run sanitizer only if btf_type_tag attr exists. */
if (len <= 0) {
if (!(IS_ENABLED(CONFIG_DEBUG_INFO_BTF) &&
IS_ENABLED(CONFIG_PAHOLE_HAS_BTF_TAG) &&
__has_attribute(btf_type_tag)))
return;
}
map = start;
trace_event_eval_update(map, len);
trace_event_update_all(map, len);
if (len <= 0)
return;
trace_insert_eval_map_file(mod, start, len);
}
@ -6297,7 +6300,7 @@ static bool tracer_options_updated;
static void add_tracer_options(struct trace_array *tr, struct tracer *t)
{
/* Only enable if the directory has been created already. */
if (!tr->dir)
if (!tr->dir && !(tr->flags & TRACE_ARRAY_FL_GLOBAL))
return;
/* Only create trace option files after update_tracer_options finish */
@ -8978,13 +8981,13 @@ static inline __init int register_snapshot_cmd(void) { return 0; }
static struct dentry *tracing_get_dentry(struct trace_array *tr)
{
if (WARN_ON(!tr->dir))
return ERR_PTR(-ENODEV);
/* Top directory uses NULL as the parent */
if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
return NULL;
if (WARN_ON(!tr->dir))
return ERR_PTR(-ENODEV);
/* All sub buffers have a descriptor */
return tr->dir;
}
@ -10250,6 +10253,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
ftrace_init_tracefs(tr, d_tracer);
}
#ifdef CONFIG_TRACEFS_AUTOMOUNT_DEPRECATED
static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
{
struct vfsmount *mnt;
@ -10271,6 +10275,8 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
if (IS_ERR(fc))
return ERR_CAST(fc);
pr_warn("NOTICE: Automounting of tracing to debugfs is deprecated and will be removed in 2030\n");
ret = vfs_parse_fs_string(fc, "source",
"tracefs", strlen("tracefs"));
if (!ret)
@ -10281,6 +10287,7 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
put_fs_context(fc);
return mnt;
}
#endif
/**
* tracing_init_dentry - initialize top level trace array
@ -10305,6 +10312,7 @@ int tracing_init_dentry(void)
if (WARN_ON(!tracefs_initialized()))
return -ENODEV;
#ifdef CONFIG_TRACEFS_AUTOMOUNT_DEPRECATED
/*
* As there may still be users that expect the tracing
* files to exist in debugfs/tracing, we must automount
@ -10313,6 +10321,7 @@ int tracing_init_dentry(void)
*/
tr->dir = debugfs_create_automount("tracing", NULL,
trace_automount, NULL);
#endif
return 0;
}
@ -10329,7 +10338,7 @@ static void __init eval_map_work_func(struct work_struct *work)
int len;
len = __stop_ftrace_eval_maps - __start_ftrace_eval_maps;
trace_insert_eval_map(NULL, __start_ftrace_eval_maps, len);
trace_event_update_with_eval_map(NULL, __start_ftrace_eval_maps, len);
}
static int __init trace_eval_init(void)
@ -10382,9 +10391,6 @@ bool module_exists(const char *module)
static void trace_module_add_evals(struct module *mod)
{
if (!mod->num_trace_evals)
return;
/*
* Modules with bad taint do not have events created, do
* not bother with enums either.
@ -10392,7 +10398,8 @@ static void trace_module_add_evals(struct module *mod)
if (trace_module_has_bad_taint(mod))
return;
trace_insert_eval_map(mod, mod->trace_evals, mod->num_trace_evals);
/* Even if no trace_evals, this need to sanitize field types. */
trace_event_update_with_eval_map(mod, mod->trace_evals, mod->num_trace_evals);
}
#ifdef CONFIG_TRACE_EVAL_MAP_FILE

View File

@ -2125,13 +2125,13 @@ static inline const char *get_syscall_name(int syscall)
#ifdef CONFIG_EVENT_TRACING
void trace_event_init(void);
void trace_event_eval_update(struct trace_eval_map **map, int len);
void trace_event_update_all(struct trace_eval_map **map, int len);
/* Used from boot time tracer */
extern int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set);
extern int trigger_process_regex(struct trace_event_file *file, char *buff);
#else
static inline void __init trace_event_init(void) { }
static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { }
static inline void trace_event_update_all(struct trace_eval_map **map, int len) { }
#endif
#ifdef CONFIG_TRACER_SNAPSHOT

View File

@ -768,6 +768,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
{
struct trace_event_call *call = file->event_call;
struct trace_array *tr = file->tr;
bool soft_mode = atomic_read(&file->sm_ref) != 0;
int ret = 0;
int disable;
@ -782,7 +783,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
* is set we do not want the event to be enabled before we
* clear the bit.
*
* When soft_disable is not set but the SOFT_MODE flag is,
* When soft_disable is not set but the soft_mode is,
* we do nothing. Do not disable the tracepoint, otherwise
* "soft enable"s (clearing the SOFT_DISABLED bit) wont work.
*/
@ -790,11 +791,11 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
if (atomic_dec_return(&file->sm_ref) > 0)
break;
disable = file->flags & EVENT_FILE_FL_SOFT_DISABLED;
clear_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags);
soft_mode = false;
/* Disable use of trace_buffered_event */
trace_buffered_event_disable();
} else
disable = !(file->flags & EVENT_FILE_FL_SOFT_MODE);
disable = !soft_mode;
if (disable && (file->flags & EVENT_FILE_FL_ENABLED)) {
clear_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags);
@ -812,8 +813,8 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
WARN_ON_ONCE(ret);
}
/* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */
if (file->flags & EVENT_FILE_FL_SOFT_MODE)
/* If in soft mode, just set the SOFT_DISABLE_BIT, else clear it */
if (soft_mode)
set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
else
clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
@ -823,7 +824,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
* When soft_disable is set and enable is set, we want to
* register the tracepoint for the event, but leave the event
* as is. That means, if the event was already enabled, we do
* nothing (but set SOFT_MODE). If the event is disabled, we
* nothing (but set soft_mode). If the event is disabled, we
* set SOFT_DISABLED before enabling the event tracepoint, so
* it still seems to be disabled.
*/
@ -832,7 +833,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
else {
if (atomic_inc_return(&file->sm_ref) > 1)
break;
set_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags);
soft_mode = true;
/* Enable use of trace_buffered_event */
trace_buffered_event_enable();
}
@ -840,7 +841,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
if (!(file->flags & EVENT_FILE_FL_ENABLED)) {
bool cmd = false, tgid = false;
/* Keep the event disabled, when going to SOFT_MODE. */
/* Keep the event disabled, when going to soft mode. */
if (soft_disable)
set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
@ -1792,8 +1793,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
!(flags & EVENT_FILE_FL_SOFT_DISABLED))
strcpy(buf, "1");
if (flags & EVENT_FILE_FL_SOFT_DISABLED ||
flags & EVENT_FILE_FL_SOFT_MODE)
if (atomic_read(&file->sm_ref) != 0)
strcat(buf, "*");
strcat(buf, "\n");
@ -3267,43 +3267,120 @@ static void add_str_to_module(struct module *module, char *str)
list_add(&modstr->next, &module_strings);
}
#define ATTRIBUTE_STR "__attribute__("
#define ATTRIBUTE_STR_LEN (sizeof(ATTRIBUTE_STR) - 1)
/* Remove all __attribute__() from @type. Return allocated string or @type. */
static char *sanitize_field_type(const char *type)
{
char *attr, *tmp, *next, *ret = (char *)type;
int depth;
next = (char *)type;
while ((attr = strstr(next, ATTRIBUTE_STR))) {
/* Retry if "__attribute__(" is a part of another word. */
if (attr != next && !isspace(attr[-1])) {
next = attr + ATTRIBUTE_STR_LEN;
continue;
}
if (ret == type) {
ret = kstrdup(type, GFP_KERNEL);
if (WARN_ON_ONCE(!ret))
return NULL;
attr = ret + (attr - type);
}
/* the ATTRIBUTE_STR already has the first '(' */
depth = 1;
next = attr + ATTRIBUTE_STR_LEN;
do {
tmp = strpbrk(next, "()");
/* There is unbalanced parentheses */
if (WARN_ON_ONCE(!tmp)) {
kfree(ret);
return (char *)type;
}
if (*tmp == '(')
depth++;
else
depth--;
next = tmp + 1;
} while (depth > 0);
next = skip_spaces(next);
strcpy(attr, next);
next = attr;
}
return ret;
}
static char *find_replacable_eval(const char *type, const char *eval_string,
int len)
{
char *ptr;
if (!eval_string)
return NULL;
ptr = strchr(type, '[');
if (!ptr)
return NULL;
ptr++;
if (!isalpha(*ptr) && *ptr != '_')
return NULL;
if (strncmp(eval_string, ptr, len) != 0)
return NULL;
return ptr;
}
static void update_event_fields(struct trace_event_call *call,
struct trace_eval_map *map)
{
struct ftrace_event_field *field;
const char *eval_string = NULL;
struct list_head *head;
int len = 0;
char *ptr;
char *str;
int len = strlen(map->eval_string);
/* Dynamic events should never have field maps */
if (WARN_ON_ONCE(call->flags & TRACE_EVENT_FL_DYNAMIC))
if (call->flags & TRACE_EVENT_FL_DYNAMIC)
return;
if (map) {
eval_string = map->eval_string;
len = strlen(map->eval_string);
}
head = trace_get_fields(call);
list_for_each_entry(field, head, link) {
ptr = strchr(field->type, '[');
if (!ptr)
continue;
ptr++;
if (!isalpha(*ptr) && *ptr != '_')
continue;
if (strncmp(map->eval_string, ptr, len) != 0)
continue;
str = kstrdup(field->type, GFP_KERNEL);
if (WARN_ON_ONCE(!str))
str = sanitize_field_type(field->type);
if (!str)
return;
ptr = str + (ptr - field->type);
ptr = eval_replace(ptr, map, len);
/* enum/sizeof string smaller than value */
if (WARN_ON_ONCE(!ptr)) {
kfree(str);
continue;
ptr = find_replacable_eval(str, eval_string, len);
if (ptr) {
if (str == field->type) {
str = kstrdup(field->type, GFP_KERNEL);
if (WARN_ON_ONCE(!str))
return;
ptr = str + (ptr - field->type);
}
ptr = eval_replace(ptr, map, len);
/* enum/sizeof string smaller than value */
if (WARN_ON_ONCE(!ptr)) {
kfree(str);
continue;
}
}
if (str == field->type)
continue;
/*
* If the event is part of a module, then we need to free the string
* when the module is removed. Otherwise, it will stay allocated
@ -3313,14 +3390,18 @@ static void update_event_fields(struct trace_event_call *call,
add_str_to_module(call->module, str);
field->type = str;
if (field->filter_type == FILTER_OTHER)
field->filter_type = filter_assign_type(field->type);
}
}
void trace_event_eval_update(struct trace_eval_map **map, int len)
/* Update all events for replacing eval and sanitizing */
void trace_event_update_all(struct trace_eval_map **map, int len)
{
struct trace_event_call *call, *p;
const char *last_system = NULL;
bool first = false;
bool updated;
int last_i;
int i;
@ -3333,6 +3414,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
last_system = call->class->system;
}
updated = false;
/*
* Since calls are grouped by systems, the likelihood that the
* next call in the iteration belongs to the same system as the
@ -3352,8 +3434,12 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
}
update_event_printk(call, map[i]);
update_event_fields(call, map[i]);
updated = true;
}
}
/* If not updated yet, update field for sanitizing. */
if (!updated)
update_event_fields(call, NULL);
cond_resched();
}
up_write(&trace_event_sem);
@ -3587,7 +3673,7 @@ static int probe_remove_event_call(struct trace_event_call *call)
continue;
/*
* We can't rely on ftrace_event_enable_disable(enable => 0)
* we are going to do, EVENT_FILE_FL_SOFT_MODE can suppress
* we are going to do, soft mode can suppress
* TRACE_REG_UNREGISTER.
*/
if (file->flags & EVENT_FILE_FL_ENABLED)
@ -3698,7 +3784,7 @@ static void trace_module_remove_events(struct module *mod)
if (call->module == mod)
__trace_remove_event_call(call);
}
/* Check for any strings allocade for this module */
/* Check for any strings allocated for this module */
list_for_each_entry_safe(modstr, m, &module_strings, next) {
if (modstr->module != mod)
continue;
@ -4002,7 +4088,7 @@ static int free_probe_data(void *data)
edata->ref--;
if (!edata->ref) {
/* Remove the SOFT_MODE flag */
/* Remove soft mode */
__ftrace_event_enable_disable(edata->file, 0, 1);
trace_event_put_ref(edata->file->event_call);
kfree(edata);

View File

@ -1344,13 +1344,14 @@ struct filter_list {
struct filter_head {
struct list_head list;
struct rcu_head rcu;
union {
struct rcu_head rcu;
struct rcu_work rwork;
};
};
static void free_filter_list(struct rcu_head *rhp)
static void free_filter_list(struct filter_head *filter_list)
{
struct filter_head *filter_list = container_of(rhp, struct filter_head, rcu);
struct filter_list *filter_item, *tmp;
list_for_each_entry_safe(filter_item, tmp, &filter_list->list, list) {
@ -1361,9 +1362,20 @@ static void free_filter_list(struct rcu_head *rhp)
kfree(filter_list);
}
static void free_filter_list_work(struct work_struct *work)
{
struct filter_head *filter_list;
filter_list = container_of(to_rcu_work(work), struct filter_head, rwork);
free_filter_list(filter_list);
}
static void free_filter_list_tasks(struct rcu_head *rhp)
{
call_rcu(rhp, free_filter_list);
struct filter_head *filter_list = container_of(rhp, struct filter_head, rcu);
INIT_RCU_WORK(&filter_list->rwork, free_filter_list_work);
queue_rcu_work(system_wq, &filter_list->rwork);
}
/*
@ -1460,7 +1472,7 @@ static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir,
tracepoint_synchronize_unregister();
if (head)
free_filter_list(&head->rcu);
free_filter_list(head);
list_for_each_entry(file, &tr->events, list) {
if (file->system != dir || !file->filter)
@ -2305,7 +2317,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir,
return 0;
fail:
/* No call succeeded */
free_filter_list(&filter_list->rcu);
free_filter_list(filter_list);
parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0);
return -EINVAL;
fail_mem:
@ -2315,7 +2327,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir,
if (!fail)
delay_free_filter(filter_list);
else
free_filter_list(&filter_list->rcu);
free_filter_list(filter_list);
return -ENOMEM;
}

View File

@ -325,12 +325,9 @@ static void move_to_next_cpu(void)
cpus_read_lock();
cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
next_cpu = cpumask_next(raw_smp_processor_id(), current_mask);
next_cpu = cpumask_next_wrap(raw_smp_processor_id(), current_mask);
cpus_read_unlock();
if (next_cpu >= nr_cpu_ids)
next_cpu = cpumask_first(current_mask);
if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */
goto change_mode;