mirror of
				git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
				synced 2025-09-04 20:19:47 +08:00 
			
		
		
		
	 2b5067a814
			
		
	
	
		2b5067a814
		
	
	
	
	
		
			
			The goal of these tracepoints is to be able to debug lock contention issues. This lock is acquired on most (all?) mmap / munmap / page fault operations, so a multi-threaded process which does a lot of these can experience significant contention. We trace just before we start acquisition, when the acquisition returns (whether it succeeded or not), and when the lock is released (or downgraded). The events are broken out by lock type (read / write). The events are also broken out by memcg path. For container-based workloads, users often think of several processes in a memcg as a single logical "task", so collecting statistics at this level is useful. The end goal is to get latency information. This isn't directly included in the trace events. Instead, users are expected to compute the time between "start locking" and "acquire returned", using e.g. synthetic events or BPF. The benefit we get from this is simpler code. Because we use tracepoint_enabled() to decide whether or not to trace, this patch has effectively no overhead unless tracepoints are enabled at runtime. If tracepoints are enabled, there is a performance impact, but how much depends on exactly what e.g. the BPF program does. [axelrasmussen@google.com: fix use-after-free race and css ref leak in tracepoints] Link: https://lkml.kernel.org/r/20201130233504.3725241-1-axelrasmussen@google.com [axelrasmussen@google.com: v3] Link: https://lkml.kernel.org/r/20201207213358.573750-1-axelrasmussen@google.com [rostedt@goodmis.org: in-depth examples of tracepoint_enabled() usage, and per-cpu-per-context buffer design] Link: https://lkml.kernel.org/r/20201105211739.568279-2-axelrasmussen@google.com Signed-off-by: Axel Rasmussen <axelrasmussen@google.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Michel Lespinasse <walken@google.com> Cc: Daniel Jordan <daniel.m.jordan@oracle.com> Cc: Jann Horn <jannh@google.com> Cc: Chinwen Chang <chinwen.chang@mediatek.com> Cc: Davidlohr Bueso <dbueso@suse.de> Cc: David Rientjes <rientjes@google.com> Cc: Laurent Dufour <ldufour@linux.ibm.com> Cc: Yafang Shao <laoar.shao@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
		
			
				
	
	
		
			231 lines
		
	
	
		
			5.8 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			231 lines
		
	
	
		
			5.8 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| // SPDX-License-Identifier: GPL-2.0
 | |
| #define CREATE_TRACE_POINTS
 | |
| #include <trace/events/mmap_lock.h>
 | |
| 
 | |
| #include <linux/mm.h>
 | |
| #include <linux/cgroup.h>
 | |
| #include <linux/memcontrol.h>
 | |
| #include <linux/mmap_lock.h>
 | |
| #include <linux/mutex.h>
 | |
| #include <linux/percpu.h>
 | |
| #include <linux/rcupdate.h>
 | |
| #include <linux/smp.h>
 | |
| #include <linux/trace_events.h>
 | |
| 
 | |
| EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking);
 | |
| EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned);
 | |
| EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);
 | |
| 
 | |
| #ifdef CONFIG_MEMCG
 | |
| 
 | |
| /*
 | |
|  * Our various events all share the same buffer (because we don't want or need
 | |
|  * to allocate a set of buffers *per event type*), so we need to protect against
 | |
|  * concurrent _reg() and _unreg() calls, and count how many _reg() calls have
 | |
|  * been made.
 | |
|  */
 | |
| static DEFINE_MUTEX(reg_lock);
 | |
| static int reg_refcount; /* Protected by reg_lock. */
 | |
| 
 | |
| /*
 | |
|  * Size of the buffer for memcg path names. Ignoring stack trace support,
 | |
|  * trace_events_hist.c uses MAX_FILTER_STR_VAL for this, so we also use it.
 | |
|  */
 | |
| #define MEMCG_PATH_BUF_SIZE MAX_FILTER_STR_VAL
 | |
| 
 | |
| /*
 | |
|  * How many contexts our trace events might be called in: normal, softirq, irq,
 | |
|  * and NMI.
 | |
|  */
 | |
| #define CONTEXT_COUNT 4
 | |
| 
 | |
| static DEFINE_PER_CPU(char __rcu *, memcg_path_buf);
 | |
| static char **tmp_bufs;
 | |
| static DEFINE_PER_CPU(int, memcg_path_buf_idx);
 | |
| 
 | |
| /* Called with reg_lock held. */
 | |
| static void free_memcg_path_bufs(void)
 | |
| {
 | |
| 	int cpu;
 | |
| 	char **old = tmp_bufs;
 | |
| 
 | |
| 	for_each_possible_cpu(cpu) {
 | |
| 		*(old++) = rcu_dereference_protected(
 | |
| 			per_cpu(memcg_path_buf, cpu),
 | |
| 			lockdep_is_held(®_lock));
 | |
| 		rcu_assign_pointer(per_cpu(memcg_path_buf, cpu), NULL);
 | |
| 	}
 | |
| 
 | |
| 	/* Wait for inflight memcg_path_buf users to finish. */
 | |
| 	synchronize_rcu();
 | |
| 
 | |
| 	old = tmp_bufs;
 | |
| 	for_each_possible_cpu(cpu) {
 | |
| 		kfree(*(old++));
 | |
| 	}
 | |
| 
 | |
| 	kfree(tmp_bufs);
 | |
| 	tmp_bufs = NULL;
 | |
| }
 | |
| 
 | |
| int trace_mmap_lock_reg(void)
 | |
| {
 | |
| 	int cpu;
 | |
| 	char *new;
 | |
| 
 | |
| 	mutex_lock(®_lock);
 | |
| 
 | |
| 	/* If the refcount is going 0->1, proceed with allocating buffers. */
 | |
| 	if (reg_refcount++)
 | |
| 		goto out;
 | |
| 
 | |
| 	tmp_bufs = kmalloc_array(num_possible_cpus(), sizeof(*tmp_bufs),
 | |
| 				 GFP_KERNEL);
 | |
| 	if (tmp_bufs == NULL)
 | |
| 		goto out_fail;
 | |
| 
 | |
| 	for_each_possible_cpu(cpu) {
 | |
| 		new = kmalloc(MEMCG_PATH_BUF_SIZE * CONTEXT_COUNT, GFP_KERNEL);
 | |
| 		if (new == NULL)
 | |
| 			goto out_fail_free;
 | |
| 		rcu_assign_pointer(per_cpu(memcg_path_buf, cpu), new);
 | |
| 		/* Don't need to wait for inflights, they'd have gotten NULL. */
 | |
| 	}
 | |
| 
 | |
| out:
 | |
| 	mutex_unlock(®_lock);
 | |
| 	return 0;
 | |
| 
 | |
| out_fail_free:
 | |
| 	free_memcg_path_bufs();
 | |
| out_fail:
 | |
| 	/* Since we failed, undo the earlier ref increment. */
 | |
| 	--reg_refcount;
 | |
| 
 | |
| 	mutex_unlock(®_lock);
 | |
| 	return -ENOMEM;
 | |
| }
 | |
| 
 | |
| void trace_mmap_lock_unreg(void)
 | |
| {
 | |
| 	mutex_lock(®_lock);
 | |
| 
 | |
| 	/* If the refcount is going 1->0, proceed with freeing buffers. */
 | |
| 	if (--reg_refcount)
 | |
| 		goto out;
 | |
| 
 | |
| 	free_memcg_path_bufs();
 | |
| 
 | |
| out:
 | |
| 	mutex_unlock(®_lock);
 | |
| }
 | |
| 
 | |
| static inline char *get_memcg_path_buf(void)
 | |
| {
 | |
| 	char *buf;
 | |
| 	int idx;
 | |
| 
 | |
| 	rcu_read_lock();
 | |
| 	buf = rcu_dereference(*this_cpu_ptr(&memcg_path_buf));
 | |
| 	if (buf == NULL) {
 | |
| 		rcu_read_unlock();
 | |
| 		return NULL;
 | |
| 	}
 | |
| 	idx = this_cpu_add_return(memcg_path_buf_idx, MEMCG_PATH_BUF_SIZE) -
 | |
| 	      MEMCG_PATH_BUF_SIZE;
 | |
| 	return &buf[idx];
 | |
| }
 | |
| 
 | |
| static inline void put_memcg_path_buf(void)
 | |
| {
 | |
| 	this_cpu_sub(memcg_path_buf_idx, MEMCG_PATH_BUF_SIZE);
 | |
| 	rcu_read_unlock();
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Write the given mm_struct's memcg path to a percpu buffer, and return a
 | |
|  * pointer to it. If the path cannot be determined, or no buffer was available
 | |
|  * (because the trace event is being unregistered), NULL is returned.
 | |
|  *
 | |
|  * Note: buffers are allocated per-cpu to avoid locking, so preemption must be
 | |
|  * disabled by the caller before calling us, and re-enabled only after the
 | |
|  * caller is done with the pointer.
 | |
|  *
 | |
|  * The caller must call put_memcg_path_buf() once the buffer is no longer
 | |
|  * needed. This must be done while preemption is still disabled.
 | |
|  */
 | |
| static const char *get_mm_memcg_path(struct mm_struct *mm)
 | |
| {
 | |
| 	char *buf = NULL;
 | |
| 	struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
 | |
| 
 | |
| 	if (memcg == NULL)
 | |
| 		goto out;
 | |
| 	if (unlikely(memcg->css.cgroup == NULL))
 | |
| 		goto out_put;
 | |
| 
 | |
| 	buf = get_memcg_path_buf();
 | |
| 	if (buf == NULL)
 | |
| 		goto out_put;
 | |
| 
 | |
| 	cgroup_path(memcg->css.cgroup, buf, MEMCG_PATH_BUF_SIZE);
 | |
| 
 | |
| out_put:
 | |
| 	css_put(&memcg->css);
 | |
| out:
 | |
| 	return buf;
 | |
| }
 | |
| 
 | |
| #define TRACE_MMAP_LOCK_EVENT(type, mm, ...)                                   \
 | |
| 	do {                                                                   \
 | |
| 		const char *memcg_path;                                        \
 | |
| 		preempt_disable();                                             \
 | |
| 		memcg_path = get_mm_memcg_path(mm);                            \
 | |
| 		trace_mmap_lock_##type(mm,                                     \
 | |
| 				       memcg_path != NULL ? memcg_path : "",   \
 | |
| 				       ##__VA_ARGS__);                         \
 | |
| 		if (likely(memcg_path != NULL))                                \
 | |
| 			put_memcg_path_buf();                                  \
 | |
| 		preempt_enable();                                              \
 | |
| 	} while (0)
 | |
| 
 | |
| #else /* !CONFIG_MEMCG */
 | |
| 
 | |
| int trace_mmap_lock_reg(void)
 | |
| {
 | |
| 	return 0;
 | |
| }
 | |
| 
 | |
| void trace_mmap_lock_unreg(void)
 | |
| {
 | |
| }
 | |
| 
 | |
| #define TRACE_MMAP_LOCK_EVENT(type, mm, ...)                                   \
 | |
| 	trace_mmap_lock_##type(mm, "", ##__VA_ARGS__)
 | |
| 
 | |
| #endif /* CONFIG_MEMCG */
 | |
| 
 | |
| /*
 | |
|  * Trace calls must be in a separate file, as otherwise there's a circular
 | |
|  * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h.
 | |
|  */
 | |
| 
 | |
| void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write)
 | |
| {
 | |
| 	TRACE_MMAP_LOCK_EVENT(start_locking, mm, write);
 | |
| }
 | |
| EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking);
 | |
| 
 | |
| void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
 | |
| 					   bool success)
 | |
| {
 | |
| 	TRACE_MMAP_LOCK_EVENT(acquire_returned, mm, write, success);
 | |
| }
 | |
| EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned);
 | |
| 
 | |
| void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
 | |
| {
 | |
| 	TRACE_MMAP_LOCK_EVENT(released, mm, write);
 | |
| }
 | |
| EXPORT_SYMBOL(__mmap_lock_do_trace_released);
 |