mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-03-22 07:27:12 +08:00
Problem
=======
Commit 658eb5ab91 ("delayacct: add delay max to record delay peak")
introduced the delay max for getdelays, which records abnormal latency
peaks and helps us understand the magnitude of such delays. However, the
peak latency value alone is insufficient for effective root cause
analysis. Without the precise timestamp of when the peak occurred, we
still lack the critical context needed to correlate it with other system
events.
Solution
========
To address this, we need to additionally record a precise timestamp when
the maximum latency occurs. By correlating this timestamp with system
logs and monitoring metrics, we can identify processes with abnormal
resource usage at the same moment, which can help us to pinpoint root
causes.
Use Case
========
bash-4.4# ./getdelays -d -t 227
print delayacct stats ON
TGID 227
CPU count real total virtual total delay total delay average delay max delay min delay max timestamp
46 188000000 192348334 4098012 0.089ms 0.429260ms 0.051205ms 2026-01-15T15:06:58
IO count delay total delay average delay max delay min delay max timestamp
0 0 0.000ms 0.000000ms 0.000000ms N/A
SWAP count delay total delay average delay max delay min delay max timestamp
0 0 0.000ms 0.000000ms 0.000000ms N/A
RECLAIM count delay total delay average delay max delay min delay max timestamp
0 0 0.000ms 0.000000ms 0.000000ms N/A
THRAS HING count delay total delay average delay max delay min delay max timestamp
0 0 0.000ms 0.000000ms 0.000000ms N/A
COMPACT count delay total delay average delay max delay min delay max timestamp
0 0 0.000ms 0.000000ms 0.000000ms N/A
WPCOPY count delay total delay average delay max delay min delay max timestamp
182 19413338 0.107ms 0.547353ms 0.022462ms 2026-01-15T15:05:24
IRQ count delay total delay average delay max delay min delay max timestamp
0 0 0.000ms 0.000000ms 0.000000ms N/A
Link: https://lkml.kernel.org/r/20260119100241520gWubW8-5QfhSf9gjqcc_E@zte.com.cn
Signed-off-by: Wang Yaxin <wang.yaxin@zte.com.cn>
Cc: Fan Yu <fan.yu9@zte.com.cn>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yang Yang <yang.yang29@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
319 lines
8.1 KiB
C
319 lines
8.1 KiB
C
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
/* delayacct.c - per-task delay accounting
|
|
*
|
|
* Copyright (C) Shailabh Nagar, IBM Corp. 2006
|
|
*/
|
|
|
|
#include <linux/sched.h>
|
|
#include <linux/sched/task.h>
|
|
#include <linux/sched/cputime.h>
|
|
#include <linux/sched/clock.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/taskstats.h>
|
|
#include <linux/sysctl.h>
|
|
#include <linux/delayacct.h>
|
|
#include <linux/module.h>
|
|
|
|
#define UPDATE_DELAY(type) \
|
|
do { \
|
|
d->type##_delay_max = tsk->delays->type##_delay_max; \
|
|
d->type##_delay_min = tsk->delays->type##_delay_min; \
|
|
d->type##_delay_max_ts = tsk->delays->type##_delay_max_ts; \
|
|
tmp = d->type##_delay_total + tsk->delays->type##_delay; \
|
|
d->type##_delay_total = (tmp < d->type##_delay_total) ? 0 : tmp; \
|
|
d->type##_count += tsk->delays->type##_count; \
|
|
} while (0)
|
|
|
|
DEFINE_STATIC_KEY_FALSE(delayacct_key);
|
|
int delayacct_on __read_mostly; /* Delay accounting turned on/off */
|
|
struct kmem_cache *delayacct_cache;
|
|
|
|
static void set_delayacct(bool enabled)
|
|
{
|
|
if (enabled) {
|
|
static_branch_enable(&delayacct_key);
|
|
delayacct_on = 1;
|
|
} else {
|
|
delayacct_on = 0;
|
|
static_branch_disable(&delayacct_key);
|
|
}
|
|
}
|
|
|
|
static int __init delayacct_setup_enable(char *str)
|
|
{
|
|
delayacct_on = 1;
|
|
return 1;
|
|
}
|
|
__setup("delayacct", delayacct_setup_enable);
|
|
|
|
void delayacct_init(void)
|
|
{
|
|
delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC|SLAB_ACCOUNT);
|
|
delayacct_tsk_init(&init_task);
|
|
set_delayacct(delayacct_on);
|
|
}
|
|
|
|
#ifdef CONFIG_PROC_SYSCTL
|
|
static int sysctl_delayacct(const struct ctl_table *table, int write, void *buffer,
|
|
size_t *lenp, loff_t *ppos)
|
|
{
|
|
int state = delayacct_on;
|
|
struct ctl_table t;
|
|
int err;
|
|
|
|
if (write && !capable(CAP_SYS_ADMIN))
|
|
return -EPERM;
|
|
|
|
t = *table;
|
|
t.data = &state;
|
|
err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
|
|
if (err < 0)
|
|
return err;
|
|
if (write)
|
|
set_delayacct(state);
|
|
return err;
|
|
}
|
|
|
|
static const struct ctl_table kern_delayacct_table[] = {
|
|
{
|
|
.procname = "task_delayacct",
|
|
.data = NULL,
|
|
.maxlen = sizeof(unsigned int),
|
|
.mode = 0644,
|
|
.proc_handler = sysctl_delayacct,
|
|
.extra1 = SYSCTL_ZERO,
|
|
.extra2 = SYSCTL_ONE,
|
|
},
|
|
};
|
|
|
|
static __init int kernel_delayacct_sysctls_init(void)
|
|
{
|
|
register_sysctl_init("kernel", kern_delayacct_table);
|
|
return 0;
|
|
}
|
|
late_initcall(kernel_delayacct_sysctls_init);
|
|
#endif
|
|
|
|
void __delayacct_tsk_init(struct task_struct *tsk)
|
|
{
|
|
tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL);
|
|
if (tsk->delays)
|
|
raw_spin_lock_init(&tsk->delays->lock);
|
|
}
|
|
|
|
/*
|
|
* Finish delay accounting for a statistic using its timestamps (@start),
|
|
* accumulator (@total) and @count
|
|
*/
|
|
static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count,
|
|
u64 *max, u64 *min, struct timespec64 *ts)
|
|
{
|
|
s64 ns = local_clock() - *start;
|
|
unsigned long flags;
|
|
|
|
if (ns > 0) {
|
|
raw_spin_lock_irqsave(lock, flags);
|
|
*total += ns;
|
|
(*count)++;
|
|
if (ns > *max) {
|
|
*max = ns;
|
|
ktime_get_real_ts64(ts);
|
|
}
|
|
if (*min == 0 || ns < *min)
|
|
*min = ns;
|
|
raw_spin_unlock_irqrestore(lock, flags);
|
|
}
|
|
}
|
|
|
|
void __delayacct_blkio_start(void)
|
|
{
|
|
current->delays->blkio_start = local_clock();
|
|
}
|
|
|
|
/*
|
|
* We cannot rely on the `current` macro, as we haven't yet switched back to
|
|
* the process being woken.
|
|
*/
|
|
void __delayacct_blkio_end(struct task_struct *p)
|
|
{
|
|
delayacct_end(&p->delays->lock,
|
|
&p->delays->blkio_start,
|
|
&p->delays->blkio_delay,
|
|
&p->delays->blkio_count,
|
|
&p->delays->blkio_delay_max,
|
|
&p->delays->blkio_delay_min,
|
|
&p->delays->blkio_delay_max_ts);
|
|
}
|
|
|
|
int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
|
|
{
|
|
u64 utime, stime, stimescaled, utimescaled;
|
|
unsigned long long t2, t3;
|
|
unsigned long flags, t1;
|
|
s64 tmp;
|
|
|
|
task_cputime(tsk, &utime, &stime);
|
|
tmp = (s64)d->cpu_run_real_total;
|
|
tmp += utime + stime;
|
|
d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
|
|
|
|
task_cputime_scaled(tsk, &utimescaled, &stimescaled);
|
|
tmp = (s64)d->cpu_scaled_run_real_total;
|
|
tmp += utimescaled + stimescaled;
|
|
d->cpu_scaled_run_real_total =
|
|
(tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;
|
|
|
|
/*
|
|
* No locking available for sched_info (and too expensive to add one)
|
|
* Mitigate by taking snapshot of values
|
|
*/
|
|
t1 = tsk->sched_info.pcount;
|
|
t2 = tsk->sched_info.run_delay;
|
|
t3 = tsk->se.sum_exec_runtime;
|
|
|
|
d->cpu_count += t1;
|
|
|
|
d->cpu_delay_max = tsk->sched_info.max_run_delay;
|
|
d->cpu_delay_min = tsk->sched_info.min_run_delay;
|
|
d->cpu_delay_max_ts = tsk->sched_info.max_run_delay_ts;
|
|
tmp = (s64)d->cpu_delay_total + t2;
|
|
d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;
|
|
tmp = (s64)d->cpu_run_virtual_total + t3;
|
|
|
|
d->cpu_run_virtual_total =
|
|
(tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp;
|
|
|
|
if (!tsk->delays)
|
|
return 0;
|
|
|
|
/* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
|
|
raw_spin_lock_irqsave(&tsk->delays->lock, flags);
|
|
UPDATE_DELAY(blkio);
|
|
UPDATE_DELAY(swapin);
|
|
UPDATE_DELAY(freepages);
|
|
UPDATE_DELAY(thrashing);
|
|
UPDATE_DELAY(compact);
|
|
UPDATE_DELAY(wpcopy);
|
|
UPDATE_DELAY(irq);
|
|
raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
|
|
|
|
return 0;
|
|
}
|
|
|
|
__u64 __delayacct_blkio_ticks(struct task_struct *tsk)
|
|
{
|
|
__u64 ret;
|
|
unsigned long flags;
|
|
|
|
raw_spin_lock_irqsave(&tsk->delays->lock, flags);
|
|
ret = nsec_to_clock_t(tsk->delays->blkio_delay);
|
|
raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
|
|
return ret;
|
|
}
|
|
|
|
void __delayacct_freepages_start(void)
|
|
{
|
|
current->delays->freepages_start = local_clock();
|
|
}
|
|
|
|
void __delayacct_freepages_end(void)
|
|
{
|
|
delayacct_end(¤t->delays->lock,
|
|
¤t->delays->freepages_start,
|
|
¤t->delays->freepages_delay,
|
|
¤t->delays->freepages_count,
|
|
¤t->delays->freepages_delay_max,
|
|
¤t->delays->freepages_delay_min,
|
|
¤t->delays->freepages_delay_max_ts);
|
|
}
|
|
|
|
void __delayacct_thrashing_start(bool *in_thrashing)
|
|
{
|
|
*in_thrashing = !!current->in_thrashing;
|
|
if (*in_thrashing)
|
|
return;
|
|
|
|
current->in_thrashing = 1;
|
|
current->delays->thrashing_start = local_clock();
|
|
}
|
|
|
|
void __delayacct_thrashing_end(bool *in_thrashing)
|
|
{
|
|
if (*in_thrashing)
|
|
return;
|
|
|
|
current->in_thrashing = 0;
|
|
delayacct_end(¤t->delays->lock,
|
|
¤t->delays->thrashing_start,
|
|
¤t->delays->thrashing_delay,
|
|
¤t->delays->thrashing_count,
|
|
¤t->delays->thrashing_delay_max,
|
|
¤t->delays->thrashing_delay_min,
|
|
¤t->delays->thrashing_delay_max_ts);
|
|
}
|
|
|
|
void __delayacct_swapin_start(void)
|
|
{
|
|
current->delays->swapin_start = local_clock();
|
|
}
|
|
|
|
void __delayacct_swapin_end(void)
|
|
{
|
|
delayacct_end(¤t->delays->lock,
|
|
¤t->delays->swapin_start,
|
|
¤t->delays->swapin_delay,
|
|
¤t->delays->swapin_count,
|
|
¤t->delays->swapin_delay_max,
|
|
¤t->delays->swapin_delay_min,
|
|
¤t->delays->swapin_delay_max_ts);
|
|
}
|
|
|
|
void __delayacct_compact_start(void)
|
|
{
|
|
current->delays->compact_start = local_clock();
|
|
}
|
|
|
|
void __delayacct_compact_end(void)
|
|
{
|
|
delayacct_end(¤t->delays->lock,
|
|
¤t->delays->compact_start,
|
|
¤t->delays->compact_delay,
|
|
¤t->delays->compact_count,
|
|
¤t->delays->compact_delay_max,
|
|
¤t->delays->compact_delay_min,
|
|
¤t->delays->compact_delay_max_ts);
|
|
}
|
|
|
|
void __delayacct_wpcopy_start(void)
|
|
{
|
|
current->delays->wpcopy_start = local_clock();
|
|
}
|
|
|
|
void __delayacct_wpcopy_end(void)
|
|
{
|
|
delayacct_end(¤t->delays->lock,
|
|
¤t->delays->wpcopy_start,
|
|
¤t->delays->wpcopy_delay,
|
|
¤t->delays->wpcopy_count,
|
|
¤t->delays->wpcopy_delay_max,
|
|
¤t->delays->wpcopy_delay_min,
|
|
¤t->delays->wpcopy_delay_max_ts);
|
|
}
|
|
|
|
void __delayacct_irq(struct task_struct *task, u32 delta)
|
|
{
|
|
unsigned long flags;
|
|
|
|
raw_spin_lock_irqsave(&task->delays->lock, flags);
|
|
task->delays->irq_delay += delta;
|
|
task->delays->irq_count++;
|
|
if (delta > task->delays->irq_delay_max) {
|
|
task->delays->irq_delay_max = delta;
|
|
ktime_get_real_ts64(&task->delays->irq_delay_max_ts);
|
|
}
|
|
if (delta && (!task->delays->irq_delay_min || delta < task->delays->irq_delay_min))
|
|
task->delays->irq_delay_min = delta;
|
|
raw_spin_unlock_irqrestore(&task->delays->lock, flags);
|
|
}
|
|
|