Merge tag 'wq-for-7.0-rc3-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq

Pull workqueue fixes from Tejun Heo:

 - Improve workqueue stall diagnostics: dump all busy workers (not just
   running ones), show wall-clock duration of in-flight work items, and
   add a sample module for reproducing stalls

 - Fix POOL_BH vs WQ_BH flag namespace mismatch in pr_cont_worker_id()

 - Rename pool->watchdog_ts to pool->last_progress_ts and related
   functions for clarity

* tag 'wq-for-7.0-rc3-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq:
  workqueue: Rename show_cpu_pool{s,}_hog{s,}() to reflect broadened scope
  workqueue: Add stall detector sample module
  workqueue: Show all busy workers in stall diagnostics
  workqueue: Show in-flight work item duration in stall diagnostics
  workqueue: Rename pool->watchdog_ts to pool->last_progress_ts
  workqueue: Use POOL_BH instead of WQ_BH when checking pool flags
This commit is contained in:
Linus Torvalds
2026-03-13 15:11:05 -07:00
4 changed files with 128 additions and 27 deletions

View File

@@ -190,7 +190,7 @@ struct worker_pool {
int id; /* I: pool ID */
unsigned int flags; /* L: flags */
unsigned long watchdog_ts; /* L: watchdog timestamp */
unsigned long last_progress_ts; /* L: last forward progress timestamp */
bool cpu_stall; /* WD: stalled cpu bound pool */
/*
@@ -1697,7 +1697,7 @@ static void __pwq_activate_work(struct pool_workqueue *pwq,
WARN_ON_ONCE(!(*wdb & WORK_STRUCT_INACTIVE));
trace_workqueue_activate_work(work);
if (list_empty(&pwq->pool->worklist))
pwq->pool->watchdog_ts = jiffies;
pwq->pool->last_progress_ts = jiffies;
move_linked_works(work, &pwq->pool->worklist, NULL);
__clear_bit(WORK_STRUCT_INACTIVE_BIT, wdb);
}
@@ -2348,7 +2348,7 @@ retry:
*/
if (list_empty(&pwq->inactive_works) && pwq_tryinc_nr_active(pwq, false)) {
if (list_empty(&pool->worklist))
pool->watchdog_ts = jiffies;
pool->last_progress_ts = jiffies;
trace_workqueue_activate_work(work);
insert_work(pwq, work, &pool->worklist, work_flags);
@@ -3204,6 +3204,7 @@ __acquires(&pool->lock)
worker->current_pwq = pwq;
if (worker->task)
worker->current_at = worker->task->se.sum_exec_runtime;
worker->current_start = jiffies;
work_data = *work_data_bits(work);
worker->current_color = get_work_color(work_data);
@@ -3352,7 +3353,7 @@ static void process_scheduled_works(struct worker *worker)
while ((work = list_first_entry_or_null(&worker->scheduled,
struct work_struct, entry))) {
if (first) {
worker->pool->watchdog_ts = jiffies;
worker->pool->last_progress_ts = jiffies;
first = false;
}
process_one_work(worker, work);
@@ -4850,7 +4851,7 @@ static int init_worker_pool(struct worker_pool *pool)
pool->cpu = -1;
pool->node = NUMA_NO_NODE;
pool->flags |= POOL_DISASSOCIATED;
pool->watchdog_ts = jiffies;
pool->last_progress_ts = jiffies;
INIT_LIST_HEAD(&pool->worklist);
INIT_LIST_HEAD(&pool->idle_list);
hash_init(pool->busy_hash);
@@ -6274,7 +6275,7 @@ static void pr_cont_worker_id(struct worker *worker)
{
struct worker_pool *pool = worker->pool;
if (pool->flags & WQ_BH)
if (pool->flags & POOL_BH)
pr_cont("bh%s",
pool->attrs->nice == HIGHPRI_NICE_LEVEL ? "-hi" : "");
else
@@ -6359,6 +6360,8 @@ static void show_pwq(struct pool_workqueue *pwq)
pr_cont(" %s", comma ? "," : "");
pr_cont_worker_id(worker);
pr_cont(":%ps", worker->current_func);
pr_cont(" for %us",
jiffies_to_msecs(jiffies - worker->current_start) / 1000);
list_for_each_entry(work, &worker->scheduled, entry)
pr_cont_work(false, work, &pcws);
pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
@@ -6462,7 +6465,7 @@ static void show_one_worker_pool(struct worker_pool *pool)
/* How long the first pending work is waiting for a worker. */
if (!list_empty(&pool->worklist))
hung = jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000;
hung = jiffies_to_msecs(jiffies - pool->last_progress_ts) / 1000;
/*
* Defer printing to avoid deadlocks in console drivers that
@@ -7580,11 +7583,11 @@ MODULE_PARM_DESC(panic_on_stall_time, "Panic if stall exceeds this many seconds
/*
* Show workers that might prevent the processing of pending work items.
* The only candidates are CPU-bound workers in the running state.
* Pending work items should be handled by another idle worker
* in all other situations.
* A busy worker that is not running on the CPU (e.g. sleeping in
* wait_event_idle() with PF_WQ_WORKER cleared) can stall the pool just as
* effectively as a CPU-bound one, so dump every in-flight worker.
*/
static void show_cpu_pool_hog(struct worker_pool *pool)
static void show_cpu_pool_busy_workers(struct worker_pool *pool)
{
struct worker *worker;
unsigned long irq_flags;
@@ -7593,36 +7596,34 @@ static void show_cpu_pool_hog(struct worker_pool *pool)
raw_spin_lock_irqsave(&pool->lock, irq_flags);
hash_for_each(pool->busy_hash, bkt, worker, hentry) {
if (task_is_running(worker->task)) {
/*
* Defer printing to avoid deadlocks in console
* drivers that queue work while holding locks
* also taken in their write paths.
*/
printk_deferred_enter();
/*
* Defer printing to avoid deadlocks in console
* drivers that queue work while holding locks
* also taken in their write paths.
*/
printk_deferred_enter();
pr_info("pool %d:\n", pool->id);
sched_show_task(worker->task);
pr_info("pool %d:\n", pool->id);
sched_show_task(worker->task);
printk_deferred_exit();
}
printk_deferred_exit();
}
raw_spin_unlock_irqrestore(&pool->lock, irq_flags);
}
static void show_cpu_pools_hogs(void)
static void show_cpu_pools_busy_workers(void)
{
struct worker_pool *pool;
int pi;
pr_info("Showing backtraces of running workers in stalled CPU-bound worker pools:\n");
pr_info("Showing backtraces of busy workers in stalled worker pools:\n");
rcu_read_lock();
for_each_pool(pool, pi) {
if (pool->cpu_stall)
show_cpu_pool_hog(pool);
show_cpu_pool_busy_workers(pool);
}
@@ -7691,7 +7692,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
touched = READ_ONCE(per_cpu(wq_watchdog_touched_cpu, pool->cpu));
else
touched = READ_ONCE(wq_watchdog_touched);
pool_ts = READ_ONCE(pool->watchdog_ts);
pool_ts = READ_ONCE(pool->last_progress_ts);
if (time_after(pool_ts, touched))
ts = pool_ts;
@@ -7719,7 +7720,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
show_all_workqueues();
if (cpu_pool_stall)
show_cpu_pools_hogs();
show_cpu_pools_busy_workers();
if (lockup_detected)
panic_on_wq_watchdog(max_stall_time);

View File

@@ -32,6 +32,7 @@ struct worker {
work_func_t current_func; /* K: function */
struct pool_workqueue *current_pwq; /* K: pwq */
u64 current_at; /* K: runtime at start or last wakeup */
unsigned long current_start; /* K: start time of current work item */
unsigned int current_color; /* K: color */
int sleeping; /* S: is worker sleeping? */

View File

@@ -0,0 +1 @@
obj-m += wq_stall.o

View File

@@ -0,0 +1,98 @@
// SPDX-License-Identifier: GPL-2.0
/*
* wq_stall - Test module for the workqueue stall detector.
*
* Deliberately creates a workqueue stall so the watchdog fires and
* prints diagnostic output. Useful for verifying that the stall
* detector correctly identifies stuck workers and produces useful
* backtraces.
*
* The stall is triggered by clearing PF_WQ_WORKER before sleeping,
* which hides the worker from the concurrency manager. A second
* work item queued on the same pool then sits in the worklist with
* no worker available to process it.
*
* After ~30s the workqueue watchdog fires:
* BUG: workqueue lockup - pool cpus=N ...
*
* Build:
* make -C <kernel tree> M=samples/workqueue/stall_detector modules
*
* Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
* Copyright (c) 2026 Breno Leitao <leitao@debian.org>
*/
#include <linux/module.h>
#include <linux/workqueue.h>
#include <linux/wait.h>
#include <linux/atomic.h>
#include <linux/sched.h>
static DECLARE_WAIT_QUEUE_HEAD(stall_wq_head);
static atomic_t wake_condition = ATOMIC_INIT(0);
static struct work_struct stall_work1;
static struct work_struct stall_work2;
static void stall_work2_fn(struct work_struct *work)
{
pr_info("wq_stall: second work item finally ran\n");
}
static void stall_work1_fn(struct work_struct *work)
{
pr_info("wq_stall: first work item running on cpu %d\n",
raw_smp_processor_id());
/*
* Queue second item while we're still counted as running
* (pool->nr_running > 0). Since schedule_work() on a per-CPU
* workqueue targets raw_smp_processor_id(), item 2 lands on the
* same pool. __queue_work -> kick_pool -> need_more_worker()
* sees nr_running > 0 and does NOT wake a new worker.
*/
schedule_work(&stall_work2);
/*
* Hide from the workqueue concurrency manager. Without
* PF_WQ_WORKER, schedule() won't call wq_worker_sleeping(),
* so nr_running is never decremented and no replacement
* worker is created. Item 2 stays stuck in pool->worklist.
*/
current->flags &= ~PF_WQ_WORKER;
pr_info("wq_stall: entering wait_event_idle (PF_WQ_WORKER cleared)\n");
pr_info("wq_stall: expect 'BUG: workqueue lockup' in ~30-60s\n");
wait_event_idle(stall_wq_head, atomic_read(&wake_condition) != 0);
/* Restore so process_one_work() cleanup works correctly */
current->flags |= PF_WQ_WORKER;
pr_info("wq_stall: woke up, PF_WQ_WORKER restored\n");
}
static int __init wq_stall_init(void)
{
pr_info("wq_stall: loading\n");
INIT_WORK(&stall_work1, stall_work1_fn);
INIT_WORK(&stall_work2, stall_work2_fn);
schedule_work(&stall_work1);
return 0;
}
static void __exit wq_stall_exit(void)
{
pr_info("wq_stall: unloading\n");
atomic_set(&wake_condition, 1);
wake_up(&stall_wq_head);
flush_work(&stall_work1);
flush_work(&stall_work2);
pr_info("wq_stall: all work flushed, module unloaded\n");
}
module_init(wq_stall_init);
module_exit(wq_stall_exit);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Reproduce workqueue stall caused by PF_WQ_WORKER misuse");
MODULE_AUTHOR("Breno Leitao <leitao@debian.org>");