2
0
mirror of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git synced 2025-09-04 20:19:47 +08:00

rcu: Document concurrent quiescent state reporting for offline CPUs

The synchronization of CPU offlining with GP initialization is confusing
to put it mildly (rightfully so as the issue it deals with is complex).
Recent discussions brought up a question -- what prevents the
rcu_implicit_dyntick_qs() from warning about QS reports for offline
CPUs (missing QS reports for offline CPUs causing indefinite hangs).

QS reporting for now-offline CPUs should only happen from:
- gp_init()
- rcutree_cpu_report_dead()

Add some documentation on this and refer to it from comments in the code
explaining how QS reporting is not missed when these functions are
concurrently running.

I referred heavily to this post [1] about the need for the ofl_lock.
[1] https://lore.kernel.org/all/20180924164443.GF4222@linux.ibm.com/

[ Applied paulmck feedback on moving documentation to Requirements.rst ]

Link: https://lore.kernel.org/all/01b4d228-9416-43f8-a62e-124b92e8741a@paulmck-laptop/
Co-developed-by: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: "Paul E. McKenney" <paulmck@kernel.org>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
Signed-off-by: Neeraj Upadhyay (AMD) <neeraj.upadhyay@kernel.org>
This commit is contained in:
Joel Fernandes 2025-07-15 16:01:55 -04:00 committed by Neeraj Upadhyay
parent 186779c036
commit 5d71c2b53f
2 changed files with 105 additions and 1 deletions

View File

@ -2011,6 +2011,93 @@ after the CPU hotplug scanning.
By incrementing gp_seq first, CPU1's RCU read-side critical section By incrementing gp_seq first, CPU1's RCU read-side critical section
is guaranteed to not be missed by CPU2. is guaranteed to not be missed by CPU2.
**Concurrent Quiescent State Reporting for Offline CPUs**
RCU must ensure that CPUs going offline report quiescent states to avoid
blocking grace periods. This requires careful synchronization to handle
race conditions
**Race condition causing Offline CPU to hang GP**
A race between CPU offlining and new GP initialization (gp_init) may occur
because `rcu_report_qs_rnp()` in `rcutree_report_cpu_dead()` must temporarily
release the `rcu_node` lock to wake the RCU grace-period kthread:
.. code-block:: none
CPU1 (going offline) CPU0 (GP kthread)
-------------------- -----------------
rcutree_report_cpu_dead()
rcu_report_qs_rnp()
// Must release rnp->lock to wake GP kthread
raw_spin_unlock_irqrestore_rcu_node()
// Wakes up and starts new GP
rcu_gp_init()
// First loop:
copies qsmaskinitnext->qsmaskinit
// CPU1 still in qsmaskinitnext!
// Second loop:
rnp->qsmask = rnp->qsmaskinit
mask = rnp->qsmask & ~rnp->qsmaskinitnext
// mask is 0! CPU1 still in both masks
// Reacquire lock (but too late)
rnp->qsmaskinitnext &= ~mask // Finally clears bit
Without `ofl_lock`, the new grace period includes the offline CPU and waits
forever for its quiescent state causing a GP hang.
**A solution with ofl_lock**
The `ofl_lock` (offline lock) prevents `rcu_gp_init()` from running during
the vulnerable window when `rcu_report_qs_rnp()` has released `rnp->lock`:
.. code-block:: none
CPU0 (rcu_gp_init) CPU1 (rcutree_report_cpu_dead)
------------------ ------------------------------
rcu_for_each_leaf_node(rnp) {
arch_spin_lock(&ofl_lock) -----> arch_spin_lock(&ofl_lock) [BLOCKED]
// Safe: CPU1 can't interfere
rnp->qsmaskinit = rnp->qsmaskinitnext
arch_spin_unlock(&ofl_lock) ---> // Now CPU1 can proceed
} // But snapshot already taken
**Another race causing GP hangs in rcu_gpu_init(): Reporting QS for Now-offline CPUs**
After the first loop takes an atomic snapshot of online CPUs, as shown above,
the second loop in `rcu_gp_init()` detects CPUs that went offline between
releasing `ofl_lock` and acquiring the per-node `rnp->lock`. This detection is
crucial because:
1. The CPU might have gone offline after the snapshot but before the second loop
2. The offline CPU cannot report its own QS if it's already dead
3. Without this detection, the grace period would wait forever for CPUs that
are now offline.
The second loop performs this detection safely:
.. code-block:: none
rcu_for_each_node_breadth_first(rnp) {
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rnp->qsmask = rnp->qsmaskinit; // Apply the snapshot
// Detect CPUs offline after snapshot
mask = rnp->qsmask & ~rnp->qsmaskinitnext;
if (mask && rcu_is_leaf_node(rnp))
rcu_report_qs_rnp(mask, ...) // Report QS for offline CPUs
}
This approach ensures atomicity: quiescent state reporting for offline CPUs
happens either in `rcu_gp_init()` (second loop) or in `rcutree_report_cpu_dead()`,
never both and never neither. The `rnp->lock` held throughout the sequence
prevents races - `rcutree_report_cpu_dead()` also acquires this lock when
clearing `qsmaskinitnext`, ensuring mutual exclusion.
Scheduler and RCU Scheduler and RCU
~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~

View File

@ -1886,6 +1886,10 @@ static noinline_for_stack bool rcu_gp_init(void)
/* Exclude CPU hotplug operations. */ /* Exclude CPU hotplug operations. */
rcu_for_each_leaf_node(rnp) { rcu_for_each_leaf_node(rnp) {
local_irq_disable(); local_irq_disable();
/*
* Serialize with CPU offline. See Requirements.rst > Hotplug CPU >
* Concurrent Quiescent State Reporting for Offline CPUs.
*/
arch_spin_lock(&rcu_state.ofl_lock); arch_spin_lock(&rcu_state.ofl_lock);
raw_spin_lock_rcu_node(rnp); raw_spin_lock_rcu_node(rnp);
if (rnp->qsmaskinit == rnp->qsmaskinitnext && if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
@ -1960,7 +1964,12 @@ static noinline_for_stack bool rcu_gp_init(void)
trace_rcu_grace_period_init(rcu_state.name, rnp->gp_seq, trace_rcu_grace_period_init(rcu_state.name, rnp->gp_seq,
rnp->level, rnp->grplo, rnp->level, rnp->grplo,
rnp->grphi, rnp->qsmask); rnp->grphi, rnp->qsmask);
/* Quiescent states for tasks on any now-offline CPUs. */ /*
* Quiescent states for tasks on any now-offline CPUs. Since we
* released the ofl and rnp lock before this loop, CPUs might
* have gone offline and we have to report QS on their behalf.
* See Requirements.rst > Hotplug CPU > Concurrent QS Reporting.
*/
mask = rnp->qsmask & ~rnp->qsmaskinitnext; mask = rnp->qsmask & ~rnp->qsmaskinitnext;
rnp->rcu_gp_init_mask = mask; rnp->rcu_gp_init_mask = mask;
if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp)) if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp))
@ -4386,6 +4395,13 @@ void rcutree_report_cpu_dead(void)
/* Remove outgoing CPU from mask in the leaf rcu_node structure. */ /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
mask = rdp->grpmask; mask = rdp->grpmask;
/*
* Hold the ofl_lock and rnp lock to avoid races between CPU going
* offline and doing a QS report (as below), versus rcu_gp_init().
* See Requirements.rst > Hotplug CPU > Concurrent QS Reporting section
* for more details.
*/
arch_spin_lock(&rcu_state.ofl_lock); arch_spin_lock(&rcu_state.ofl_lock);
raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq); rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
@ -4396,6 +4412,7 @@ void rcutree_report_cpu_dead(void)
rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
raw_spin_lock_irqsave_rcu_node(rnp, flags); raw_spin_lock_irqsave_rcu_node(rnp, flags);
} }
/* Clear from ->qsmaskinitnext to mark offline. */
WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask); WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags); raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
arch_spin_unlock(&rcu_state.ofl_lock); arch_spin_unlock(&rcu_state.ofl_lock);