Merge tag 'sched_ext-for-7.0-rc3-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext

Pull sched_ext fixes from Tejun Heo:

 - Fix data races flagged by KCSAN: add missing READ_ONCE()/WRITE_ONCE()
   annotations for lock-free accesses to module parameters and dsq->seq

 - Fix silent truncation of upper 32 enqueue flags (SCX_ENQ_PREEMPT and
   above) when passed through the int sched_class interface

 - Documentation updates: scheduling class precedence, task ownership
   state machine, example scheduler descriptions, config list cleanup

 - Selftest fix for format specifier and buffer length in
   file_write_long()

* tag 'sched_ext-for-7.0-rc3-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext:
  sched_ext: Use WRITE_ONCE() for the write side of scx_enable helper pointer
  sched_ext: Fix enqueue_task_scx() truncation of upper enqueue flags
  sched_ext: Documentation: Update sched-ext.rst
  sched_ext: Use READ_ONCE() for scx_slice_bypass_us in scx_bypass()
  sched_ext: Documentation: Mention scheduling class precedence
  sched_ext: Document task ownership state machine
  sched_ext: Use READ_ONCE() for lock-free reads of module param variables
  sched_ext/selftests: Fix format specifier and buffer length in file_write_long()
  sched_ext: Use WRITE_ONCE() for the write side of dsq->seq update
This commit is contained in:
Linus Torvalds
2026-03-13 14:54:56 -07:00
4 changed files with 138 additions and 32 deletions

View File

@@ -43,7 +43,6 @@ options should be enabled to use sched_ext:
CONFIG_DEBUG_INFO_BTF=y
CONFIG_BPF_JIT_ALWAYS_ON=y
CONFIG_BPF_JIT_DEFAULT_ON=y
CONFIG_PAHOLE_HAS_BTF_TAG=y
sched_ext is used only when the BPF scheduler is loaded and running.
@@ -58,7 +57,8 @@ in ``ops->flags``, all ``SCHED_NORMAL``, ``SCHED_BATCH``, ``SCHED_IDLE``, and
However, when the BPF scheduler is loaded and ``SCX_OPS_SWITCH_PARTIAL`` is
set in ``ops->flags``, only tasks with the ``SCHED_EXT`` policy are scheduled
by sched_ext, while tasks with ``SCHED_NORMAL``, ``SCHED_BATCH`` and
``SCHED_IDLE`` policies are scheduled by the fair-class scheduler.
``SCHED_IDLE`` policies are scheduled by the fair-class scheduler which has
higher sched_class precedence than ``SCHED_EXT``.
Terminating the sched_ext scheduler program, triggering `SysRq-S`, or
detection of any internal error including stalled runnable tasks aborts the
@@ -345,6 +345,8 @@ Where to Look
The functions prefixed with ``scx_bpf_`` can be called from the BPF
scheduler.
* ``kernel/sched/ext_idle.c`` contains the built-in idle CPU selection policy.
* ``tools/sched_ext/`` hosts example BPF scheduler implementations.
* ``scx_simple[.bpf].c``: Minimal global FIFO scheduler example using a
@@ -353,13 +355,35 @@ Where to Look
* ``scx_qmap[.bpf].c``: A multi-level FIFO scheduler supporting five
levels of priority implemented with ``BPF_MAP_TYPE_QUEUE``.
* ``scx_central[.bpf].c``: A central FIFO scheduler where all scheduling
decisions are made on one CPU, demonstrating ``LOCAL_ON`` dispatching,
tickless operation, and kthread preemption.
* ``scx_cpu0[.bpf].c``: A scheduler that queues all tasks to a shared DSQ
and only dispatches them on CPU0 in FIFO order. Useful for testing bypass
behavior.
* ``scx_flatcg[.bpf].c``: A flattened cgroup hierarchy scheduler
implementing hierarchical weight-based cgroup CPU control by compounding
each cgroup's share at every level into a single flat scheduling layer.
* ``scx_pair[.bpf].c``: A core-scheduling example that always makes
sibling CPU pairs execute tasks from the same CPU cgroup.
* ``scx_sdt[.bpf].c``: A variation of ``scx_simple`` demonstrating BPF
arena memory management for per-task data.
* ``scx_userland[.bpf].c``: A minimal scheduler demonstrating user space
scheduling. Tasks with CPU affinity are direct-dispatched in FIFO order;
all others are scheduled in user space by a simple vruntime scheduler.
ABI Instability
===============
The APIs provided by sched_ext to BPF schedulers programs have no stability
guarantees. This includes the ops table callbacks and constants defined in
``include/linux/sched/ext.h``, as well as the ``scx_bpf_`` kfuncs defined in
``kernel/sched/ext.c``.
``kernel/sched/ext.c`` and ``kernel/sched/ext_idle.c``.
While we will attempt to provide a relatively stable API surface when
possible, they are subject to change without warning between kernel

View File

@@ -1103,7 +1103,7 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
}
/* seq records the order tasks are queued, used by BPF DSQ iterator */
dsq->seq++;
WRITE_ONCE(dsq->seq, dsq->seq + 1);
p->scx.dsq_seq = dsq->seq;
dsq_mod_nr(dsq, 1);
@@ -1470,16 +1470,15 @@ static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at)
p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
}
static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags)
static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int core_enq_flags)
{
struct scx_sched *sch = scx_root;
int sticky_cpu = p->scx.sticky_cpu;
u64 enq_flags = core_enq_flags | rq->scx.extra_enq_flags;
if (enq_flags & ENQUEUE_WAKEUP)
rq->scx.flags |= SCX_RQ_IN_WAKEUP;
enq_flags |= rq->scx.extra_enq_flags;
if (sticky_cpu >= 0)
p->scx.sticky_cpu = -1;
@@ -3908,8 +3907,8 @@ static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq,
* consider offloading iff the total queued duration is over the
* threshold.
*/
min_delta_us = scx_bypass_lb_intv_us / SCX_BYPASS_LB_MIN_DELTA_DIV;
if (delta < DIV_ROUND_UP(min_delta_us, scx_slice_bypass_us))
min_delta_us = READ_ONCE(scx_bypass_lb_intv_us) / SCX_BYPASS_LB_MIN_DELTA_DIV;
if (delta < DIV_ROUND_UP(min_delta_us, READ_ONCE(scx_slice_bypass_us)))
return 0;
raw_spin_rq_lock_irq(rq);
@@ -4137,7 +4136,7 @@ static void scx_bypass(bool bypass)
WARN_ON_ONCE(scx_bypass_depth <= 0);
if (scx_bypass_depth != 1)
goto unlock;
WRITE_ONCE(scx_slice_dfl, scx_slice_bypass_us * NSEC_PER_USEC);
WRITE_ONCE(scx_slice_dfl, READ_ONCE(scx_slice_bypass_us) * NSEC_PER_USEC);
bypass_timestamp = ktime_get_ns();
if (sch)
scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
@@ -5259,13 +5258,14 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
if (!READ_ONCE(helper)) {
mutex_lock(&helper_mutex);
if (!helper) {
helper = kthread_run_worker(0, "scx_enable_helper");
if (IS_ERR_OR_NULL(helper)) {
helper = NULL;
struct kthread_worker *w =
kthread_run_worker(0, "scx_enable_helper");
if (IS_ERR_OR_NULL(w)) {
mutex_unlock(&helper_mutex);
return -ENOMEM;
}
sched_set_fifo(helper->task);
sched_set_fifo(w->task);
WRITE_ONCE(helper, w);
}
mutex_unlock(&helper_mutex);
}

View File

@@ -1035,26 +1035,108 @@ static const char *scx_enable_state_str[] = {
};
/*
* sched_ext_entity->ops_state
* Task Ownership State Machine (sched_ext_entity->ops_state)
*
* Used to track the task ownership between the SCX core and the BPF scheduler.
* State transitions look as follows:
* The sched_ext core uses this state machine to track task ownership
* between the SCX core and the BPF scheduler. This allows the BPF
* scheduler to dispatch tasks without strict ordering requirements, while
* the SCX core safely rejects invalid dispatches.
*
* NONE -> QUEUEING -> QUEUED -> DISPATCHING
* ^ | |
* | v v
* \-------------------------------/
* State Transitions
*
* QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call
* sites for explanations on the conditions being waited upon and why they are
* safe. Transitions out of them into NONE or QUEUED must store_release and the
* waiters should load_acquire.
* .------------> NONE (owned by SCX core)
* | | ^
* | enqueue | | direct dispatch
* | v |
* | QUEUEING -------'
* | |
* | enqueue |
* | completes |
* | v
* | QUEUED (owned by BPF scheduler)
* | |
* | dispatch |
* | |
* | v
* | DISPATCHING
* | |
* | dispatch |
* | completes |
* `---------------'
*
* Tracking scx_ops_state enables sched_ext core to reliably determine whether
* any given task can be dispatched by the BPF scheduler at all times and thus
* relaxes the requirements on the BPF scheduler. This allows the BPF scheduler
* to try to dispatch any task anytime regardless of its state as the SCX core
* can safely reject invalid dispatches.
* State Descriptions
*
* - %SCX_OPSS_NONE:
* Task is owned by the SCX core. It's either on a run queue, running,
* or being manipulated by the core scheduler. The BPF scheduler has no
* claim on this task.
*
* - %SCX_OPSS_QUEUEING:
* Transitional state while transferring a task from the SCX core to
* the BPF scheduler. The task's rq lock is held during this state.
* Since QUEUEING is both entered and exited under the rq lock, dequeue
* can never observe this state (it would be a BUG). When finishing a
* dispatch, if the task is still in %SCX_OPSS_QUEUEING the completion
* path busy-waits for it to leave this state (via wait_ops_state())
* before retrying.
*
* - %SCX_OPSS_QUEUED:
* Task is owned by the BPF scheduler. It's on a DSQ (dispatch queue)
* and the BPF scheduler is responsible for dispatching it. A QSEQ
* (queue sequence number) is embedded in this state to detect
* dispatch/dequeue races: if a task is dequeued and re-enqueued, the
* QSEQ changes and any in-flight dispatch operations targeting the old
* QSEQ are safely ignored.
*
* - %SCX_OPSS_DISPATCHING:
* Transitional state while transferring a task from the BPF scheduler
* back to the SCX core. This state indicates the BPF scheduler has
* selected the task for execution. When dequeue needs to take the task
* off a DSQ and it is still in %SCX_OPSS_DISPATCHING, the dequeue path
* busy-waits for it to leave this state (via wait_ops_state()) before
* proceeding. Exits to %SCX_OPSS_NONE when dispatch completes.
*
* Memory Ordering
*
* Transitions out of %SCX_OPSS_QUEUEING and %SCX_OPSS_DISPATCHING into
* %SCX_OPSS_NONE or %SCX_OPSS_QUEUED must use atomic_long_set_release()
* and waiters must use atomic_long_read_acquire(). This ensures proper
* synchronization between concurrent operations.
*
* Cross-CPU Task Migration
*
* When moving a task in the %SCX_OPSS_DISPATCHING state, we can't simply
* grab the target CPU's rq lock because a concurrent dequeue might be
* waiting on %SCX_OPSS_DISPATCHING while holding the source rq lock
* (deadlock).
*
* The sched_ext core uses a "lock dancing" protocol coordinated by
* p->scx.holding_cpu. When moving a task to a different rq:
*
* 1. Verify task can be moved (CPU affinity, migration_disabled, etc.)
* 2. Set p->scx.holding_cpu to the current CPU
* 3. Set task state to %SCX_OPSS_NONE; dequeue waits while DISPATCHING
* is set, so clearing DISPATCHING first prevents the circular wait
* (safe to lock the rq we need)
* 4. Unlock the current CPU's rq
* 5. Lock src_rq (where the task currently lives)
* 6. Verify p->scx.holding_cpu == current CPU, if not, dequeue won the
* race (dequeue clears holding_cpu to -1 when it takes the task), in
* this case migration is aborted
* 7. If src_rq == dst_rq: clear holding_cpu and enqueue directly
* into dst_rq's local DSQ (no lock swap needed)
* 8. Otherwise: call move_remote_task_to_local_dsq(), which releases
* src_rq, locks dst_rq, and performs the deactivate/activate
* migration cycle (dst_rq is held on return)
* 9. Unlock dst_rq and re-lock the current CPU's rq to restore
* the lock state expected by the caller
*
* If any verification fails, abort the migration.
*
* This state tracking allows the BPF scheduler to try to dispatch any task
* at any time regardless of its state. The SCX core can safely
* reject/ignore invalid dispatches, simplifying the BPF scheduler
* implementation.
*/
enum scx_ops_state {
SCX_OPSS_NONE, /* owned by the SCX core */

View File

@@ -60,11 +60,11 @@ int file_write_long(const char *path, long val)
char buf[64];
int ret;
ret = sprintf(buf, "%lu", val);
ret = sprintf(buf, "%ld", val);
if (ret < 0)
return ret;
if (write_text(path, buf, sizeof(buf)) <= 0)
if (write_text(path, buf, ret) <= 0)
return -1;
return 0;