mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-03-21 23:16:50 +08:00
Merge tag 'sched_ext-for-7.0-rc3-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext
Pull sched_ext fixes from Tejun Heo: - Fix data races flagged by KCSAN: add missing READ_ONCE()/WRITE_ONCE() annotations for lock-free accesses to module parameters and dsq->seq - Fix silent truncation of upper 32 enqueue flags (SCX_ENQ_PREEMPT and above) when passed through the int sched_class interface - Documentation updates: scheduling class precedence, task ownership state machine, example scheduler descriptions, config list cleanup - Selftest fix for format specifier and buffer length in file_write_long() * tag 'sched_ext-for-7.0-rc3-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext: sched_ext: Use WRITE_ONCE() for the write side of scx_enable helper pointer sched_ext: Fix enqueue_task_scx() truncation of upper enqueue flags sched_ext: Documentation: Update sched-ext.rst sched_ext: Use READ_ONCE() for scx_slice_bypass_us in scx_bypass() sched_ext: Documentation: Mention scheduling class precedence sched_ext: Document task ownership state machine sched_ext: Use READ_ONCE() for lock-free reads of module param variables sched_ext/selftests: Fix format specifier and buffer length in file_write_long() sched_ext: Use WRITE_ONCE() for the write side of dsq->seq update
This commit is contained in:
@@ -43,7 +43,6 @@ options should be enabled to use sched_ext:
|
||||
CONFIG_DEBUG_INFO_BTF=y
|
||||
CONFIG_BPF_JIT_ALWAYS_ON=y
|
||||
CONFIG_BPF_JIT_DEFAULT_ON=y
|
||||
CONFIG_PAHOLE_HAS_BTF_TAG=y
|
||||
|
||||
sched_ext is used only when the BPF scheduler is loaded and running.
|
||||
|
||||
@@ -58,7 +57,8 @@ in ``ops->flags``, all ``SCHED_NORMAL``, ``SCHED_BATCH``, ``SCHED_IDLE``, and
|
||||
However, when the BPF scheduler is loaded and ``SCX_OPS_SWITCH_PARTIAL`` is
|
||||
set in ``ops->flags``, only tasks with the ``SCHED_EXT`` policy are scheduled
|
||||
by sched_ext, while tasks with ``SCHED_NORMAL``, ``SCHED_BATCH`` and
|
||||
``SCHED_IDLE`` policies are scheduled by the fair-class scheduler.
|
||||
``SCHED_IDLE`` policies are scheduled by the fair-class scheduler which has
|
||||
higher sched_class precedence than ``SCHED_EXT``.
|
||||
|
||||
Terminating the sched_ext scheduler program, triggering `SysRq-S`, or
|
||||
detection of any internal error including stalled runnable tasks aborts the
|
||||
@@ -345,6 +345,8 @@ Where to Look
|
||||
The functions prefixed with ``scx_bpf_`` can be called from the BPF
|
||||
scheduler.
|
||||
|
||||
* ``kernel/sched/ext_idle.c`` contains the built-in idle CPU selection policy.
|
||||
|
||||
* ``tools/sched_ext/`` hosts example BPF scheduler implementations.
|
||||
|
||||
* ``scx_simple[.bpf].c``: Minimal global FIFO scheduler example using a
|
||||
@@ -353,13 +355,35 @@ Where to Look
|
||||
* ``scx_qmap[.bpf].c``: A multi-level FIFO scheduler supporting five
|
||||
levels of priority implemented with ``BPF_MAP_TYPE_QUEUE``.
|
||||
|
||||
* ``scx_central[.bpf].c``: A central FIFO scheduler where all scheduling
|
||||
decisions are made on one CPU, demonstrating ``LOCAL_ON`` dispatching,
|
||||
tickless operation, and kthread preemption.
|
||||
|
||||
* ``scx_cpu0[.bpf].c``: A scheduler that queues all tasks to a shared DSQ
|
||||
and only dispatches them on CPU0 in FIFO order. Useful for testing bypass
|
||||
behavior.
|
||||
|
||||
* ``scx_flatcg[.bpf].c``: A flattened cgroup hierarchy scheduler
|
||||
implementing hierarchical weight-based cgroup CPU control by compounding
|
||||
each cgroup's share at every level into a single flat scheduling layer.
|
||||
|
||||
* ``scx_pair[.bpf].c``: A core-scheduling example that always makes
|
||||
sibling CPU pairs execute tasks from the same CPU cgroup.
|
||||
|
||||
* ``scx_sdt[.bpf].c``: A variation of ``scx_simple`` demonstrating BPF
|
||||
arena memory management for per-task data.
|
||||
|
||||
* ``scx_userland[.bpf].c``: A minimal scheduler demonstrating user space
|
||||
scheduling. Tasks with CPU affinity are direct-dispatched in FIFO order;
|
||||
all others are scheduled in user space by a simple vruntime scheduler.
|
||||
|
||||
ABI Instability
|
||||
===============
|
||||
|
||||
The APIs provided by sched_ext to BPF schedulers programs have no stability
|
||||
guarantees. This includes the ops table callbacks and constants defined in
|
||||
``include/linux/sched/ext.h``, as well as the ``scx_bpf_`` kfuncs defined in
|
||||
``kernel/sched/ext.c``.
|
||||
``kernel/sched/ext.c`` and ``kernel/sched/ext_idle.c``.
|
||||
|
||||
While we will attempt to provide a relatively stable API surface when
|
||||
possible, they are subject to change without warning between kernel
|
||||
|
||||
@@ -1103,7 +1103,7 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
|
||||
}
|
||||
|
||||
/* seq records the order tasks are queued, used by BPF DSQ iterator */
|
||||
dsq->seq++;
|
||||
WRITE_ONCE(dsq->seq, dsq->seq + 1);
|
||||
p->scx.dsq_seq = dsq->seq;
|
||||
|
||||
dsq_mod_nr(dsq, 1);
|
||||
@@ -1470,16 +1470,15 @@ static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at)
|
||||
p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
|
||||
}
|
||||
|
||||
static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags)
|
||||
static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int core_enq_flags)
|
||||
{
|
||||
struct scx_sched *sch = scx_root;
|
||||
int sticky_cpu = p->scx.sticky_cpu;
|
||||
u64 enq_flags = core_enq_flags | rq->scx.extra_enq_flags;
|
||||
|
||||
if (enq_flags & ENQUEUE_WAKEUP)
|
||||
rq->scx.flags |= SCX_RQ_IN_WAKEUP;
|
||||
|
||||
enq_flags |= rq->scx.extra_enq_flags;
|
||||
|
||||
if (sticky_cpu >= 0)
|
||||
p->scx.sticky_cpu = -1;
|
||||
|
||||
@@ -3908,8 +3907,8 @@ static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq,
|
||||
* consider offloading iff the total queued duration is over the
|
||||
* threshold.
|
||||
*/
|
||||
min_delta_us = scx_bypass_lb_intv_us / SCX_BYPASS_LB_MIN_DELTA_DIV;
|
||||
if (delta < DIV_ROUND_UP(min_delta_us, scx_slice_bypass_us))
|
||||
min_delta_us = READ_ONCE(scx_bypass_lb_intv_us) / SCX_BYPASS_LB_MIN_DELTA_DIV;
|
||||
if (delta < DIV_ROUND_UP(min_delta_us, READ_ONCE(scx_slice_bypass_us)))
|
||||
return 0;
|
||||
|
||||
raw_spin_rq_lock_irq(rq);
|
||||
@@ -4137,7 +4136,7 @@ static void scx_bypass(bool bypass)
|
||||
WARN_ON_ONCE(scx_bypass_depth <= 0);
|
||||
if (scx_bypass_depth != 1)
|
||||
goto unlock;
|
||||
WRITE_ONCE(scx_slice_dfl, scx_slice_bypass_us * NSEC_PER_USEC);
|
||||
WRITE_ONCE(scx_slice_dfl, READ_ONCE(scx_slice_bypass_us) * NSEC_PER_USEC);
|
||||
bypass_timestamp = ktime_get_ns();
|
||||
if (sch)
|
||||
scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
|
||||
@@ -5259,13 +5258,14 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
|
||||
if (!READ_ONCE(helper)) {
|
||||
mutex_lock(&helper_mutex);
|
||||
if (!helper) {
|
||||
helper = kthread_run_worker(0, "scx_enable_helper");
|
||||
if (IS_ERR_OR_NULL(helper)) {
|
||||
helper = NULL;
|
||||
struct kthread_worker *w =
|
||||
kthread_run_worker(0, "scx_enable_helper");
|
||||
if (IS_ERR_OR_NULL(w)) {
|
||||
mutex_unlock(&helper_mutex);
|
||||
return -ENOMEM;
|
||||
}
|
||||
sched_set_fifo(helper->task);
|
||||
sched_set_fifo(w->task);
|
||||
WRITE_ONCE(helper, w);
|
||||
}
|
||||
mutex_unlock(&helper_mutex);
|
||||
}
|
||||
|
||||
@@ -1035,26 +1035,108 @@ static const char *scx_enable_state_str[] = {
|
||||
};
|
||||
|
||||
/*
|
||||
* sched_ext_entity->ops_state
|
||||
* Task Ownership State Machine (sched_ext_entity->ops_state)
|
||||
*
|
||||
* Used to track the task ownership between the SCX core and the BPF scheduler.
|
||||
* State transitions look as follows:
|
||||
* The sched_ext core uses this state machine to track task ownership
|
||||
* between the SCX core and the BPF scheduler. This allows the BPF
|
||||
* scheduler to dispatch tasks without strict ordering requirements, while
|
||||
* the SCX core safely rejects invalid dispatches.
|
||||
*
|
||||
* NONE -> QUEUEING -> QUEUED -> DISPATCHING
|
||||
* ^ | |
|
||||
* | v v
|
||||
* \-------------------------------/
|
||||
* State Transitions
|
||||
*
|
||||
* QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call
|
||||
* sites for explanations on the conditions being waited upon and why they are
|
||||
* safe. Transitions out of them into NONE or QUEUED must store_release and the
|
||||
* waiters should load_acquire.
|
||||
* .------------> NONE (owned by SCX core)
|
||||
* | | ^
|
||||
* | enqueue | | direct dispatch
|
||||
* | v |
|
||||
* | QUEUEING -------'
|
||||
* | |
|
||||
* | enqueue |
|
||||
* | completes |
|
||||
* | v
|
||||
* | QUEUED (owned by BPF scheduler)
|
||||
* | |
|
||||
* | dispatch |
|
||||
* | |
|
||||
* | v
|
||||
* | DISPATCHING
|
||||
* | |
|
||||
* | dispatch |
|
||||
* | completes |
|
||||
* `---------------'
|
||||
*
|
||||
* Tracking scx_ops_state enables sched_ext core to reliably determine whether
|
||||
* any given task can be dispatched by the BPF scheduler at all times and thus
|
||||
* relaxes the requirements on the BPF scheduler. This allows the BPF scheduler
|
||||
* to try to dispatch any task anytime regardless of its state as the SCX core
|
||||
* can safely reject invalid dispatches.
|
||||
* State Descriptions
|
||||
*
|
||||
* - %SCX_OPSS_NONE:
|
||||
* Task is owned by the SCX core. It's either on a run queue, running,
|
||||
* or being manipulated by the core scheduler. The BPF scheduler has no
|
||||
* claim on this task.
|
||||
*
|
||||
* - %SCX_OPSS_QUEUEING:
|
||||
* Transitional state while transferring a task from the SCX core to
|
||||
* the BPF scheduler. The task's rq lock is held during this state.
|
||||
* Since QUEUEING is both entered and exited under the rq lock, dequeue
|
||||
* can never observe this state (it would be a BUG). When finishing a
|
||||
* dispatch, if the task is still in %SCX_OPSS_QUEUEING the completion
|
||||
* path busy-waits for it to leave this state (via wait_ops_state())
|
||||
* before retrying.
|
||||
*
|
||||
* - %SCX_OPSS_QUEUED:
|
||||
* Task is owned by the BPF scheduler. It's on a DSQ (dispatch queue)
|
||||
* and the BPF scheduler is responsible for dispatching it. A QSEQ
|
||||
* (queue sequence number) is embedded in this state to detect
|
||||
* dispatch/dequeue races: if a task is dequeued and re-enqueued, the
|
||||
* QSEQ changes and any in-flight dispatch operations targeting the old
|
||||
* QSEQ are safely ignored.
|
||||
*
|
||||
* - %SCX_OPSS_DISPATCHING:
|
||||
* Transitional state while transferring a task from the BPF scheduler
|
||||
* back to the SCX core. This state indicates the BPF scheduler has
|
||||
* selected the task for execution. When dequeue needs to take the task
|
||||
* off a DSQ and it is still in %SCX_OPSS_DISPATCHING, the dequeue path
|
||||
* busy-waits for it to leave this state (via wait_ops_state()) before
|
||||
* proceeding. Exits to %SCX_OPSS_NONE when dispatch completes.
|
||||
*
|
||||
* Memory Ordering
|
||||
*
|
||||
* Transitions out of %SCX_OPSS_QUEUEING and %SCX_OPSS_DISPATCHING into
|
||||
* %SCX_OPSS_NONE or %SCX_OPSS_QUEUED must use atomic_long_set_release()
|
||||
* and waiters must use atomic_long_read_acquire(). This ensures proper
|
||||
* synchronization between concurrent operations.
|
||||
*
|
||||
* Cross-CPU Task Migration
|
||||
*
|
||||
* When moving a task in the %SCX_OPSS_DISPATCHING state, we can't simply
|
||||
* grab the target CPU's rq lock because a concurrent dequeue might be
|
||||
* waiting on %SCX_OPSS_DISPATCHING while holding the source rq lock
|
||||
* (deadlock).
|
||||
*
|
||||
* The sched_ext core uses a "lock dancing" protocol coordinated by
|
||||
* p->scx.holding_cpu. When moving a task to a different rq:
|
||||
*
|
||||
* 1. Verify task can be moved (CPU affinity, migration_disabled, etc.)
|
||||
* 2. Set p->scx.holding_cpu to the current CPU
|
||||
* 3. Set task state to %SCX_OPSS_NONE; dequeue waits while DISPATCHING
|
||||
* is set, so clearing DISPATCHING first prevents the circular wait
|
||||
* (safe to lock the rq we need)
|
||||
* 4. Unlock the current CPU's rq
|
||||
* 5. Lock src_rq (where the task currently lives)
|
||||
* 6. Verify p->scx.holding_cpu == current CPU, if not, dequeue won the
|
||||
* race (dequeue clears holding_cpu to -1 when it takes the task), in
|
||||
* this case migration is aborted
|
||||
* 7. If src_rq == dst_rq: clear holding_cpu and enqueue directly
|
||||
* into dst_rq's local DSQ (no lock swap needed)
|
||||
* 8. Otherwise: call move_remote_task_to_local_dsq(), which releases
|
||||
* src_rq, locks dst_rq, and performs the deactivate/activate
|
||||
* migration cycle (dst_rq is held on return)
|
||||
* 9. Unlock dst_rq and re-lock the current CPU's rq to restore
|
||||
* the lock state expected by the caller
|
||||
*
|
||||
* If any verification fails, abort the migration.
|
||||
*
|
||||
* This state tracking allows the BPF scheduler to try to dispatch any task
|
||||
* at any time regardless of its state. The SCX core can safely
|
||||
* reject/ignore invalid dispatches, simplifying the BPF scheduler
|
||||
* implementation.
|
||||
*/
|
||||
enum scx_ops_state {
|
||||
SCX_OPSS_NONE, /* owned by the SCX core */
|
||||
|
||||
@@ -60,11 +60,11 @@ int file_write_long(const char *path, long val)
|
||||
char buf[64];
|
||||
int ret;
|
||||
|
||||
ret = sprintf(buf, "%lu", val);
|
||||
ret = sprintf(buf, "%ld", val);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
if (write_text(path, buf, sizeof(buf)) <= 0)
|
||||
if (write_text(path, buf, ret) <= 0)
|
||||
return -1;
|
||||
|
||||
return 0;
|
||||
|
||||
Reference in New Issue
Block a user