From 3ca59da7aa5c7f569b04a511dc8670861d58b509 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:16 +0100 Subject: [PATCH 01/54] rseq: Avoid pointless evaluation in __rseq_notify_resume() The RSEQ critical section mechanism only clears the event mask when a critical section is registered, otherwise it is stale and collects bits. That means once a critical section is installed the first invocation of that code when TIF_NOTIFY_RESUME is set will abort the critical section, even when the TIF bit was not raised by the rseq preempt/migrate/signal helpers. This also has a performance implication because TIF_NOTIFY_RESUME is a multiplexing TIF bit, which is utilized by quite some infrastructure. That means every invocation of __rseq_notify_resume() goes unconditionally through the heavy lifting of user space access and consistency checks even if there is no reason to do so. Keeping the stale event mask around when exiting to user space also prevents it from being utilized by the upcoming time slice extension mechanism. Avoid this by reading and clearing the event mask before doing the user space critical section access with interrupts or preemption disabled, which ensures that the read and clear operation is CPU local atomic versus scheduling and the membarrier IPI. This is correct as after re-enabling interrupts/preemption any relevant event will set the bit again and raise TIF_NOTIFY_RESUME, which makes the user space exit code take another round of TIF bit clearing. If the event mask was non-zero, invoke the slow path. On debug kernels the slow path is invoked unconditionally and the result of the event mask evaluation is handed in. Add a exit path check after the TIF bit loop, which validates on debug kernels that the event mask is zero before exiting to user space. While at it reword the convoluted comment why the pt_regs pointer can be NULL under certain circumstances. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.022571576@linutronix.de --- include/linux/irq-entry-common.h | 7 +++- include/linux/rseq.h | 10 ++++- kernel/rseq.c | 66 +++++++++++++++++++++----------- 3 files changed, 58 insertions(+), 25 deletions(-) diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index d643c7c87822..e5941df13901 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -2,11 +2,12 @@ #ifndef __LINUX_IRQENTRYCOMMON_H #define __LINUX_IRQENTRYCOMMON_H +#include +#include +#include #include #include -#include #include -#include #include #include @@ -226,6 +227,8 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs) arch_exit_to_user_mode_prepare(regs, ti_work); + rseq_exit_to_user_mode(); + /* Ensure that kernel state is sane for a return to userspace */ kmap_assert_nomap(); lockdep_assert_irqs_disabled(); diff --git a/include/linux/rseq.h b/include/linux/rseq.h index 69553e7c14c1..7622b733a508 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -66,6 +66,14 @@ static inline void rseq_migrate(struct task_struct *t) rseq_set_notify_resume(t); } +static __always_inline void rseq_exit_to_user_mode(void) +{ + if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) { + if (WARN_ON_ONCE(current->rseq && current->rseq_event_mask)) + current->rseq_event_mask = 0; + } +} + /* * If parent process has a registered restartable sequences area, the * child inherits. Unregister rseq for a clone with CLONE_VM set. @@ -118,7 +126,7 @@ static inline void rseq_fork(struct task_struct *t, u64 clone_flags) static inline void rseq_execve(struct task_struct *t) { } - +static inline void rseq_exit_to_user_mode(void) { } #endif #ifdef CONFIG_DEBUG_RSEQ diff --git a/kernel/rseq.c b/kernel/rseq.c index 2452b7366b00..246319d7cb0c 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -324,9 +324,9 @@ static bool rseq_warn_flags(const char *str, u32 flags) return true; } -static int rseq_need_restart(struct task_struct *t, u32 cs_flags) +static int rseq_check_flags(struct task_struct *t, u32 cs_flags) { - u32 flags, event_mask; + u32 flags; int ret; if (rseq_warn_flags("rseq_cs", cs_flags)) @@ -339,17 +339,7 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags) if (rseq_warn_flags("rseq", flags)) return -EINVAL; - - /* - * Load and clear event mask atomically with respect to - * scheduler preemption and membarrier IPIs. - */ - scoped_guard(RSEQ_EVENT_GUARD) { - event_mask = t->rseq_event_mask; - t->rseq_event_mask = 0; - } - - return !!event_mask; + return 0; } static int clear_rseq_cs(struct rseq __user *rseq) @@ -380,7 +370,7 @@ static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs) return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset; } -static int rseq_ip_fixup(struct pt_regs *regs) +static int rseq_ip_fixup(struct pt_regs *regs, bool abort) { unsigned long ip = instruction_pointer(regs); struct task_struct *t = current; @@ -398,9 +388,11 @@ static int rseq_ip_fixup(struct pt_regs *regs) */ if (!in_rseq_cs(ip, &rseq_cs)) return clear_rseq_cs(t->rseq); - ret = rseq_need_restart(t, rseq_cs.flags); - if (ret <= 0) + ret = rseq_check_flags(t, rseq_cs.flags); + if (ret < 0) return ret; + if (!abort) + return 0; ret = clear_rseq_cs(t->rseq); if (ret) return ret; @@ -430,14 +422,44 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) return; /* - * regs is NULL if and only if the caller is in a syscall path. Skip - * fixup and leave rseq_cs as is so that rseq_sycall() will detect and - * kill a misbehaving userspace on debug kernels. + * If invoked from hypervisors or IO-URING, then @regs is a NULL + * pointer, so fixup cannot be done. If the syscall which led to + * this invocation was invoked inside a critical section, then it + * will either end up in this code again or a possible violation of + * a syscall inside a critical region can only be detected by the + * debug code in rseq_syscall() in a debug enabled kernel. */ if (regs) { - ret = rseq_ip_fixup(regs); - if (unlikely(ret < 0)) - goto error; + /* + * Read and clear the event mask first. If the task was not + * preempted or migrated or a signal is on the way, there + * is no point in doing any of the heavy lifting here on + * production kernels. In that case TIF_NOTIFY_RESUME was + * raised by some other functionality. + * + * This is correct because the read/clear operation is + * guarded against scheduler preemption, which makes it CPU + * local atomic. If the task is preempted right after + * re-enabling preemption then TIF_NOTIFY_RESUME is set + * again and this function is invoked another time _before_ + * the task is able to return to user mode. + * + * On a debug kernel, invoke the fixup code unconditionally + * with the result handed in to allow the detection of + * inconsistencies. + */ + u32 event_mask; + + scoped_guard(RSEQ_EVENT_GUARD) { + event_mask = t->rseq_event_mask; + t->rseq_event_mask = 0; + } + + if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event_mask) { + ret = rseq_ip_fixup(regs, !!event_mask); + if (unlikely(ret < 0)) + goto error; + } } if (unlikely(rseq_update_cpu_node_id(t))) goto error; From fdc0f39d289ebcf46ef44f43460207ef24c94ed7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:18 +0100 Subject: [PATCH 02/54] rseq: Condense the inline stubs Scrolling over tons of pointless { } lines to find the actual code is annoying at best. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.085971048@linutronix.de --- include/linux/rseq.h | 47 +++++++++++--------------------------------- 1 file changed, 12 insertions(+), 35 deletions(-) diff --git a/include/linux/rseq.h b/include/linux/rseq.h index 7622b733a508..21f875af0e96 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -101,44 +101,21 @@ static inline void rseq_execve(struct task_struct *t) t->rseq_event_mask = 0; } -#else - -static inline void rseq_set_notify_resume(struct task_struct *t) -{ -} -static inline void rseq_handle_notify_resume(struct ksignal *ksig, - struct pt_regs *regs) -{ -} -static inline void rseq_signal_deliver(struct ksignal *ksig, - struct pt_regs *regs) -{ -} -static inline void rseq_preempt(struct task_struct *t) -{ -} -static inline void rseq_migrate(struct task_struct *t) -{ -} -static inline void rseq_fork(struct task_struct *t, u64 clone_flags) -{ -} -static inline void rseq_execve(struct task_struct *t) -{ -} +#else /* CONFIG_RSEQ */ +static inline void rseq_set_notify_resume(struct task_struct *t) { } +static inline void rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { } +static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } +static inline void rseq_preempt(struct task_struct *t) { } +static inline void rseq_migrate(struct task_struct *t) { } +static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { } +static inline void rseq_execve(struct task_struct *t) { } static inline void rseq_exit_to_user_mode(void) { } -#endif +#endif /* !CONFIG_RSEQ */ #ifdef CONFIG_DEBUG_RSEQ - void rseq_syscall(struct pt_regs *regs); - -#else - -static inline void rseq_syscall(struct pt_regs *regs) -{ -} - -#endif +#else /* CONFIG_DEBUG_RSEQ */ +static inline void rseq_syscall(struct pt_regs *regs) { } +#endif /* !CONFIG_DEBUG_RSEQ */ #endif /* _LINUX_RSEQ_H */ From 77f19e4d4fc90a9364f5055a4daf8b98a76cb303 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:20 +0100 Subject: [PATCH 03/54] rseq: Move algorithm comment to top Move the comment which documents the RSEQ algorithm to the top of the file, so it does not create horrible diffs later when the actual implementation is fed into the mincer. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.149519580@linutronix.de --- kernel/rseq.c | 119 +++++++++++++++++++++++++------------------------- 1 file changed, 59 insertions(+), 60 deletions(-) diff --git a/kernel/rseq.c b/kernel/rseq.c index 246319d7cb0c..51fafc4528b0 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -8,6 +8,65 @@ * Mathieu Desnoyers */ +/* + * Restartable sequences are a lightweight interface that allows + * user-level code to be executed atomically relative to scheduler + * preemption and signal delivery. Typically used for implementing + * per-cpu operations. + * + * It allows user-space to perform update operations on per-cpu data + * without requiring heavy-weight atomic operations. + * + * Detailed algorithm of rseq user-space assembly sequences: + * + * init(rseq_cs) + * cpu = TLS->rseq::cpu_id_start + * [1] TLS->rseq::rseq_cs = rseq_cs + * [start_ip] ---------------------------- + * [2] if (cpu != TLS->rseq::cpu_id) + * goto abort_ip; + * [3] + * [post_commit_ip] ---------------------------- + * + * The address of jump target abort_ip must be outside the critical + * region, i.e.: + * + * [abort_ip] < [start_ip] || [abort_ip] >= [post_commit_ip] + * + * Steps [2]-[3] (inclusive) need to be a sequence of instructions in + * userspace that can handle being interrupted between any of those + * instructions, and then resumed to the abort_ip. + * + * 1. Userspace stores the address of the struct rseq_cs assembly + * block descriptor into the rseq_cs field of the registered + * struct rseq TLS area. This update is performed through a single + * store within the inline assembly instruction sequence. + * [start_ip] + * + * 2. Userspace tests to check whether the current cpu_id field match + * the cpu number loaded before start_ip, branching to abort_ip + * in case of a mismatch. + * + * If the sequence is preempted or interrupted by a signal + * at or after start_ip and before post_commit_ip, then the kernel + * clears TLS->__rseq_abi::rseq_cs, and sets the user-space return + * ip to abort_ip before returning to user-space, so the preempted + * execution resumes at abort_ip. + * + * 3. Userspace critical section final instruction before + * post_commit_ip is the commit. The critical section is + * self-terminating. + * [post_commit_ip] + * + * 4. + * + * On failure at [2], or if interrupted by preempt or signal delivery + * between [1] and [3]: + * + * [abort_ip] + * F1. + */ + #include #include #include @@ -98,66 +157,6 @@ static int rseq_validate_ro_fields(struct task_struct *t) unsafe_put_user(value, &t->rseq->field, error_label) #endif -/* - * - * Restartable sequences are a lightweight interface that allows - * user-level code to be executed atomically relative to scheduler - * preemption and signal delivery. Typically used for implementing - * per-cpu operations. - * - * It allows user-space to perform update operations on per-cpu data - * without requiring heavy-weight atomic operations. - * - * Detailed algorithm of rseq user-space assembly sequences: - * - * init(rseq_cs) - * cpu = TLS->rseq::cpu_id_start - * [1] TLS->rseq::rseq_cs = rseq_cs - * [start_ip] ---------------------------- - * [2] if (cpu != TLS->rseq::cpu_id) - * goto abort_ip; - * [3] - * [post_commit_ip] ---------------------------- - * - * The address of jump target abort_ip must be outside the critical - * region, i.e.: - * - * [abort_ip] < [start_ip] || [abort_ip] >= [post_commit_ip] - * - * Steps [2]-[3] (inclusive) need to be a sequence of instructions in - * userspace that can handle being interrupted between any of those - * instructions, and then resumed to the abort_ip. - * - * 1. Userspace stores the address of the struct rseq_cs assembly - * block descriptor into the rseq_cs field of the registered - * struct rseq TLS area. This update is performed through a single - * store within the inline assembly instruction sequence. - * [start_ip] - * - * 2. Userspace tests to check whether the current cpu_id field match - * the cpu number loaded before start_ip, branching to abort_ip - * in case of a mismatch. - * - * If the sequence is preempted or interrupted by a signal - * at or after start_ip and before post_commit_ip, then the kernel - * clears TLS->__rseq_abi::rseq_cs, and sets the user-space return - * ip to abort_ip before returning to user-space, so the preempted - * execution resumes at abort_ip. - * - * 3. Userspace critical section final instruction before - * post_commit_ip is the commit. The critical section is - * self-terminating. - * [post_commit_ip] - * - * 4. - * - * On failure at [2], or if interrupted by preempt or signal delivery - * between [1] and [3]: - * - * [abort_ip] - * F1. - */ - static int rseq_update_cpu_node_id(struct task_struct *t) { struct rseq __user *rseq = t->rseq; From 41b43a6ba3848be8ceec77b8b2a56ddeca6167ed Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:22 +0100 Subject: [PATCH 04/54] rseq: Remove the ksig argument from rseq_handle_notify_resume() There is no point for this being visible in the resume_to_user_mode() handling. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.211520245@linutronix.de --- include/linux/resume_user_mode.h | 2 +- include/linux/rseq.h | 15 ++++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/include/linux/resume_user_mode.h b/include/linux/resume_user_mode.h index e0135e0adae0..dd3bf7da90a8 100644 --- a/include/linux/resume_user_mode.h +++ b/include/linux/resume_user_mode.h @@ -59,7 +59,7 @@ static inline void resume_user_mode_work(struct pt_regs *regs) mem_cgroup_handle_over_high(GFP_KERNEL); blkcg_maybe_throttle_current(); - rseq_handle_notify_resume(NULL, regs); + rseq_handle_notify_resume(regs); } #endif /* LINUX_RESUME_USER_MODE_H */ diff --git a/include/linux/rseq.h b/include/linux/rseq.h index 21f875af0e96..d72ddf7ce903 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -37,19 +37,20 @@ static inline void rseq_set_notify_resume(struct task_struct *t) void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); -static inline void rseq_handle_notify_resume(struct ksignal *ksig, - struct pt_regs *regs) +static inline void rseq_handle_notify_resume(struct pt_regs *regs) { if (current->rseq) - __rseq_handle_notify_resume(ksig, regs); + __rseq_handle_notify_resume(NULL, regs); } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { - scoped_guard(RSEQ_EVENT_GUARD) - __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); - rseq_handle_notify_resume(ksig, regs); + if (current->rseq) { + scoped_guard(RSEQ_EVENT_GUARD) + __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); + __rseq_handle_notify_resume(ksig, regs); + } } /* rseq_preempt() requires preemption to be disabled. */ @@ -103,7 +104,7 @@ static inline void rseq_execve(struct task_struct *t) #else /* CONFIG_RSEQ */ static inline void rseq_set_notify_resume(struct task_struct *t) { } -static inline void rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { } +static inline void rseq_handle_notify_resume(struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } static inline void rseq_preempt(struct task_struct *t) { } static inline void rseq_migrate(struct task_struct *t) { } From 067b3b41b4dd5bf51d6874206f5c1f72e0684eeb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:24 +0100 Subject: [PATCH 05/54] rseq: Simplify registration There is no point to read the critical section element in the newly registered user space RSEQ struct first in order to clear it. Just clear it and be done with it. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.274661227@linutronix.de --- kernel/rseq.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/kernel/rseq.c b/kernel/rseq.c index 51fafc4528b0..80af48a972f0 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -492,11 +492,9 @@ void rseq_syscall(struct pt_regs *regs) /* * sys_rseq - setup restartable sequences for caller thread. */ -SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, - int, flags, u32, sig) +SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) { int ret; - u64 rseq_cs; if (flags & RSEQ_FLAG_UNREGISTER) { if (flags & ~RSEQ_FLAG_UNREGISTER) @@ -557,11 +555,9 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, * avoid a potential segfault on return to user-space. The proper thing * to do would have been to fail the registration but this would break * older libcs that reuse the rseq area for new threads without - * clearing the fields. + * clearing the fields. Don't bother reading it, just reset it. */ - if (rseq_get_rseq_cs_ptr_val(rseq, &rseq_cs)) - return -EFAULT; - if (rseq_cs && clear_rseq_cs(rseq)) + if (put_user(0UL, &rseq->rseq_cs)) return -EFAULT; #ifdef CONFIG_DEBUG_RSEQ From d923739e2e356424cc566143a3323c62cd6ed067 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:26 +0100 Subject: [PATCH 06/54] rseq: Simplify the event notification Since commit 0190e4198e47 ("rseq: Deprecate RSEQ_CS_FLAG_NO_RESTART_ON_* flags") the bits in task::rseq_event_mask are meaningless and just extra work in terms of setting them individually. Aside of that the only relevant point where an event has to be raised is context switch. Neither the CPU nor MM CID can change without going through a context switch. Collapse them all into a single boolean which simplifies the code a lot and remove the pointless invocations which have been sprinkled all over the place for no value. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.336978188@linutronix.de --- fs/exec.c | 2 +- include/linux/rseq.h | 66 ++++++++------------------------------- include/linux/sched.h | 10 +++--- include/uapi/linux/rseq.h | 21 +++++-------- kernel/rseq.c | 28 ++++++++++------- kernel/sched/core.c | 5 +-- kernel/sched/membarrier.c | 8 ++--- 7 files changed, 48 insertions(+), 92 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 4298e7e08d5d..e45b29890269 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1775,7 +1775,7 @@ out: force_fatal_sig(SIGSEGV); sched_mm_cid_after_execve(current); - rseq_set_notify_resume(current); + rseq_sched_switch_event(current); current->in_execve = 0; return retval; diff --git a/include/linux/rseq.h b/include/linux/rseq.h index d72ddf7ce903..241067bf20db 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -3,38 +3,8 @@ #define _LINUX_RSEQ_H #ifdef CONFIG_RSEQ - -#include #include -#ifdef CONFIG_MEMBARRIER -# define RSEQ_EVENT_GUARD irq -#else -# define RSEQ_EVENT_GUARD preempt -#endif - -/* - * Map the event mask on the user-space ABI enum rseq_cs_flags - * for direct mask checks. - */ -enum rseq_event_mask_bits { - RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT, - RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT, - RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT, -}; - -enum rseq_event_mask { - RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT), - RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT), - RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT), -}; - -static inline void rseq_set_notify_resume(struct task_struct *t) -{ - if (t->rseq) - set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); -} - void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); static inline void rseq_handle_notify_resume(struct pt_regs *regs) @@ -43,35 +13,27 @@ static inline void rseq_handle_notify_resume(struct pt_regs *regs) __rseq_handle_notify_resume(NULL, regs); } -static inline void rseq_signal_deliver(struct ksignal *ksig, - struct pt_regs *regs) +static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { if (current->rseq) { - scoped_guard(RSEQ_EVENT_GUARD) - __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); + current->rseq_event_pending = true; __rseq_handle_notify_resume(ksig, regs); } } -/* rseq_preempt() requires preemption to be disabled. */ -static inline void rseq_preempt(struct task_struct *t) +static inline void rseq_sched_switch_event(struct task_struct *t) { - __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask); - rseq_set_notify_resume(t); -} - -/* rseq_migrate() requires preemption to be disabled. */ -static inline void rseq_migrate(struct task_struct *t) -{ - __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask); - rseq_set_notify_resume(t); + if (t->rseq) { + t->rseq_event_pending = true; + set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); + } } static __always_inline void rseq_exit_to_user_mode(void) { if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) { - if (WARN_ON_ONCE(current->rseq && current->rseq_event_mask)) - current->rseq_event_mask = 0; + if (WARN_ON_ONCE(current->rseq && current->rseq_event_pending)) + current->rseq_event_pending = false; } } @@ -85,12 +47,12 @@ static inline void rseq_fork(struct task_struct *t, u64 clone_flags) t->rseq = NULL; t->rseq_len = 0; t->rseq_sig = 0; - t->rseq_event_mask = 0; + t->rseq_event_pending = false; } else { t->rseq = current->rseq; t->rseq_len = current->rseq_len; t->rseq_sig = current->rseq_sig; - t->rseq_event_mask = current->rseq_event_mask; + t->rseq_event_pending = current->rseq_event_pending; } } @@ -99,15 +61,13 @@ static inline void rseq_execve(struct task_struct *t) t->rseq = NULL; t->rseq_len = 0; t->rseq_sig = 0; - t->rseq_event_mask = 0; + t->rseq_event_pending = false; } #else /* CONFIG_RSEQ */ -static inline void rseq_set_notify_resume(struct task_struct *t) { } static inline void rseq_handle_notify_resume(struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } -static inline void rseq_preempt(struct task_struct *t) { } -static inline void rseq_migrate(struct task_struct *t) { } +static inline void rseq_sched_switch_event(struct task_struct *t) { } static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { } static inline void rseq_execve(struct task_struct *t) { } static inline void rseq_exit_to_user_mode(void) { } diff --git a/include/linux/sched.h b/include/linux/sched.h index b469878de25c..6627c527c2c7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1407,14 +1407,14 @@ struct task_struct { #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_RSEQ - struct rseq __user *rseq; - u32 rseq_len; - u32 rseq_sig; + struct rseq __user *rseq; + u32 rseq_len; + u32 rseq_sig; /* - * RmW on rseq_event_mask must be performed atomically + * RmW on rseq_event_pending must be performed atomically * with respect to preemption. */ - unsigned long rseq_event_mask; + bool rseq_event_pending; # ifdef CONFIG_DEBUG_RSEQ /* * This is a place holder to save a copy of the rseq fields for diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h index c233aae5eac9..1b76d508400c 100644 --- a/include/uapi/linux/rseq.h +++ b/include/uapi/linux/rseq.h @@ -114,20 +114,13 @@ struct rseq { /* * Restartable sequences flags field. * - * This field should only be updated by the thread which - * registered this data structure. Read by the kernel. - * Mainly used for single-stepping through rseq critical sections - * with debuggers. - * - * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT - * Inhibit instruction sequence block restart on preemption - * for this thread. - * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL - * Inhibit instruction sequence block restart on signal - * delivery for this thread. - * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE - * Inhibit instruction sequence block restart on migration for - * this thread. + * This field was initially intended to allow event masking for + * single-stepping through rseq critical sections with debuggers. + * The kernel does not support this anymore and the relevant bits + * are checked for being always false: + * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT + * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL + * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE */ __u32 flags; diff --git a/kernel/rseq.c b/kernel/rseq.c index 80af48a972f0..59adc1a7183b 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -78,6 +78,12 @@ #define CREATE_TRACE_POINTS #include +#ifdef CONFIG_MEMBARRIER +# define RSEQ_EVENT_GUARD irq +#else +# define RSEQ_EVENT_GUARD preempt +#endif + /* The original rseq structure size (including padding) is 32 bytes. */ #define ORIG_RSEQ_SIZE 32 @@ -430,11 +436,11 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) */ if (regs) { /* - * Read and clear the event mask first. If the task was not - * preempted or migrated or a signal is on the way, there - * is no point in doing any of the heavy lifting here on - * production kernels. In that case TIF_NOTIFY_RESUME was - * raised by some other functionality. + * Read and clear the event pending bit first. If the task + * was not preempted or migrated or a signal is on the way, + * there is no point in doing any of the heavy lifting here + * on production kernels. In that case TIF_NOTIFY_RESUME + * was raised by some other functionality. * * This is correct because the read/clear operation is * guarded against scheduler preemption, which makes it CPU @@ -447,15 +453,15 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) * with the result handed in to allow the detection of * inconsistencies. */ - u32 event_mask; + bool event; scoped_guard(RSEQ_EVENT_GUARD) { - event_mask = t->rseq_event_mask; - t->rseq_event_mask = 0; + event = t->rseq_event_pending; + t->rseq_event_pending = false; } - if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event_mask) { - ret = rseq_ip_fixup(regs, !!event_mask); + if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event) { + ret = rseq_ip_fixup(regs, event); if (unlikely(ret < 0)) goto error; } @@ -584,7 +590,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 * registered, ensure the cpu_id_start and cpu_id fields * are updated before returning to user-space. */ - rseq_set_notify_resume(current); + rseq_sched_switch_event(current); return 0; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f1ebf67b48e2..b75e8e1eca4a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3329,7 +3329,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (p->sched_class->migrate_task_rq) p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; - rseq_migrate(p); sched_mm_cid_migrate_from(p); perf_event_task_migrate(p); } @@ -4763,7 +4762,6 @@ int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) p->sched_task_group = tg; } #endif - rseq_migrate(p); /* * We're setting the CPU for the first time, we don't migrate, * so use __set_task_cpu(). @@ -4827,7 +4825,6 @@ void wake_up_new_task(struct task_struct *p) * as we're not fully set-up yet. */ p->recent_used_cpu = task_cpu(p); - rseq_migrate(p); __set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags)); rq = __task_rq_lock(p, &rf); update_rq_clock(rq); @@ -5121,7 +5118,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, kcov_prepare_switch(prev); sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); - rseq_preempt(prev); + rseq_sched_switch_event(prev); fire_sched_out_preempt_notifiers(prev, next); kmap_local_sched_out(); prepare_task(next); diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 62fba83b7bb1..623445603725 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -199,7 +199,7 @@ static void ipi_rseq(void *info) * is negligible. */ smp_mb(); - rseq_preempt(current); + rseq_sched_switch_event(current); } static void ipi_sync_rq_state(void *info) @@ -407,9 +407,9 @@ static int membarrier_private_expedited(int flags, int cpu_id) * membarrier, we will end up with some thread in the mm * running without a core sync. * - * For RSEQ, don't rseq_preempt() the caller. User code - * is not supposed to issue syscalls at all from inside an - * rseq critical section. + * For RSEQ, don't invoke rseq_sched_switch_event() on the + * caller. User code is not supposed to issue syscalls at + * all from inside an rseq critical section. */ if (flags != MEMBARRIER_FLAG_SYNC_CORE) { preempt_disable(); From 83409986f49f17b14a675f9c598ad50d4c60191b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:28 +0100 Subject: [PATCH 07/54] rseq, virt: Retrigger RSEQ after vcpu_run() Hypervisors invoke resume_user_mode_work() before entering the guest, which clears TIF_NOTIFY_RESUME. The @regs argument is NULL as there is no user space context available to them, so the rseq notify handler skips inspecting the critical section, but updates the CPU/MM CID values unconditionally so that the eventual pending rseq event is not lost on the way to user space. This is a pointless exercise as the task might be rescheduled before actually returning to user space and it creates unnecessary work in the vcpu_run() loops. It's way more efficient to ignore that invocation based on @regs == NULL and let the hypervisors re-raise TIF_NOTIFY_RESUME after returning from the vcpu_run() loop before returning from the ioctl(). This ensures that a pending RSEQ update is not lost and the IDs are updated before returning to user space. Once the RSEQ handling is decoupled from TIF_NOTIFY_RESUME, this turns into a NOOP. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Acked-by: Sean Christopherson Link: https://patch.msgid.link/20251027084306.399495855@linutronix.de --- drivers/hv/mshv_root_main.c | 3 ++ include/linux/rseq.h | 17 ++++++++ kernel/rseq.c | 78 +++++++++++++++++++------------------ virt/kvm/kvm_main.c | 7 ++++ 4 files changed, 68 insertions(+), 37 deletions(-) diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index e3b2bd417c46..a21a0eb0f5be 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -29,6 +29,7 @@ #include #include #include +#include #include "mshv_eventfd.h" #include "mshv.h" @@ -560,6 +561,8 @@ static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp) } } while (!vp->run.flags.intercept_suspend); + rseq_virt_userspace_exit(); + return ret; } diff --git a/include/linux/rseq.h b/include/linux/rseq.h index 241067bf20db..c6267f70c746 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -37,6 +37,22 @@ static __always_inline void rseq_exit_to_user_mode(void) } } +/* + * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode, + * which clears TIF_NOTIFY_RESUME. To avoid updating user space RSEQ in + * that case just to do it eventually again before returning to user space, + * the entry resume_user_mode_work() invocation is ignored as the register + * argument is NULL. + * + * After returning from guest mode, they have to invoke this function to + * re-raise TIF_NOTIFY_RESUME if necessary. + */ +static inline void rseq_virt_userspace_exit(void) +{ + if (current->rseq_event_pending) + set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); +} + /* * If parent process has a registered restartable sequences area, the * child inherits. Unregister rseq for a clone with CLONE_VM set. @@ -68,6 +84,7 @@ static inline void rseq_execve(struct task_struct *t) static inline void rseq_handle_notify_resume(struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } static inline void rseq_sched_switch_event(struct task_struct *t) { } +static inline void rseq_virt_userspace_exit(void) { } static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { } static inline void rseq_execve(struct task_struct *t) { } static inline void rseq_exit_to_user_mode(void) { } diff --git a/kernel/rseq.c b/kernel/rseq.c index 59adc1a7183b..01e711383e05 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -422,50 +422,54 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { struct task_struct *t = current; int ret, sig; + bool event; + + /* + * If invoked from hypervisors before entering the guest via + * resume_user_mode_work(), then @regs is a NULL pointer. + * + * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises + * it before returning from the ioctl() to user space when + * rseq_event.sched_switch is set. + * + * So it's safe to ignore here instead of pointlessly updating it + * in the vcpu_run() loop. + */ + if (!regs) + return; if (unlikely(t->flags & PF_EXITING)) return; /* - * If invoked from hypervisors or IO-URING, then @regs is a NULL - * pointer, so fixup cannot be done. If the syscall which led to - * this invocation was invoked inside a critical section, then it - * will either end up in this code again or a possible violation of - * a syscall inside a critical region can only be detected by the - * debug code in rseq_syscall() in a debug enabled kernel. + * Read and clear the event pending bit first. If the task + * was not preempted or migrated or a signal is on the way, + * there is no point in doing any of the heavy lifting here + * on production kernels. In that case TIF_NOTIFY_RESUME + * was raised by some other functionality. + * + * This is correct because the read/clear operation is + * guarded against scheduler preemption, which makes it CPU + * local atomic. If the task is preempted right after + * re-enabling preemption then TIF_NOTIFY_RESUME is set + * again and this function is invoked another time _before_ + * the task is able to return to user mode. + * + * On a debug kernel, invoke the fixup code unconditionally + * with the result handed in to allow the detection of + * inconsistencies. */ - if (regs) { - /* - * Read and clear the event pending bit first. If the task - * was not preempted or migrated or a signal is on the way, - * there is no point in doing any of the heavy lifting here - * on production kernels. In that case TIF_NOTIFY_RESUME - * was raised by some other functionality. - * - * This is correct because the read/clear operation is - * guarded against scheduler preemption, which makes it CPU - * local atomic. If the task is preempted right after - * re-enabling preemption then TIF_NOTIFY_RESUME is set - * again and this function is invoked another time _before_ - * the task is able to return to user mode. - * - * On a debug kernel, invoke the fixup code unconditionally - * with the result handed in to allow the detection of - * inconsistencies. - */ - bool event; - - scoped_guard(RSEQ_EVENT_GUARD) { - event = t->rseq_event_pending; - t->rseq_event_pending = false; - } - - if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event) { - ret = rseq_ip_fixup(regs, event); - if (unlikely(ret < 0)) - goto error; - } + scoped_guard(RSEQ_EVENT_GUARD) { + event = t->rseq_event_pending; + t->rseq_event_pending = false; } + + if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event) { + ret = rseq_ip_fixup(regs, event); + if (unlikely(ret < 0)) + goto error; + } + if (unlikely(rseq_update_cpu_node_id(t))) goto error; return; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b7a0ae2a7b20..4255fcf9c6e5 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -4476,6 +4477,12 @@ static long kvm_vcpu_ioctl(struct file *filp, r = kvm_arch_vcpu_ioctl_run(vcpu); vcpu->wants_to_run = false; + /* + * FIXME: Remove this hack once all KVM architectures + * support the generic TIF bits, i.e. a dedicated TIF_RSEQ. + */ + rseq_virt_userspace_exit(); + trace_kvm_userspace_exit(vcpu->run->exit_reason, r); break; } From 566d8015f7eef11d82cd63dc4e1f620fcfc2a394 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:31 +0100 Subject: [PATCH 08/54] rseq: Avoid CPU/MM CID updates when no event pending There is no need to update these values unconditionally if there is no event pending. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.462964916@linutronix.de --- kernel/rseq.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/kernel/rseq.c b/kernel/rseq.c index 01e711383e05..81dddafa2f2e 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -464,11 +464,12 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) t->rseq_event_pending = false; } - if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event) { - ret = rseq_ip_fixup(regs, event); - if (unlikely(ret < 0)) - goto error; - } + if (!IS_ENABLED(CONFIG_DEBUG_RSEQ) && !event) + return; + + ret = rseq_ip_fixup(regs, event); + if (unlikely(ret < 0)) + goto error; if (unlikely(rseq_update_cpu_node_id(t))) goto error; From faba9d250eaec7afa248bba71531a08ccc497aab Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:33 +0100 Subject: [PATCH 09/54] rseq: Introduce struct rseq_data In preparation for a major rewrite of this code, provide a data structure for rseq management. Put all the rseq related data into it (except for the debug part), which allows to simplify fork/execve by using memset() and memcpy() instead of adding new fields to initialize over and over. Create a storage struct for event management as well and put the sched_switch event and a indicator for RSEQ on a task into it as a start. That uses a union, which allows to mask and clear the whole lot efficiently. The indicators are explicitly not a bit field. Bit fields generate abysmal code. The boolean members are defined as u8 as that actually guarantees that it fits. There seem to be strange architecture ABIs which need more than 8 bits for a boolean. The has_rseq member is redundant vs. task::rseq, but it turns out that boolean operations and quick checks on the union generate better code than fiddling with separate entities and data types. This struct will be extended over time to carry more information. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.527086690@linutronix.de --- include/linux/rseq.h | 48 +++++++++++++---------------- include/linux/rseq_types.h | 51 ++++++++++++++++++++++++++++++ include/linux/sched.h | 14 ++------- kernel/ptrace.c | 6 ++-- kernel/rseq.c | 63 +++++++++++++++++++------------------- 5 files changed, 110 insertions(+), 72 deletions(-) create mode 100644 include/linux/rseq_types.h diff --git a/include/linux/rseq.h b/include/linux/rseq.h index c6267f70c746..ab91b1e6bb4a 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -9,22 +9,22 @@ void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); static inline void rseq_handle_notify_resume(struct pt_regs *regs) { - if (current->rseq) + if (current->rseq.event.has_rseq) __rseq_handle_notify_resume(NULL, regs); } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { - if (current->rseq) { - current->rseq_event_pending = true; + if (current->rseq.event.has_rseq) { + current->rseq.event.sched_switch = true; __rseq_handle_notify_resume(ksig, regs); } } static inline void rseq_sched_switch_event(struct task_struct *t) { - if (t->rseq) { - t->rseq_event_pending = true; + if (t->rseq.event.has_rseq) { + t->rseq.event.sched_switch = true; set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); } } @@ -32,8 +32,9 @@ static inline void rseq_sched_switch_event(struct task_struct *t) static __always_inline void rseq_exit_to_user_mode(void) { if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) { - if (WARN_ON_ONCE(current->rseq && current->rseq_event_pending)) - current->rseq_event_pending = false; + if (WARN_ON_ONCE(current->rseq.event.has_rseq && + current->rseq.event.events)) + current->rseq.event.events = 0; } } @@ -49,35 +50,30 @@ static __always_inline void rseq_exit_to_user_mode(void) */ static inline void rseq_virt_userspace_exit(void) { - if (current->rseq_event_pending) + if (current->rseq.event.sched_switch) set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); } +static inline void rseq_reset(struct task_struct *t) +{ + memset(&t->rseq, 0, sizeof(t->rseq)); +} + +static inline void rseq_execve(struct task_struct *t) +{ + rseq_reset(t); +} + /* * If parent process has a registered restartable sequences area, the * child inherits. Unregister rseq for a clone with CLONE_VM set. */ static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { - if (clone_flags & CLONE_VM) { - t->rseq = NULL; - t->rseq_len = 0; - t->rseq_sig = 0; - t->rseq_event_pending = false; - } else { + if (clone_flags & CLONE_VM) + rseq_reset(t); + else t->rseq = current->rseq; - t->rseq_len = current->rseq_len; - t->rseq_sig = current->rseq_sig; - t->rseq_event_pending = current->rseq_event_pending; - } -} - -static inline void rseq_execve(struct task_struct *t) -{ - t->rseq = NULL; - t->rseq_len = 0; - t->rseq_sig = 0; - t->rseq_event_pending = false; } #else /* CONFIG_RSEQ */ diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h new file mode 100644 index 000000000000..f7a60c8eddc9 --- /dev/null +++ b/include/linux/rseq_types.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_RSEQ_TYPES_H +#define _LINUX_RSEQ_TYPES_H + +#include + +#ifdef CONFIG_RSEQ +struct rseq; + +/** + * struct rseq_event - Storage for rseq related event management + * @all: Compound to initialize and clear the data efficiently + * @events: Compound to access events with a single load/store + * @sched_switch: True if the task was scheduled out + * @has_rseq: True if the task has a rseq pointer installed + */ +struct rseq_event { + union { + u32 all; + struct { + union { + u16 events; + struct { + u8 sched_switch; + }; + }; + + u8 has_rseq; + }; + }; +}; + +/** + * struct rseq_data - Storage for all rseq related data + * @usrptr: Pointer to the registered user space RSEQ memory + * @len: Length of the RSEQ region + * @sig: Signature of critial section abort IPs + * @event: Storage for event management + */ +struct rseq_data { + struct rseq __user *usrptr; + u32 len; + u32 sig; + struct rseq_event event; +}; + +#else /* CONFIG_RSEQ */ +struct rseq_data { }; +#endif /* !CONFIG_RSEQ */ + +#endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 6627c527c2c7..15627769409d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -1406,16 +1407,8 @@ struct task_struct { unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ -#ifdef CONFIG_RSEQ - struct rseq __user *rseq; - u32 rseq_len; - u32 rseq_sig; - /* - * RmW on rseq_event_pending must be performed atomically - * with respect to preemption. - */ - bool rseq_event_pending; -# ifdef CONFIG_DEBUG_RSEQ + struct rseq_data rseq; +#ifdef CONFIG_DEBUG_RSEQ /* * This is a place holder to save a copy of the rseq fields for * validation of read-only fields. The struct rseq has a @@ -1423,7 +1416,6 @@ struct task_struct { * directly. Reserve a size large enough for the known fields. */ char rseq_fields[sizeof(struct rseq)]; -# endif #endif #ifdef CONFIG_SCHED_MM_CID diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 75a84efad40f..392ec2f75f01 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -793,9 +793,9 @@ static long ptrace_get_rseq_configuration(struct task_struct *task, unsigned long size, void __user *data) { struct ptrace_rseq_configuration conf = { - .rseq_abi_pointer = (u64)(uintptr_t)task->rseq, - .rseq_abi_size = task->rseq_len, - .signature = task->rseq_sig, + .rseq_abi_pointer = (u64)(uintptr_t)task->rseq.usrptr, + .rseq_abi_size = task->rseq.len, + .signature = task->rseq.sig, .flags = 0, }; diff --git a/kernel/rseq.c b/kernel/rseq.c index 81dddafa2f2e..aae62661e6bb 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -103,13 +103,13 @@ static int rseq_validate_ro_fields(struct task_struct *t) DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); u32 cpu_id_start, cpu_id, node_id, mm_cid; - struct rseq __user *rseq = t->rseq; + struct rseq __user *rseq = t->rseq.usrptr; /* * Validate fields which are required to be read-only by * user-space. */ - if (!user_read_access_begin(rseq, t->rseq_len)) + if (!user_read_access_begin(rseq, t->rseq.len)) goto efault; unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end); unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end); @@ -147,10 +147,10 @@ efault: * Update an rseq field and its in-kernel copy in lock-step to keep a coherent * state. */ -#define rseq_unsafe_put_user(t, value, field, error_label) \ - do { \ - unsafe_put_user(value, &t->rseq->field, error_label); \ - rseq_kernel_fields(t)->field = value; \ +#define rseq_unsafe_put_user(t, value, field, error_label) \ + do { \ + unsafe_put_user(value, &t->rseq.usrptr->field, error_label); \ + rseq_kernel_fields(t)->field = value; \ } while (0) #else @@ -160,12 +160,12 @@ static int rseq_validate_ro_fields(struct task_struct *t) } #define rseq_unsafe_put_user(t, value, field, error_label) \ - unsafe_put_user(value, &t->rseq->field, error_label) + unsafe_put_user(value, &t->rseq.usrptr->field, error_label) #endif static int rseq_update_cpu_node_id(struct task_struct *t) { - struct rseq __user *rseq = t->rseq; + struct rseq __user *rseq = t->rseq.usrptr; u32 cpu_id = raw_smp_processor_id(); u32 node_id = cpu_to_node(cpu_id); u32 mm_cid = task_mm_cid(t); @@ -176,7 +176,7 @@ static int rseq_update_cpu_node_id(struct task_struct *t) if (rseq_validate_ro_fields(t)) goto efault; WARN_ON_ONCE((int) mm_cid < 0); - if (!user_write_access_begin(rseq, t->rseq_len)) + if (!user_write_access_begin(rseq, t->rseq.len)) goto efault; rseq_unsafe_put_user(t, cpu_id, cpu_id_start, efault_end); @@ -201,7 +201,7 @@ efault: static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) { - struct rseq __user *rseq = t->rseq; + struct rseq __user *rseq = t->rseq.usrptr; u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0, mm_cid = 0; @@ -211,7 +211,7 @@ static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) if (rseq_validate_ro_fields(t)) goto efault; - if (!user_write_access_begin(rseq, t->rseq_len)) + if (!user_write_access_begin(rseq, t->rseq.len)) goto efault; /* @@ -272,7 +272,7 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) u32 sig; int ret; - ret = rseq_get_rseq_cs_ptr_val(t->rseq, &ptr); + ret = rseq_get_rseq_cs_ptr_val(t->rseq.usrptr, &ptr); if (ret) return ret; @@ -305,10 +305,10 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) if (ret) return ret; - if (current->rseq_sig != sig) { + if (current->rseq.sig != sig) { printk_ratelimited(KERN_WARNING "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", - sig, current->rseq_sig, current->pid, usig); + sig, current->rseq.sig, current->pid, usig); return -EINVAL; } return 0; @@ -338,7 +338,7 @@ static int rseq_check_flags(struct task_struct *t, u32 cs_flags) return -EINVAL; /* Get thread flags. */ - ret = get_user(flags, &t->rseq->flags); + ret = get_user(flags, &t->rseq.usrptr->flags); if (ret) return ret; @@ -392,13 +392,13 @@ static int rseq_ip_fixup(struct pt_regs *regs, bool abort) * Clear the rseq_cs pointer and return. */ if (!in_rseq_cs(ip, &rseq_cs)) - return clear_rseq_cs(t->rseq); + return clear_rseq_cs(t->rseq.usrptr); ret = rseq_check_flags(t, rseq_cs.flags); if (ret < 0) return ret; if (!abort) return 0; - ret = clear_rseq_cs(t->rseq); + ret = clear_rseq_cs(t->rseq.usrptr); if (ret) return ret; trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, @@ -460,8 +460,8 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) * inconsistencies. */ scoped_guard(RSEQ_EVENT_GUARD) { - event = t->rseq_event_pending; - t->rseq_event_pending = false; + event = t->rseq.event.sched_switch; + t->rseq.event.sched_switch = false; } if (!IS_ENABLED(CONFIG_DEBUG_RSEQ) && !event) @@ -492,7 +492,7 @@ void rseq_syscall(struct pt_regs *regs) struct task_struct *t = current; struct rseq_cs rseq_cs; - if (!t->rseq) + if (!t->rseq.usrptr) return; if (rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs)) force_sig(SIGSEGV); @@ -511,33 +511,31 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 if (flags & ~RSEQ_FLAG_UNREGISTER) return -EINVAL; /* Unregister rseq for current thread. */ - if (current->rseq != rseq || !current->rseq) + if (current->rseq.usrptr != rseq || !current->rseq.usrptr) return -EINVAL; - if (rseq_len != current->rseq_len) + if (rseq_len != current->rseq.len) return -EINVAL; - if (current->rseq_sig != sig) + if (current->rseq.sig != sig) return -EPERM; ret = rseq_reset_rseq_cpu_node_id(current); if (ret) return ret; - current->rseq = NULL; - current->rseq_sig = 0; - current->rseq_len = 0; + rseq_reset(current); return 0; } if (unlikely(flags)) return -EINVAL; - if (current->rseq) { + if (current->rseq.usrptr) { /* * If rseq is already registered, check whether * the provided address differs from the prior * one. */ - if (current->rseq != rseq || rseq_len != current->rseq_len) + if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len) return -EINVAL; - if (current->rseq_sig != sig) + if (current->rseq.sig != sig) return -EPERM; /* Already registered. */ return -EBUSY; @@ -586,15 +584,16 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 * Activate the registration by setting the rseq area address, length * and signature in the task struct. */ - current->rseq = rseq; - current->rseq_len = rseq_len; - current->rseq_sig = sig; + current->rseq.usrptr = rseq; + current->rseq.len = rseq_len; + current->rseq.sig = sig; /* * If rseq was previously inactive, and has just been * registered, ensure the cpu_id_start and cpu_id fields * are updated before returning to user-space. */ + current->rseq.event.has_rseq = true; rseq_sched_switch_event(current); return 0; From 5204be16790f305febbf331d0ec2cead7978b3c3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:36 +0100 Subject: [PATCH 10/54] entry: Clean up header Clean up the include ordering, kernel-doc and other trivialities before making further changes. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.590338411@linutronix.de --- include/linux/entry-common.h | 8 ++++---- include/linux/irq-entry-common.h | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 7177436f0f9e..c585221ff16b 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -3,11 +3,11 @@ #define __LINUX_ENTRYCOMMON_H #include +#include #include +#include #include #include -#include -#include #include #include @@ -37,6 +37,7 @@ SYSCALL_WORK_SYSCALL_AUDIT | \ SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ ARCH_SYSCALL_WORK_ENTER) + #define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \ SYSCALL_WORK_SYSCALL_TRACE | \ SYSCALL_WORK_SYSCALL_AUDIT | \ @@ -61,8 +62,7 @@ */ void syscall_enter_from_user_mode_prepare(struct pt_regs *regs); -long syscall_trace_enter(struct pt_regs *regs, long syscall, - unsigned long work); +long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work); /** * syscall_enter_from_user_mode_work - Check and handle work before invoking diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index e5941df13901..9b1f386ffeb1 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -68,6 +68,7 @@ static __always_inline bool arch_in_rcu_eqs(void) { return false; } /** * enter_from_user_mode - Establish state when coming from user mode + * @regs: Pointer to currents pt_regs * * Syscall/interrupt entry disables interrupts, but user mode is traced as * interrupts enabled. Also with NO_HZ_FULL RCU might be idle. @@ -357,6 +358,7 @@ irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs); * Conditional reschedule with additional sanity checks. */ void raw_irqentry_exit_cond_resched(void); + #ifdef CONFIG_PREEMPT_DYNAMIC #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) #define irqentry_exit_cond_resched_dynamic_enabled raw_irqentry_exit_cond_resched From 54a5ab56242f96555999aaa41228f77b4a76e386 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:38 +0100 Subject: [PATCH 11/54] entry: Remove syscall_enter_from_user_mode_prepare() Open code the only user in the x86 syscall code and reduce the zoo of functions. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.652839989@linutronix.de --- arch/x86/entry/syscall_32.c | 3 ++- include/linux/entry-common.h | 26 +++++--------------------- kernel/entry/syscall-common.c | 8 -------- 3 files changed, 7 insertions(+), 30 deletions(-) diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index 2b15ea17bb7c..a67a644d0cfe 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -274,9 +274,10 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) * fetch EBP before invoking any of the syscall entry work * functions. */ - syscall_enter_from_user_mode_prepare(regs); + enter_from_user_mode(regs); instrumentation_begin(); + local_irq_enable(); /* Fetch EBP from where the vDSO stashed it. */ if (IS_ENABLED(CONFIG_X86_64)) { /* diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index c585221ff16b..75b194c34e18 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -45,23 +45,6 @@ SYSCALL_WORK_SYSCALL_EXIT_TRAP | \ ARCH_SYSCALL_WORK_EXIT) -/** - * syscall_enter_from_user_mode_prepare - Establish state and enable interrupts - * @regs: Pointer to currents pt_regs - * - * Invoked from architecture specific syscall entry code with interrupts - * disabled. The calling code has to be non-instrumentable. When the - * function returns all state is correct, interrupts are enabled and the - * subsequent functions can be instrumented. - * - * This handles lockdep, RCU (context tracking) and tracing state, i.e. - * the functionality provided by enter_from_user_mode(). - * - * This is invoked when there is extra architecture specific functionality - * to be done between establishing state and handling user mode entry work. - */ -void syscall_enter_from_user_mode_prepare(struct pt_regs *regs); - long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work); /** @@ -71,8 +54,8 @@ long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work) * @syscall: The syscall number * * Invoked from architecture specific syscall entry code with interrupts - * enabled after invoking syscall_enter_from_user_mode_prepare() and extra - * architecture specific work. + * enabled after invoking enter_from_user_mode(), enabling interrupts and + * extra architecture specific work. * * Returns: The original or a modified syscall number * @@ -108,8 +91,9 @@ static __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *re * function returns all state is correct, interrupts are enabled and the * subsequent functions can be instrumented. * - * This is combination of syscall_enter_from_user_mode_prepare() and - * syscall_enter_from_user_mode_work(). + * This is the combination of enter_from_user_mode() and + * syscall_enter_from_user_mode_work() to be used when there is no + * architecture specific work to be done between the two. * * Returns: The original or a modified syscall number. See * syscall_enter_from_user_mode_work() for further explanation. diff --git a/kernel/entry/syscall-common.c b/kernel/entry/syscall-common.c index 66e6ba7fa80c..940a597ded40 100644 --- a/kernel/entry/syscall-common.c +++ b/kernel/entry/syscall-common.c @@ -63,14 +63,6 @@ long syscall_trace_enter(struct pt_regs *regs, long syscall, return ret ? : syscall; } -noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) -{ - enter_from_user_mode(regs); - instrumentation_begin(); - local_irq_enable(); - instrumentation_end(); -} - /* * If SYSCALL_EMU is set, then the only reason to report is when * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall From 7702a9c2856794b6bf961b408eba3bacb753bd5b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:40 +0100 Subject: [PATCH 12/54] entry: Inline irqentry_enter/exit_from/to_user_mode() There is no point to have this as a function which just inlines enter_from_user_mode(). The function call overhead is larger than the function itself. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.715309918@linutronix.de --- include/linux/irq-entry-common.h | 13 +++++++++++-- kernel/entry/common.c | 13 ------------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index 9b1f386ffeb1..83c9d841d9e1 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -278,7 +278,10 @@ static __always_inline void exit_to_user_mode(void) * * The function establishes state (lockdep, RCU (context tracking), tracing) */ -void irqentry_enter_from_user_mode(struct pt_regs *regs); +static __always_inline void irqentry_enter_from_user_mode(struct pt_regs *regs) +{ + enter_from_user_mode(regs); +} /** * irqentry_exit_to_user_mode - Interrupt exit work @@ -293,7 +296,13 @@ void irqentry_enter_from_user_mode(struct pt_regs *regs); * Interrupt exit is not invoking #1 which is the syscall specific one time * work. */ -void irqentry_exit_to_user_mode(struct pt_regs *regs); +static __always_inline void irqentry_exit_to_user_mode(struct pt_regs *regs) +{ + instrumentation_begin(); + exit_to_user_mode_prepare(regs); + instrumentation_end(); + exit_to_user_mode(); +} #ifndef irqentry_state /** diff --git a/kernel/entry/common.c b/kernel/entry/common.c index f62e1d1b2063..70a16db4cc0a 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -62,19 +62,6 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, return ti_work; } -noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) -{ - enter_from_user_mode(regs); -} - -noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) -{ - instrumentation_begin(); - exit_to_user_mode_prepare(regs); - instrumentation_end(); - exit_to_user_mode(); -} - noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) { irqentry_state_t ret = { From 4fc9225d19ad6289c03340a520d35e3a6d1aebed Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:42 +0100 Subject: [PATCH 13/54] sched: Move MM CID related functions to sched.h There is nothing mm specific in that and including mm.h can cause header recursion hell. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.778457951@linutronix.de --- include/linux/mm.h | 25 ------------------------- include/linux/sched.h | 26 ++++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 25 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index d16b33bacc32..17cfbba9914c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2401,31 +2401,6 @@ struct zap_details { /* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */ #define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1)) -#ifdef CONFIG_SCHED_MM_CID -void sched_mm_cid_before_execve(struct task_struct *t); -void sched_mm_cid_after_execve(struct task_struct *t); -void sched_mm_cid_fork(struct task_struct *t); -void sched_mm_cid_exit_signals(struct task_struct *t); -static inline int task_mm_cid(struct task_struct *t) -{ - return t->mm_cid; -} -#else -static inline void sched_mm_cid_before_execve(struct task_struct *t) { } -static inline void sched_mm_cid_after_execve(struct task_struct *t) { } -static inline void sched_mm_cid_fork(struct task_struct *t) { } -static inline void sched_mm_cid_exit_signals(struct task_struct *t) { } -static inline int task_mm_cid(struct task_struct *t) -{ - /* - * Use the processor id as a fall-back when the mm cid feature is - * disabled. This provides functional per-cpu data structure accesses - * in user-space, althrough it won't provide the memory usage benefits. - */ - return raw_smp_processor_id(); -} -#endif - #ifdef CONFIG_MMU extern bool can_do_mlock(void); #else diff --git a/include/linux/sched.h b/include/linux/sched.h index 15627769409d..24a9da7ca3e7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2310,6 +2310,32 @@ static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct allo #define alloc_tag_restore(_tag, _old) do {} while (0) #endif +/* Avoids recursive inclusion hell */ +#ifdef CONFIG_SCHED_MM_CID +void sched_mm_cid_before_execve(struct task_struct *t); +void sched_mm_cid_after_execve(struct task_struct *t); +void sched_mm_cid_fork(struct task_struct *t); +void sched_mm_cid_exit_signals(struct task_struct *t); +static inline int task_mm_cid(struct task_struct *t) +{ + return t->mm_cid; +} +#else +static inline void sched_mm_cid_before_execve(struct task_struct *t) { } +static inline void sched_mm_cid_after_execve(struct task_struct *t) { } +static inline void sched_mm_cid_fork(struct task_struct *t) { } +static inline void sched_mm_cid_exit_signals(struct task_struct *t) { } +static inline int task_mm_cid(struct task_struct *t) +{ + /* + * Use the processor id as a fall-back when the mm cid feature is + * disabled. This provides functional per-cpu data structure accesses + * in user-space, althrough it won't provide the memory usage benefits. + */ + return task_cpu(t); +} +#endif + #ifndef MODULE #ifndef COMPILE_OFFSETS From 4b7de6df20d43dd651031aef8d818fa5da981dbf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:45 +0100 Subject: [PATCH 14/54] rseq: Cache CPU ID and MM CID values In preparation for rewriting RSEQ exit to user space handling provide storage to cache the CPU ID and MM CID values which were written to user space. That prepares for a quick check, which avoids the update when nothing changed. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.841964081@linutronix.de --- include/linux/rseq.h | 7 +++++-- include/linux/rseq_types.h | 21 +++++++++++++++++++++ include/trace/events/rseq.h | 4 ++-- kernel/rseq.c | 4 ++++ 4 files changed, 32 insertions(+), 4 deletions(-) diff --git a/include/linux/rseq.h b/include/linux/rseq.h index ab91b1e6bb4a..d315a92afb36 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -57,6 +57,7 @@ static inline void rseq_virt_userspace_exit(void) static inline void rseq_reset(struct task_struct *t) { memset(&t->rseq, 0, sizeof(t->rseq)); + t->rseq.ids.cpu_cid = ~0ULL; } static inline void rseq_execve(struct task_struct *t) @@ -70,10 +71,12 @@ static inline void rseq_execve(struct task_struct *t) */ static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { - if (clone_flags & CLONE_VM) + if (clone_flags & CLONE_VM) { rseq_reset(t); - else + } else { t->rseq = current->rseq; + t->rseq.ids.cpu_cid = ~0ULL; + } } #else /* CONFIG_RSEQ */ diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index f7a60c8eddc9..40901b033b92 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -30,18 +30,39 @@ struct rseq_event { }; }; +/** + * struct rseq_ids - Cache for ids, which need to be updated + * @cpu_cid: Compound of @cpu_id and @mm_cid to make the + * compiler emit a single compare on 64-bit + * @cpu_id: The CPU ID which was written last to user space + * @mm_cid: The MM CID which was written last to user space + * + * @cpu_id and @mm_cid are updated when the data is written to user space. + */ +struct rseq_ids { + union { + u64 cpu_cid; + struct { + u32 cpu_id; + u32 mm_cid; + }; + }; +}; + /** * struct rseq_data - Storage for all rseq related data * @usrptr: Pointer to the registered user space RSEQ memory * @len: Length of the RSEQ region * @sig: Signature of critial section abort IPs * @event: Storage for event management + * @ids: Storage for cached CPU ID and MM CID */ struct rseq_data { struct rseq __user *usrptr; u32 len; u32 sig; struct rseq_event event; + struct rseq_ids ids; }; #else /* CONFIG_RSEQ */ diff --git a/include/trace/events/rseq.h b/include/trace/events/rseq.h index 823b47d1ba1e..ce85d650bf4b 100644 --- a/include/trace/events/rseq.h +++ b/include/trace/events/rseq.h @@ -21,9 +21,9 @@ TRACE_EVENT(rseq_update, ), TP_fast_assign( - __entry->cpu_id = raw_smp_processor_id(); + __entry->cpu_id = t->rseq.ids.cpu_id; __entry->node_id = cpu_to_node(__entry->cpu_id); - __entry->mm_cid = task_mm_cid(t); + __entry->mm_cid = t->rseq.ids.mm_cid; ), TP_printk("cpu_id=%d node_id=%d mm_cid=%d", __entry->cpu_id, diff --git a/kernel/rseq.c b/kernel/rseq.c index aae62661e6bb..ad1e7cecd527 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -184,6 +184,10 @@ static int rseq_update_cpu_node_id(struct task_struct *t) rseq_unsafe_put_user(t, node_id, node_id, efault_end); rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); + /* Cache the user space values */ + t->rseq.ids.cpu_id = cpu_id; + t->rseq.ids.mm_cid = mm_cid; + /* * Additional feature fields added after ORIG_RSEQ_SIZE * need to be conditionally updated only if From 2fc0e4b4126caadfa5772ba69276b350609584dd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:48 +0100 Subject: [PATCH 15/54] rseq: Record interrupt from user space For RSEQ the only relevant reason to inspect and eventually fixup (abort) user space critical sections is when user space was interrupted and the task was scheduled out. If the user to kernel entry was from a syscall no fixup is required. If user space invokes a syscall from a critical section it can keep the pieces as documented. This is only supported on architectures which utilize the generic entry code. If your architecture does not use it, bad luck. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.905067101@linutronix.de --- include/linux/irq-entry-common.h | 3 ++- include/linux/rseq.h | 16 +++++++++++----- include/linux/rseq_entry.h | 18 ++++++++++++++++++ include/linux/rseq_types.h | 2 ++ 4 files changed, 33 insertions(+), 6 deletions(-) create mode 100644 include/linux/rseq_entry.h diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index 83c9d841d9e1..cb31fb84d7b4 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -4,7 +4,7 @@ #include #include -#include +#include #include #include #include @@ -281,6 +281,7 @@ static __always_inline void exit_to_user_mode(void) static __always_inline void irqentry_enter_from_user_mode(struct pt_regs *regs) { enter_from_user_mode(regs); + rseq_note_user_irq_entry(); } /** diff --git a/include/linux/rseq.h b/include/linux/rseq.h index d315a92afb36..a200836a6fe3 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -31,11 +31,17 @@ static inline void rseq_sched_switch_event(struct task_struct *t) static __always_inline void rseq_exit_to_user_mode(void) { - if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) { - if (WARN_ON_ONCE(current->rseq.event.has_rseq && - current->rseq.event.events)) - current->rseq.event.events = 0; - } + struct rseq_event *ev = ¤t->rseq.event; + + if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) + WARN_ON_ONCE(ev->sched_switch); + + /* + * Ensure that event (especially user_irq) is cleared when the + * interrupt did not result in a schedule and therefore the + * rseq processing did not clear it. + */ + ev->events = 0; } /* diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h new file mode 100644 index 000000000000..ce30e87ce1f5 --- /dev/null +++ b/include/linux/rseq_entry.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_RSEQ_ENTRY_H +#define _LINUX_RSEQ_ENTRY_H + +#ifdef CONFIG_RSEQ +#include + +static __always_inline void rseq_note_user_irq_entry(void) +{ + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) + current->rseq.event.user_irq = true; +} + +#else /* CONFIG_RSEQ */ +static inline void rseq_note_user_irq_entry(void) { } +#endif /* !CONFIG_RSEQ */ + +#endif /* _LINUX_RSEQ_ENTRY_H */ diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index 40901b033b92..80f6c398ef0f 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -12,6 +12,7 @@ struct rseq; * @all: Compound to initialize and clear the data efficiently * @events: Compound to access events with a single load/store * @sched_switch: True if the task was scheduled out + * @user_irq: True on interrupt entry from user mode * @has_rseq: True if the task has a rseq pointer installed */ struct rseq_event { @@ -22,6 +23,7 @@ struct rseq_event { u16 events; struct { u8 sched_switch; + u8 user_irq; }; }; From dab344753e021fe84c24f9d8b0b63cb5bcf463d7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:50 +0100 Subject: [PATCH 16/54] rseq: Provide tracepoint wrappers for inline code Provide tracepoint wrappers for the upcoming RSEQ exit to user space inline fast path, so that the header can be safely included by code which defines actual trace points. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.967114316@linutronix.de --- include/linux/rseq_entry.h | 28 ++++++++++++++++++++++++++++ kernel/rseq.c | 19 ++++++++++++++++++- 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index ce30e87ce1f5..5be507a127eb 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -5,6 +5,34 @@ #ifdef CONFIG_RSEQ #include +#include + +#ifdef CONFIG_TRACEPOINTS +DECLARE_TRACEPOINT(rseq_update); +DECLARE_TRACEPOINT(rseq_ip_fixup); +void __rseq_trace_update(struct task_struct *t); +void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, + unsigned long offset, unsigned long abort_ip); + +static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) +{ + if (tracepoint_enabled(rseq_update) && ids) + __rseq_trace_update(t); +} + +static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, + unsigned long offset, unsigned long abort_ip) +{ + if (tracepoint_enabled(rseq_ip_fixup)) + __rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip); +} + +#else /* CONFIG_TRACEPOINT */ +static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) { } +static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, + unsigned long offset, unsigned long abort_ip) { } +#endif /* !CONFIG_TRACEPOINT */ + static __always_inline void rseq_note_user_irq_entry(void) { if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) diff --git a/kernel/rseq.c b/kernel/rseq.c index ad1e7cecd527..f49d3118c3e0 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -70,7 +70,7 @@ #include #include #include -#include +#include #include #include #include @@ -91,6 +91,23 @@ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) +#ifdef CONFIG_TRACEPOINTS +/* + * Out of line, so the actual update functions can be in a header to be + * inlined into the exit to user code. + */ +void __rseq_trace_update(struct task_struct *t) +{ + trace_rseq_update(t); +} + +void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, + unsigned long offset, unsigned long abort_ip) +{ + trace_rseq_ip_fixup(ip, start_ip, offset, abort_ip); +} +#endif /* CONFIG_TRACEPOINTS */ + #ifdef CONFIG_DEBUG_RSEQ static struct rseq *rseq_kernel_fields(struct task_struct *t) { From 5412910487d0839111e4f2f3a6f33f6c9af9b007 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:52 +0100 Subject: [PATCH 17/54] rseq: Expose lightweight statistics in debugfs Analyzing the call frequency without actually using tracing is helpful for analysis of this infrastructure. The overhead is minimal as it just increments a per CPU counter associated to each operation. The debugfs readout provides a racy sum of all counters. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.027916598@linutronix.de --- include/linux/rseq.h | 16 -------- include/linux/rseq_entry.h | 49 ++++++++++++++++++++++ init/Kconfig | 12 ++++++ kernel/rseq.c | 83 +++++++++++++++++++++++++++++++++----- 4 files changed, 135 insertions(+), 25 deletions(-) diff --git a/include/linux/rseq.h b/include/linux/rseq.h index a200836a6fe3..7f347c3a4af8 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -29,21 +29,6 @@ static inline void rseq_sched_switch_event(struct task_struct *t) } } -static __always_inline void rseq_exit_to_user_mode(void) -{ - struct rseq_event *ev = ¤t->rseq.event; - - if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) - WARN_ON_ONCE(ev->sched_switch); - - /* - * Ensure that event (especially user_irq) is cleared when the - * interrupt did not result in a schedule and therefore the - * rseq processing did not clear it. - */ - ev->events = 0; -} - /* * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode, * which clears TIF_NOTIFY_RESUME. To avoid updating user space RSEQ in @@ -92,7 +77,6 @@ static inline void rseq_sched_switch_event(struct task_struct *t) { } static inline void rseq_virt_userspace_exit(void) { } static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { } static inline void rseq_execve(struct task_struct *t) { } -static inline void rseq_exit_to_user_mode(void) { } #endif /* !CONFIG_RSEQ */ #ifdef CONFIG_DEBUG_RSEQ diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index 5be507a127eb..ff9080b89be3 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -2,6 +2,37 @@ #ifndef _LINUX_RSEQ_ENTRY_H #define _LINUX_RSEQ_ENTRY_H +/* Must be outside the CONFIG_RSEQ guard to resolve the stubs */ +#ifdef CONFIG_RSEQ_STATS +#include + +struct rseq_stats { + unsigned long exit; + unsigned long signal; + unsigned long slowpath; + unsigned long ids; + unsigned long cs; + unsigned long clear; + unsigned long fixup; +}; + +DECLARE_PER_CPU(struct rseq_stats, rseq_stats); + +/* + * Slow path has interrupts and preemption enabled, but the fast path + * runs with interrupts disabled so there is no point in having the + * preemption checks implied in __this_cpu_inc() for every operation. + */ +#ifdef RSEQ_BUILD_SLOW_PATH +#define rseq_stat_inc(which) this_cpu_inc((which)) +#else +#define rseq_stat_inc(which) raw_cpu_inc((which)) +#endif + +#else /* CONFIG_RSEQ_STATS */ +#define rseq_stat_inc(x) do { } while (0) +#endif /* !CONFIG_RSEQ_STATS */ + #ifdef CONFIG_RSEQ #include @@ -39,8 +70,26 @@ static __always_inline void rseq_note_user_irq_entry(void) current->rseq.event.user_irq = true; } +static __always_inline void rseq_exit_to_user_mode(void) +{ + struct rseq_event *ev = ¤t->rseq.event; + + rseq_stat_inc(rseq_stats.exit); + + if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) + WARN_ON_ONCE(ev->sched_switch); + + /* + * Ensure that event (especially user_irq) is cleared when the + * interrupt did not result in a schedule and therefore the + * rseq processing did not clear it. + */ + ev->events = 0; +} + #else /* CONFIG_RSEQ */ static inline void rseq_note_user_irq_entry(void) { } +static inline void rseq_exit_to_user_mode(void) { } #endif /* !CONFIG_RSEQ */ #endif /* _LINUX_RSEQ_ENTRY_H */ diff --git a/init/Kconfig b/init/Kconfig index cab3ad28ca49..f39fdfb28797 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1913,6 +1913,18 @@ config RSEQ If unsure, say Y. +config RSEQ_STATS + default n + bool "Enable lightweight statistics of restartable sequences" if EXPERT + depends on RSEQ && DEBUG_FS + help + Enable lightweight counters which expose information about the + frequency of RSEQ operations via debugfs. Mostly interesting for + kernel debugging or performance analysis. While lightweight it's + still adding code into the user/kernel mode transitions. + + If unsure, say N. + config DEBUG_RSEQ default n bool "Enable debugging of rseq() system call" if EXPERT diff --git a/kernel/rseq.c b/kernel/rseq.c index f49d3118c3e0..c0dbe2ed26a8 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -67,12 +67,16 @@ * F1. */ -#include -#include -#include -#include -#include +/* Required to select the proper per_cpu ops for rseq_stats_inc() */ +#define RSEQ_BUILD_SLOW_PATH + +#include #include +#include +#include +#include +#include +#include #include #define CREATE_TRACE_POINTS @@ -108,6 +112,56 @@ void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, } #endif /* CONFIG_TRACEPOINTS */ +#ifdef CONFIG_RSEQ_STATS +DEFINE_PER_CPU(struct rseq_stats, rseq_stats); + +static int rseq_debug_show(struct seq_file *m, void *p) +{ + struct rseq_stats stats = { }; + unsigned int cpu; + + for_each_possible_cpu(cpu) { + stats.exit += data_race(per_cpu(rseq_stats.exit, cpu)); + stats.signal += data_race(per_cpu(rseq_stats.signal, cpu)); + stats.slowpath += data_race(per_cpu(rseq_stats.slowpath, cpu)); + stats.ids += data_race(per_cpu(rseq_stats.ids, cpu)); + stats.cs += data_race(per_cpu(rseq_stats.cs, cpu)); + stats.clear += data_race(per_cpu(rseq_stats.clear, cpu)); + stats.fixup += data_race(per_cpu(rseq_stats.fixup, cpu)); + } + + seq_printf(m, "exit: %16lu\n", stats.exit); + seq_printf(m, "signal: %16lu\n", stats.signal); + seq_printf(m, "slowp: %16lu\n", stats.slowpath); + seq_printf(m, "ids: %16lu\n", stats.ids); + seq_printf(m, "cs: %16lu\n", stats.cs); + seq_printf(m, "clear: %16lu\n", stats.clear); + seq_printf(m, "fixup: %16lu\n", stats.fixup); + return 0; +} + +static int rseq_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, rseq_debug_show, inode->i_private); +} + +static const struct file_operations dfs_ops = { + .open = rseq_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init rseq_debugfs_init(void) +{ + struct dentry *root_dir = debugfs_create_dir("rseq", NULL); + + debugfs_create_file("stats", 0444, root_dir, NULL, &dfs_ops); + return 0; +} +__initcall(rseq_debugfs_init); +#endif /* CONFIG_RSEQ_STATS */ + #ifdef CONFIG_DEBUG_RSEQ static struct rseq *rseq_kernel_fields(struct task_struct *t) { @@ -187,12 +241,13 @@ static int rseq_update_cpu_node_id(struct task_struct *t) u32 node_id = cpu_to_node(cpu_id); u32 mm_cid = task_mm_cid(t); - /* - * Validate read-only rseq fields. - */ + rseq_stat_inc(rseq_stats.ids); + + /* Validate read-only rseq fields on debug kernels */ if (rseq_validate_ro_fields(t)) goto efault; WARN_ON_ONCE((int) mm_cid < 0); + if (!user_write_access_begin(rseq, t->rseq.len)) goto efault; @@ -403,6 +458,8 @@ static int rseq_ip_fixup(struct pt_regs *regs, bool abort) struct rseq_cs rseq_cs; int ret; + rseq_stat_inc(rseq_stats.cs); + ret = rseq_get_rseq_cs(t, &rseq_cs); if (ret) return ret; @@ -412,8 +469,10 @@ static int rseq_ip_fixup(struct pt_regs *regs, bool abort) * If not nested over a rseq critical section, restart is useless. * Clear the rseq_cs pointer and return. */ - if (!in_rseq_cs(ip, &rseq_cs)) + if (!in_rseq_cs(ip, &rseq_cs)) { + rseq_stat_inc(rseq_stats.clear); return clear_rseq_cs(t->rseq.usrptr); + } ret = rseq_check_flags(t, rseq_cs.flags); if (ret < 0) return ret; @@ -422,6 +481,7 @@ static int rseq_ip_fixup(struct pt_regs *regs, bool abort) ret = clear_rseq_cs(t->rseq.usrptr); if (ret) return ret; + rseq_stat_inc(rseq_stats.fixup); trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, rseq_cs.abort_ip); instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip); @@ -462,6 +522,11 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) if (unlikely(t->flags & PF_EXITING)) return; + if (ksig) + rseq_stat_inc(rseq_stats.signal); + else + rseq_stat_inc(rseq_stats.slowpath); + /* * Read and clear the event pending bit first. If the task * was not preempted or migrated or a signal is on the way, From 9c37cb6e80b8fcdddc1236ba42ffd438f511192b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:55 +0100 Subject: [PATCH 18/54] rseq: Provide static branch for runtime debugging Config based debug is rarely turned on and is not available easily when things go wrong. Provide a static branch to allow permanent integration of debug mechanisms along with the usual toggles in Kconfig, command line and debugfs. Requested-by: Peter Zijlstra Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.089270547@linutronix.de --- .../admin-guide/kernel-parameters.txt | 4 + include/linux/rseq_entry.h | 3 + init/Kconfig | 14 ++++ kernel/rseq.c | 73 ++++++++++++++++++- 4 files changed, 90 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 6c42061ca20e..e63827475792 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6500,6 +6500,10 @@ Memory area to be used by remote processor image, managed by CMA. + rseq_debug= [KNL] Enable or disable restartable sequence + debug mode. Defaults to CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE. + Format: + rt_group_sched= [KNL] Enable or disable SCHED_RR/FIFO group scheduling when CONFIG_RT_GROUP_SCHED=y. Defaults to !CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED. diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index ff9080b89be3..ed8e5f89499b 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -34,6 +34,7 @@ DECLARE_PER_CPU(struct rseq_stats, rseq_stats); #endif /* !CONFIG_RSEQ_STATS */ #ifdef CONFIG_RSEQ +#include #include #include @@ -64,6 +65,8 @@ static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, unsigned long offset, unsigned long abort_ip) { } #endif /* !CONFIG_TRACEPOINT */ +DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); + static __always_inline void rseq_note_user_irq_entry(void) { if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) diff --git a/init/Kconfig b/init/Kconfig index f39fdfb28797..bde40ab664e2 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1925,10 +1925,24 @@ config RSEQ_STATS If unsure, say N. +config RSEQ_DEBUG_DEFAULT_ENABLE + default n + bool "Enable restartable sequences debug mode by default" if EXPERT + depends on RSEQ + help + This enables the static branch for debug mode of restartable + sequences. + + This also can be controlled on the kernel command line via the + command line parameter "rseq_debug=0/1" and through debugfs. + + If unsure, say N. + config DEBUG_RSEQ default n bool "Enable debugging of rseq() system call" if EXPERT depends on RSEQ && DEBUG_KERNEL + select RSEQ_DEBUG_DEFAULT_ENABLE help Enable extra debugging checks for the rseq system call. diff --git a/kernel/rseq.c b/kernel/rseq.c index c0dbe2ed26a8..679ab8ebdfd3 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -95,6 +95,27 @@ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) +DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); + +static inline void rseq_control_debug(bool on) +{ + if (on) + static_branch_enable(&rseq_debug_enabled); + else + static_branch_disable(&rseq_debug_enabled); +} + +static int __init rseq_setup_debug(char *str) +{ + bool on; + + if (kstrtobool(str, &on)) + return -EINVAL; + rseq_control_debug(on); + return 1; +} +__setup("rseq_debug=", rseq_setup_debug); + #ifdef CONFIG_TRACEPOINTS /* * Out of line, so the actual update functions can be in a header to be @@ -112,10 +133,11 @@ void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, } #endif /* CONFIG_TRACEPOINTS */ +#ifdef CONFIG_DEBUG_FS #ifdef CONFIG_RSEQ_STATS DEFINE_PER_CPU(struct rseq_stats, rseq_stats); -static int rseq_debug_show(struct seq_file *m, void *p) +static int rseq_stats_show(struct seq_file *m, void *p) { struct rseq_stats stats = { }; unsigned int cpu; @@ -140,14 +162,56 @@ static int rseq_debug_show(struct seq_file *m, void *p) return 0; } +static int rseq_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, rseq_stats_show, inode->i_private); +} + +static const struct file_operations stat_ops = { + .open = rseq_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init rseq_stats_init(struct dentry *root_dir) +{ + debugfs_create_file("stats", 0444, root_dir, NULL, &stat_ops); + return 0; +} +#else +static inline void rseq_stats_init(struct dentry *root_dir) { } +#endif /* CONFIG_RSEQ_STATS */ + +static int rseq_debug_show(struct seq_file *m, void *p) +{ + bool on = static_branch_unlikely(&rseq_debug_enabled); + + seq_printf(m, "%d\n", on); + return 0; +} + +static ssize_t rseq_debug_write(struct file *file, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + bool on; + + if (kstrtobool_from_user(ubuf, count, &on)) + return -EINVAL; + + rseq_control_debug(on); + return count; +} + static int rseq_debug_open(struct inode *inode, struct file *file) { return single_open(file, rseq_debug_show, inode->i_private); } -static const struct file_operations dfs_ops = { +static const struct file_operations debug_ops = { .open = rseq_debug_open, .read = seq_read, + .write = rseq_debug_write, .llseek = seq_lseek, .release = single_release, }; @@ -156,11 +220,12 @@ static int __init rseq_debugfs_init(void) { struct dentry *root_dir = debugfs_create_dir("rseq", NULL); - debugfs_create_file("stats", 0444, root_dir, NULL, &dfs_ops); + debugfs_create_file("debug", 0644, root_dir, NULL, &debug_ops); + rseq_stats_init(root_dir); return 0; } __initcall(rseq_debugfs_init); -#endif /* CONFIG_RSEQ_STATS */ +#endif /* CONFIG_DEBUG_FS */ #ifdef CONFIG_DEBUG_RSEQ static struct rseq *rseq_kernel_fields(struct task_struct *t) From abc850e7616c91ebaa3f5ba3617ab0a104d45039 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:57 +0100 Subject: [PATCH 19/54] rseq: Provide and use rseq_update_user_cs() Provide a straight forward implementation to check for and eventually clear/fixup critical sections in user space. The non-debug version does only the minimal sanity checks and aims for efficiency. There are two attack vectors, which are checked for: 1) An abort IP which is in the kernel address space. That would cause at least x86 to return to kernel space via IRET. 2) A rogue critical section descriptor with an abort IP pointing to some arbitrary address, which is not preceded by the RSEQ signature. If the section descriptors are invalid then the resulting misbehaviour of the user space application is not the kernels problem. The kernel provides a run-time switchable debug slow path, which implements the full zoo of checks including termination of the task when one of the gazillion conditions is not met. Replace the zoo in rseq.c with it and invoke it from the TIF_NOTIFY_RESUME handler. Move the remainders into the CONFIG_DEBUG_RSEQ section, which will be replaced and removed in a subsequent step. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.151465632@linutronix.de --- include/linux/rseq_entry.h | 206 +++++++++++++++++++++++++++++++ include/linux/rseq_types.h | 11 +- kernel/rseq.c | 244 +++++++++++-------------------------- 3 files changed, 290 insertions(+), 171 deletions(-) diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index ed8e5f89499b..f9510ce72211 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -36,6 +36,7 @@ DECLARE_PER_CPU(struct rseq_stats, rseq_stats); #ifdef CONFIG_RSEQ #include #include +#include #include @@ -67,12 +68,217 @@ static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); +#ifdef RSEQ_BUILD_SLOW_PATH +#define rseq_inline +#else +#define rseq_inline __always_inline +#endif + +bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); + static __always_inline void rseq_note_user_irq_entry(void) { if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) current->rseq.event.user_irq = true; } +/* + * Check whether there is a valid critical section and whether the + * instruction pointer in @regs is inside the critical section. + * + * - If the critical section is invalid, terminate the task. + * + * - If valid and the instruction pointer is inside, set it to the abort IP. + * + * - If valid and the instruction pointer is outside, clear the critical + * section address. + * + * Returns true, if the section was valid and either fixup or clear was + * done, false otherwise. + * + * In the failure case task::rseq_event::fatal is set when a invalid + * section was found. It's clear when the failure was an unresolved page + * fault. + * + * If inlined into the exit to user path with interrupts disabled, the + * caller has to protect against page faults with pagefault_disable(). + * + * In preemptible task context this would be counterproductive as the page + * faults could not be fully resolved. As a consequence unresolved page + * faults in task context are fatal too. + */ + +#ifdef RSEQ_BUILD_SLOW_PATH +/* + * The debug version is put out of line, but kept here so the code stays + * together. + * + * @csaddr has already been checked by the caller to be in user space + */ +bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, + unsigned long csaddr) +{ + struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr; + u64 start_ip, abort_ip, offset, cs_end, head, tasksize = TASK_SIZE; + unsigned long ip = instruction_pointer(regs); + u64 __user *uc_head = (u64 __user *) ucs; + u32 usig, __user *uc_sig; + + scoped_user_rw_access(ucs, efault) { + /* + * Evaluate the user pile and exit if one of the conditions + * is not fulfilled. + */ + unsafe_get_user(start_ip, &ucs->start_ip, efault); + if (unlikely(start_ip >= tasksize)) + goto die; + /* If outside, just clear the critical section. */ + if (ip < start_ip) + goto clear; + + unsafe_get_user(offset, &ucs->post_commit_offset, efault); + cs_end = start_ip + offset; + /* Check for overflow and wraparound */ + if (unlikely(cs_end >= tasksize || cs_end < start_ip)) + goto die; + + /* If not inside, clear it. */ + if (ip >= cs_end) + goto clear; + + unsafe_get_user(abort_ip, &ucs->abort_ip, efault); + /* Ensure it's "valid" */ + if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig))) + goto die; + /* Validate that the abort IP is not in the critical section */ + if (unlikely(abort_ip - start_ip < offset)) + goto die; + + /* + * Check version and flags for 0. No point in emitting + * deprecated warnings before dying. That could be done in + * the slow path eventually, but *shrug*. + */ + unsafe_get_user(head, uc_head, efault); + if (unlikely(head)) + goto die; + + /* abort_ip - 4 is >= 0. See abort_ip check above */ + uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig)); + unsafe_get_user(usig, uc_sig, efault); + if (unlikely(usig != t->rseq.sig)) + goto die; + + /* rseq_event.user_irq is only valid if CONFIG_GENERIC_IRQ_ENTRY=y */ + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { + /* If not in interrupt from user context, let it die */ + if (unlikely(!t->rseq.event.user_irq)) + goto die; + } + unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); + instruction_pointer_set(regs, (unsigned long)abort_ip); + rseq_stat_inc(rseq_stats.fixup); + break; + clear: + unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); + rseq_stat_inc(rseq_stats.clear); + abort_ip = 0ULL; + } + + if (unlikely(abort_ip)) + rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip); + return true; +die: + t->rseq.event.fatal = true; +efault: + return false; +} + +#endif /* RSEQ_BUILD_SLOW_PATH */ + +/* + * This only ensures that abort_ip is in the user address space and + * validates that it is preceded by the signature. + * + * No other sanity checks are done here, that's what the debug code is for. + */ +static rseq_inline bool +rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr) +{ + struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr; + unsigned long ip = instruction_pointer(regs); + u64 start_ip, abort_ip, offset; + u32 usig, __user *uc_sig; + + rseq_stat_inc(rseq_stats.cs); + + if (unlikely(csaddr >= TASK_SIZE)) { + t->rseq.event.fatal = true; + return false; + } + + if (static_branch_unlikely(&rseq_debug_enabled)) + return rseq_debug_update_user_cs(t, regs, csaddr); + + scoped_user_rw_access(ucs, efault) { + unsafe_get_user(start_ip, &ucs->start_ip, efault); + unsafe_get_user(offset, &ucs->post_commit_offset, efault); + unsafe_get_user(abort_ip, &ucs->abort_ip, efault); + + /* + * No sanity checks. If user space screwed it up, it can + * keep the pieces. That's what debug code is for. + * + * If outside, just clear the critical section. + */ + if (ip - start_ip >= offset) + goto clear; + + /* + * Two requirements for @abort_ip: + * - Must be in user space as x86 IRET would happily return to + * the kernel. + * - The four bytes preceding the instruction at @abort_ip must + * contain the signature. + * + * The latter protects against the following attack vector: + * + * An attacker with limited abilities to write, creates a critical + * section descriptor, sets the abort IP to a library function or + * some other ROP gadget and stores the address of the descriptor + * in TLS::rseq::rseq_cs. An RSEQ abort would then evade ROP + * protection. + */ + if (abort_ip >= TASK_SIZE || abort_ip < sizeof(*uc_sig)) + goto die; + + /* The address is guaranteed to be >= 0 and < TASK_SIZE */ + uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig)); + unsafe_get_user(usig, uc_sig, efault); + if (unlikely(usig != t->rseq.sig)) + goto die; + + /* Invalidate the critical section */ + unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); + /* Update the instruction pointer */ + instruction_pointer_set(regs, (unsigned long)abort_ip); + rseq_stat_inc(rseq_stats.fixup); + break; + clear: + unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); + rseq_stat_inc(rseq_stats.clear); + abort_ip = 0ULL; + } + + if (unlikely(abort_ip)) + rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip); + return true; +die: + t->rseq.event.fatal = true; +efault: + return false; +} + static __always_inline void rseq_exit_to_user_mode(void) { struct rseq_event *ev = ¤t->rseq.event; diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index 80f6c398ef0f..7c123947bb98 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -14,10 +14,12 @@ struct rseq; * @sched_switch: True if the task was scheduled out * @user_irq: True on interrupt entry from user mode * @has_rseq: True if the task has a rseq pointer installed + * @error: Compound error code for the slow path to analyze + * @fatal: User space data corrupted or invalid */ struct rseq_event { union { - u32 all; + u64 all; struct { union { u16 events; @@ -28,6 +30,13 @@ struct rseq_event { }; u8 has_rseq; + u8 __pad; + union { + u16 error; + struct { + u8 fatal; + }; + }; }; }; }; diff --git a/kernel/rseq.c b/kernel/rseq.c index 679ab8ebdfd3..12a9b6ab2cef 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -382,175 +382,18 @@ efault: return -EFAULT; } -/* - * Get the user-space pointer value stored in the 'rseq_cs' field. - */ -static int rseq_get_rseq_cs_ptr_val(struct rseq __user *rseq, u64 *rseq_cs) +static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs) { - if (!rseq_cs) - return -EFAULT; + struct rseq __user *urseq = t->rseq.usrptr; + u64 csaddr; -#ifdef CONFIG_64BIT - if (get_user(*rseq_cs, &rseq->rseq_cs)) - return -EFAULT; -#else - if (copy_from_user(rseq_cs, &rseq->rseq_cs, sizeof(*rseq_cs))) - return -EFAULT; -#endif - - return 0; -} - -/* - * If the rseq_cs field of 'struct rseq' contains a valid pointer to - * user-space, copy 'struct rseq_cs' from user-space and validate its fields. - */ -static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) -{ - struct rseq_cs __user *urseq_cs; - u64 ptr; - u32 __user *usig; - u32 sig; - int ret; - - ret = rseq_get_rseq_cs_ptr_val(t->rseq.usrptr, &ptr); - if (ret) - return ret; - - /* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */ - if (!ptr) { - memset(rseq_cs, 0, sizeof(*rseq_cs)); - return 0; - } - /* Check that the pointer value fits in the user-space process space. */ - if (ptr >= TASK_SIZE) - return -EINVAL; - urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; - if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) - return -EFAULT; - - if (rseq_cs->start_ip >= TASK_SIZE || - rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE || - rseq_cs->abort_ip >= TASK_SIZE || - rseq_cs->version > 0) - return -EINVAL; - /* Check for overflow. */ - if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip) - return -EINVAL; - /* Ensure that abort_ip is not in the critical section. */ - if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) - return -EINVAL; - - usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32)); - ret = get_user(sig, usig); - if (ret) - return ret; - - if (current->rseq.sig != sig) { - printk_ratelimited(KERN_WARNING - "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", - sig, current->rseq.sig, current->pid, usig); - return -EINVAL; - } - return 0; -} - -static bool rseq_warn_flags(const char *str, u32 flags) -{ - u32 test_flags; - - if (!flags) - return false; - test_flags = flags & RSEQ_CS_NO_RESTART_FLAGS; - if (test_flags) - pr_warn_once("Deprecated flags (%u) in %s ABI structure", test_flags, str); - test_flags = flags & ~RSEQ_CS_NO_RESTART_FLAGS; - if (test_flags) - pr_warn_once("Unknown flags (%u) in %s ABI structure", test_flags, str); - return true; -} - -static int rseq_check_flags(struct task_struct *t, u32 cs_flags) -{ - u32 flags; - int ret; - - if (rseq_warn_flags("rseq_cs", cs_flags)) - return -EINVAL; - - /* Get thread flags. */ - ret = get_user(flags, &t->rseq.usrptr->flags); - if (ret) - return ret; - - if (rseq_warn_flags("rseq", flags)) - return -EINVAL; - return 0; -} - -static int clear_rseq_cs(struct rseq __user *rseq) -{ - /* - * The rseq_cs field is set to NULL on preemption or signal - * delivery on top of rseq assembly block, as well as on top - * of code outside of the rseq assembly block. This performs - * a lazy clear of the rseq_cs field. - * - * Set rseq_cs to NULL. - */ -#ifdef CONFIG_64BIT - return put_user(0UL, &rseq->rseq_cs); -#else - if (clear_user(&rseq->rseq_cs, sizeof(rseq->rseq_cs))) - return -EFAULT; - return 0; -#endif -} - -/* - * Unsigned comparison will be true when ip >= start_ip, and when - * ip < start_ip + post_commit_offset. - */ -static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs) -{ - return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset; -} - -static int rseq_ip_fixup(struct pt_regs *regs, bool abort) -{ - unsigned long ip = instruction_pointer(regs); - struct task_struct *t = current; - struct rseq_cs rseq_cs; - int ret; - - rseq_stat_inc(rseq_stats.cs); - - ret = rseq_get_rseq_cs(t, &rseq_cs); - if (ret) - return ret; - - /* - * Handle potentially not being within a critical section. - * If not nested over a rseq critical section, restart is useless. - * Clear the rseq_cs pointer and return. - */ - if (!in_rseq_cs(ip, &rseq_cs)) { - rseq_stat_inc(rseq_stats.clear); - return clear_rseq_cs(t->rseq.usrptr); - } - ret = rseq_check_flags(t, rseq_cs.flags); - if (ret < 0) - return ret; - if (!abort) - return 0; - ret = clear_rseq_cs(t->rseq.usrptr); - if (ret) - return ret; - rseq_stat_inc(rseq_stats.fixup); - trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, - rseq_cs.abort_ip); - instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip); - return 0; + scoped_user_read_access(urseq, efault) + unsafe_get_user(csaddr, &urseq->rseq_cs, efault); + if (likely(!csaddr)) + return true; + return rseq_update_user_cs(t, regs, csaddr); +efault: + return false; } /* @@ -567,8 +410,8 @@ static int rseq_ip_fixup(struct pt_regs *regs, bool abort) void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { struct task_struct *t = current; - int ret, sig; bool event; + int sig; /* * If invoked from hypervisors before entering the guest via @@ -618,8 +461,7 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) if (!IS_ENABLED(CONFIG_DEBUG_RSEQ) && !event) return; - ret = rseq_ip_fixup(regs, event); - if (unlikely(ret < 0)) + if (!rseq_handle_cs(t, regs)) goto error; if (unlikely(rseq_update_cpu_node_id(t))) @@ -632,6 +474,68 @@ error: } #ifdef CONFIG_DEBUG_RSEQ +/* + * Unsigned comparison will be true when ip >= start_ip, and when + * ip < start_ip + post_commit_offset. + */ +static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs) +{ + return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset; +} + +/* + * If the rseq_cs field of 'struct rseq' contains a valid pointer to + * user-space, copy 'struct rseq_cs' from user-space and validate its fields. + */ +static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) +{ + struct rseq __user *urseq = t->rseq.usrptr; + struct rseq_cs __user *urseq_cs; + u32 __user *usig; + u64 ptr; + u32 sig; + int ret; + + if (get_user(ptr, &rseq->rseq_cs)) + return -EFAULT; + + /* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */ + if (!ptr) { + memset(rseq_cs, 0, sizeof(*rseq_cs)); + return 0; + } + /* Check that the pointer value fits in the user-space process space. */ + if (ptr >= TASK_SIZE) + return -EINVAL; + urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; + if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) + return -EFAULT; + + if (rseq_cs->start_ip >= TASK_SIZE || + rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE || + rseq_cs->abort_ip >= TASK_SIZE || + rseq_cs->version > 0) + return -EINVAL; + /* Check for overflow. */ + if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip) + return -EINVAL; + /* Ensure that abort_ip is not in the critical section. */ + if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) + return -EINVAL; + + usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32)); + ret = get_user(sig, usig); + if (ret) + return ret; + + if (current->rseq.sig != sig) { + printk_ratelimited(KERN_WARNING + "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", + sig, current->rseq.sig, current->pid, usig); + return -EINVAL; + } + return 0; +} /* * Terminate the process if a syscall is issued within a restartable From f7ee1964ac397bee5c6d1c017557c0eec8856145 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:45:00 +0100 Subject: [PATCH 20/54] rseq: Replace the original debug implementation Just utilize the new infrastructure and put the original one to rest. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.212510692@linutronix.de --- kernel/rseq.c | 81 ++++++++------------------------------------------- 1 file changed, 12 insertions(+), 69 deletions(-) diff --git a/kernel/rseq.c b/kernel/rseq.c index 12a9b6ab2cef..abd6bfadbcc6 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -474,85 +474,28 @@ error: } #ifdef CONFIG_DEBUG_RSEQ -/* - * Unsigned comparison will be true when ip >= start_ip, and when - * ip < start_ip + post_commit_offset. - */ -static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs) -{ - return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset; -} - -/* - * If the rseq_cs field of 'struct rseq' contains a valid pointer to - * user-space, copy 'struct rseq_cs' from user-space and validate its fields. - */ -static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) -{ - struct rseq __user *urseq = t->rseq.usrptr; - struct rseq_cs __user *urseq_cs; - u32 __user *usig; - u64 ptr; - u32 sig; - int ret; - - if (get_user(ptr, &rseq->rseq_cs)) - return -EFAULT; - - /* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */ - if (!ptr) { - memset(rseq_cs, 0, sizeof(*rseq_cs)); - return 0; - } - /* Check that the pointer value fits in the user-space process space. */ - if (ptr >= TASK_SIZE) - return -EINVAL; - urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; - if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) - return -EFAULT; - - if (rseq_cs->start_ip >= TASK_SIZE || - rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE || - rseq_cs->abort_ip >= TASK_SIZE || - rseq_cs->version > 0) - return -EINVAL; - /* Check for overflow. */ - if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip) - return -EINVAL; - /* Ensure that abort_ip is not in the critical section. */ - if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) - return -EINVAL; - - usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32)); - ret = get_user(sig, usig); - if (ret) - return ret; - - if (current->rseq.sig != sig) { - printk_ratelimited(KERN_WARNING - "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", - sig, current->rseq.sig, current->pid, usig); - return -EINVAL; - } - return 0; -} - /* * Terminate the process if a syscall is issued within a restartable * sequence. */ void rseq_syscall(struct pt_regs *regs) { - unsigned long ip = instruction_pointer(regs); struct task_struct *t = current; - struct rseq_cs rseq_cs; + u64 csaddr; - if (!t->rseq.usrptr) + if (!t->rseq.event.has_rseq) return; - if (rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs)) - force_sig(SIGSEGV); + if (get_user(csaddr, &t->rseq.usrptr->rseq_cs)) + goto fail; + if (likely(!csaddr)) + return; + if (unlikely(csaddr >= TASK_SIZE)) + goto fail; + if (rseq_debug_update_user_cs(t, regs, csaddr)) + return; +fail: + force_sig(SIGSEGV); } - #endif /* From c1cbad8f99b5c73c6af6e96acbfa64eaaaeb085f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:45:02 +0100 Subject: [PATCH 21/54] rseq: Make exit debugging static branch based Disconnect it from the config switch and use the static debug branch. This is a temporary measure for validating the rework. At the end this check needs to be hidden behind lockdep as it has nothing to do with the other debug infrastructure, which mainly aids user space debugging by enabling a zoo of checks which terminate misbehaving tasks instead of letting them keep the hard to diagnose pieces. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.272660745@linutronix.de --- include/linux/rseq_entry.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index f9510ce72211..5bdcf5b5f595 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -285,7 +285,7 @@ static __always_inline void rseq_exit_to_user_mode(void) rseq_stat_inc(rseq_stats.exit); - if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) + if (static_branch_unlikely(&rseq_debug_enabled)) WARN_ON_ONCE(ev->sched_switch); /* From eaa9088d568c84afd72fa32dbe01833aef861d0d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:45:05 +0100 Subject: [PATCH 22/54] rseq: Use static branch for syscall exit debug when GENERIC_IRQ_ENTRY=y Make the syscall exit debug mechanism available via the static branch on architectures which utilize the generic entry code. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.333440475@linutronix.de --- include/linux/entry-common.h | 2 +- include/linux/rseq_entry.h | 9 +++++++++ kernel/rseq.c | 10 ++++++++-- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 75b194c34e18..d967184ae08f 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -146,7 +146,7 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs) local_irq_enable(); } - rseq_syscall(regs); + rseq_debug_syscall_return(regs); /* * Do one-time syscall specific work. If these work items are diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index 5bdcf5b5f595..fb53a6ff05d7 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -296,9 +296,18 @@ static __always_inline void rseq_exit_to_user_mode(void) ev->events = 0; } +void __rseq_debug_syscall_return(struct pt_regs *regs); + +static inline void rseq_debug_syscall_return(struct pt_regs *regs) +{ + if (static_branch_unlikely(&rseq_debug_enabled)) + __rseq_debug_syscall_return(regs); +} + #else /* CONFIG_RSEQ */ static inline void rseq_note_user_irq_entry(void) { } static inline void rseq_exit_to_user_mode(void) { } +static inline void rseq_debug_syscall_return(struct pt_regs *regs) { } #endif /* !CONFIG_RSEQ */ #endif /* _LINUX_RSEQ_ENTRY_H */ diff --git a/kernel/rseq.c b/kernel/rseq.c index abd6bfadbcc6..97631554ae96 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -473,12 +473,11 @@ error: force_sigsegv(sig); } -#ifdef CONFIG_DEBUG_RSEQ /* * Terminate the process if a syscall is issued within a restartable * sequence. */ -void rseq_syscall(struct pt_regs *regs) +void __rseq_debug_syscall_return(struct pt_regs *regs) { struct task_struct *t = current; u64 csaddr; @@ -496,6 +495,13 @@ void rseq_syscall(struct pt_regs *regs) fail: force_sig(SIGSEGV); } + +#ifdef CONFIG_DEBUG_RSEQ +/* Kept around to keep GENERIC_ENTRY=n architectures supported. */ +void rseq_syscall(struct pt_regs *regs) +{ + __rseq_debug_syscall_return(regs); +} #endif /* From 0f085b41880e3140efa6941ff2b8fd43bac4d659 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:45:08 +0100 Subject: [PATCH 23/54] rseq: Provide and use rseq_set_ids() Provide a new and straight forward implementation to set the IDs (CPU ID, Node ID and MM CID), which can be later inlined into the fast path. It does all operations in one scoped_user_rw_access() section and retrieves also the critical section member (rseq::cs_rseq) from user space to avoid another user..begin/end() pair. This is in preparation for optimizing the fast path to avoid extra work when not required. On rseq registration set the CPU ID fields to RSEQ_CPU_ID_UNINITIALIZED and node and MM CID to zero. That's the same as the kernel internal reset values. That makes the debug validation in the exit code work correctly on the first exit to user space. Use it to replace the whole related zoo in rseq.c Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.393972266@linutronix.de --- fs/binfmt_elf.c | 2 +- include/linux/rseq.h | 16 ++- include/linux/rseq_entry.h | 89 ++++++++++++++ include/linux/sched.h | 10 -- kernel/rseq.c | 236 ++++++++----------------------------- 5 files changed, 151 insertions(+), 202 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index e4653bb99946..3eb734c192e9 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -46,7 +46,7 @@ #include #include #include -#include +#include #include #include diff --git a/include/linux/rseq.h b/include/linux/rseq.h index 7f347c3a4af8..92f9cd49489b 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -5,6 +5,8 @@ #ifdef CONFIG_RSEQ #include +#include + void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); static inline void rseq_handle_notify_resume(struct pt_regs *regs) @@ -48,7 +50,7 @@ static inline void rseq_virt_userspace_exit(void) static inline void rseq_reset(struct task_struct *t) { memset(&t->rseq, 0, sizeof(t->rseq)); - t->rseq.ids.cpu_cid = ~0ULL; + t->rseq.ids.cpu_id = RSEQ_CPU_ID_UNINITIALIZED; } static inline void rseq_execve(struct task_struct *t) @@ -59,15 +61,19 @@ static inline void rseq_execve(struct task_struct *t) /* * If parent process has a registered restartable sequences area, the * child inherits. Unregister rseq for a clone with CLONE_VM set. + * + * On fork, keep the IDs (CPU, MMCID) of the parent, which avoids a fault + * on the COW page on exit to user space, when the child stays on the same + * CPU as the parent. That's obviously not guaranteed, but in overcommit + * scenarios it is more likely and optimizes for the fork/exec case without + * taking the fault. */ static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { - if (clone_flags & CLONE_VM) { + if (clone_flags & CLONE_VM) rseq_reset(t); - } else { + else t->rseq = current->rseq; - t->rseq.ids.cpu_cid = ~0ULL; - } } #else /* CONFIG_RSEQ */ diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index fb53a6ff05d7..37444e80fd45 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -75,6 +75,7 @@ DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); #endif bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); +bool rseq_debug_validate_ids(struct task_struct *t); static __always_inline void rseq_note_user_irq_entry(void) { @@ -194,6 +195,43 @@ efault: return false; } +/* + * On debug kernels validate that user space did not mess with it if the + * debug branch is enabled. + */ +bool rseq_debug_validate_ids(struct task_struct *t) +{ + struct rseq __user *rseq = t->rseq.usrptr; + u32 cpu_id, uval, node_id; + + /* + * On the first exit after registering the rseq region CPU ID is + * RSEQ_CPU_ID_UNINITIALIZED and node_id in user space is 0! + */ + node_id = t->rseq.ids.cpu_id != RSEQ_CPU_ID_UNINITIALIZED ? + cpu_to_node(t->rseq.ids.cpu_id) : 0; + + scoped_user_read_access(rseq, efault) { + unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault); + if (cpu_id != t->rseq.ids.cpu_id) + goto die; + unsafe_get_user(uval, &rseq->cpu_id, efault); + if (uval != cpu_id) + goto die; + unsafe_get_user(uval, &rseq->node_id, efault); + if (uval != node_id) + goto die; + unsafe_get_user(uval, &rseq->mm_cid, efault); + if (uval != t->rseq.ids.mm_cid) + goto die; + } + return true; +die: + t->rseq.event.fatal = true; +efault: + return false; +} + #endif /* RSEQ_BUILD_SLOW_PATH */ /* @@ -279,6 +317,57 @@ efault: return false; } +/* + * Updates CPU ID, Node ID and MM CID and reads the critical section + * address, when @csaddr != NULL. This allows to put the ID update and the + * read under the same uaccess region to spare a separate begin/end. + * + * As this is either invoked from a C wrapper with @csaddr = NULL or from + * the fast path code with a valid pointer, a clever compiler should be + * able to optimize the read out. Spares a duplicate implementation. + * + * Returns true, if the operation was successful, false otherwise. + * + * In the failure case task::rseq_event::fatal is set when invalid data + * was found on debug kernels. It's clear when the failure was an unresolved page + * fault. + * + * If inlined into the exit to user path with interrupts disabled, the + * caller has to protect against page faults with pagefault_disable(). + * + * In preemptible task context this would be counterproductive as the page + * faults could not be fully resolved. As a consequence unresolved page + * faults in task context are fatal too. + */ +static rseq_inline +bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, + u32 node_id, u64 *csaddr) +{ + struct rseq __user *rseq = t->rseq.usrptr; + + if (static_branch_unlikely(&rseq_debug_enabled)) { + if (!rseq_debug_validate_ids(t)) + return false; + } + + scoped_user_rw_access(rseq, efault) { + unsafe_put_user(ids->cpu_id, &rseq->cpu_id_start, efault); + unsafe_put_user(ids->cpu_id, &rseq->cpu_id, efault); + unsafe_put_user(node_id, &rseq->node_id, efault); + unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault); + if (csaddr) + unsafe_get_user(*csaddr, &rseq->rseq_cs, efault); + } + + /* Cache the new values */ + t->rseq.ids.cpu_cid = ids->cpu_cid; + rseq_stat_inc(rseq_stats.ids); + rseq_trace_update(t, ids); + return true; +efault: + return false; +} + static __always_inline void rseq_exit_to_user_mode(void) { struct rseq_event *ev = ¤t->rseq.event; diff --git a/include/linux/sched.h b/include/linux/sched.h index 24a9da7ca3e7..e47abc8685d7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -42,7 +42,6 @@ #include #include #include -#include #include #include #include @@ -1408,15 +1407,6 @@ struct task_struct { #endif /* CONFIG_NUMA_BALANCING */ struct rseq_data rseq; -#ifdef CONFIG_DEBUG_RSEQ - /* - * This is a place holder to save a copy of the rseq fields for - * validation of read-only fields. The struct rseq has a - * variable-length array at the end, so it cannot be used - * directly. Reserve a size large enough for the known fields. - */ - char rseq_fields[sizeof(struct rseq)]; -#endif #ifdef CONFIG_SCHED_MM_CID int mm_cid; /* Current cid in mm */ diff --git a/kernel/rseq.c b/kernel/rseq.c index 97631554ae96..1e4f1d2cdfe5 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -88,13 +88,6 @@ # define RSEQ_EVENT_GUARD preempt #endif -/* The original rseq structure size (including padding) is 32 bytes. */ -#define ORIG_RSEQ_SIZE 32 - -#define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \ - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \ - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) - DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); static inline void rseq_control_debug(bool on) @@ -227,159 +220,9 @@ static int __init rseq_debugfs_init(void) __initcall(rseq_debugfs_init); #endif /* CONFIG_DEBUG_FS */ -#ifdef CONFIG_DEBUG_RSEQ -static struct rseq *rseq_kernel_fields(struct task_struct *t) +static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id) { - return (struct rseq *) t->rseq_fields; -} - -static int rseq_validate_ro_fields(struct task_struct *t) -{ - static DEFINE_RATELIMIT_STATE(_rs, - DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); - u32 cpu_id_start, cpu_id, node_id, mm_cid; - struct rseq __user *rseq = t->rseq.usrptr; - - /* - * Validate fields which are required to be read-only by - * user-space. - */ - if (!user_read_access_begin(rseq, t->rseq.len)) - goto efault; - unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end); - unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end); - unsafe_get_user(node_id, &rseq->node_id, efault_end); - unsafe_get_user(mm_cid, &rseq->mm_cid, efault_end); - user_read_access_end(); - - if ((cpu_id_start != rseq_kernel_fields(t)->cpu_id_start || - cpu_id != rseq_kernel_fields(t)->cpu_id || - node_id != rseq_kernel_fields(t)->node_id || - mm_cid != rseq_kernel_fields(t)->mm_cid) && __ratelimit(&_rs)) { - - pr_warn("Detected rseq corruption for pid: %d, name: %s\n" - "\tcpu_id_start: %u ?= %u\n" - "\tcpu_id: %u ?= %u\n" - "\tnode_id: %u ?= %u\n" - "\tmm_cid: %u ?= %u\n", - t->pid, t->comm, - cpu_id_start, rseq_kernel_fields(t)->cpu_id_start, - cpu_id, rseq_kernel_fields(t)->cpu_id, - node_id, rseq_kernel_fields(t)->node_id, - mm_cid, rseq_kernel_fields(t)->mm_cid); - } - - /* For now, only print a console warning on mismatch. */ - return 0; - -efault_end: - user_read_access_end(); -efault: - return -EFAULT; -} - -/* - * Update an rseq field and its in-kernel copy in lock-step to keep a coherent - * state. - */ -#define rseq_unsafe_put_user(t, value, field, error_label) \ - do { \ - unsafe_put_user(value, &t->rseq.usrptr->field, error_label); \ - rseq_kernel_fields(t)->field = value; \ - } while (0) - -#else -static int rseq_validate_ro_fields(struct task_struct *t) -{ - return 0; -} - -#define rseq_unsafe_put_user(t, value, field, error_label) \ - unsafe_put_user(value, &t->rseq.usrptr->field, error_label) -#endif - -static int rseq_update_cpu_node_id(struct task_struct *t) -{ - struct rseq __user *rseq = t->rseq.usrptr; - u32 cpu_id = raw_smp_processor_id(); - u32 node_id = cpu_to_node(cpu_id); - u32 mm_cid = task_mm_cid(t); - - rseq_stat_inc(rseq_stats.ids); - - /* Validate read-only rseq fields on debug kernels */ - if (rseq_validate_ro_fields(t)) - goto efault; - WARN_ON_ONCE((int) mm_cid < 0); - - if (!user_write_access_begin(rseq, t->rseq.len)) - goto efault; - - rseq_unsafe_put_user(t, cpu_id, cpu_id_start, efault_end); - rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); - rseq_unsafe_put_user(t, node_id, node_id, efault_end); - rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); - - /* Cache the user space values */ - t->rseq.ids.cpu_id = cpu_id; - t->rseq.ids.mm_cid = mm_cid; - - /* - * Additional feature fields added after ORIG_RSEQ_SIZE - * need to be conditionally updated only if - * t->rseq_len != ORIG_RSEQ_SIZE. - */ - user_write_access_end(); - trace_rseq_update(t); - return 0; - -efault_end: - user_write_access_end(); -efault: - return -EFAULT; -} - -static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) -{ - struct rseq __user *rseq = t->rseq.usrptr; - u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0, - mm_cid = 0; - - /* - * Validate read-only rseq fields. - */ - if (rseq_validate_ro_fields(t)) - goto efault; - - if (!user_write_access_begin(rseq, t->rseq.len)) - goto efault; - - /* - * Reset all fields to their initial state. - * - * All fields have an initial state of 0 except cpu_id which is set to - * RSEQ_CPU_ID_UNINITIALIZED, so that any user coming in after - * unregistration can figure out that rseq needs to be registered - * again. - */ - rseq_unsafe_put_user(t, cpu_id_start, cpu_id_start, efault_end); - rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); - rseq_unsafe_put_user(t, node_id, node_id, efault_end); - rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); - - /* - * Additional feature fields added after ORIG_RSEQ_SIZE - * need to be conditionally reset only if - * t->rseq_len != ORIG_RSEQ_SIZE. - */ - user_write_access_end(); - return 0; - -efault_end: - user_write_access_end(); -efault: - return -EFAULT; + return rseq_set_ids_get_csaddr(t, ids, node_id, NULL); } static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs) @@ -410,6 +253,8 @@ efault: void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { struct task_struct *t = current; + struct rseq_ids ids; + u32 node_id; bool event; int sig; @@ -456,6 +301,8 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) scoped_guard(RSEQ_EVENT_GUARD) { event = t->rseq.event.sched_switch; t->rseq.event.sched_switch = false; + ids.cpu_id = task_cpu(t); + ids.mm_cid = task_mm_cid(t); } if (!IS_ENABLED(CONFIG_DEBUG_RSEQ) && !event) @@ -464,7 +311,8 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) if (!rseq_handle_cs(t, regs)) goto error; - if (unlikely(rseq_update_cpu_node_id(t))) + node_id = cpu_to_node(ids.cpu_id); + if (!rseq_set_ids(t, &ids, node_id)) goto error; return; @@ -504,13 +352,33 @@ void rseq_syscall(struct pt_regs *regs) } #endif +static bool rseq_reset_ids(void) +{ + struct rseq_ids ids = { + .cpu_id = RSEQ_CPU_ID_UNINITIALIZED, + .mm_cid = 0, + }; + + /* + * If this fails, terminate it because this leaves the kernel in + * stupid state as exit to user space will try to fixup the ids + * again. + */ + if (rseq_set_ids(current, &ids, 0)) + return true; + + force_sig(SIGSEGV); + return false; +} + +/* The original rseq structure size (including padding) is 32 bytes. */ +#define ORIG_RSEQ_SIZE 32 + /* * sys_rseq - setup restartable sequences for caller thread. */ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) { - int ret; - if (flags & RSEQ_FLAG_UNREGISTER) { if (flags & ~RSEQ_FLAG_UNREGISTER) return -EINVAL; @@ -521,9 +389,8 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 return -EINVAL; if (current->rseq.sig != sig) return -EPERM; - ret = rseq_reset_rseq_cpu_node_id(current); - if (ret) - return ret; + if (!rseq_reset_ids()) + return -EFAULT; rseq_reset(current); return 0; } @@ -563,27 +430,22 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 if (!access_ok(rseq, rseq_len)) return -EFAULT; - /* - * If the rseq_cs pointer is non-NULL on registration, clear it to - * avoid a potential segfault on return to user-space. The proper thing - * to do would have been to fail the registration but this would break - * older libcs that reuse the rseq area for new threads without - * clearing the fields. Don't bother reading it, just reset it. - */ - if (put_user(0UL, &rseq->rseq_cs)) - return -EFAULT; + scoped_user_write_access(rseq, efault) { + /* + * If the rseq_cs pointer is non-NULL on registration, clear it to + * avoid a potential segfault on return to user-space. The proper thing + * to do would have been to fail the registration but this would break + * older libcs that reuse the rseq area for new threads without + * clearing the fields. Don't bother reading it, just reset it. + */ + unsafe_put_user(0UL, &rseq->rseq_cs, efault); + /* Initialize IDs in user space */ + unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault); + unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault); + unsafe_put_user(0U, &rseq->node_id, efault); + unsafe_put_user(0U, &rseq->mm_cid, efault); + } -#ifdef CONFIG_DEBUG_RSEQ - /* - * Initialize the in-kernel rseq fields copy for validation of - * read-only fields. - */ - if (get_user(rseq_kernel_fields(current)->cpu_id_start, &rseq->cpu_id_start) || - get_user(rseq_kernel_fields(current)->cpu_id, &rseq->cpu_id) || - get_user(rseq_kernel_fields(current)->node_id, &rseq->node_id) || - get_user(rseq_kernel_fields(current)->mm_cid, &rseq->mm_cid)) - return -EFAULT; -#endif /* * Activate the registration by setting the rseq area address, length * and signature in the task struct. @@ -599,6 +461,8 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 */ current->rseq.event.has_rseq = true; rseq_sched_switch_event(current); - return 0; + +efault: + return -EFAULT; } From 9f6ffd4cebda86841700775de3213f22bb0ea22d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:45:10 +0100 Subject: [PATCH 24/54] rseq: Separate the signal delivery path Completely separate the signal delivery path from the notify handler as they have different semantics versus the event handling. The signal delivery only needs to ensure that the interrupted user context was not in a critical section or the section is aborted before it switches to the signal frame context. The signal frame context does not have the original instruction pointer anymore, so that can't be handled on exit to user space. No point in updating the CPU/CID ids as they might change again before the task returns to user space for real. The fast path optimization, which checks for the 'entry from user via interrupt' condition is only available for architectures which use the generic entry code. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.455429038@linutronix.de --- include/linux/rseq.h | 21 ++++++++++++++++----- kernel/rseq.c | 30 ++++++++++++++++++++++-------- 2 files changed, 38 insertions(+), 13 deletions(-) diff --git a/include/linux/rseq.h b/include/linux/rseq.h index 92f9cd49489b..f5a43188023f 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -7,22 +7,33 @@ #include -void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); +void __rseq_handle_notify_resume(struct pt_regs *regs); static inline void rseq_handle_notify_resume(struct pt_regs *regs) { if (current->rseq.event.has_rseq) - __rseq_handle_notify_resume(NULL, regs); + __rseq_handle_notify_resume(regs); } +void __rseq_signal_deliver(int sig, struct pt_regs *regs); + +/* + * Invoked from signal delivery to fixup based on the register context before + * switching to the signal delivery context. + */ static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { - if (current->rseq.event.has_rseq) { - current->rseq.event.sched_switch = true; - __rseq_handle_notify_resume(ksig, regs); + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { + /* '&' is intentional to spare one conditional branch */ + if (current->rseq.event.has_rseq & current->rseq.event.user_irq) + __rseq_signal_deliver(ksig->sig, regs); + } else { + if (current->rseq.event.has_rseq) + __rseq_signal_deliver(ksig->sig, regs); } } +/* Raised from context switch and exevce to force evaluation on exit to user */ static inline void rseq_sched_switch_event(struct task_struct *t) { if (t->rseq.event.has_rseq) { diff --git a/kernel/rseq.c b/kernel/rseq.c index 1e4f1d2cdfe5..13faadc737ad 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -250,13 +250,12 @@ efault: * respect to other threads scheduled on the same CPU, and with respect * to signal handlers. */ -void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) +void __rseq_handle_notify_resume(struct pt_regs *regs) { struct task_struct *t = current; struct rseq_ids ids; u32 node_id; bool event; - int sig; /* * If invoked from hypervisors before entering the guest via @@ -275,10 +274,7 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) if (unlikely(t->flags & PF_EXITING)) return; - if (ksig) - rseq_stat_inc(rseq_stats.signal); - else - rseq_stat_inc(rseq_stats.slowpath); + rseq_stat_inc(rseq_stats.slowpath); /* * Read and clear the event pending bit first. If the task @@ -317,8 +313,26 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) return; error: - sig = ksig ? ksig->sig : 0; - force_sigsegv(sig); + force_sig(SIGSEGV); +} + +void __rseq_signal_deliver(int sig, struct pt_regs *regs) +{ + rseq_stat_inc(rseq_stats.signal); + /* + * Don't update IDs, they are handled on exit to user if + * necessary. The important thing is to abort a critical section of + * the interrupted context as after this point the instruction + * pointer in @regs points to the signal handler. + */ + if (unlikely(!rseq_handle_cs(current, regs))) { + /* + * Clear the errors just in case this might survive + * magically, but leave the rest intact. + */ + current->rseq.event.error = 0; + force_sigsegv(sig); + } } /* From e2d4f42271155045a49b89530f2c06ad8e9f1a1e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:45:12 +0100 Subject: [PATCH 25/54] rseq: Rework the TIF_NOTIFY handler Replace the whole logic with a new implementation, which is shared with signal delivery and the upcoming exit fast path. Contrary to the original implementation, this ignores invocations from KVM/IO-uring, which invoke resume_user_mode_work() with the @regs argument set to NULL. The original implementation updated the CPU/Node/MM CID fields, but that was just a side effect, which was addressing the problem that this invocation cleared TIF_NOTIFY_RESUME, which in turn could cause an update on return to user space to be lost. This problem has been addressed differently, so that it's not longer required to do that update before entering the guest. That might be considered a user visible change, when the hosts thread TLS memory is mapped into the guest, but as this was never intentionally supported, this abuse of kernel internal implementation details is not considered an ABI break. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.517640811@linutronix.de --- include/linux/rseq_entry.h | 29 +++++++++++++++ kernel/rseq.c | 76 +++++++++++++++++--------------------- 2 files changed, 62 insertions(+), 43 deletions(-) diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index 37444e80fd45..aa1c0464a16c 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -368,6 +368,35 @@ efault: return false; } +/* + * Update user space with new IDs and conditionally check whether the task + * is in a critical section. + */ +static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *regs, + struct rseq_ids *ids, u32 node_id) +{ + u64 csaddr; + + if (!rseq_set_ids_get_csaddr(t, ids, node_id, &csaddr)) + return false; + + /* + * On architectures which utilize the generic entry code this + * allows to skip the critical section when the entry was not from + * a user space interrupt, unless debug mode is enabled. + */ + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { + if (!static_branch_unlikely(&rseq_debug_enabled)) { + if (likely(!t->rseq.event.user_irq)) + return true; + } + } + if (likely(!csaddr)) + return true; + /* Sigh, this really needs to do work */ + return rseq_update_user_cs(t, regs, csaddr); +} + static __always_inline void rseq_exit_to_user_mode(void) { struct rseq_event *ev = ¤t->rseq.event; diff --git a/kernel/rseq.c b/kernel/rseq.c index 13faadc737ad..148fb2103023 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -82,12 +82,6 @@ #define CREATE_TRACE_POINTS #include -#ifdef CONFIG_MEMBARRIER -# define RSEQ_EVENT_GUARD irq -#else -# define RSEQ_EVENT_GUARD preempt -#endif - DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); static inline void rseq_control_debug(bool on) @@ -239,38 +233,15 @@ efault: return false; } -/* - * This resume handler must always be executed between any of: - * - preemption, - * - signal delivery, - * and return to user-space. - * - * This is how we can ensure that the entire rseq critical section - * will issue the commit instruction only if executed atomically with - * respect to other threads scheduled on the same CPU, and with respect - * to signal handlers. - */ -void __rseq_handle_notify_resume(struct pt_regs *regs) +static void rseq_slowpath_update_usr(struct pt_regs *regs) { + /* Preserve rseq state and user_irq state for exit to user */ + const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, }; struct task_struct *t = current; struct rseq_ids ids; u32 node_id; bool event; - /* - * If invoked from hypervisors before entering the guest via - * resume_user_mode_work(), then @regs is a NULL pointer. - * - * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises - * it before returning from the ioctl() to user space when - * rseq_event.sched_switch is set. - * - * So it's safe to ignore here instead of pointlessly updating it - * in the vcpu_run() loop. - */ - if (!regs) - return; - if (unlikely(t->flags & PF_EXITING)) return; @@ -294,26 +265,45 @@ void __rseq_handle_notify_resume(struct pt_regs *regs) * with the result handed in to allow the detection of * inconsistencies. */ - scoped_guard(RSEQ_EVENT_GUARD) { + scoped_guard(irq) { event = t->rseq.event.sched_switch; - t->rseq.event.sched_switch = false; + t->rseq.event.all &= evt_mask.all; ids.cpu_id = task_cpu(t); ids.mm_cid = task_mm_cid(t); } - if (!IS_ENABLED(CONFIG_DEBUG_RSEQ) && !event) + if (!event) return; - if (!rseq_handle_cs(t, regs)) - goto error; - node_id = cpu_to_node(ids.cpu_id); - if (!rseq_set_ids(t, &ids, node_id)) - goto error; - return; -error: - force_sig(SIGSEGV); + if (unlikely(!rseq_update_usr(t, regs, &ids, node_id))) { + /* + * Clear the errors just in case this might survive magically, but + * leave the rest intact. + */ + t->rseq.event.error = 0; + force_sig(SIGSEGV); + } +} + +void __rseq_handle_notify_resume(struct pt_regs *regs) +{ + /* + * If invoked from hypervisors before entering the guest via + * resume_user_mode_work(), then @regs is a NULL pointer. + * + * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises + * it before returning from the ioctl() to user space when + * rseq_event.sched_switch is set. + * + * So it's safe to ignore here instead of pointlessly updating it + * in the vcpu_run() loop. + */ + if (!regs) + return; + + rseq_slowpath_update_usr(regs); } void __rseq_signal_deliver(int sig, struct pt_regs *regs) From 39a167560a61f913560ba803a96dbe6c15239f5c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:45:14 +0100 Subject: [PATCH 26/54] rseq: Optimize event setting After removing the various condition bits earlier it turns out that one extra information is needed to avoid setting event::sched_switch and TIF_NOTIFY_RESUME unconditionally on every context switch. The update of the RSEQ user space memory is only required, when either the task was interrupted in user space and schedules or the CPU or MM CID changes in schedule() independent of the entry mode Right now only the interrupt from user information is available. Add an event flag, which is set when the CPU or MM CID or both change. Evaluate this event in the scheduler to decide whether the sched_switch event and the TIF bit need to be set. It's an extra conditional in context_switch(), but the downside of unconditionally handling RSEQ after a context switch to user is way more significant. The utilized boolean logic minimizes this to a single conditional branch. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.578058898@linutronix.de --- fs/exec.c | 2 +- include/linux/rseq.h | 81 ++++++++++++++++++++++++++++++++++---- include/linux/rseq_types.h | 11 +++++- kernel/rseq.c | 2 +- kernel/sched/core.c | 7 +++- kernel/sched/sched.h | 5 ++- 6 files changed, 95 insertions(+), 13 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index e45b29890269..90e47eb156ab 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1775,7 +1775,7 @@ out: force_fatal_sig(SIGSEGV); sched_mm_cid_after_execve(current); - rseq_sched_switch_event(current); + rseq_force_update(); current->in_execve = 0; return retval; diff --git a/include/linux/rseq.h b/include/linux/rseq.h index f5a43188023f..abfbeb42d1a2 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -11,7 +11,8 @@ void __rseq_handle_notify_resume(struct pt_regs *regs); static inline void rseq_handle_notify_resume(struct pt_regs *regs) { - if (current->rseq.event.has_rseq) + /* '&' is intentional to spare one conditional branch */ + if (current->rseq.event.sched_switch & current->rseq.event.has_rseq) __rseq_handle_notify_resume(regs); } @@ -33,12 +34,75 @@ static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *reg } } -/* Raised from context switch and exevce to force evaluation on exit to user */ -static inline void rseq_sched_switch_event(struct task_struct *t) +static inline void rseq_raise_notify_resume(struct task_struct *t) { - if (t->rseq.event.has_rseq) { - t->rseq.event.sched_switch = true; - set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); + set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); +} + +/* Invoked from context switch to force evaluation on exit to user */ +static __always_inline void rseq_sched_switch_event(struct task_struct *t) +{ + struct rseq_event *ev = &t->rseq.event; + + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { + /* + * Avoid a boat load of conditionals by using simple logic + * to determine whether NOTIFY_RESUME needs to be raised. + * + * It's required when the CPU or MM CID has changed or + * the entry was from user space. + */ + bool raise = (ev->user_irq | ev->ids_changed) & ev->has_rseq; + + if (raise) { + ev->sched_switch = true; + rseq_raise_notify_resume(t); + } + } else { + if (ev->has_rseq) { + t->rseq.event.sched_switch = true; + rseq_raise_notify_resume(t); + } + } +} + +/* + * Invoked from __set_task_cpu() when a task migrates to enforce an IDs + * update. + * + * This does not raise TIF_NOTIFY_RESUME as that happens in + * rseq_sched_switch_event(). + */ +static __always_inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu) +{ + t->rseq.event.ids_changed = true; +} + +/* + * Invoked from switch_mm_cid() in context switch when the task gets a MM + * CID assigned. + * + * This does not raise TIF_NOTIFY_RESUME as that happens in + * rseq_sched_switch_event(). + */ +static __always_inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid) +{ + /* + * Requires a comparison as the switch_mm_cid() code does not + * provide a conditional for it readily. So avoid excessive updates + * when nothing changes. + */ + if (t->rseq.ids.mm_cid != cid) + t->rseq.event.ids_changed = true; +} + +/* Enforce a full update after RSEQ registration and when execve() failed */ +static inline void rseq_force_update(void) +{ + if (current->rseq.event.has_rseq) { + current->rseq.event.ids_changed = true; + current->rseq.event.sched_switch = true; + rseq_raise_notify_resume(current); } } @@ -55,7 +119,7 @@ static inline void rseq_sched_switch_event(struct task_struct *t) static inline void rseq_virt_userspace_exit(void) { if (current->rseq.event.sched_switch) - set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); + rseq_raise_notify_resume(current); } static inline void rseq_reset(struct task_struct *t) @@ -91,6 +155,9 @@ static inline void rseq_fork(struct task_struct *t, u64 clone_flags) static inline void rseq_handle_notify_resume(struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } static inline void rseq_sched_switch_event(struct task_struct *t) { } +static inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu) { } +static inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid) { } +static inline void rseq_force_update(void) { } static inline void rseq_virt_userspace_exit(void) { } static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { } static inline void rseq_execve(struct task_struct *t) { } diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index 7c123947bb98..a1389fff4fca 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -11,20 +11,27 @@ struct rseq; * struct rseq_event - Storage for rseq related event management * @all: Compound to initialize and clear the data efficiently * @events: Compound to access events with a single load/store - * @sched_switch: True if the task was scheduled out + * @sched_switch: True if the task was scheduled and needs update on + * exit to user + * @ids_changed: Indicator that IDs need to be updated * @user_irq: True on interrupt entry from user mode * @has_rseq: True if the task has a rseq pointer installed * @error: Compound error code for the slow path to analyze * @fatal: User space data corrupted or invalid + * + * @sched_switch and @ids_changed must be adjacent and the combo must be + * 16bit aligned to allow a single store, when both are set at the same + * time in the scheduler. */ struct rseq_event { union { u64 all; struct { union { - u16 events; + u32 events; struct { u8 sched_switch; + u8 ids_changed; u8 user_irq; }; }; diff --git a/kernel/rseq.c b/kernel/rseq.c index 148fb2103023..183dde756808 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -464,7 +464,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 * are updated before returning to user-space. */ current->rseq.event.has_rseq = true; - rseq_sched_switch_event(current); + rseq_force_update(); return 0; efault: diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b75e8e1eca4a..579a8e93578f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5118,7 +5118,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, kcov_prepare_switch(prev); sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); - rseq_sched_switch_event(prev); fire_sched_out_preempt_notifiers(prev, next); kmap_local_sched_out(); prepare_task(next); @@ -5316,6 +5315,12 @@ context_switch(struct rq *rq, struct task_struct *prev, /* switch_mm_cid() requires the memory barriers above. */ switch_mm_cid(rq, prev, next); + /* + * Tell rseq that the task was scheduled in. Must be after + * switch_mm_cid() to get the TIF flag set. + */ + rseq_sched_switch_event(next); + prepare_lock_switch(rq, next, rf); /* Here we just switch the register state and the stack. */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index adfb6e3409d7..4838dda75b10 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2209,6 +2209,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) smp_wmb(); WRITE_ONCE(task_thread_info(p)->cpu, cpu); p->wake_cpu = cpu; + rseq_sched_set_task_cpu(p, cpu); #endif /* CONFIG_SMP */ } @@ -3807,8 +3808,10 @@ static inline void switch_mm_cid(struct rq *rq, mm_cid_put_lazy(prev); prev->mm_cid = -1; } - if (next->mm_cid_active) + if (next->mm_cid_active) { next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next, next->mm); + rseq_sched_set_task_mm_cid(next, next->mm_cid); + } } #else /* !CONFIG_SCHED_MM_CID: */ From 05b44aef709cae5e4274590f050cf35049dcc24e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:45:17 +0100 Subject: [PATCH 27/54] rseq: Implement fast path for exit to user Implement the actual logic for handling RSEQ updates in a fast path after handling the TIF work and at the point where the task is actually returning to user space. This is the right point to do that because at this point the CPU and the MM CID are stable and cannot longer change due to yet another reschedule. That happens when the task is handling it via TIF_NOTIFY_RESUME in resume_user_mode_work(), which is invoked from the exit to user mode work loop. The function is invoked after the TIF work is handled and runs with interrupts disabled, which means it cannot resolve page faults. It therefore disables page faults and in case the access to the user space memory faults, it: - notes the fail in the event struct - raises TIF_NOTIFY_RESUME - returns false to the caller The caller has to go back to the TIF work, which runs with interrupts enabled and therefore can resolve the page faults. This happens mostly on fork() when the memory is marked COW. If the user memory inspection finds invalid data, the function returns false as well and sets the fatal flag in the event struct along with TIF_NOTIFY_RESUME. The slow path notify handler has to evaluate that flag and terminate the task with SIGSEGV as documented. The initial decision to invoke any of this is based on one flags in the event struct: @sched_switch. The decision is in pseudo ASM: load tsk::event::sched_switch jnz inspect_user_space mov $0, tsk::event::events ... leave So for the common case where the task was not scheduled out, this really boils down to three instructions before going out if the compiler is not completely stupid (and yes, some of them are). If the condition is true, then it checks, whether CPU ID or MM CID have changed. If so, then the CPU/MM IDs have to be updated and are thereby cached for the next round. The update unconditionally retrieves the user space critical section address to spare another user*begin/end() pair. If that's not zero and tsk::event::user_irq is set, then the critical section is analyzed and acted upon. If either zero or the entry came via syscall the critical section analysis is skipped. If the comparison is false then the critical section has to be analyzed because the event flag is then only true when entry from user was by interrupt. This is provided without the actual hookup to let reviewers focus on the implementation details. The hookup happens in the next step. Note: As with quite some other optimizations this depends on the generic entry infrastructure and is not enabled to be sucked into random architecture implementations. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.638929615@linutronix.de --- include/linux/rseq_entry.h | 133 ++++++++++++++++++++++++++++++++++++- include/linux/rseq_types.h | 3 + kernel/rseq.c | 2 + 3 files changed, 135 insertions(+), 3 deletions(-) diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index aa1c0464a16c..3f13be7301fa 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -10,6 +10,7 @@ struct rseq_stats { unsigned long exit; unsigned long signal; unsigned long slowpath; + unsigned long fastpath; unsigned long ids; unsigned long cs; unsigned long clear; @@ -245,12 +246,13 @@ rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long c { struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr; unsigned long ip = instruction_pointer(regs); + unsigned long tasksize = TASK_SIZE; u64 start_ip, abort_ip, offset; u32 usig, __user *uc_sig; rseq_stat_inc(rseq_stats.cs); - if (unlikely(csaddr >= TASK_SIZE)) { + if (unlikely(csaddr >= tasksize)) { t->rseq.event.fatal = true; return false; } @@ -287,7 +289,7 @@ rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long c * in TLS::rseq::rseq_cs. An RSEQ abort would then evade ROP * protection. */ - if (abort_ip >= TASK_SIZE || abort_ip < sizeof(*uc_sig)) + if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig))) goto die; /* The address is guaranteed to be >= 0 and < TASK_SIZE */ @@ -397,6 +399,128 @@ static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *r return rseq_update_user_cs(t, regs, csaddr); } +/* + * If you want to use this then convert your architecture to the generic + * entry code. I'm tired of building workarounds for people who can't be + * bothered to make the maintenance of generic infrastructure less + * burdensome. Just sucking everything into the architecture code and + * thereby making others chase the horrible hacks and keep them working is + * neither acceptable nor sustainable. + */ +#ifdef CONFIG_GENERIC_ENTRY + +/* + * This is inlined into the exit path because: + * + * 1) It's a one time comparison in the fast path when there is no event to + * handle + * + * 2) The access to the user space rseq memory (TLS) is unlikely to fault + * so the straight inline operation is: + * + * - Four 32-bit stores only if CPU ID/ MM CID need to be updated + * - One 64-bit load to retrieve the critical section address + * + * 3) In the unlikely case that the critical section address is != NULL: + * + * - One 64-bit load to retrieve the start IP + * - One 64-bit load to retrieve the offset for calculating the end + * - One 64-bit load to retrieve the abort IP + * - One 64-bit load to retrieve the signature + * - One store to clear the critical section address + * + * The non-debug case implements only the minimal required checking. It + * provides protection against a rogue abort IP in kernel space, which + * would be exploitable at least on x86, and also against a rogue CS + * descriptor by checking the signature at the abort IP. Any fallout from + * invalid critical section descriptors is a user space problem. The debug + * case provides the full set of checks and terminates the task if a + * condition is not met. + * + * In case of a fault or an invalid value, this sets TIF_NOTIFY_RESUME and + * tells the caller to loop back into exit_to_user_mode_loop(). The rseq + * slow path there will handle the failure. + */ +static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct task_struct *t) +{ + /* + * Page faults need to be disabled as this is called with + * interrupts disabled + */ + guard(pagefault)(); + if (likely(!t->rseq.event.ids_changed)) { + struct rseq __user *rseq = t->rseq.usrptr; + /* + * If IDs have not changed rseq_event::user_irq must be true + * See rseq_sched_switch_event(). + */ + u64 csaddr; + + if (unlikely(get_user_inline(csaddr, &rseq->rseq_cs))) + return false; + + if (static_branch_unlikely(&rseq_debug_enabled) || unlikely(csaddr)) { + if (unlikely(!rseq_update_user_cs(t, regs, csaddr))) + return false; + } + return true; + } + + struct rseq_ids ids = { + .cpu_id = task_cpu(t), + .mm_cid = task_mm_cid(t), + }; + u32 node_id = cpu_to_node(ids.cpu_id); + + return rseq_update_usr(t, regs, &ids, node_id); +} + +static __always_inline bool __rseq_exit_to_user_mode_restart(struct pt_regs *regs) +{ + struct task_struct *t = current; + + /* + * If the task did not go through schedule or got the flag enforced + * by the rseq syscall or execve, then nothing to do here. + * + * CPU ID and MM CID can only change when going through a context + * switch. + * + * rseq_sched_switch_event() sets the rseq_event::sched_switch bit + * only when rseq_event::has_rseq is true. That conditional is + * required to avoid setting the TIF bit if RSEQ is not registered + * for a task. rseq_event::sched_switch is cleared when RSEQ is + * unregistered by a task so it's sufficient to check for the + * sched_switch bit alone. + * + * A sane compiler requires three instructions for the nothing to do + * case including clearing the events, but your mileage might vary. + */ + if (unlikely((t->rseq.event.sched_switch))) { + rseq_stat_inc(rseq_stats.fastpath); + + if (unlikely(!rseq_exit_user_update(regs, t))) + return true; + } + /* Clear state so next entry starts from a clean slate */ + t->rseq.event.events = 0; + return false; +} + +static __always_inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs) +{ + if (unlikely(__rseq_exit_to_user_mode_restart(regs))) { + current->rseq.event.slowpath = true; + set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); + return true; + } + return false; +} + +#else /* CONFIG_GENERIC_ENTRY */ +static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs) { return false; } +#endif /* !CONFIG_GENERIC_ENTRY */ + static __always_inline void rseq_exit_to_user_mode(void) { struct rseq_event *ev = ¤t->rseq.event; @@ -421,9 +545,12 @@ static inline void rseq_debug_syscall_return(struct pt_regs *regs) if (static_branch_unlikely(&rseq_debug_enabled)) __rseq_debug_syscall_return(regs); } - #else /* CONFIG_RSEQ */ static inline void rseq_note_user_irq_entry(void) { } +static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs) +{ + return false; +} static inline void rseq_exit_to_user_mode(void) { } static inline void rseq_debug_syscall_return(struct pt_regs *regs) { } #endif /* !CONFIG_RSEQ */ diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index a1389fff4fca..9c7a34154de8 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -18,6 +18,8 @@ struct rseq; * @has_rseq: True if the task has a rseq pointer installed * @error: Compound error code for the slow path to analyze * @fatal: User space data corrupted or invalid + * @slowpath: Indicator that slow path processing via TIF_NOTIFY_RESUME + * is required * * @sched_switch and @ids_changed must be adjacent and the combo must be * 16bit aligned to allow a single store, when both are set at the same @@ -42,6 +44,7 @@ struct rseq_event { u16 error; struct { u8 fatal; + u8 slowpath; }; }; }; diff --git a/kernel/rseq.c b/kernel/rseq.c index 183dde756808..c5d6336c6956 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -133,6 +133,7 @@ static int rseq_stats_show(struct seq_file *m, void *p) stats.exit += data_race(per_cpu(rseq_stats.exit, cpu)); stats.signal += data_race(per_cpu(rseq_stats.signal, cpu)); stats.slowpath += data_race(per_cpu(rseq_stats.slowpath, cpu)); + stats.fastpath += data_race(per_cpu(rseq_stats.fastpath, cpu)); stats.ids += data_race(per_cpu(rseq_stats.ids, cpu)); stats.cs += data_race(per_cpu(rseq_stats.cs, cpu)); stats.clear += data_race(per_cpu(rseq_stats.clear, cpu)); @@ -142,6 +143,7 @@ static int rseq_stats_show(struct seq_file *m, void *p) seq_printf(m, "exit: %16lu\n", stats.exit); seq_printf(m, "signal: %16lu\n", stats.signal); seq_printf(m, "slowp: %16lu\n", stats.slowpath); + seq_printf(m, "fastp: %16lu\n", stats.fastpath); seq_printf(m, "ids: %16lu\n", stats.ids); seq_printf(m, "cs: %16lu\n", stats.cs); seq_printf(m, "clear: %16lu\n", stats.clear); From 3db6b38dfe640207da706b286d4181237391f5bd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:45:19 +0100 Subject: [PATCH 28/54] rseq: Switch to fast path processing on exit to user Now that all bits and pieces are in place, hook the RSEQ handling fast path function into exit_to_user_mode_prepare() after the TIF work bits have been handled. If case of fast path failure, TIF_NOTIFY_RESUME has been raised and the caller needs to take another turn through the TIF handling slow path. This only works for architectures which use the generic entry code. Architectures who still have their own incomplete hacks are not supported and won't be. This results in the following improvements: Kernel build Before After Reduction exit to user 80692981 80514451 signal checks: 32581 121 99% slowpath runs: 1201408 1.49% 198 0.00% 100% fastpath runs: 675941 0.84% N/A id updates: 1233989 1.53% 50541 0.06% 96% cs checks: 1125366 1.39% 0 0.00% 100% cs cleared: 1125366 100% 0 100% cs fixup: 0 0% 0 RSEQ selftests Before After Reduction exit to user: 386281778 387373750 signal checks: 35661203 0 100% slowpath runs: 140542396 36.38% 100 0.00% 100% fastpath runs: 9509789 2.51% N/A id updates: 176203599 45.62% 9087994 2.35% 95% cs checks: 175587856 45.46% 4728394 1.22% 98% cs cleared: 172359544 98.16% 1319307 27.90% 99% cs fixup: 3228312 1.84% 3409087 72.10% The 'cs cleared' and 'cs fixup' percentages are not relative to the exit to user invocations, they are relative to the actual 'cs check' invocations. While some of this could have been avoided in the original code, like the obvious clearing of CS when it's already clear, the main problem of going through TIF_NOTIFY_RESUME cannot be solved. In some workloads the RSEQ notify handler is invoked more than once before going out to user space. Doing this once when everything has stabilized is the only solution to avoid this. The initial attempt to completely decouple it from the TIF work turned out to be suboptimal for workloads, which do a lot of quick and short system calls. Even if the fast path decision is only 4 instructions (including a conditional branch), this adds up quickly and becomes measurable when the rate for actually having to handle rseq is in the low single digit percentage range of user/kernel transitions. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.701201365@linutronix.de --- include/linux/irq-entry-common.h | 7 ++----- include/linux/resume_user_mode.h | 2 +- include/linux/rseq.h | 18 ++++++++++++------ init/Kconfig | 2 +- kernel/entry/common.c | 26 +++++++++++++++++++------- kernel/rseq.c | 8 ++++++-- 6 files changed, 41 insertions(+), 22 deletions(-) diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index cb31fb84d7b4..8f5ceeaaaea5 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -197,11 +197,8 @@ static __always_inline void arch_exit_to_user_mode(void) { } */ void arch_do_signal_or_restart(struct pt_regs *regs); -/** - * exit_to_user_mode_loop - do any pending work before leaving to user space - */ -unsigned long exit_to_user_mode_loop(struct pt_regs *regs, - unsigned long ti_work); +/* Handle pending TIF work */ +unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work); /** * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required diff --git a/include/linux/resume_user_mode.h b/include/linux/resume_user_mode.h index dd3bf7da90a8..bf92227c78d0 100644 --- a/include/linux/resume_user_mode.h +++ b/include/linux/resume_user_mode.h @@ -59,7 +59,7 @@ static inline void resume_user_mode_work(struct pt_regs *regs) mem_cgroup_handle_over_high(GFP_KERNEL); blkcg_maybe_throttle_current(); - rseq_handle_notify_resume(regs); + rseq_handle_slowpath(regs); } #endif /* LINUX_RESUME_USER_MODE_H */ diff --git a/include/linux/rseq.h b/include/linux/rseq.h index abfbeb42d1a2..ded4baa34586 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -7,13 +7,19 @@ #include -void __rseq_handle_notify_resume(struct pt_regs *regs); +void __rseq_handle_slowpath(struct pt_regs *regs); -static inline void rseq_handle_notify_resume(struct pt_regs *regs) +/* Invoked from resume_user_mode_work() */ +static inline void rseq_handle_slowpath(struct pt_regs *regs) { - /* '&' is intentional to spare one conditional branch */ - if (current->rseq.event.sched_switch & current->rseq.event.has_rseq) - __rseq_handle_notify_resume(regs); + if (IS_ENABLED(CONFIG_GENERIC_ENTRY)) { + if (current->rseq.event.slowpath) + __rseq_handle_slowpath(regs); + } else { + /* '&' is intentional to spare one conditional branch */ + if (current->rseq.event.sched_switch & current->rseq.event.has_rseq) + __rseq_handle_slowpath(regs); + } } void __rseq_signal_deliver(int sig, struct pt_regs *regs); @@ -152,7 +158,7 @@ static inline void rseq_fork(struct task_struct *t, u64 clone_flags) } #else /* CONFIG_RSEQ */ -static inline void rseq_handle_notify_resume(struct pt_regs *regs) { } +static inline void rseq_handle_slowpath(struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } static inline void rseq_sched_switch_event(struct task_struct *t) { } static inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu) { } diff --git a/init/Kconfig b/init/Kconfig index bde40ab664e2..d1c606ec632e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1941,7 +1941,7 @@ config RSEQ_DEBUG_DEFAULT_ENABLE config DEBUG_RSEQ default n bool "Enable debugging of rseq() system call" if EXPERT - depends on RSEQ && DEBUG_KERNEL + depends on RSEQ && DEBUG_KERNEL && !GENERIC_ENTRY select RSEQ_DEBUG_DEFAULT_ENABLE help Enable extra debugging checks for the rseq system call. diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 70a16db4cc0a..523a3e758af4 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -11,13 +11,8 @@ /* Workaround to allow gradual conversion of architecture code */ void __weak arch_do_signal_or_restart(struct pt_regs *regs) { } -/** - * exit_to_user_mode_loop - do any pending work before leaving to user space - * @regs: Pointer to pt_regs on entry stack - * @ti_work: TIF work flags as read by the caller - */ -__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, - unsigned long ti_work) +static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *regs, + unsigned long ti_work) { /* * Before returning to user space ensure that all pending work @@ -62,6 +57,23 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, return ti_work; } +/** + * exit_to_user_mode_loop - do any pending work before leaving to user space + * @regs: Pointer to pt_regs on entry stack + * @ti_work: TIF work flags as read by the caller + */ +__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, + unsigned long ti_work) +{ + for (;;) { + ti_work = __exit_to_user_mode_loop(regs, ti_work); + + if (likely(!rseq_exit_to_user_mode_restart(regs))) + return ti_work; + ti_work = read_thread_flags(); + } +} + noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) { irqentry_state_t ret = { diff --git a/kernel/rseq.c b/kernel/rseq.c index c5d6336c6956..395d8b002350 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -237,7 +237,11 @@ efault: static void rseq_slowpath_update_usr(struct pt_regs *regs) { - /* Preserve rseq state and user_irq state for exit to user */ + /* + * Preserve rseq state and user_irq state. The generic entry code + * clears user_irq on the way out, the non-generic entry + * architectures are not having user_irq. + */ const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, }; struct task_struct *t = current; struct rseq_ids ids; @@ -289,7 +293,7 @@ static void rseq_slowpath_update_usr(struct pt_regs *regs) } } -void __rseq_handle_notify_resume(struct pt_regs *regs) +void __rseq_handle_slowpath(struct pt_regs *regs) { /* * If invoked from hypervisors before entering the guest via From 70fe25a3bc53a891f0e6184c12bd55cc524cb13b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:45:21 +0100 Subject: [PATCH 29/54] entry: Split up exit_to_user_mode_prepare() exit_to_user_mode_prepare() is used for both interrupts and syscalls, but there is extra rseq work, which is only required for in the interrupt exit case. Split up the function and provide wrappers for syscalls and interrupts, which allows to separate the rseq exit work in the next step. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.782234789@linutronix.de --- arch/arm64/kernel/entry-common.c | 2 +- include/linux/entry-common.h | 2 +- include/linux/irq-entry-common.h | 49 ++++++++++++++++++++++++++++---- 3 files changed, 46 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index a9c81715ce59..0a97e2621f60 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -100,7 +100,7 @@ static __always_inline void arm64_enter_from_user_mode(struct pt_regs *regs) static __always_inline void arm64_exit_to_user_mode(struct pt_regs *regs) { local_irq_disable(); - exit_to_user_mode_prepare(regs); + exit_to_user_mode_prepare_legacy(regs); local_daif_mask(); mte_check_tfsr_exit(); exit_to_user_mode(); diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index d967184ae08f..87efb38b7081 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -156,7 +156,7 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs) if (unlikely(work & SYSCALL_WORK_EXIT)) syscall_exit_work(regs, work); local_irq_disable_exit_to_user(); - exit_to_user_mode_prepare(regs); + syscall_exit_to_user_mode_prepare(regs); } /** diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index 8f5ceeaaaea5..5ea61722bb70 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -201,7 +201,7 @@ void arch_do_signal_or_restart(struct pt_regs *regs); unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work); /** - * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required + * __exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required * @regs: Pointer to pt_regs on entry stack * * 1) check that interrupts are disabled @@ -209,8 +209,10 @@ unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work * 3) call exit_to_user_mode_loop() if any flags from * EXIT_TO_USER_MODE_WORK are set * 4) check that interrupts are still disabled + * + * Don't invoke directly, use the syscall/irqentry_ prefixed variants below */ -static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs) +static __always_inline void __exit_to_user_mode_prepare(struct pt_regs *regs) { unsigned long ti_work; @@ -224,15 +226,52 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs) ti_work = exit_to_user_mode_loop(regs, ti_work); arch_exit_to_user_mode_prepare(regs, ti_work); +} - rseq_exit_to_user_mode(); - +static __always_inline void __exit_to_user_mode_validate(void) +{ /* Ensure that kernel state is sane for a return to userspace */ kmap_assert_nomap(); lockdep_assert_irqs_disabled(); lockdep_sys_exit(); } +/* Temporary workaround to keep ARM64 alive */ +static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *regs) +{ + __exit_to_user_mode_prepare(regs); + rseq_exit_to_user_mode(); + __exit_to_user_mode_validate(); +} + +/** + * syscall_exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required + * @regs: Pointer to pt_regs on entry stack + * + * Wrapper around __exit_to_user_mode_prepare() to separate the exit work for + * syscalls and interrupts. + */ +static __always_inline void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) +{ + __exit_to_user_mode_prepare(regs); + rseq_exit_to_user_mode(); + __exit_to_user_mode_validate(); +} + +/** + * irqentry_exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required + * @regs: Pointer to pt_regs on entry stack + * + * Wrapper around __exit_to_user_mode_prepare() to separate the exit work for + * syscalls and interrupts. + */ +static __always_inline void irqentry_exit_to_user_mode_prepare(struct pt_regs *regs) +{ + __exit_to_user_mode_prepare(regs); + rseq_exit_to_user_mode(); + __exit_to_user_mode_validate(); +} + /** * exit_to_user_mode - Fixup state when exiting to user mode * @@ -297,7 +336,7 @@ static __always_inline void irqentry_enter_from_user_mode(struct pt_regs *regs) static __always_inline void irqentry_exit_to_user_mode(struct pt_regs *regs) { instrumentation_begin(); - exit_to_user_mode_prepare(regs); + irqentry_exit_to_user_mode_prepare(regs); instrumentation_end(); exit_to_user_mode(); } From 7a5201ea1907534efe3a6e9c001ef4c0257cb3f0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:45:24 +0100 Subject: [PATCH 30/54] rseq: Split up rseq_exit_to_user_mode() Separate the interrupt and syscall exit handling. Syscall exit does not require to clear the user_irq bit as it can't be set. On interrupt exit it can be set when the interrupt did not result in a scheduling event and therefore the return path did not invoke the TIF work handling, which would have cleared it. The debug check for the event state is also not really required even when debug mode is enabled via the static key. Debug mode is largely aiding user space by enabling a larger amount of validation checks, which cause a segfault when a malformed critical section is detected. In production mode the critical section handling takes the content mostly as is and lets user space keep the pieces when it screwed up. On kernel changes in that area the state check is useful, but that can be done when lockdep is enabled, which is anyway a required test scenario for fundamental changes. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.842785700@linutronix.de --- include/linux/irq-entry-common.h | 6 +++--- include/linux/rseq_entry.h | 36 ++++++++++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index 5ea61722bb70..bc5d178e0b91 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -240,7 +240,7 @@ static __always_inline void __exit_to_user_mode_validate(void) static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *regs) { __exit_to_user_mode_prepare(regs); - rseq_exit_to_user_mode(); + rseq_exit_to_user_mode_legacy(); __exit_to_user_mode_validate(); } @@ -254,7 +254,7 @@ static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *reg static __always_inline void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) { __exit_to_user_mode_prepare(regs); - rseq_exit_to_user_mode(); + rseq_syscall_exit_to_user_mode(); __exit_to_user_mode_validate(); } @@ -268,7 +268,7 @@ static __always_inline void syscall_exit_to_user_mode_prepare(struct pt_regs *re static __always_inline void irqentry_exit_to_user_mode_prepare(struct pt_regs *regs) { __exit_to_user_mode_prepare(regs); - rseq_exit_to_user_mode(); + rseq_irqentry_exit_to_user_mode(); __exit_to_user_mode_validate(); } diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index 3f13be7301fa..958a63eeb2d3 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -521,7 +521,37 @@ static __always_inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs) static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs) { return false; } #endif /* !CONFIG_GENERIC_ENTRY */ -static __always_inline void rseq_exit_to_user_mode(void) +static __always_inline void rseq_syscall_exit_to_user_mode(void) +{ + struct rseq_event *ev = ¤t->rseq.event; + + rseq_stat_inc(rseq_stats.exit); + + /* Needed to remove the store for the !lockdep case */ + if (IS_ENABLED(CONFIG_LOCKDEP)) { + WARN_ON_ONCE(ev->sched_switch); + ev->events = 0; + } +} + +static __always_inline void rseq_irqentry_exit_to_user_mode(void) +{ + struct rseq_event *ev = ¤t->rseq.event; + + rseq_stat_inc(rseq_stats.exit); + + lockdep_assert_once(!ev->sched_switch); + + /* + * Ensure that event (especially user_irq) is cleared when the + * interrupt did not result in a schedule and therefore the + * rseq processing could not clear it. + */ + ev->events = 0; +} + +/* Required to keep ARM64 working */ +static __always_inline void rseq_exit_to_user_mode_legacy(void) { struct rseq_event *ev = ¤t->rseq.event; @@ -551,7 +581,9 @@ static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs) { return false; } -static inline void rseq_exit_to_user_mode(void) { } +static inline void rseq_syscall_exit_to_user_mode(void) { } +static inline void rseq_irqentry_exit_to_user_mode(void) { } +static inline void rseq_exit_to_user_mode_legacy(void) { } static inline void rseq_debug_syscall_return(struct pt_regs *regs) { } #endif /* !CONFIG_RSEQ */ From 32034df66b5f49626aa450ceaf1849a08d87906e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:45:26 +0100 Subject: [PATCH 31/54] rseq: Switch to TIF_RSEQ if supported TIF_NOTIFY_RESUME is a multiplexing TIF bit, which is suboptimal especially with the RSEQ fast path depending on it, but not really handling it. Define a separate TIF_RSEQ in the generic TIF space and enable the full separation of fast and slow path for architectures which utilize that. That avoids the hassle with invocations of resume_user_mode_work() from hypervisors, which clear TIF_NOTIFY_RESUME. It makes the therefore required re-evaluation at the end of vcpu_run() a NOOP on architectures which utilize the generic TIF space and have a separate TIF_RSEQ. The hypervisor TIF handling does not include the separate TIF_RSEQ as there is no point in doing so. The guest does neither know nor care about the VMM host applications RSEQ state. That state is only relevant when the ioctl() returns to user space. The fastpath implementation still utilizes TIF_NOTIFY_RESUME for failure handling, but this only happens within exit_to_user_mode_loop(), so arguably the hypervisor ioctl() code is long done when this happens. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.903622031@linutronix.de --- include/asm-generic/thread_info_tif.h | 3 +++ include/linux/irq-entry-common.h | 2 +- include/linux/rseq.h | 22 ++++++++++++------ include/linux/rseq_entry.h | 32 ++++++++++++++++++++++++--- include/linux/thread_info.h | 5 +++++ kernel/entry/common.c | 10 +++++++-- 6 files changed, 61 insertions(+), 13 deletions(-) diff --git a/include/asm-generic/thread_info_tif.h b/include/asm-generic/thread_info_tif.h index ee3793e9b1a4..da1610a78f92 100644 --- a/include/asm-generic/thread_info_tif.h +++ b/include/asm-generic/thread_info_tif.h @@ -45,4 +45,7 @@ # define _TIF_RESTORE_SIGMASK BIT(TIF_RESTORE_SIGMASK) #endif +#define TIF_RSEQ 11 // Run RSEQ fast path +#define _TIF_RSEQ BIT(TIF_RSEQ) + #endif /* _ASM_GENERIC_THREAD_INFO_TIF_H_ */ diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index bc5d178e0b91..72e3f7a59469 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -30,7 +30,7 @@ #define EXIT_TO_USER_MODE_WORK \ (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ _TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ - _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ + _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | _TIF_RSEQ | \ ARCH_EXIT_TO_USER_MODE_WORK) /** diff --git a/include/linux/rseq.h b/include/linux/rseq.h index ded4baa34586..b5e4803c4ebe 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -42,7 +42,7 @@ static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *reg static inline void rseq_raise_notify_resume(struct task_struct *t) { - set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); + set_tsk_thread_flag(t, TIF_RSEQ); } /* Invoked from context switch to force evaluation on exit to user */ @@ -114,17 +114,25 @@ static inline void rseq_force_update(void) /* * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode, - * which clears TIF_NOTIFY_RESUME. To avoid updating user space RSEQ in - * that case just to do it eventually again before returning to user space, - * the entry resume_user_mode_work() invocation is ignored as the register - * argument is NULL. + * which clears TIF_NOTIFY_RESUME on architectures that don't use the + * generic TIF bits and therefore can't provide a separate TIF_RSEQ flag. * - * After returning from guest mode, they have to invoke this function to - * re-raise TIF_NOTIFY_RESUME if necessary. + * To avoid updating user space RSEQ in that case just to do it eventually + * again before returning to user space, because __rseq_handle_slowpath() + * does nothing when invoked with NULL register state. + * + * After returning from guest mode, before exiting to userspace, hypervisors + * must invoke this function to re-raise TIF_NOTIFY_RESUME if necessary. */ static inline void rseq_virt_userspace_exit(void) { if (current->rseq.event.sched_switch) + /* + * The generic optimization for deferring RSEQ updates until the next + * exit relies on having a dedicated TIF_RSEQ. + */ + if (!IS_ENABLED(CONFIG_HAVE_GENERIC_TIF_BITS) && + current->rseq.event.sched_switch) rseq_raise_notify_resume(current); } diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index 958a63eeb2d3..c92167ff8a7f 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -507,18 +507,44 @@ static __always_inline bool __rseq_exit_to_user_mode_restart(struct pt_regs *reg return false; } -static __always_inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs) +/* Required to allow conversion to GENERIC_ENTRY w/o GENERIC_TIF_BITS */ +#ifdef CONFIG_HAVE_GENERIC_TIF_BITS +static __always_inline bool test_tif_rseq(unsigned long ti_work) { + return ti_work & _TIF_RSEQ; +} + +static __always_inline void clear_tif_rseq(void) +{ + static_assert(TIF_RSEQ != TIF_NOTIFY_RESUME); + clear_thread_flag(TIF_RSEQ); +} +#else +static __always_inline bool test_tif_rseq(unsigned long ti_work) { return true; } +static __always_inline void clear_tif_rseq(void) { } +#endif + +static __always_inline bool +rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) +{ + if (likely(!test_tif_rseq(ti_work))) + return false; + if (unlikely(__rseq_exit_to_user_mode_restart(regs))) { current->rseq.event.slowpath = true; set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); return true; } + + clear_tif_rseq(); return false; } #else /* CONFIG_GENERIC_ENTRY */ -static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs) { return false; } +static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) +{ + return false; +} #endif /* !CONFIG_GENERIC_ENTRY */ static __always_inline void rseq_syscall_exit_to_user_mode(void) @@ -577,7 +603,7 @@ static inline void rseq_debug_syscall_return(struct pt_regs *regs) } #else /* CONFIG_RSEQ */ static inline void rseq_note_user_irq_entry(void) { } -static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs) +static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) { return false; } diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index dd925d84fa46..b40de9bab4b7 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -67,6 +67,11 @@ enum syscall_work_bit { #define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED #endif +#ifndef TIF_RSEQ +# define TIF_RSEQ TIF_NOTIFY_RESUME +# define _TIF_RSEQ _TIF_NOTIFY_RESUME +#endif + #ifdef __KERNEL__ #ifndef arch_set_restart_data diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 523a3e758af4..5c792b30c58a 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -11,6 +11,12 @@ /* Workaround to allow gradual conversion of architecture code */ void __weak arch_do_signal_or_restart(struct pt_regs *regs) { } +#ifdef CONFIG_HAVE_GENERIC_TIF_BITS +#define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK & ~_TIF_RSEQ) +#else +#define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK) +#endif + static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work) { @@ -18,7 +24,7 @@ static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *re * Before returning to user space ensure that all pending work * items have been completed. */ - while (ti_work & EXIT_TO_USER_MODE_WORK) { + while (ti_work & EXIT_TO_USER_MODE_WORK_LOOP) { local_irq_enable_exit_to_user(ti_work); @@ -68,7 +74,7 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, for (;;) { ti_work = __exit_to_user_mode_loop(regs, ti_work); - if (likely(!rseq_exit_to_user_mode_restart(regs))) + if (likely(!rseq_exit_to_user_mode_restart(regs, ti_work))) return ti_work; ti_work = read_thread_flags(); } From 323d93f0432edb5415c79bd35e15e5754a76e486 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 31 Oct 2025 12:02:12 +0100 Subject: [PATCH 32/54] cleanup: Always inline everything KASAN bloat caused cleanup helper functions to not get inlined: vmlinux.o: error: objtool: irqentry_exit+0x323: call to class_user_rw_access_destructor() with UACCESS enabled Force inline all the cleanup helpers like they already are on normal builds. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://patch.msgid.link/20251031105435.GU4068168@noisy.programming.kicks-ass.net --- include/linux/cleanup.h | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index 2573585b7f06..d1806ac5342c 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -208,7 +208,7 @@ */ #define DEFINE_FREE(_name, _type, _free) \ - static inline void __free_##_name(void *p) { _type _T = *(_type *)p; _free; } + static __always_inline void __free_##_name(void *p) { _type _T = *(_type *)p; _free; } #define __free(_name) __cleanup(__free_##_name) @@ -220,7 +220,7 @@ __val; \ }) -static inline __must_check +static __always_inline __must_check const volatile void * __must_check_fn(const volatile void *val) { return val; } @@ -274,16 +274,16 @@ const volatile void * __must_check_fn(const volatile void *val) #define DEFINE_CLASS(_name, _type, _exit, _init, _init_args...) \ typedef _type class_##_name##_t; \ -static inline void class_##_name##_destructor(_type *p) \ +static __always_inline void class_##_name##_destructor(_type *p) \ { _type _T = *p; _exit; } \ -static inline _type class_##_name##_constructor(_init_args) \ +static __always_inline _type class_##_name##_constructor(_init_args) \ { _type t = _init; return t; } #define EXTEND_CLASS(_name, ext, _init, _init_args...) \ typedef class_##_name##_t class_##_name##ext##_t; \ -static inline void class_##_name##ext##_destructor(class_##_name##_t *p)\ +static __always_inline void class_##_name##ext##_destructor(class_##_name##_t *p) \ { class_##_name##_destructor(p); } \ -static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ +static __always_inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ { class_##_name##_t t = _init; return t; } #define CLASS(_name, var) \ @@ -347,7 +347,7 @@ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond }) #define __DEFINE_GUARD_LOCK_PTR(_name, _exp) \ - static inline void *class_##_name##_lock_ptr(class_##_name##_t *_T) \ + static __always_inline void *class_##_name##_lock_ptr(class_##_name##_t *_T) \ { \ void *_ptr = (void *)(__force unsigned long)*(_exp); \ if (IS_ERR(_ptr)) { \ @@ -355,7 +355,7 @@ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond } \ return _ptr; \ } \ - static inline int class_##_name##_lock_err(class_##_name##_t *_T) \ + static __always_inline int class_##_name##_lock_err(class_##_name##_t *_T) \ { \ long _rc = (__force unsigned long)*(_exp); \ if (!_rc) { \ @@ -384,9 +384,9 @@ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond EXTEND_CLASS(_name, _ext, \ ({ void *_t = _T; int _RET = (_lock); if (_T && !(_cond)) _t = ERR_PTR(_RET); _t; }), \ class_##_name##_t _T) \ - static inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ + static __always_inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ { return class_##_name##_lock_ptr(_T); } \ - static inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \ + static __always_inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \ { return class_##_name##_lock_err(_T); } /* @@ -466,7 +466,7 @@ typedef struct { \ __VA_ARGS__; \ } class_##_name##_t; \ \ -static inline void class_##_name##_destructor(class_##_name##_t *_T) \ +static __always_inline void class_##_name##_destructor(class_##_name##_t *_T) \ { \ if (!__GUARD_IS_ERR(_T->lock)) { _unlock; } \ } \ @@ -474,7 +474,7 @@ static inline void class_##_name##_destructor(class_##_name##_t *_T) \ __DEFINE_GUARD_LOCK_PTR(_name, &_T->lock) #define __DEFINE_LOCK_GUARD_1(_name, _type, _lock) \ -static inline class_##_name##_t class_##_name##_constructor(_type *l) \ +static __always_inline class_##_name##_t class_##_name##_constructor(_type *l) \ { \ class_##_name##_t _t = { .lock = l }, *_T = &_t; \ _lock; \ @@ -482,7 +482,7 @@ static inline class_##_name##_t class_##_name##_constructor(_type *l) \ } #define __DEFINE_LOCK_GUARD_0(_name, _lock) \ -static inline class_##_name##_t class_##_name##_constructor(void) \ +static __always_inline class_##_name##_t class_##_name##_constructor(void) \ { \ class_##_name##_t _t = { .lock = (void*)1 }, \ *_T __maybe_unused = &_t; \ @@ -508,9 +508,9 @@ __DEFINE_LOCK_GUARD_0(_name, _lock) if (_T->lock && !(_cond)) _T->lock = ERR_PTR(_RET);\ _t; }), \ typeof_member(class_##_name##_t, lock) l) \ - static inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ + static __always_inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ { return class_##_name##_lock_ptr(_T); } \ - static inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \ + static __always_inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \ { return class_##_name##_lock_err(_T); } #define DEFINE_LOCK_GUARD_1_COND_3(_name, _ext, _lock) \ From 1fe4002cf7f23d70c79bda429ca2a9423ebcfdfa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 31 Oct 2025 12:04:24 +0100 Subject: [PATCH 33/54] x86/ptrace: Always inline trivial accessors A KASAN build bloats these single load/store helpers such that it fails to inline them: vmlinux.o: error: objtool: irqentry_exit+0x5e8: call to instruction_pointer_set() with UACCESS enabled Make sure the compiler isn't allowed to do stupid. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://patch.msgid.link/20251031105435.GU4068168@noisy.programming.kicks-ass.net --- arch/x86/include/asm/ptrace.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 50f75467f73d..b5dec859bc75 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -187,12 +187,12 @@ convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs); extern void send_sigtrap(struct pt_regs *regs, int error_code, int si_code); -static inline unsigned long regs_return_value(struct pt_regs *regs) +static __always_inline unsigned long regs_return_value(struct pt_regs *regs) { return regs->ax; } -static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) +static __always_inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) { regs->ax = rc; } @@ -277,34 +277,34 @@ static __always_inline bool ip_within_syscall_gap(struct pt_regs *regs) } #endif -static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) +static __always_inline unsigned long kernel_stack_pointer(struct pt_regs *regs) { return regs->sp; } -static inline unsigned long instruction_pointer(struct pt_regs *regs) +static __always_inline unsigned long instruction_pointer(struct pt_regs *regs) { return regs->ip; } -static inline void instruction_pointer_set(struct pt_regs *regs, - unsigned long val) +static __always_inline +void instruction_pointer_set(struct pt_regs *regs, unsigned long val) { regs->ip = val; } -static inline unsigned long frame_pointer(struct pt_regs *regs) +static __always_inline unsigned long frame_pointer(struct pt_regs *regs) { return regs->bp; } -static inline unsigned long user_stack_pointer(struct pt_regs *regs) +static __always_inline unsigned long user_stack_pointer(struct pt_regs *regs) { return regs->sp; } -static inline void user_stack_pointer_set(struct pt_regs *regs, - unsigned long val) +static __always_inline +void user_stack_pointer_set(struct pt_regs *regs, unsigned long val) { regs->sp = val; } From 80adaccf0e1c8c8fff44be2d959f6dba80af0491 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 18 Nov 2025 13:52:13 +0300 Subject: [PATCH 34/54] rseq: Delete duplicate if statement in rseq_virt_userspace_exit() This if statement is indented weirdly. It's a duplicate and doesn't affect runtime (beyond wasting a little time). Delete it. Signed-off-by: Dan Carpenter Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/aRxP3YcwscrP1BU_@stanley.mountain --- include/linux/rseq.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/rseq.h b/include/linux/rseq.h index b5e4803c4ebe..bf8a6bf315f3 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -126,7 +126,6 @@ static inline void rseq_force_update(void) */ static inline void rseq_virt_userspace_exit(void) { - if (current->rseq.event.sched_switch) /* * The generic optimization for deferring RSEQ updates until the next * exit relies on having a dedicated TIF_RSEQ. From 77d7dc8bef482e987036bc204136bbda552d95cd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:26:45 +0100 Subject: [PATCH 35/54] sched/mmcid: Revert the complex CID management The CID management is a complex beast, which affects both scheduling and task migration. The compaction mechanism forces random tasks of a process into task work on exit to user space causing latency spikes. Revert back to the initial simple bitmap allocating mechanics, which are known to have scalability issues as that allows to gradually build up a replacement functionality in a reviewable way. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251119172549.068197830@linutronix.de --- include/linux/mm_types.h | 53 +--- kernel/fork.c | 5 +- kernel/sched/core.c | 517 ++------------------------------------- kernel/sched/sched.h | 293 ++++------------------ 4 files changed, 66 insertions(+), 802 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 90e5790c318f..63b8c1209e7b 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -922,13 +922,9 @@ struct vm_area_struct { #define vma_policy(vma) NULL #endif -#ifdef CONFIG_SCHED_MM_CID struct mm_cid { - u64 time; - int cid; - int recent_cid; + unsigned int cid; }; -#endif /* * Opaque type representing current mm_struct flag state. Must be accessed via @@ -1000,12 +996,6 @@ struct mm_struct { * runqueue locks. */ struct mm_cid __percpu *pcpu_cid; - /* - * @mm_cid_next_scan: Next mm_cid scan (in jiffies). - * - * When the next mm_cid scan is due (in jiffies). - */ - unsigned long mm_cid_next_scan; /** * @nr_cpus_allowed: Number of CPUs allowed for mm. * @@ -1013,14 +1003,6 @@ struct mm_struct { * threads allowed CPUs. */ unsigned int nr_cpus_allowed; - /** - * @max_nr_cid: Maximum number of allowed concurrency - * IDs allocated. - * - * Track the highest number of allowed concurrency IDs - * allocated for the mm. - */ - atomic_t max_nr_cid; /** * @cpus_allowed_lock: Lock protecting mm cpus_allowed. * @@ -1371,35 +1353,7 @@ static inline void vma_iter_init(struct vma_iterator *vmi, #ifdef CONFIG_SCHED_MM_CID -enum mm_cid_state { - MM_CID_UNSET = -1U, /* Unset state has lazy_put flag set. */ - MM_CID_LAZY_PUT = (1U << 31), -}; - -static inline bool mm_cid_is_unset(int cid) -{ - return cid == MM_CID_UNSET; -} - -static inline bool mm_cid_is_lazy_put(int cid) -{ - return !mm_cid_is_unset(cid) && (cid & MM_CID_LAZY_PUT); -} - -static inline bool mm_cid_is_valid(int cid) -{ - return !(cid & MM_CID_LAZY_PUT); -} - -static inline int mm_cid_set_lazy_put(int cid) -{ - return cid | MM_CID_LAZY_PUT; -} - -static inline int mm_cid_clear_lazy_put(int cid) -{ - return cid & ~MM_CID_LAZY_PUT; -} +#define MM_CID_UNSET (~0U) /* * mm_cpus_allowed: Union of all mm's threads allowed CPUs. @@ -1432,11 +1386,8 @@ static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i); pcpu_cid->cid = MM_CID_UNSET; - pcpu_cid->recent_cid = MM_CID_UNSET; - pcpu_cid->time = 0; } mm->nr_cpus_allowed = p->nr_cpus_allowed; - atomic_set(&mm->max_nr_cid, 0); raw_spin_lock_init(&mm->cpus_allowed_lock); cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); cpumask_clear(mm_cidmask(mm)); diff --git a/kernel/fork.c b/kernel/fork.c index 3da0f08615a9..9d9afe453ef1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -955,10 +955,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #endif #ifdef CONFIG_SCHED_MM_CID - tsk->mm_cid = -1; - tsk->last_mm_cid = -1; + tsk->mm_cid = MM_CID_UNSET; + tsk->last_mm_cid = MM_CID_UNSET; tsk->mm_cid_active = 0; - tsk->migrate_from_cpu = -1; #endif return tsk; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 579a8e93578f..11a173596e0d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2128,8 +2128,6 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags) { if (task_on_rq_migrating(p)) flags |= ENQUEUE_MIGRATED; - if (flags & ENQUEUE_MIGRATED) - sched_mm_cid_migrate_to(rq, p); enqueue_task(rq, p, flags); @@ -3329,7 +3327,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (p->sched_class->migrate_task_rq) p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; - sched_mm_cid_migrate_from(p); perf_event_task_migrate(p); } @@ -5280,9 +5277,6 @@ context_switch(struct rq *rq, struct task_struct *prev, * * kernel -> user switch + mmdrop_lazy_tlb() active * user -> user switch - * - * switch_mm_cid() needs to be updated if the barriers provided - * by context_switch() are modified. */ if (!next->mm) { // to kernel enter_lazy_tlb(prev->active_mm, next); @@ -5312,8 +5306,7 @@ context_switch(struct rq *rq, struct task_struct *prev, } } - /* switch_mm_cid() requires the memory barriers above. */ - switch_mm_cid(rq, prev, next); + switch_mm_cid(prev, next); /* * Tell rseq that the task was scheduled in. Must be after @@ -5604,7 +5597,6 @@ void sched_tick(void) resched_latency = cpu_resched_latency(rq); calc_global_load_tick(rq); sched_core_tick(rq); - task_tick_mm_cid(rq, donor); scx_tick(rq); rq_unlock(rq, &rf); @@ -10376,522 +10368,47 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) } #ifdef CONFIG_SCHED_MM_CID - /* - * @cid_lock: Guarantee forward-progress of cid allocation. - * - * Concurrency ID allocation within a bitmap is mostly lock-free. The cid_lock - * is only used when contention is detected by the lock-free allocation so - * forward progress can be guaranteed. + * When a task exits, the MM CID held by the task is not longer required as + * the task cannot return to user space. */ -DEFINE_RAW_SPINLOCK(cid_lock); - -/* - * @use_cid_lock: Select cid allocation behavior: lock-free vs spinlock. - * - * When @use_cid_lock is 0, the cid allocation is lock-free. When contention is - * detected, it is set to 1 to ensure that all newly coming allocations are - * serialized by @cid_lock until the allocation which detected contention - * completes and sets @use_cid_lock back to 0. This guarantees forward progress - * of a cid allocation. - */ -int use_cid_lock; - -/* - * mm_cid remote-clear implements a lock-free algorithm to clear per-mm/cpu cid - * concurrently with respect to the execution of the source runqueue context - * switch. - * - * There is one basic properties we want to guarantee here: - * - * (1) Remote-clear should _never_ mark a per-cpu cid UNSET when it is actively - * used by a task. That would lead to concurrent allocation of the cid and - * userspace corruption. - * - * Provide this guarantee by introducing a Dekker memory ordering to guarantee - * that a pair of loads observe at least one of a pair of stores, which can be - * shown as: - * - * X = Y = 0 - * - * w[X]=1 w[Y]=1 - * MB MB - * r[Y]=y r[X]=x - * - * Which guarantees that x==0 && y==0 is impossible. But rather than using - * values 0 and 1, this algorithm cares about specific state transitions of the - * runqueue current task (as updated by the scheduler context switch), and the - * per-mm/cpu cid value. - * - * Let's introduce task (Y) which has task->mm == mm and task (N) which has - * task->mm != mm for the rest of the discussion. There are two scheduler state - * transitions on context switch we care about: - * - * (TSA) Store to rq->curr with transition from (N) to (Y) - * - * (TSB) Store to rq->curr with transition from (Y) to (N) - * - * On the remote-clear side, there is one transition we care about: - * - * (TMA) cmpxchg to *pcpu_cid to set the LAZY flag - * - * There is also a transition to UNSET state which can be performed from all - * sides (scheduler, remote-clear). It is always performed with a cmpxchg which - * guarantees that only a single thread will succeed: - * - * (TMB) cmpxchg to *pcpu_cid to mark UNSET - * - * Just to be clear, what we do _not_ want to happen is a transition to UNSET - * when a thread is actively using the cid (property (1)). - * - * Let's looks at the relevant combinations of TSA/TSB, and TMA transitions. - * - * Scenario A) (TSA)+(TMA) (from next task perspective) - * - * CPU0 CPU1 - * - * Context switch CS-1 Remote-clear - * - store to rq->curr: (N)->(Y) (TSA) - cmpxchg to *pcpu_id to LAZY (TMA) - * (implied barrier after cmpxchg) - * - switch_mm_cid() - * - memory barrier (see switch_mm_cid() - * comment explaining how this barrier - * is combined with other scheduler - * barriers) - * - mm_cid_get (next) - * - READ_ONCE(*pcpu_cid) - rcu_dereference(src_rq->curr) - * - * This Dekker ensures that either task (Y) is observed by the - * rcu_dereference() or the LAZY flag is observed by READ_ONCE(), or both are - * observed. - * - * If task (Y) store is observed by rcu_dereference(), it means that there is - * still an active task on the cpu. Remote-clear will therefore not transition - * to UNSET, which fulfills property (1). - * - * If task (Y) is not observed, but the lazy flag is observed by READ_ONCE(), - * it will move its state to UNSET, which clears the percpu cid perhaps - * uselessly (which is not an issue for correctness). Because task (Y) is not - * observed, CPU1 can move ahead to set the state to UNSET. Because moving - * state to UNSET is done with a cmpxchg expecting that the old state has the - * LAZY flag set, only one thread will successfully UNSET. - * - * If both states (LAZY flag and task (Y)) are observed, the thread on CPU0 - * will observe the LAZY flag and transition to UNSET (perhaps uselessly), and - * CPU1 will observe task (Y) and do nothing more, which is fine. - * - * What we are effectively preventing with this Dekker is a scenario where - * neither LAZY flag nor store (Y) are observed, which would fail property (1) - * because this would UNSET a cid which is actively used. - */ - -void sched_mm_cid_migrate_from(struct task_struct *t) -{ - t->migrate_from_cpu = task_cpu(t); -} - -static -int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq, - struct task_struct *t, - struct mm_cid *src_pcpu_cid) -{ - struct mm_struct *mm = t->mm; - struct task_struct *src_task; - int src_cid, last_mm_cid; - - if (!mm) - return -1; - - last_mm_cid = t->last_mm_cid; - /* - * If the migrated task has no last cid, or if the current - * task on src rq uses the cid, it means the source cid does not need - * to be moved to the destination cpu. - */ - if (last_mm_cid == -1) - return -1; - src_cid = READ_ONCE(src_pcpu_cid->cid); - if (!mm_cid_is_valid(src_cid) || last_mm_cid != src_cid) - return -1; - - /* - * If we observe an active task using the mm on this rq, it means we - * are not the last task to be migrated from this cpu for this mm, so - * there is no need to move src_cid to the destination cpu. - */ - guard(rcu)(); - src_task = rcu_dereference(src_rq->curr); - if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { - t->last_mm_cid = -1; - return -1; - } - - return src_cid; -} - -static -int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq, - struct task_struct *t, - struct mm_cid *src_pcpu_cid, - int src_cid) -{ - struct task_struct *src_task; - struct mm_struct *mm = t->mm; - int lazy_cid; - - if (src_cid == -1) - return -1; - - /* - * Attempt to clear the source cpu cid to move it to the destination - * cpu. - */ - lazy_cid = mm_cid_set_lazy_put(src_cid); - if (!try_cmpxchg(&src_pcpu_cid->cid, &src_cid, lazy_cid)) - return -1; - - /* - * The implicit barrier after cmpxchg per-mm/cpu cid before loading - * rq->curr->mm matches the scheduler barrier in context_switch() - * between store to rq->curr and load of prev and next task's - * per-mm/cpu cid. - * - * The implicit barrier after cmpxchg per-mm/cpu cid before loading - * rq->curr->mm_cid_active matches the barrier in - * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and - * sched_mm_cid_after_execve() between store to t->mm_cid_active and - * load of per-mm/cpu cid. - */ - - /* - * If we observe an active task using the mm on this rq after setting - * the lazy-put flag, this task will be responsible for transitioning - * from lazy-put flag set to MM_CID_UNSET. - */ - scoped_guard (rcu) { - src_task = rcu_dereference(src_rq->curr); - if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { - /* - * We observed an active task for this mm, there is therefore - * no point in moving this cid to the destination cpu. - */ - t->last_mm_cid = -1; - return -1; - } - } - - /* - * The src_cid is unused, so it can be unset. - */ - if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) - return -1; - WRITE_ONCE(src_pcpu_cid->recent_cid, MM_CID_UNSET); - return src_cid; -} - -/* - * Migration to dst cpu. Called with dst_rq lock held. - * Interrupts are disabled, which keeps the window of cid ownership without the - * source rq lock held small. - */ -void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) -{ - struct mm_cid *src_pcpu_cid, *dst_pcpu_cid; - struct mm_struct *mm = t->mm; - int src_cid, src_cpu; - bool dst_cid_is_set; - struct rq *src_rq; - - lockdep_assert_rq_held(dst_rq); - - if (!mm) - return; - src_cpu = t->migrate_from_cpu; - if (src_cpu == -1) { - t->last_mm_cid = -1; - return; - } - /* - * Move the src cid if the dst cid is unset. This keeps id - * allocation closest to 0 in cases where few threads migrate around - * many CPUs. - * - * If destination cid or recent cid is already set, we may have - * to just clear the src cid to ensure compactness in frequent - * migrations scenarios. - * - * It is not useful to clear the src cid when the number of threads is - * greater or equal to the number of allowed CPUs, because user-space - * can expect that the number of allowed cids can reach the number of - * allowed CPUs. - */ - dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq)); - dst_cid_is_set = !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->cid)) || - !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->recent_cid)); - if (dst_cid_is_set && atomic_read(&mm->mm_users) >= READ_ONCE(mm->nr_cpus_allowed)) - return; - src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu); - src_rq = cpu_rq(src_cpu); - src_cid = __sched_mm_cid_migrate_from_fetch_cid(src_rq, t, src_pcpu_cid); - if (src_cid == -1) - return; - src_cid = __sched_mm_cid_migrate_from_try_steal_cid(src_rq, t, src_pcpu_cid, - src_cid); - if (src_cid == -1) - return; - if (dst_cid_is_set) { - __mm_cid_put(mm, src_cid); - return; - } - /* Move src_cid to dst cpu. */ - mm_cid_snapshot_time(dst_rq, mm); - WRITE_ONCE(dst_pcpu_cid->cid, src_cid); - WRITE_ONCE(dst_pcpu_cid->recent_cid, src_cid); -} - -static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid, - int cpu) -{ - struct rq *rq = cpu_rq(cpu); - struct task_struct *t; - int cid, lazy_cid; - - cid = READ_ONCE(pcpu_cid->cid); - if (!mm_cid_is_valid(cid)) - return; - - /* - * Clear the cpu cid if it is set to keep cid allocation compact. If - * there happens to be other tasks left on the source cpu using this - * mm, the next task using this mm will reallocate its cid on context - * switch. - */ - lazy_cid = mm_cid_set_lazy_put(cid); - if (!try_cmpxchg(&pcpu_cid->cid, &cid, lazy_cid)) - return; - - /* - * The implicit barrier after cmpxchg per-mm/cpu cid before loading - * rq->curr->mm matches the scheduler barrier in context_switch() - * between store to rq->curr and load of prev and next task's - * per-mm/cpu cid. - * - * The implicit barrier after cmpxchg per-mm/cpu cid before loading - * rq->curr->mm_cid_active matches the barrier in - * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and - * sched_mm_cid_after_execve() between store to t->mm_cid_active and - * load of per-mm/cpu cid. - */ - - /* - * If we observe an active task using the mm on this rq after setting - * the lazy-put flag, that task will be responsible for transitioning - * from lazy-put flag set to MM_CID_UNSET. - */ - scoped_guard (rcu) { - t = rcu_dereference(rq->curr); - if (READ_ONCE(t->mm_cid_active) && t->mm == mm) - return; - } - - /* - * The cid is unused, so it can be unset. - * Disable interrupts to keep the window of cid ownership without rq - * lock small. - */ - scoped_guard (irqsave) { - if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) - __mm_cid_put(mm, cid); - } -} - -static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu) -{ - struct rq *rq = cpu_rq(cpu); - struct mm_cid *pcpu_cid; - struct task_struct *curr; - u64 rq_clock; - - /* - * rq->clock load is racy on 32-bit but one spurious clear once in a - * while is irrelevant. - */ - rq_clock = READ_ONCE(rq->clock); - pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu); - - /* - * In order to take care of infrequently scheduled tasks, bump the time - * snapshot associated with this cid if an active task using the mm is - * observed on this rq. - */ - scoped_guard (rcu) { - curr = rcu_dereference(rq->curr); - if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) { - WRITE_ONCE(pcpu_cid->time, rq_clock); - return; - } - } - - if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS) - return; - sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); -} - -static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu, - int weight) -{ - struct mm_cid *pcpu_cid; - int cid; - - pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu); - cid = READ_ONCE(pcpu_cid->cid); - if (!mm_cid_is_valid(cid) || cid < weight) - return; - sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); -} - -static void task_mm_cid_work(struct callback_head *work) -{ - unsigned long now = jiffies, old_scan, next_scan; - struct task_struct *t = current; - struct cpumask *cidmask; - struct mm_struct *mm; - int weight, cpu; - - WARN_ON_ONCE(t != container_of(work, struct task_struct, cid_work)); - - work->next = work; /* Prevent double-add */ - if (t->flags & PF_EXITING) - return; - mm = t->mm; - if (!mm) - return; - old_scan = READ_ONCE(mm->mm_cid_next_scan); - next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY); - if (!old_scan) { - unsigned long res; - - res = cmpxchg(&mm->mm_cid_next_scan, old_scan, next_scan); - if (res != old_scan) - old_scan = res; - else - old_scan = next_scan; - } - if (time_before(now, old_scan)) - return; - if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan)) - return; - cidmask = mm_cidmask(mm); - /* Clear cids that were not recently used. */ - for_each_possible_cpu(cpu) - sched_mm_cid_remote_clear_old(mm, cpu); - weight = cpumask_weight(cidmask); - /* - * Clear cids that are greater or equal to the cidmask weight to - * recompact it. - */ - for_each_possible_cpu(cpu) - sched_mm_cid_remote_clear_weight(mm, cpu, weight); -} - -void init_sched_mm_cid(struct task_struct *t) -{ - struct mm_struct *mm = t->mm; - int mm_users = 0; - - if (mm) { - mm_users = atomic_read(&mm->mm_users); - if (mm_users == 1) - mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY); - } - t->cid_work.next = &t->cid_work; /* Protect against double add */ - init_task_work(&t->cid_work, task_mm_cid_work); -} - -void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) -{ - struct callback_head *work = &curr->cid_work; - unsigned long now = jiffies; - - if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || - work->next != work) - return; - if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan))) - return; - - /* No page allocation under rq lock */ - task_work_add(curr, work, TWA_RESUME); -} - void sched_mm_cid_exit_signals(struct task_struct *t) { struct mm_struct *mm = t->mm; - struct rq *rq; - if (!mm) + if (!mm || !t->mm_cid_active) return; - preempt_disable(); - rq = this_rq(); - guard(rq_lock_irqsave)(rq); - preempt_enable_no_resched(); /* holding spinlock */ - WRITE_ONCE(t->mm_cid_active, 0); - /* - * Store t->mm_cid_active before loading per-mm/cpu cid. - * Matches barrier in sched_mm_cid_remote_clear_old(). - */ - smp_mb(); - mm_cid_put(mm); - t->last_mm_cid = t->mm_cid = -1; + guard(preempt)(); + t->mm_cid_active = 0; + if (t->mm_cid != MM_CID_UNSET) { + cpumask_clear_cpu(t->mm_cid, mm_cidmask(mm)); + t->mm_cid = MM_CID_UNSET; + } } +/* Deactivate MM CID allocation across execve() */ void sched_mm_cid_before_execve(struct task_struct *t) { - struct mm_struct *mm = t->mm; - struct rq *rq; - - if (!mm) - return; - - preempt_disable(); - rq = this_rq(); - guard(rq_lock_irqsave)(rq); - preempt_enable_no_resched(); /* holding spinlock */ - WRITE_ONCE(t->mm_cid_active, 0); - /* - * Store t->mm_cid_active before loading per-mm/cpu cid. - * Matches barrier in sched_mm_cid_remote_clear_old(). - */ - smp_mb(); - mm_cid_put(mm); - t->last_mm_cid = t->mm_cid = -1; + sched_mm_cid_exit_signals(t); } +/* Reactivate MM CID after successful execve() */ void sched_mm_cid_after_execve(struct task_struct *t) { struct mm_struct *mm = t->mm; - struct rq *rq; if (!mm) return; - preempt_disable(); - rq = this_rq(); - scoped_guard (rq_lock_irqsave, rq) { - preempt_enable_no_resched(); /* holding spinlock */ - WRITE_ONCE(t->mm_cid_active, 1); - /* - * Store t->mm_cid_active before loading per-mm/cpu cid. - * Matches barrier in sched_mm_cid_remote_clear_old(). - */ - smp_mb(); - t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, mm); - } + guard(preempt)(); + t->mm_cid_active = 1; + mm_cid_select(t); } void sched_mm_cid_fork(struct task_struct *t) { - WARN_ON_ONCE(!t->mm || t->mm_cid != -1); + WARN_ON_ONCE(!t->mm || t->mm_cid != MM_CID_UNSET); t->mm_cid_active = 1; } #endif /* CONFIG_SCHED_MM_CID */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4838dda75b10..bf227c27b889 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3540,286 +3540,83 @@ extern void sched_dynamic_update(int mode); extern const char *preempt_modes[]; #ifdef CONFIG_SCHED_MM_CID - -#define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */ -#define MM_CID_SCAN_DELAY 100 /* 100ms */ - -extern raw_spinlock_t cid_lock; -extern int use_cid_lock; - -extern void sched_mm_cid_migrate_from(struct task_struct *t); -extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t); -extern void task_tick_mm_cid(struct rq *rq, struct task_struct *curr); -extern void init_sched_mm_cid(struct task_struct *t); - -static inline void __mm_cid_put(struct mm_struct *mm, int cid) -{ - if (cid < 0) - return; - cpumask_clear_cpu(cid, mm_cidmask(mm)); -} - -/* - * The per-mm/cpu cid can have the MM_CID_LAZY_PUT flag set or transition to - * the MM_CID_UNSET state without holding the rq lock, but the rq lock needs to - * be held to transition to other states. - * - * State transitions synchronized with cmpxchg or try_cmpxchg need to be - * consistent across CPUs, which prevents use of this_cpu_cmpxchg. - */ -static inline void mm_cid_put_lazy(struct task_struct *t) +static inline void init_sched_mm_cid(struct task_struct *t) { struct mm_struct *mm = t->mm; - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; - int cid; + unsigned int max_cid; - lockdep_assert_irqs_disabled(); - cid = __this_cpu_read(pcpu_cid->cid); - if (!mm_cid_is_lazy_put(cid) || - !try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) + if (!mm) return; - __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); + + /* Preset last_mm_cid */ + max_cid = min_t(int, READ_ONCE(mm->nr_cpus_allowed), atomic_read(&mm->mm_users)); + t->last_mm_cid = max_cid - 1; } -static inline int mm_cid_pcpu_unset(struct mm_struct *mm) +static inline bool __mm_cid_get(struct task_struct *t, unsigned int cid, unsigned int max_cids) { - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; - int cid, res; + struct mm_struct *mm = t->mm; - lockdep_assert_irqs_disabled(); - cid = __this_cpu_read(pcpu_cid->cid); - for (;;) { - if (mm_cid_is_unset(cid)) - return MM_CID_UNSET; - /* - * Attempt transition from valid or lazy-put to unset. - */ - res = cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, cid, MM_CID_UNSET); - if (res == cid) - break; - cid = res; - } - return cid; + if (cid >= max_cids) + return false; + if (cpumask_test_and_set_cpu(cid, mm_cidmask(mm))) + return false; + t->mm_cid = t->last_mm_cid = cid; + __this_cpu_write(mm->pcpu_cid->cid, cid); + return true; } -static inline void mm_cid_put(struct mm_struct *mm) +static inline bool mm_cid_get(struct task_struct *t) { - int cid; + struct mm_struct *mm = t->mm; + unsigned int max_cids; - lockdep_assert_irqs_disabled(); - cid = mm_cid_pcpu_unset(mm); - if (cid == MM_CID_UNSET) - return; - __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); + max_cids = min_t(int, READ_ONCE(mm->nr_cpus_allowed), atomic_read(&mm->mm_users)); + + /* Try to reuse the last CID of this task */ + if (__mm_cid_get(t, t->last_mm_cid, max_cids)) + return true; + + /* Try to reuse the last CID of this mm on this CPU */ + if (__mm_cid_get(t, __this_cpu_read(mm->pcpu_cid->cid), max_cids)) + return true; + + /* Try the first zero bit in the cidmask. */ + return __mm_cid_get(t, cpumask_first_zero(mm_cidmask(mm)), max_cids); } -static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm) +static inline void mm_cid_select(struct task_struct *t) { - struct cpumask *cidmask = mm_cidmask(mm); - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; - int cid, max_nr_cid, allowed_max_nr_cid; - /* - * After shrinking the number of threads or reducing the number - * of allowed cpus, reduce the value of max_nr_cid so expansion - * of cid allocation will preserve cache locality if the number - * of threads or allowed cpus increase again. - */ - max_nr_cid = atomic_read(&mm->max_nr_cid); - while ((allowed_max_nr_cid = min_t(int, READ_ONCE(mm->nr_cpus_allowed), - atomic_read(&mm->mm_users))), - max_nr_cid > allowed_max_nr_cid) { - /* atomic_try_cmpxchg loads previous mm->max_nr_cid into max_nr_cid. */ - if (atomic_try_cmpxchg(&mm->max_nr_cid, &max_nr_cid, allowed_max_nr_cid)) { - max_nr_cid = allowed_max_nr_cid; - break; - } - } - /* Try to re-use recent cid. This improves cache locality. */ - cid = __this_cpu_read(pcpu_cid->recent_cid); - if (!mm_cid_is_unset(cid) && cid < max_nr_cid && - !cpumask_test_and_set_cpu(cid, cidmask)) - return cid; - /* - * Expand cid allocation if the maximum number of concurrency - * IDs allocated (max_nr_cid) is below the number cpus allowed - * and number of threads. Expanding cid allocation as much as - * possible improves cache locality. - */ - cid = max_nr_cid; - while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_users)) { - /* atomic_try_cmpxchg loads previous mm->max_nr_cid into cid. */ - if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1)) - continue; - if (!cpumask_test_and_set_cpu(cid, cidmask)) - return cid; - } - /* - * Find the first available concurrency id. - * Retry finding first zero bit if the mask is temporarily - * filled. This only happens during concurrent remote-clear - * which owns a cid without holding a rq lock. + * mm_cid_get() can fail when the maximum CID, which is determined + * by min(mm->nr_cpus_allowed, mm->mm_users) changes concurrently. + * That's a transient failure as there cannot be more tasks + * concurrently on a CPU (or about to be scheduled in) than that. */ for (;;) { - cid = cpumask_first_zero(cidmask); - if (cid < READ_ONCE(mm->nr_cpus_allowed)) + if (mm_cid_get(t)) break; - cpu_relax(); } - if (cpumask_test_and_set_cpu(cid, cidmask)) - return -1; - - return cid; } -/* - * Save a snapshot of the current runqueue time of this cpu - * with the per-cpu cid value, allowing to estimate how recently it was used. - */ -static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm) +static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { - struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(rq)); - - lockdep_assert_rq_held(rq); - WRITE_ONCE(pcpu_cid->time, rq->clock); -} - -static inline int __mm_cid_get(struct rq *rq, struct task_struct *t, - struct mm_struct *mm) -{ - int cid; - - /* - * All allocations (even those using the cid_lock) are lock-free. If - * use_cid_lock is set, hold the cid_lock to perform cid allocation to - * guarantee forward progress. - */ - if (!READ_ONCE(use_cid_lock)) { - cid = __mm_cid_try_get(t, mm); - if (cid >= 0) - goto end; - raw_spin_lock(&cid_lock); - } else { - raw_spin_lock(&cid_lock); - cid = __mm_cid_try_get(t, mm); - if (cid >= 0) - goto unlock; - } - - /* - * cid concurrently allocated. Retry while forcing following - * allocations to use the cid_lock to ensure forward progress. - */ - WRITE_ONCE(use_cid_lock, 1); - /* - * Set use_cid_lock before allocation. Only care about program order - * because this is only required for forward progress. - */ - barrier(); - /* - * Retry until it succeeds. It is guaranteed to eventually succeed once - * all newcoming allocations observe the use_cid_lock flag set. - */ - do { - cid = __mm_cid_try_get(t, mm); - cpu_relax(); - } while (cid < 0); - /* - * Allocate before clearing use_cid_lock. Only care about - * program order because this is for forward progress. - */ - barrier(); - WRITE_ONCE(use_cid_lock, 0); -unlock: - raw_spin_unlock(&cid_lock); -end: - mm_cid_snapshot_time(rq, mm); - - return cid; -} - -static inline int mm_cid_get(struct rq *rq, struct task_struct *t, - struct mm_struct *mm) -{ - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; - int cid; - - lockdep_assert_rq_held(rq); - cid = __this_cpu_read(pcpu_cid->cid); - if (mm_cid_is_valid(cid)) { - mm_cid_snapshot_time(rq, mm); - return cid; - } - if (mm_cid_is_lazy_put(cid)) { - if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) - __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); - } - cid = __mm_cid_get(rq, t, mm); - __this_cpu_write(pcpu_cid->cid, cid); - __this_cpu_write(pcpu_cid->recent_cid, cid); - - return cid; -} - -static inline void switch_mm_cid(struct rq *rq, - struct task_struct *prev, - struct task_struct *next) -{ - /* - * Provide a memory barrier between rq->curr store and load of - * {prev,next}->mm->pcpu_cid[cpu] on rq->curr->mm transition. - * - * Should be adapted if context_switch() is modified. - */ - if (!next->mm) { // to kernel - /* - * user -> kernel transition does not guarantee a barrier, but - * we can use the fact that it performs an atomic operation in - * mmgrab(). - */ - if (prev->mm) // from user - smp_mb__after_mmgrab(); - /* - * kernel -> kernel transition does not change rq->curr->mm - * state. It stays NULL. - */ - } else { // to user - /* - * kernel -> user transition does not provide a barrier - * between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu]. - * Provide it here. - */ - if (!prev->mm) { // from kernel - smp_mb(); - } else { // from user - /* - * user->user transition relies on an implicit - * memory barrier in switch_mm() when - * current->mm changes. If the architecture - * switch_mm() does not have an implicit memory - * barrier, it is emitted here. If current->mm - * is unchanged, no barrier is needed. - */ - smp_mb__after_switch_mm(); - } - } if (prev->mm_cid_active) { - mm_cid_snapshot_time(rq, prev->mm); - mm_cid_put_lazy(prev); - prev->mm_cid = -1; + if (prev->mm_cid != MM_CID_UNSET) + cpumask_clear_cpu(prev->mm_cid, mm_cidmask(prev->mm)); + prev->mm_cid = MM_CID_UNSET; } + if (next->mm_cid_active) { - next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next, next->mm); + mm_cid_select(next); rseq_sched_set_task_mm_cid(next, next->mm_cid); } } #else /* !CONFIG_SCHED_MM_CID: */ -static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { } -static inline void sched_mm_cid_migrate_from(struct task_struct *t) { } -static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { } -static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { } static inline void init_sched_mm_cid(struct task_struct *t) { } +static inline void mm_cid_select(struct task_struct *t) { } +static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { } #endif /* !CONFIG_SCHED_MM_CID */ extern u64 avg_vruntime(struct cfs_rq *cfs_rq); From 8cea569ca785060b8c5cc7800713ddc3b1548a94 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:26:47 +0100 Subject: [PATCH 36/54] sched/mmcid: Use proper data structures Having a lot of CID functionality specific members in struct task_struct and struct mm_struct is not really making the code easier to read. Encapsulate the CID specific parts in data structures and keep them separate from the stuff they are embedded in. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251119172549.131573768@linutronix.de --- include/linux/mm_types.h | 56 ++++++++++---------------------------- include/linux/rseq_types.h | 42 ++++++++++++++++++++++++++++ include/linux/sched.h | 11 ++------ init/init_task.c | 3 ++ kernel/fork.c | 6 ++-- kernel/sched/core.c | 16 +++++------ kernel/sched/sched.h | 26 +++++++++--------- 7 files changed, 85 insertions(+), 75 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 63b8c1209e7b..e4818e932a1d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -922,10 +923,6 @@ struct vm_area_struct { #define vma_policy(vma) NULL #endif -struct mm_cid { - unsigned int cid; -}; - /* * Opaque type representing current mm_struct flag state. Must be accessed via * mm_flags_xxx() helper functions. @@ -987,30 +984,9 @@ struct mm_struct { */ atomic_t mm_users; -#ifdef CONFIG_SCHED_MM_CID - /** - * @pcpu_cid: Per-cpu current cid. - * - * Keep track of the currently allocated mm_cid for each cpu. - * The per-cpu mm_cid values are serialized by their respective - * runqueue locks. - */ - struct mm_cid __percpu *pcpu_cid; - /** - * @nr_cpus_allowed: Number of CPUs allowed for mm. - * - * Number of CPUs allowed in the union of all mm's - * threads allowed CPUs. - */ - unsigned int nr_cpus_allowed; - /** - * @cpus_allowed_lock: Lock protecting mm cpus_allowed. - * - * Provide mutual exclusion for mm cpus_allowed and - * mm nr_cpus_allowed updates. - */ - raw_spinlock_t cpus_allowed_lock; -#endif + /* MM CID related storage */ + struct mm_mm_cid mm_cid; + #ifdef CONFIG_MMU atomic_long_t pgtables_bytes; /* size of all page tables */ #endif @@ -1352,9 +1328,6 @@ static inline void vma_iter_init(struct vma_iterator *vmi, } #ifdef CONFIG_SCHED_MM_CID - -#define MM_CID_UNSET (~0U) - /* * mm_cpus_allowed: Union of all mm's threads allowed CPUs. */ @@ -1383,20 +1356,20 @@ static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) int i; for_each_possible_cpu(i) { - struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i); + struct mm_cid_pcpu *pcpu = per_cpu_ptr(mm->mm_cid.pcpu, i); - pcpu_cid->cid = MM_CID_UNSET; + pcpu->cid = MM_CID_UNSET; } - mm->nr_cpus_allowed = p->nr_cpus_allowed; - raw_spin_lock_init(&mm->cpus_allowed_lock); + mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed; + raw_spin_lock_init(&mm->mm_cid.lock); cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); cpumask_clear(mm_cidmask(mm)); } static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *p) { - mm->pcpu_cid = alloc_percpu_noprof(struct mm_cid); - if (!mm->pcpu_cid) + mm->mm_cid.pcpu = alloc_percpu_noprof(struct mm_cid_pcpu); + if (!mm->mm_cid.pcpu) return -ENOMEM; mm_init_cid(mm, p); return 0; @@ -1405,8 +1378,8 @@ static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct * static inline void mm_destroy_cid(struct mm_struct *mm) { - free_percpu(mm->pcpu_cid); - mm->pcpu_cid = NULL; + free_percpu(mm->mm_cid.pcpu); + mm->mm_cid.pcpu = NULL; } static inline unsigned int mm_cid_size(void) @@ -1421,10 +1394,9 @@ static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumas if (!mm) return; /* The mm_cpus_allowed is the union of each thread allowed CPUs masks. */ - raw_spin_lock(&mm->cpus_allowed_lock); + guard(raw_spinlock)(&mm->mm_cid.lock); cpumask_or(mm_allowed, mm_allowed, cpumask); - WRITE_ONCE(mm->nr_cpus_allowed, cpumask_weight(mm_allowed)); - raw_spin_unlock(&mm->cpus_allowed_lock); + WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, cpumask_weight(mm_allowed)); } #else /* CONFIG_SCHED_MM_CID */ static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { } diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index 9c7a34154de8..e444dd267c7a 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -90,4 +90,46 @@ struct rseq_data { struct rseq_data { }; #endif /* !CONFIG_RSEQ */ +#ifdef CONFIG_SCHED_MM_CID + +#define MM_CID_UNSET (~0U) + +/** + * struct sched_mm_cid - Storage for per task MM CID data + * @active: MM CID is active for the task + * @cid: The CID associated to the task + * @last_cid: The last CID associated to the task + */ +struct sched_mm_cid { + unsigned int active; + unsigned int cid; + unsigned int last_cid; +}; + +/** + * struct mm_cid_pcpu - Storage for per CPU MM_CID data + * @cid: The CID associated to the CPU + */ +struct mm_cid_pcpu { + unsigned int cid; +}; + +/** + * struct mm_mm_cid - Storage for per MM CID data + * @pcpu: Per CPU storage for CIDs associated to a CPU + * @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. The map + * is growth only. + * @lock: Spinlock to protect all fields except @pcpu. It also protects + * the MM cid cpumask and the MM cidmask bitmap. + */ +struct mm_mm_cid { + struct mm_cid_pcpu __percpu *pcpu; + unsigned int nr_cpus_allowed; + raw_spinlock_t lock; +}; +#else /* CONFIG_SCHED_MM_CID */ +struct mm_mm_cid { }; +struct sched_mm_cid { }; +#endif /* !CONFIG_SCHED_MM_CID */ + #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index e47abc8685d7..64f080d6ed6e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1407,14 +1407,7 @@ struct task_struct { #endif /* CONFIG_NUMA_BALANCING */ struct rseq_data rseq; - -#ifdef CONFIG_SCHED_MM_CID - int mm_cid; /* Current cid in mm */ - int last_mm_cid; /* Most recent cid in mm */ - int migrate_from_cpu; - int mm_cid_active; /* Whether cid bitmap is active */ - struct callback_head cid_work; -#endif + struct sched_mm_cid mm_cid; struct tlbflush_unmap_batch tlb_ubc; @@ -2308,7 +2301,7 @@ void sched_mm_cid_fork(struct task_struct *t); void sched_mm_cid_exit_signals(struct task_struct *t); static inline int task_mm_cid(struct task_struct *t) { - return t->mm_cid; + return t->mm_cid.cid; } #else static inline void sched_mm_cid_before_execve(struct task_struct *t) { } diff --git a/init/init_task.c b/init/init_task.c index a55e2189206f..5d122699b664 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -223,6 +223,9 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { #ifdef CONFIG_SECCOMP_FILTER .seccomp = { .filter_count = ATOMIC_INIT(0) }, #endif +#ifdef CONFIG_SCHED_MM_CID + .mm_cid = { .cid = MM_CID_UNSET, }, +#endif }; EXPORT_SYMBOL(init_task); diff --git a/kernel/fork.c b/kernel/fork.c index 9d9afe453ef1..74bc7c9f1bb3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -955,9 +955,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #endif #ifdef CONFIG_SCHED_MM_CID - tsk->mm_cid = MM_CID_UNSET; - tsk->last_mm_cid = MM_CID_UNSET; - tsk->mm_cid_active = 0; + tsk->mm_cid.cid = MM_CID_UNSET; + tsk->mm_cid.last_cid = MM_CID_UNSET; + tsk->mm_cid.active = 0; #endif return tsk; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 11a173596e0d..b1aa7d1055ac 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10376,14 +10376,14 @@ void sched_mm_cid_exit_signals(struct task_struct *t) { struct mm_struct *mm = t->mm; - if (!mm || !t->mm_cid_active) + if (!mm || !t->mm_cid.active) return; guard(preempt)(); - t->mm_cid_active = 0; - if (t->mm_cid != MM_CID_UNSET) { - cpumask_clear_cpu(t->mm_cid, mm_cidmask(mm)); - t->mm_cid = MM_CID_UNSET; + t->mm_cid.active = 0; + if (t->mm_cid.cid != MM_CID_UNSET) { + cpumask_clear_cpu(t->mm_cid.cid, mm_cidmask(mm)); + t->mm_cid.cid = MM_CID_UNSET; } } @@ -10402,14 +10402,14 @@ void sched_mm_cid_after_execve(struct task_struct *t) return; guard(preempt)(); - t->mm_cid_active = 1; + t->mm_cid.active = 1; mm_cid_select(t); } void sched_mm_cid_fork(struct task_struct *t) { - WARN_ON_ONCE(!t->mm || t->mm_cid != MM_CID_UNSET); - t->mm_cid_active = 1; + WARN_ON_ONCE(!t->mm || t->mm_cid.cid != MM_CID_UNSET); + t->mm_cid.active = 1; } #endif /* CONFIG_SCHED_MM_CID */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index bf227c27b889..a17f04f075e1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3549,8 +3549,8 @@ static inline void init_sched_mm_cid(struct task_struct *t) return; /* Preset last_mm_cid */ - max_cid = min_t(int, READ_ONCE(mm->nr_cpus_allowed), atomic_read(&mm->mm_users)); - t->last_mm_cid = max_cid - 1; + max_cid = min_t(int, READ_ONCE(mm->mm_cid.nr_cpus_allowed), atomic_read(&mm->mm_users)); + t->mm_cid.last_cid = max_cid - 1; } static inline bool __mm_cid_get(struct task_struct *t, unsigned int cid, unsigned int max_cids) @@ -3561,8 +3561,8 @@ static inline bool __mm_cid_get(struct task_struct *t, unsigned int cid, unsigne return false; if (cpumask_test_and_set_cpu(cid, mm_cidmask(mm))) return false; - t->mm_cid = t->last_mm_cid = cid; - __this_cpu_write(mm->pcpu_cid->cid, cid); + t->mm_cid.cid = t->mm_cid.last_cid = cid; + __this_cpu_write(mm->mm_cid.pcpu->cid, cid); return true; } @@ -3571,14 +3571,14 @@ static inline bool mm_cid_get(struct task_struct *t) struct mm_struct *mm = t->mm; unsigned int max_cids; - max_cids = min_t(int, READ_ONCE(mm->nr_cpus_allowed), atomic_read(&mm->mm_users)); + max_cids = min_t(int, READ_ONCE(mm->mm_cid.nr_cpus_allowed), atomic_read(&mm->mm_users)); /* Try to reuse the last CID of this task */ - if (__mm_cid_get(t, t->last_mm_cid, max_cids)) + if (__mm_cid_get(t, t->mm_cid.last_cid, max_cids)) return true; /* Try to reuse the last CID of this mm on this CPU */ - if (__mm_cid_get(t, __this_cpu_read(mm->pcpu_cid->cid), max_cids)) + if (__mm_cid_get(t, __this_cpu_read(mm->mm_cid.pcpu->cid), max_cids)) return true; /* Try the first zero bit in the cidmask. */ @@ -3601,15 +3601,15 @@ static inline void mm_cid_select(struct task_struct *t) static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { - if (prev->mm_cid_active) { - if (prev->mm_cid != MM_CID_UNSET) - cpumask_clear_cpu(prev->mm_cid, mm_cidmask(prev->mm)); - prev->mm_cid = MM_CID_UNSET; + if (prev->mm_cid.active) { + if (prev->mm_cid.cid != MM_CID_UNSET) + cpumask_clear_cpu(prev->mm_cid.cid, mm_cidmask(prev->mm)); + prev->mm_cid.cid = MM_CID_UNSET; } - if (next->mm_cid_active) { + if (next->mm_cid.active) { mm_cid_select(next); - rseq_sched_set_task_mm_cid(next, next->mm_cid); + rseq_sched_set_task_mm_cid(next, next->mm_cid.cid); } } From be4463fa2c7185823d2989562162d578b45a89ae Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:26:49 +0100 Subject: [PATCH 37/54] sched/mmcid: Cacheline align MM CID storage Both the per CPU storage and the data in mm_struct are heavily used in context switch. As they can end up next to other frequently modified data, they are subject to false sharing. Make them cache line aligned. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251119172549.194111661@linutronix.de --- include/linux/rseq_types.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index e444dd267c7a..d7e8071b626a 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -112,7 +112,7 @@ struct sched_mm_cid { */ struct mm_cid_pcpu { unsigned int cid; -}; +}____cacheline_aligned_in_smp; /** * struct mm_mm_cid - Storage for per MM CID data @@ -126,7 +126,7 @@ struct mm_mm_cid { struct mm_cid_pcpu __percpu *pcpu; unsigned int nr_cpus_allowed; raw_spinlock_t lock; -}; +}____cacheline_aligned_in_smp; #else /* CONFIG_SCHED_MM_CID */ struct mm_mm_cid { }; struct sched_mm_cid { }; From 925b7847bb7d4eb523a7698b309e8441647796f2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:26:51 +0100 Subject: [PATCH 38/54] sched: Fixup whitespace damage With whitespace checks enabled in the editor this makes eyes bleed. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251119172549.258651925@linutronix.de --- kernel/sched/core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b1aa7d1055ac..b667171b4422 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5278,15 +5278,15 @@ context_switch(struct rq *rq, struct task_struct *prev, * kernel -> user switch + mmdrop_lazy_tlb() active * user -> user switch */ - if (!next->mm) { // to kernel + if (!next->mm) { // to kernel enter_lazy_tlb(prev->active_mm, next); next->active_mm = prev->active_mm; - if (prev->mm) // from user + if (prev->mm) // from user mmgrab_lazy_tlb(prev->active_mm); else prev->active_mm = NULL; - } else { // to user + } else { // to user membarrier_switch_mm(rq, prev->active_mm, next->mm); /* * sys_membarrier() requires an smp_mb() between setting @@ -5299,7 +5299,7 @@ context_switch(struct rq *rq, struct task_struct *prev, switch_mm_irqs_off(prev->active_mm, next->mm, next); lru_gen_use_mm(next->mm); - if (!prev->mm) { // from kernel + if (!prev->mm) { // from kernel /* will mmdrop_lazy_tlb() in finish_task_switch(). */ rq->prev_mm = prev->active_mm; prev->active_mm = NULL; From b08ef5fc8fa01ae5285bef5ff783bbb425d1fb08 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:26:53 +0100 Subject: [PATCH 39/54] sched/mmcid: Move scheduler code out of global header This is only used in the scheduler core code, so there is no point to have it in a global header. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mathieu Desnoyers Acked-by: Yury Norov (NVIDIA) Link: https://patch.msgid.link/20251119172549.321259077@linutronix.de --- include/linux/mm_types.h | 13 ------------- kernel/sched/core.c | 20 ++++++++++++++++++-- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index e4818e932a1d..67a7bdf772f7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1387,27 +1387,14 @@ static inline unsigned int mm_cid_size(void) return 2 * cpumask_size(); /* mm_cpus_allowed(), mm_cidmask(). */ } -static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) -{ - struct cpumask *mm_allowed = mm_cpus_allowed(mm); - - if (!mm) - return; - /* The mm_cpus_allowed is the union of each thread allowed CPUs masks. */ - guard(raw_spinlock)(&mm->mm_cid.lock); - cpumask_or(mm_allowed, mm_allowed, cpumask); - WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, cpumask_weight(mm_allowed)); -} #else /* CONFIG_SCHED_MM_CID */ static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { } static inline int mm_alloc_cid(struct mm_struct *mm, struct task_struct *p) { return 0; } static inline void mm_destroy_cid(struct mm_struct *mm) { } - static inline unsigned int mm_cid_size(void) { return 0; } -static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { } #endif /* CONFIG_SCHED_MM_CID */ struct mmu_gather; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b667171b4422..f5e37c233b01 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2669,6 +2669,8 @@ out_unlock: return 0; } +static inline void mm_update_cpus_allowed(struct mm_struct *mm, const cpumask_t *affmask); + /* * sched_class::set_cpus_allowed must do the below, but is not required to * actually call this function. @@ -2728,7 +2730,7 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) put_prev_task(rq, p); p->sched_class->set_cpus_allowed(p, ctx); - mm_set_cpus_allowed(p->mm, ctx->new_mask); + mm_update_cpus_allowed(p->mm, ctx->new_mask); if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); @@ -10372,6 +10374,18 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) * When a task exits, the MM CID held by the task is not longer required as * the task cannot return to user space. */ +static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) +{ + struct cpumask *mm_allowed = mm_cpus_allowed(mm); + + if (!mm) + return; + /* The mm_cpus_allowed is the union of each thread allowed CPUs masks. */ + guard(raw_spinlock)(&mm->mm_cid.lock); + cpumask_or(mm_allowed, mm_allowed, affmsk); + WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, cpumask_weight(mm_allowed)); +} + void sched_mm_cid_exit_signals(struct task_struct *t) { struct mm_struct *mm = t->mm; @@ -10411,7 +10425,9 @@ void sched_mm_cid_fork(struct task_struct *t) WARN_ON_ONCE(!t->mm || t->mm_cid.cid != MM_CID_UNSET); t->mm_cid.active = 1; } -#endif /* CONFIG_SCHED_MM_CID */ +#else /* CONFIG_SCHED_MM_CID */ +static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { } +#endif /* !CONFIG_SCHED_MM_CID */ #ifdef CONFIG_SCHED_CLASS_EXT void sched_deq_and_put_task(struct task_struct *p, int queue_flags, From 0d032a43ebeb9bf255cd7e3dad5f7a6371571648 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:26:55 +0100 Subject: [PATCH 40/54] sched/mmcid: Prevent pointless work in mm_update_cpus_allowed() mm_update_cpus_allowed() is not required to be invoked for affinity changes due to migrate_disable() and migrate_enable(). migrate_disable() restricts the task temporarily to a CPU on which the task was already allowed to run, so nothing changes. migrate_enable() restores the actual task affinity mask. If that mask changed between migrate_disable() and migrate_enable() then that change was already accounted for. Move the invocation to the proper place to avoid that. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251119172549.385208276@linutronix.de --- kernel/sched/core.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f5e37c233b01..2ea77e72f7c6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2684,6 +2684,7 @@ void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx cpumask_copy(&p->cpus_mask, ctx->new_mask); p->nr_cpus_allowed = cpumask_weight(ctx->new_mask); + mm_update_cpus_allowed(p->mm, ctx->new_mask); /* * Swap in a new user_cpus_ptr if SCA_USER flag set @@ -2730,7 +2731,6 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) put_prev_task(rq, p); p->sched_class->set_cpus_allowed(p, ctx); - mm_update_cpus_allowed(p->mm, ctx->new_mask); if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); @@ -10376,12 +10376,17 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) */ static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { - struct cpumask *mm_allowed = mm_cpus_allowed(mm); + struct cpumask *mm_allowed; if (!mm) return; - /* The mm_cpus_allowed is the union of each thread allowed CPUs masks. */ + + /* + * mm::mm_cid::mm_cpus_allowed is the superset of each threads + * allowed CPUs mask which means it can only grow. + */ guard(raw_spinlock)(&mm->mm_cid.lock); + mm_allowed = mm_cpus_allowed(mm); cpumask_or(mm_allowed, mm_allowed, affmsk); WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, cpumask_weight(mm_allowed)); } From 437cb3ded25038d5280d21de489ce78c745118d5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:26:57 +0100 Subject: [PATCH 41/54] cpumask: Introduce cpumask_weighted_or() CID management OR's two cpumasks and then calculates the weight on the result. That's inefficient as that has to walk the same stuff twice. As this is done with runqueue lock held, there is a real benefit of speeding this up. Depending on the system this results in 10-20% less cycles spent with runqueue lock held for a 4K cpumask. Provide cpumask_weighted_or() and the corresponding bitmap functions which return the weight of the OR result right away. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Yury Norov (NVIDIA) Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251119172549.448263340@linutronix.de --- include/linux/bitmap.h | 15 +++++++++++++++ include/linux/cpumask.h | 16 ++++++++++++++++ lib/bitmap.c | 6 ++++++ 3 files changed, 37 insertions(+) diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 595217b7a6e7..b0395e4ccf90 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -45,6 +45,7 @@ struct device; * bitmap_copy(dst, src, nbits) *dst = *src * bitmap_and(dst, src1, src2, nbits) *dst = *src1 & *src2 * bitmap_or(dst, src1, src2, nbits) *dst = *src1 | *src2 + * bitmap_weighted_or(dst, src1, src2, nbits) *dst = *src1 | *src2. Returns Hamming Weight of dst * bitmap_xor(dst, src1, src2, nbits) *dst = *src1 ^ *src2 * bitmap_andnot(dst, src1, src2, nbits) *dst = *src1 & ~(*src2) * bitmap_complement(dst, src, nbits) *dst = ~(*src) @@ -165,6 +166,8 @@ bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); +unsigned int __bitmap_weighted_or(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int nbits); void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, @@ -337,6 +340,18 @@ void bitmap_or(unsigned long *dst, const unsigned long *src1, __bitmap_or(dst, src1, src2, nbits); } +static __always_inline +unsigned int bitmap_weighted_or(unsigned long *dst, const unsigned long *src1, + const unsigned long *src2, unsigned int nbits) +{ + if (small_const_nbits(nbits)) { + *dst = *src1 | *src2; + return hweight_long(*dst & BITMAP_LAST_WORD_MASK(nbits)); + } else { + return __bitmap_weighted_or(dst, src1, src2, nbits); + } +} + static __always_inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index ff8f41ab7ce6..feba06eb0a42 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -728,6 +728,22 @@ void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p, cpumask_bits(src2p), small_cpumask_bits); } +/** + * cpumask_weighted_or - *dstp = *src1p | *src2p and return the weight of the result + * @dstp: the cpumask result + * @src1p: the first input + * @src2p: the second input + * + * Return: The number of bits set in the resulting cpumask @dstp + */ +static __always_inline +unsigned int cpumask_weighted_or(struct cpumask *dstp, const struct cpumask *src1p, + const struct cpumask *src2p) +{ + return bitmap_weighted_or(cpumask_bits(dstp), cpumask_bits(src1p), + cpumask_bits(src2p), small_cpumask_bits); +} + /** * cpumask_xor - *dstp = *src1p ^ *src2p * @dstp: the cpumask result diff --git a/lib/bitmap.c b/lib/bitmap.c index b97692854966..9dc526507875 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -355,6 +355,12 @@ unsigned int __bitmap_weight_andnot(const unsigned long *bitmap1, } EXPORT_SYMBOL(__bitmap_weight_andnot); +unsigned int __bitmap_weighted_or(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int bits) +{ + return BITMAP_WEIGHT(({dst[idx] = bitmap1[idx] | bitmap2[idx]; dst[idx]; }), bits); +} + void __bitmap_set(unsigned long *map, unsigned int start, int len) { unsigned long *p = map + BIT_WORD(start); From 79c11fb3da8581a2f222b290ce62a153ab1108fc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:26:59 +0100 Subject: [PATCH 42/54] sched/mmcid: Use cpumask_weighted_or() Use cpumask_weighted_or() instead of cpumask_or() and cpumask_weight() on the result, which walks the same bitmap twice. Results in 10-20% less cycles, which reduces the runqueue lock hold time. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mathieu Desnoyers Acked-by: Yury Norov (NVIDIA) Link: https://patch.msgid.link/20251119172549.511736272@linutronix.de --- kernel/sched/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2ea77e72f7c6..f6bbfa1f5c15 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10377,6 +10377,7 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { struct cpumask *mm_allowed; + unsigned int weight; if (!mm) return; @@ -10387,8 +10388,8 @@ static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpu */ guard(raw_spinlock)(&mm->mm_cid.lock); mm_allowed = mm_cpus_allowed(mm); - cpumask_or(mm_allowed, mm_allowed, affmsk); - WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, cpumask_weight(mm_allowed)); + weight = cpumask_weighted_or(mm_allowed, mm_allowed, affmsk); + WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, weight); } void sched_mm_cid_exit_signals(struct task_struct *t) From 35a5c37cb9f1f947dff18e7cfc75a8cfcfd557ca Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:27:01 +0100 Subject: [PATCH 43/54] cpumask: Cache num_possible_cpus() Reevaluating num_possible_cpus() over and over does not make sense. That becomes a constant after init as cpu_possible_mask is marked ro_after_init. Cache the value during initialization and provide that for consumption. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Yury Norov Reviewed-by: Mathieu Desnoyers Reviewed-by: Shrikanth Hegde Link: https://patch.msgid.link/20251119172549.578653738@linutronix.de --- include/linux/cpumask.h | 10 ++++++++-- kernel/cpu.c | 19 +++++++++++++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index feba06eb0a42..66694ee8d86e 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -126,6 +126,7 @@ extern struct cpumask __cpu_dying_mask; #define cpu_dying_mask ((const struct cpumask *)&__cpu_dying_mask) extern atomic_t __num_online_cpus; +extern unsigned int __num_possible_cpus; extern cpumask_t cpus_booted_once_mask; @@ -1152,13 +1153,13 @@ void init_cpu_possible(const struct cpumask *src); #define __assign_cpu(cpu, mask, val) \ __assign_bit(cpumask_check(cpu), cpumask_bits(mask), (val)) -#define set_cpu_possible(cpu, possible) assign_cpu((cpu), &__cpu_possible_mask, (possible)) #define set_cpu_enabled(cpu, enabled) assign_cpu((cpu), &__cpu_enabled_mask, (enabled)) #define set_cpu_present(cpu, present) assign_cpu((cpu), &__cpu_present_mask, (present)) #define set_cpu_active(cpu, active) assign_cpu((cpu), &__cpu_active_mask, (active)) #define set_cpu_dying(cpu, dying) assign_cpu((cpu), &__cpu_dying_mask, (dying)) void set_cpu_online(unsigned int cpu, bool online); +void set_cpu_possible(unsigned int cpu, bool possible); /** * to_cpumask - convert a NR_CPUS bitmap to a struct cpumask * @@ -1211,7 +1212,12 @@ static __always_inline unsigned int num_online_cpus(void) { return raw_atomic_read(&__num_online_cpus); } -#define num_possible_cpus() cpumask_weight(cpu_possible_mask) + +static __always_inline unsigned int num_possible_cpus(void) +{ + return __num_possible_cpus; +} + #define num_enabled_cpus() cpumask_weight(cpu_enabled_mask) #define num_present_cpus() cpumask_weight(cpu_present_mask) #define num_active_cpus() cpumask_weight(cpu_active_mask) diff --git a/kernel/cpu.c b/kernel/cpu.c index db9f6c539b28..b674fdf96208 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -3085,10 +3085,13 @@ EXPORT_SYMBOL(cpu_all_bits); #ifdef CONFIG_INIT_ALL_POSSIBLE struct cpumask __cpu_possible_mask __ro_after_init = {CPU_BITS_ALL}; +unsigned int __num_possible_cpus __ro_after_init = NR_CPUS; #else struct cpumask __cpu_possible_mask __ro_after_init; +unsigned int __num_possible_cpus __ro_after_init; #endif EXPORT_SYMBOL(__cpu_possible_mask); +EXPORT_SYMBOL(__num_possible_cpus); struct cpumask __cpu_online_mask __read_mostly; EXPORT_SYMBOL(__cpu_online_mask); @@ -3116,6 +3119,7 @@ void init_cpu_present(const struct cpumask *src) void init_cpu_possible(const struct cpumask *src) { cpumask_copy(&__cpu_possible_mask, src); + __num_possible_cpus = cpumask_weight(&__cpu_possible_mask); } void set_cpu_online(unsigned int cpu, bool online) @@ -3139,6 +3143,21 @@ void set_cpu_online(unsigned int cpu, bool online) } } +/* + * This should be marked __init, but there is a boatload of call sites + * which need to be fixed up to do so. Sigh... + */ +void set_cpu_possible(unsigned int cpu, bool possible) +{ + if (possible) { + if (!cpumask_test_and_set_cpu(cpu, &__cpu_possible_mask)) + __num_possible_cpus++; + } else { + if (cpumask_test_and_clear_cpu(cpu, &__cpu_possible_mask)) + __num_possible_cpus--; + } +} + /* * Activate the first processor. */ From 539115f08cf850b9fdc6526b31da0839ff6c1631 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:27:03 +0100 Subject: [PATCH 44/54] sched/mmcid: Convert mm CID mask to a bitmap This is truly a bitmap and just conveniently uses a cpumask because the maximum size of the bitmap is nr_cpu_ids. But that prevents to do searches for a zero bit in a limited range, which is helpful to provide an efficient mechanism to consolidate the CID space when the number of users decreases. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Mathieu Desnoyers Acked-by: Yury Norov (NVIDIA) Link: https://patch.msgid.link/20251119172549.642866767@linutronix.de --- include/linux/mm_types.h | 9 +++++---- kernel/sched/core.c | 2 +- kernel/sched/sched.h | 6 +++--- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 67a7bdf772f7..bafb81b33922 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1342,13 +1342,13 @@ static inline cpumask_t *mm_cpus_allowed(struct mm_struct *mm) } /* Accessor for struct mm_struct's cidmask. */ -static inline cpumask_t *mm_cidmask(struct mm_struct *mm) +static inline unsigned long *mm_cidmask(struct mm_struct *mm) { unsigned long cid_bitmap = (unsigned long)mm_cpus_allowed(mm); /* Skip mm_cpus_allowed */ cid_bitmap += cpumask_size(); - return (struct cpumask *)cid_bitmap; + return (unsigned long *)cid_bitmap; } static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) @@ -1363,7 +1363,7 @@ static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed; raw_spin_lock_init(&mm->mm_cid.lock); cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); - cpumask_clear(mm_cidmask(mm)); + bitmap_zero(mm_cidmask(mm), num_possible_cpus()); } static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *p) @@ -1384,7 +1384,8 @@ static inline void mm_destroy_cid(struct mm_struct *mm) static inline unsigned int mm_cid_size(void) { - return 2 * cpumask_size(); /* mm_cpus_allowed(), mm_cidmask(). */ + /* mm_cpus_allowed(), mm_cidmask(). */ + return cpumask_size() + bitmap_size(num_possible_cpus()); } #else /* CONFIG_SCHED_MM_CID */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f6bbfa1f5c15..9a114b6f6a6f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10402,7 +10402,7 @@ void sched_mm_cid_exit_signals(struct task_struct *t) guard(preempt)(); t->mm_cid.active = 0; if (t->mm_cid.cid != MM_CID_UNSET) { - cpumask_clear_cpu(t->mm_cid.cid, mm_cidmask(mm)); + clear_bit(t->mm_cid.cid, mm_cidmask(mm)); t->mm_cid.cid = MM_CID_UNSET; } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a17f04f075e1..31f2e431db5e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3559,7 +3559,7 @@ static inline bool __mm_cid_get(struct task_struct *t, unsigned int cid, unsigne if (cid >= max_cids) return false; - if (cpumask_test_and_set_cpu(cid, mm_cidmask(mm))) + if (test_and_set_bit(cid, mm_cidmask(mm))) return false; t->mm_cid.cid = t->mm_cid.last_cid = cid; __this_cpu_write(mm->mm_cid.pcpu->cid, cid); @@ -3582,7 +3582,7 @@ static inline bool mm_cid_get(struct task_struct *t) return true; /* Try the first zero bit in the cidmask. */ - return __mm_cid_get(t, cpumask_first_zero(mm_cidmask(mm)), max_cids); + return __mm_cid_get(t, find_first_zero_bit(mm_cidmask(mm), num_possible_cpus()), max_cids); } static inline void mm_cid_select(struct task_struct *t) @@ -3603,7 +3603,7 @@ static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *n { if (prev->mm_cid.active) { if (prev->mm_cid.cid != MM_CID_UNSET) - cpumask_clear_cpu(prev->mm_cid.cid, mm_cidmask(prev->mm)); + clear_bit(prev->mm_cid.cid, mm_cidmask(prev->mm)); prev->mm_cid.cid = MM_CID_UNSET; } From 2b1642b881088bbf73fcb1147c474a198ec46729 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:27:05 +0100 Subject: [PATCH 45/54] signal: Move MMCID exit out of sighand lock There is no need anymore to keep this under sighand lock as the current code and the upcoming replacement are not depending on the exit state of a task anymore. That allows to use a mutex in the exit path. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251119172549.706439391@linutronix.de --- include/linux/sched.h | 4 ++-- kernel/exit.c | 1 + kernel/sched/core.c | 4 ++-- kernel/signal.c | 2 -- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 64f080d6ed6e..c411ae021bc5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2298,7 +2298,7 @@ static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct allo void sched_mm_cid_before_execve(struct task_struct *t); void sched_mm_cid_after_execve(struct task_struct *t); void sched_mm_cid_fork(struct task_struct *t); -void sched_mm_cid_exit_signals(struct task_struct *t); +void sched_mm_cid_exit(struct task_struct *t); static inline int task_mm_cid(struct task_struct *t) { return t->mm_cid.cid; @@ -2307,7 +2307,7 @@ static inline int task_mm_cid(struct task_struct *t) static inline void sched_mm_cid_before_execve(struct task_struct *t) { } static inline void sched_mm_cid_after_execve(struct task_struct *t) { } static inline void sched_mm_cid_fork(struct task_struct *t) { } -static inline void sched_mm_cid_exit_signals(struct task_struct *t) { } +static inline void sched_mm_cid_exit(struct task_struct *t) { } static inline int task_mm_cid(struct task_struct *t) { /* diff --git a/kernel/exit.c b/kernel/exit.c index 9f74e8f1c431..324616f690b7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -910,6 +910,7 @@ void __noreturn do_exit(long code) user_events_exit(tsk); io_uring_files_cancel(); + sched_mm_cid_exit(tsk); exit_signals(tsk); /* sets PF_EXITING */ seccomp_filter_release(tsk); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9a114b6f6a6f..3fdf90a7074d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10392,7 +10392,7 @@ static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpu WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, weight); } -void sched_mm_cid_exit_signals(struct task_struct *t) +void sched_mm_cid_exit(struct task_struct *t) { struct mm_struct *mm = t->mm; @@ -10410,7 +10410,7 @@ void sched_mm_cid_exit_signals(struct task_struct *t) /* Deactivate MM CID allocation across execve() */ void sched_mm_cid_before_execve(struct task_struct *t) { - sched_mm_cid_exit_signals(t); + sched_mm_cid_exit(t); } /* Reactivate MM CID after successful execve() */ diff --git a/kernel/signal.c b/kernel/signal.c index fe9190d84f28..e42b8bd6922f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3125,7 +3125,6 @@ void exit_signals(struct task_struct *tsk) cgroup_threadgroup_change_begin(tsk); if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) { - sched_mm_cid_exit_signals(tsk); tsk->flags |= PF_EXITING; cgroup_threadgroup_change_end(tsk); return; @@ -3136,7 +3135,6 @@ void exit_signals(struct task_struct *tsk) * From now this task is not visible for group-wide signals, * see wants_signal(), do_signal_stop(). */ - sched_mm_cid_exit_signals(tsk); tsk->flags |= PF_EXITING; cgroup_threadgroup_change_end(tsk); From bf070520e398679cd582b3c3e44107bf22c143ba Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:27:07 +0100 Subject: [PATCH 46/54] sched/mmcid: Move initialization out of line It's getting bigger soon, so just move it out of line to the rest of the code. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251119172549.769636491@linutronix.de --- include/linux/mm_types.h | 15 +-------------- kernel/sched/core.c | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index bafb81b33922..3b7d05e7169c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1351,20 +1351,7 @@ static inline unsigned long *mm_cidmask(struct mm_struct *mm) return (unsigned long *)cid_bitmap; } -static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) -{ - int i; - - for_each_possible_cpu(i) { - struct mm_cid_pcpu *pcpu = per_cpu_ptr(mm->mm_cid.pcpu, i); - - pcpu->cid = MM_CID_UNSET; - } - mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed; - raw_spin_lock_init(&mm->mm_cid.lock); - cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); - bitmap_zero(mm_cidmask(mm), num_possible_cpus()); -} +void mm_init_cid(struct mm_struct *mm, struct task_struct *p); static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *p) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3fdf90a7074d..34b6c31eca3a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10431,6 +10431,20 @@ void sched_mm_cid_fork(struct task_struct *t) WARN_ON_ONCE(!t->mm || t->mm_cid.cid != MM_CID_UNSET); t->mm_cid.active = 1; } + +void mm_init_cid(struct mm_struct *mm, struct task_struct *p) +{ + struct mm_cid_pcpu __percpu *pcpu = mm->mm_cid.pcpu; + int cpu; + + for_each_possible_cpu(cpu) + per_cpu_ptr(pcpu, cpu)->cid = MM_CID_UNSET; + + mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed; + raw_spin_lock_init(&mm->mm_cid.lock); + cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); + bitmap_zero(mm_cidmask(mm), num_possible_cpus()); +} #else /* CONFIG_SCHED_MM_CID */ static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { } #endif /* !CONFIG_SCHED_MM_CID */ From b0c3d51b54f8a4f4c809432d210c0c983d5cd97e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:27:09 +0100 Subject: [PATCH 47/54] sched/mmcid: Provide precomputed maximal value Reading mm::mm_users and mm:::mm_cid::nr_cpus_allowed every time to compute the maximal CID value is just wasteful as that value is only changing on fork(), exit() and eventually when the affinity changes. So it can be easily precomputed at those points and provided in mm::mm_cid for consumption in the hot path. But there is an issue with using mm::mm_users for accounting because that does not necessarily reflect the number of user space tasks as other kernel code can take temporary references on the MM which skew the picture. Solve that by adding a users counter to struct mm_mm_cid, which is modified by fork() and exit() and used for precomputing under mm_mm_cid::lock. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251119172549.832764634@linutronix.de --- include/linux/rseq_types.h | 6 ++++ kernel/fork.c | 1 + kernel/sched/core.c | 59 +++++++++++++++++++++++++++----------- kernel/sched/sched.h | 3 +- 4 files changed, 50 insertions(+), 19 deletions(-) diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index d7e8071b626a..0fab369999b6 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -117,14 +117,20 @@ struct mm_cid_pcpu { /** * struct mm_mm_cid - Storage for per MM CID data * @pcpu: Per CPU storage for CIDs associated to a CPU + * @max_cids: The exclusive maximum CID value for allocation and convergence * @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. The map * is growth only. + * @users: The number of tasks sharing this MM. Separate from mm::mm_users + * as that is modified by mmget()/mm_put() by other entities which + * do not actually share the MM. * @lock: Spinlock to protect all fields except @pcpu. It also protects * the MM cid cpumask and the MM cidmask bitmap. */ struct mm_mm_cid { struct mm_cid_pcpu __percpu *pcpu; + unsigned int max_cids; unsigned int nr_cpus_allowed; + unsigned int users; raw_spinlock_t lock; }____cacheline_aligned_in_smp; #else /* CONFIG_SCHED_MM_CID */ diff --git a/kernel/fork.c b/kernel/fork.c index 74bc7c9f1bb3..6c23219e1169 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2455,6 +2455,7 @@ bad_fork_cleanup_namespaces: exit_task_namespaces(p); bad_fork_cleanup_mm: if (p->mm) { + sched_mm_cid_exit(p); mm_clear_owner(p->mm, p); mmput(p->mm); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 34b6c31eca3a..f9295c42da22 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4485,7 +4485,6 @@ static void __sched_fork(u64 clone_flags, struct task_struct *p) init_numa_balancing(clone_flags, p); p->wake_entry.u_flags = CSD_TYPE_TTWU; p->migration_pending = NULL; - init_sched_mm_cid(p); } DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); @@ -10371,15 +10370,27 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) #ifdef CONFIG_SCHED_MM_CID /* - * When a task exits, the MM CID held by the task is not longer required as - * the task cannot return to user space. + * Update the CID range properties when the constraints change. Invoked via + * fork(), exit() and affinity changes */ +static void mm_update_max_cids(struct mm_struct *mm) +{ + struct mm_mm_cid *mc = &mm->mm_cid; + unsigned int max_cids; + + lockdep_assert_held(&mm->mm_cid.lock); + + /* Calculate the new maximum constraint */ + max_cids = min(mc->nr_cpus_allowed, mc->users); + WRITE_ONCE(mc->max_cids, max_cids); +} + static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { struct cpumask *mm_allowed; unsigned int weight; - if (!mm) + if (!mm || !READ_ONCE(mm->mm_cid.users)) return; /* @@ -10389,9 +10400,30 @@ static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpu guard(raw_spinlock)(&mm->mm_cid.lock); mm_allowed = mm_cpus_allowed(mm); weight = cpumask_weighted_or(mm_allowed, mm_allowed, affmsk); + if (weight == mm->mm_cid.nr_cpus_allowed) + return; WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, weight); + mm_update_max_cids(mm); } +void sched_mm_cid_fork(struct task_struct *t) +{ + struct mm_struct *mm = t->mm; + + WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET); + + guard(raw_spinlock)(&mm->mm_cid.lock); + t->mm_cid.active = 1; + mm->mm_cid.users++; + /* Preset last_cid for mm_cid_select() */ + t->mm_cid.last_cid = READ_ONCE(mm->mm_cid.max_cids) - 1; + mm_update_max_cids(mm); +} + +/* + * When a task exits, the MM CID held by the task is not longer required as + * the task cannot return to user space. + */ void sched_mm_cid_exit(struct task_struct *t) { struct mm_struct *mm = t->mm; @@ -10399,12 +10431,14 @@ void sched_mm_cid_exit(struct task_struct *t) if (!mm || !t->mm_cid.active) return; - guard(preempt)(); + guard(raw_spinlock)(&mm->mm_cid.lock); t->mm_cid.active = 0; + mm->mm_cid.users--; if (t->mm_cid.cid != MM_CID_UNSET) { clear_bit(t->mm_cid.cid, mm_cidmask(mm)); t->mm_cid.cid = MM_CID_UNSET; } + mm_update_max_cids(mm); } /* Deactivate MM CID allocation across execve() */ @@ -10416,22 +10450,11 @@ void sched_mm_cid_before_execve(struct task_struct *t) /* Reactivate MM CID after successful execve() */ void sched_mm_cid_after_execve(struct task_struct *t) { - struct mm_struct *mm = t->mm; - - if (!mm) - return; - + sched_mm_cid_fork(t); guard(preempt)(); - t->mm_cid.active = 1; mm_cid_select(t); } -void sched_mm_cid_fork(struct task_struct *t) -{ - WARN_ON_ONCE(!t->mm || t->mm_cid.cid != MM_CID_UNSET); - t->mm_cid.active = 1; -} - void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { struct mm_cid_pcpu __percpu *pcpu = mm->mm_cid.pcpu; @@ -10440,7 +10463,9 @@ void mm_init_cid(struct mm_struct *mm, struct task_struct *p) for_each_possible_cpu(cpu) per_cpu_ptr(pcpu, cpu)->cid = MM_CID_UNSET; + mm->mm_cid.max_cids = 0; mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed; + mm->mm_cid.users = 0; raw_spin_lock_init(&mm->mm_cid.lock); cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); bitmap_zero(mm_cidmask(mm), num_possible_cpus()); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 31f2e431db5e..d539fb269957 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3571,7 +3571,7 @@ static inline bool mm_cid_get(struct task_struct *t) struct mm_struct *mm = t->mm; unsigned int max_cids; - max_cids = min_t(int, READ_ONCE(mm->mm_cid.nr_cpus_allowed), atomic_read(&mm->mm_users)); + max_cids = READ_ONCE(mm->mm_cid.max_cids); /* Try to reuse the last CID of this task */ if (__mm_cid_get(t, t->mm_cid.last_cid, max_cids)) @@ -3614,7 +3614,6 @@ static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *n } #else /* !CONFIG_SCHED_MM_CID: */ -static inline void init_sched_mm_cid(struct task_struct *t) { } static inline void mm_cid_select(struct task_struct *t) { } static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { } #endif /* !CONFIG_SCHED_MM_CID */ From 51dd92c71a38647803478fb81e1812286a8998b1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:27:11 +0100 Subject: [PATCH 48/54] sched/mmcid: Serialize sched_mm_cid_fork()/exit() with a mutex Prepare for the new CID management scheme which puts the CID ownership transition into the fork() and exit() slow path by serializing sched_mm_cid_fork()/exit() with it, so task list and cpu mask walks can be done in interruptible and preemptible code. The contention on it is not worse than on other concurrency controls in the fork()/exit() machinery. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251119172549.895826703@linutronix.de --- include/linux/rseq_types.h | 2 ++ kernel/sched/core.c | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index 0fab369999b6..574aba6fe97c 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -125,6 +125,7 @@ struct mm_cid_pcpu { * do not actually share the MM. * @lock: Spinlock to protect all fields except @pcpu. It also protects * the MM cid cpumask and the MM cidmask bitmap. + * @mutex: Mutex to serialize forks and exits related to this mm */ struct mm_mm_cid { struct mm_cid_pcpu __percpu *pcpu; @@ -132,6 +133,7 @@ struct mm_mm_cid { unsigned int nr_cpus_allowed; unsigned int users; raw_spinlock_t lock; + struct mutex mutex; }____cacheline_aligned_in_smp; #else /* CONFIG_SCHED_MM_CID */ struct mm_mm_cid { }; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f9295c42da22..01903cf03ab2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10369,6 +10369,25 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) } #ifdef CONFIG_SCHED_MM_CID +/* + * Concurrency IDentifier management + * + * Serialization rules: + * + * mm::mm_cid::mutex: Serializes fork() and exit() and therefore + * protects mm::mm_cid::users. + * + * mm::mm_cid::lock: Serializes mm_update_max_cids() and + * mm_update_cpus_allowed(). Nests in mm_cid::mutex + * and runqueue lock. + * + * The mm_cidmask bitmap is not protected by any of the mm::mm_cid locks + * and can only be modified with atomic operations. + * + * The mm::mm_cid:pcpu per CPU storage is protected by the CPUs runqueue + * lock. + */ + /* * Update the CID range properties when the constraints change. Invoked via * fork(), exit() and affinity changes @@ -10412,6 +10431,7 @@ void sched_mm_cid_fork(struct task_struct *t) WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET); + guard(mutex)(&mm->mm_cid.mutex); guard(raw_spinlock)(&mm->mm_cid.lock); t->mm_cid.active = 1; mm->mm_cid.users++; @@ -10431,6 +10451,7 @@ void sched_mm_cid_exit(struct task_struct *t) if (!mm || !t->mm_cid.active) return; + guard(mutex)(&mm->mm_cid.mutex); guard(raw_spinlock)(&mm->mm_cid.lock); t->mm_cid.active = 0; mm->mm_cid.users--; @@ -10467,6 +10488,7 @@ void mm_init_cid(struct mm_struct *mm, struct task_struct *p) mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed; mm->mm_cid.users = 0; raw_spin_lock_init(&mm->mm_cid.lock); + mutex_init(&mm->mm_cid.mutex); cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); bitmap_zero(mm_cidmask(mm), num_possible_cpus()); } From 23343b6b09acb4bf97f34ed60e135000ca57ede1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:27:12 +0100 Subject: [PATCH 49/54] sched/mmcid: Introduce per task/CPU ownership infrastructure The MM CID management has two fundamental requirements: 1) It has to guarantee that at no given point in time the same CID is used by concurrent tasks in userspace. 2) The CID space must not exceed the number of possible CPUs in a system. While most allocators (glibc, tcmalloc, jemalloc) do not care about that, there seems to be at least librseq depending on it. The CID space compaction itself is not a functional correctness requirement, it is only a useful optimization mechanism to reduce the memory foot print in unused user space pools. The optimal CID space is: min(nr_tasks, nr_cpus_allowed); Where @nr_tasks is the number of actual user space threads associated to the mm and @nr_cpus_allowed is the superset of all task affinities. It is growth only as it would be insane to take a racy snapshot of all task affinities when the affinity of one task changes just do redo it 2 milliseconds later when the next task changes its affinity. That means that as long as the number of tasks is lower or equal than the number of CPUs allowed, each task owns a CID. If the number of tasks exceeds the number of CPUs allowed it switches to per CPU mode, where the CPUs own the CIDs and the tasks borrow them as long as they are scheduled in. For transition periods CIDs can go beyond the optimal space as long as they don't go beyond the number of possible CPUs. The current upstream implementation adds overhead into task migration to keep the CID with the task. It also has to do the CID space consolidation work from a task work in the exit to user space path. As that work is assigned to a random task related to a MM this can inflict unwanted exit latencies. This can be done differently by implementing a strict CID ownership mechanism. Either the CIDs are owned by the tasks or by the CPUs. The latter provides less locality when tasks are heavily migrating, but there is no justification to optimize for overcommit scenarios and thereby penalizing everyone else. Provide the basic infrastructure to implement this: - Change the UNSET marker to BIT(31) from ~0U - Add the ONCPU marker as BIT(30) - Add the TRANSIT marker as BIT(29) That allows to check for ownership trivially and provides a simple check for UNSET as well. The TRANSIT marker is required to prevent CID space exhaustion when switching from per CPU to per task mode. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20251119172549.960252358@linutronix.de --- include/linux/rseq_types.h | 4 ++- include/linux/sched.h | 6 ++-- kernel/sched/core.c | 10 +++++++ kernel/sched/sched.h | 59 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 75 insertions(+), 4 deletions(-) diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index 574aba6fe97c..87854effe1ad 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -92,7 +92,9 @@ struct rseq_data { }; #ifdef CONFIG_SCHED_MM_CID -#define MM_CID_UNSET (~0U) +#define MM_CID_UNSET BIT(31) +#define MM_CID_ONCPU BIT(30) +#define MM_CID_TRANSIT BIT(29) /** * struct sched_mm_cid - Storage for per task MM CID data diff --git a/include/linux/sched.h b/include/linux/sched.h index c411ae021bc5..9eec409745f8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2299,16 +2299,16 @@ void sched_mm_cid_before_execve(struct task_struct *t); void sched_mm_cid_after_execve(struct task_struct *t); void sched_mm_cid_fork(struct task_struct *t); void sched_mm_cid_exit(struct task_struct *t); -static inline int task_mm_cid(struct task_struct *t) +static __always_inline int task_mm_cid(struct task_struct *t) { - return t->mm_cid.cid; + return t->mm_cid.cid & ~(MM_CID_ONCPU | MM_CID_TRANSIT); } #else static inline void sched_mm_cid_before_execve(struct task_struct *t) { } static inline void sched_mm_cid_after_execve(struct task_struct *t) { } static inline void sched_mm_cid_fork(struct task_struct *t) { } static inline void sched_mm_cid_exit(struct task_struct *t) { } -static inline int task_mm_cid(struct task_struct *t) +static __always_inline int task_mm_cid(struct task_struct *t) { /* * Use the processor id as a fall-back when the mm cid feature is diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 01903cf03ab2..55bb9c9ae32c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10386,6 +10386,16 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) * * The mm::mm_cid:pcpu per CPU storage is protected by the CPUs runqueue * lock. + * + * CID ownership: + * + * A CID is either owned by a task (stored in task_struct::mm_cid.cid) or + * by a CPU (stored in mm::mm_cid.pcpu::cid). CIDs owned by CPUs have the + * MM_CID_ONCPU bit set. During transition from CPU to task ownership mode, + * MM_CID_TRANSIT is set on the per task CIDs. When this bit is set the + * task needs to drop the CID into the pool when scheduling out. Both bits + * (ONCPU and TRANSIT) are filtered out by task_cid() when the CID is + * actually handed over to user space in the RSEQ memory. */ /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d539fb269957..4b49284504fb 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3540,6 +3540,65 @@ extern void sched_dynamic_update(int mode); extern const char *preempt_modes[]; #ifdef CONFIG_SCHED_MM_CID + +static __always_inline bool cid_on_cpu(unsigned int cid) +{ + return cid & MM_CID_ONCPU; +} + +static __always_inline bool cid_in_transit(unsigned int cid) +{ + return cid & MM_CID_TRANSIT; +} + +static __always_inline unsigned int cpu_cid_to_cid(unsigned int cid) +{ + return cid & ~MM_CID_ONCPU; +} + +static __always_inline unsigned int cid_to_cpu_cid(unsigned int cid) +{ + return cid | MM_CID_ONCPU; +} + +static __always_inline unsigned int cid_to_transit_cid(unsigned int cid) +{ + return cid | MM_CID_TRANSIT; +} + +static __always_inline unsigned int cid_from_transit_cid(unsigned int cid) +{ + return cid & ~MM_CID_TRANSIT; +} + +static __always_inline bool cid_on_task(unsigned int cid) +{ + /* True if none of the MM_CID_ONCPU, MM_CID_TRANSIT, MM_CID_UNSET bits is set */ + return cid < MM_CID_TRANSIT; +} + +static __always_inline void mm_drop_cid(struct mm_struct *mm, unsigned int cid) +{ + clear_bit(cid, mm_cidmask(mm)); +} + +static __always_inline void mm_unset_cid_on_task(struct task_struct *t) +{ + unsigned int cid = t->mm_cid.cid; + + t->mm_cid.cid = MM_CID_UNSET; + if (cid_on_task(cid)) + mm_drop_cid(t->mm, cid); +} + +static __always_inline void mm_drop_cid_on_cpu(struct mm_struct *mm, struct mm_cid_pcpu *pcp) +{ + /* Clear the ONCPU bit, but do not set UNSET in the per CPU storage */ + pcp->cid = cpu_cid_to_cid(pcp->cid); + mm_drop_cid(mm, pcp->cid); +} + +/* Active implementation */ static inline void init_sched_mm_cid(struct task_struct *t) { struct mm_struct *mm = t->mm; From 9a723ed7facff6955da8d64cc9de7066038036c1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:27:14 +0100 Subject: [PATCH 50/54] sched/mmcid: Provide new scheduler CID mechanism The MM CID management has two fundamental requirements: 1) It has to guarantee that at no given point in time the same CID is used by concurrent tasks in userspace. 2) The CID space must not exceed the number of possible CPUs in a system. While most allocators (glibc, tcmalloc, jemalloc) do not care about that, there seems to be at least some LTTng library depending on it. The CID space compaction itself is not a functional correctness requirement, it is only a useful optimization mechanism to reduce the memory foot print in unused user space pools. The optimal CID space is: min(nr_tasks, nr_cpus_allowed); Where @nr_tasks is the number of actual user space threads associated to the mm and @nr_cpus_allowed is the superset of all task affinities. It is growth only as it would be insane to take a racy snapshot of all task affinities when the affinity of one task changes just do redo it 2 milliseconds later when the next task changes it's affinity. That means that as long as the number of tasks is lower or equal than the number of CPUs allowed, each task owns a CID. If the number of tasks exceeds the number of CPUs allowed it switches to per CPU mode, where the CPUs own the CIDs and the tasks borrow them as long as they are scheduled in. For transition periods CIDs can go beyond the optimal space as long as they don't go beyond the number of possible CPUs. The current upstream implementation adds overhead into task migration to keep the CID with the task. It also has to do the CID space consolidation work from a task work in the exit to user space path. As that work is assigned to a random task related to a MM this can inflict unwanted exit latencies. Implement the context switch parts of a strict ownership mechanism to address this. This removes most of the work from the task which schedules out. Only during transitioning from per CPU to per task ownership it is required to drop the CID when leaving the CPU to prevent CID space exhaustion. Other than that scheduling out is just a single check and branch. The task which schedules in has to check whether: 1) The ownership mode changed 2) The CID is within the optimal CID space In stable situations this results in zero work. The only short disruption is when ownership mode changes or when the associated CID is not in the optimal CID space. The latter only happens when tasks exit and therefore the optimal CID space shrinks. That mechanism is strictly optimized for the common case where no change happens. The only case where it actually causes a temporary one time spike is on mode changes when and only when a lot of tasks related to a MM schedule exactly at the same time and have eventually to compete on allocating a CID from the bitmap. In the sysbench test case which triggered the spinlock contention in the initial CID code, __schedule() drops significantly in perf top on a 128 Core (256 threads) machine when running sysbench with 255 threads, which fits into the task mode limit of 256 together with the parent thread: Upstream rseq/perf branch +CID rework 0.42% 0.37% 0.32% [k] __schedule Increasing the number of threads to 256, which puts the test process into per CPU mode looks about the same. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251119172550.023984859@linutronix.de --- include/linux/rseq.h | 8 +- include/linux/rseq_types.h | 18 +++-- kernel/sched/core.c | 2 + kernel/sched/sched.h | 150 ++++++++++++++++++++++++++++++++++++- 4 files changed, 168 insertions(+), 10 deletions(-) diff --git a/include/linux/rseq.h b/include/linux/rseq.h index bf8a6bf315f3..4c0e8bdd2dd9 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -73,13 +73,13 @@ static __always_inline void rseq_sched_switch_event(struct task_struct *t) } /* - * Invoked from __set_task_cpu() when a task migrates to enforce an IDs - * update. + * Invoked from __set_task_cpu() when a task migrates or from + * mm_cid_schedin() when the CID changes to enforce an IDs update. * * This does not raise TIF_NOTIFY_RESUME as that happens in * rseq_sched_switch_event(). */ -static __always_inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu) +static __always_inline void rseq_sched_set_ids_changed(struct task_struct *t) { t->rseq.event.ids_changed = true; } @@ -168,7 +168,7 @@ static inline void rseq_fork(struct task_struct *t, u64 clone_flags) static inline void rseq_handle_slowpath(struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } static inline void rseq_sched_switch_event(struct task_struct *t) { } -static inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu) { } +static inline void rseq_sched_set_ids_changed(struct task_struct *t) { } static inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid) { } static inline void rseq_force_update(void) { } static inline void rseq_virt_userspace_exit(void) { } diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index 87854effe1ad..66b1482e1146 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -119,23 +119,31 @@ struct mm_cid_pcpu { /** * struct mm_mm_cid - Storage for per MM CID data * @pcpu: Per CPU storage for CIDs associated to a CPU + * @percpu: Set, when CIDs are in per CPU mode + * @transit: Set to MM_CID_TRANSIT during a mode change transition phase * @max_cids: The exclusive maximum CID value for allocation and convergence + * @lock: Spinlock to protect all fields except @pcpu. It also protects + * the MM cid cpumask and the MM cidmask bitmap. + * @mutex: Mutex to serialize forks and exits related to this mm * @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. The map * is growth only. * @users: The number of tasks sharing this MM. Separate from mm::mm_users * as that is modified by mmget()/mm_put() by other entities which * do not actually share the MM. - * @lock: Spinlock to protect all fields except @pcpu. It also protects - * the MM cid cpumask and the MM cidmask bitmap. - * @mutex: Mutex to serialize forks and exits related to this mm */ struct mm_mm_cid { + /* Hotpath read mostly members */ struct mm_cid_pcpu __percpu *pcpu; + unsigned int percpu; + unsigned int transit; unsigned int max_cids; - unsigned int nr_cpus_allowed; - unsigned int users; + raw_spinlock_t lock; struct mutex mutex; + + /* Low frequency modified */ + unsigned int nr_cpus_allowed; + unsigned int users; }____cacheline_aligned_in_smp; #else /* CONFIG_SCHED_MM_CID */ struct mm_mm_cid { }; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 55bb9c9ae32c..659ae56b459f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10495,6 +10495,8 @@ void mm_init_cid(struct mm_struct *mm, struct task_struct *p) per_cpu_ptr(pcpu, cpu)->cid = MM_CID_UNSET; mm->mm_cid.max_cids = 0; + mm->mm_cid.percpu = 0; + mm->mm_cid.transit = 0; mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed; mm->mm_cid.users = 0; raw_spin_lock_init(&mm->mm_cid.lock); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4b49284504fb..82c7978d548e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2209,7 +2209,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) smp_wmb(); WRITE_ONCE(task_thread_info(p)->cpu, cpu); p->wake_cpu = cpu; - rseq_sched_set_task_cpu(p, cpu); + rseq_sched_set_ids_changed(p); #endif /* CONFIG_SMP */ } @@ -3598,6 +3598,153 @@ static __always_inline void mm_drop_cid_on_cpu(struct mm_struct *mm, struct mm_c mm_drop_cid(mm, pcp->cid); } +static inline unsigned int __mm_get_cid(struct mm_struct *mm, unsigned int max_cids) +{ + unsigned int cid = find_first_zero_bit(mm_cidmask(mm), max_cids); + + if (cid >= max_cids) + return MM_CID_UNSET; + if (test_and_set_bit(cid, mm_cidmask(mm))) + return MM_CID_UNSET; + return cid; +} + +static inline unsigned int mm_get_cid(struct mm_struct *mm) +{ + unsigned int cid = __mm_get_cid(mm, READ_ONCE(mm->mm_cid.max_cids)); + + while (cid == MM_CID_UNSET) { + cpu_relax(); + cid = __mm_get_cid(mm, num_possible_cpus()); + } + return cid; +} + +static inline unsigned int mm_cid_converge(struct mm_struct *mm, unsigned int orig_cid, + unsigned int max_cids) +{ + unsigned int new_cid, cid = cpu_cid_to_cid(orig_cid); + + /* Is it in the optimal CID space? */ + if (likely(cid < max_cids)) + return orig_cid; + + /* Try to find one in the optimal space. Otherwise keep the provided. */ + new_cid = __mm_get_cid(mm, max_cids); + if (new_cid != MM_CID_UNSET) { + mm_drop_cid(mm, cid); + /* Preserve the ONCPU mode of the original CID */ + return new_cid | (orig_cid & MM_CID_ONCPU); + } + return orig_cid; +} + +static __always_inline void mm_cid_update_task_cid(struct task_struct *t, unsigned int cid) +{ + if (t->mm_cid.cid != cid) { + t->mm_cid.cid = cid; + rseq_sched_set_ids_changed(t); + } +} + +static __always_inline void mm_cid_update_pcpu_cid(struct mm_struct *mm, unsigned int cid) +{ + __this_cpu_write(mm->mm_cid.pcpu->cid, cid); +} + +static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int cpu_cid) +{ + unsigned int max_cids, tcid = t->mm_cid.cid; + struct mm_struct *mm = t->mm; + + max_cids = READ_ONCE(mm->mm_cid.max_cids); + /* Optimize for the common case where both have the ONCPU bit set */ + if (likely(cid_on_cpu(cpu_cid & tcid))) { + if (likely(cpu_cid_to_cid(cpu_cid) < max_cids)) { + mm_cid_update_task_cid(t, cpu_cid); + return; + } + /* Try to converge into the optimal CID space */ + cpu_cid = mm_cid_converge(mm, cpu_cid, max_cids); + } else { + /* Hand over or drop the task owned CID */ + if (cid_on_task(tcid)) { + if (cid_on_cpu(cpu_cid)) + mm_unset_cid_on_task(t); + else + cpu_cid = cid_to_cpu_cid(tcid); + } + /* Still nothing, allocate a new one */ + if (!cid_on_cpu(cpu_cid)) + cpu_cid = cid_to_cpu_cid(mm_get_cid(mm)); + } + mm_cid_update_pcpu_cid(mm, cpu_cid); + mm_cid_update_task_cid(t, cpu_cid); +} + +static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int cpu_cid) +{ + unsigned int max_cids, tcid = t->mm_cid.cid; + struct mm_struct *mm = t->mm; + + max_cids = READ_ONCE(mm->mm_cid.max_cids); + /* Optimize for the common case, where both have the ONCPU bit clear */ + if (likely(cid_on_task(tcid | cpu_cid))) { + if (likely(tcid < max_cids)) { + mm_cid_update_pcpu_cid(mm, tcid); + return; + } + /* Try to converge into the optimal CID space */ + tcid = mm_cid_converge(mm, tcid, max_cids); + } else { + /* Hand over or drop the CPU owned CID */ + if (cid_on_cpu(cpu_cid)) { + if (cid_on_task(tcid)) + mm_drop_cid_on_cpu(mm, this_cpu_ptr(mm->mm_cid.pcpu)); + else + tcid = cpu_cid_to_cid(cpu_cid); + } + /* Still nothing, allocate a new one */ + if (!cid_on_task(tcid)) + tcid = mm_get_cid(mm); + /* Set the transition mode flag if required */ + tcid |= READ_ONCE(mm->mm_cid.transit); + } + mm_cid_update_pcpu_cid(mm, tcid); + mm_cid_update_task_cid(t, tcid); +} + +static __always_inline void mm_cid_schedin(struct task_struct *next) +{ + struct mm_struct *mm = next->mm; + unsigned int cpu_cid; + + if (!next->mm_cid.active) + return; + + cpu_cid = __this_cpu_read(mm->mm_cid.pcpu->cid); + if (likely(!READ_ONCE(mm->mm_cid.percpu))) + mm_cid_from_task(next, cpu_cid); + else + mm_cid_from_cpu(next, cpu_cid); +} + +static __always_inline void mm_cid_schedout(struct task_struct *prev) +{ + /* During mode transitions CIDs are temporary and need to be dropped */ + if (likely(!cid_in_transit(prev->mm_cid.cid))) + return; + + mm_drop_cid(prev->mm, cid_from_transit_cid(prev->mm_cid.cid)); + prev->mm_cid.cid = MM_CID_UNSET; +} + +static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) +{ + mm_cid_schedout(prev); + mm_cid_schedin(next); +} + /* Active implementation */ static inline void init_sched_mm_cid(struct task_struct *t) { @@ -3675,6 +3822,7 @@ static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *n #else /* !CONFIG_SCHED_MM_CID: */ static inline void mm_cid_select(struct task_struct *t) { } static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { } +static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) { } #endif /* !CONFIG_SCHED_MM_CID */ extern u64 avg_vruntime(struct cfs_rq *cfs_rq); From fbd0e71dc370af73f6b316e4de9eed273dd90340 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:27:16 +0100 Subject: [PATCH 51/54] sched/mmcid: Provide CID ownership mode fixup functions CIDs are either owned by tasks or by CPUs. The ownership mode depends on the number of tasks related to a MM and the number of CPUs on which these tasks are theoretically allowed to run on. Theoretically because that number is the superset of CPU affinities of all tasks which only grows and never shrinks. Switching to per CPU mode happens when the user count becomes greater than the maximum number of CIDs, which is calculated by: opt_cids = min(mm_cid::nr_cpus_allowed, mm_cid::users); max_cids = min(1.25 * opt_cids, nr_cpu_ids); The +25% allowance is useful for tight CPU masks in scenarios where only a few threads are created and destroyed to avoid frequent mode switches. Though this allowance shrinks, the closer opt_cids becomes to nr_cpu_ids, which is the (unfortunate) hard ABI limit. At the point of switching to per CPU mode the new user is not yet visible in the system, so the task which initiated the fork() runs the fixup function: mm_cid_fixup_tasks_to_cpu() walks the thread list and either transfers each tasks owned CID to the CPU the task runs on or drops it into the CID pool if a task is not on a CPU at that point in time. Tasks which schedule in before the task walk reaches them do the handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() completes it's guaranteed that no task related to that MM owns a CID anymore. Switching back to task mode happens when the user count goes below the threshold which was recorded on the per CPU mode switch: pcpu_thrs = min(opt_cids - (opt_cids / 4), nr_cpu_ids / 2); This threshold is updated when a affinity change increases the number of allowed CPUs for the MM, which might cause a switch back to per task mode. If the switch back was initiated by a exiting task, then that task runs the fixup function. If it was initiated by a affinity change, then it's run either in the deferred update function in context of a workqueue or by a task which forks a new one or by a task which exits. Whatever happens first. mm_cid_fixup_cpus_to_task() walks through the possible CPUs and either transfers the CPU owned CIDs to a related task which runs on the CPU or drops it into the pool. Tasks which schedule in on a CPU which the walk did not cover yet do the handover themselves. This transition from CPU to per task ownership happens in two phases: 1) mm:mm_cid.transit contains MM_CID_TRANSIT. This is OR'ed on the task CID and denotes that the CID is only temporarily owned by the task. When it schedules out the task drops the CID back into the pool if this bit is set. 2) The initiating context walks the per CPU space and after completion clears mm:mm_cid.transit. After that point the CIDs are strictly task owned again. This two phase transition is required to prevent CID space exhaustion during the transition as a direct transfer of ownership would fail if two tasks are scheduled in on the same CPU before the fixup freed per CPU CIDs. When mm_cid_fixup_cpus_to_tasks() completes it's guaranteed that no CID related to that MM is owned by a CPU anymore. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251119172550.088189028@linutronix.de --- include/linux/rseq_types.h | 7 +- kernel/sched/core.c | 278 +++++++++++++++++++++++++++++++++---- 2 files changed, 259 insertions(+), 26 deletions(-) diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index 66b1482e1146..a3a4f3f10862 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -122,14 +122,15 @@ struct mm_cid_pcpu { * @percpu: Set, when CIDs are in per CPU mode * @transit: Set to MM_CID_TRANSIT during a mode change transition phase * @max_cids: The exclusive maximum CID value for allocation and convergence - * @lock: Spinlock to protect all fields except @pcpu. It also protects - * the MM cid cpumask and the MM cidmask bitmap. + * @lock: Spinlock to protect against affinity setting which can't take @mutex * @mutex: Mutex to serialize forks and exits related to this mm * @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. The map * is growth only. * @users: The number of tasks sharing this MM. Separate from mm::mm_users * as that is modified by mmget()/mm_put() by other entities which * do not actually share the MM. + * @pcpu_thrs: Threshold for switching back from per CPU mode + * @update_deferred: A deferred switch back to per task mode is pending. */ struct mm_mm_cid { /* Hotpath read mostly members */ @@ -144,6 +145,8 @@ struct mm_mm_cid { /* Low frequency modified */ unsigned int nr_cpus_allowed; unsigned int users; + unsigned int pcpu_thrs; + unsigned int update_deferred; }____cacheline_aligned_in_smp; #else /* CONFIG_SCHED_MM_CID */ struct mm_mm_cid { }; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 659ae56b459f..eb0d59df8acc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10396,43 +10396,270 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) * task needs to drop the CID into the pool when scheduling out. Both bits * (ONCPU and TRANSIT) are filtered out by task_cid() when the CID is * actually handed over to user space in the RSEQ memory. + * + * Mode switching: + * + * Switching to per CPU mode happens when the user count becomes greater + * than the maximum number of CIDs, which is calculated by: + * + * opt_cids = min(mm_cid::nr_cpus_allowed, mm_cid::users); + * max_cids = min(1.25 * opt_cids, num_possible_cpus()); + * + * The +25% allowance is useful for tight CPU masks in scenarios where only + * a few threads are created and destroyed to avoid frequent mode + * switches. Though this allowance shrinks, the closer opt_cids becomes to + * num_possible_cpus(), which is the (unfortunate) hard ABI limit. + * + * At the point of switching to per CPU mode the new user is not yet + * visible in the system, so the task which initiated the fork() runs the + * fixup function: mm_cid_fixup_tasks_to_cpu() walks the thread list and + * either transfers each tasks owned CID to the CPU the task runs on or + * drops it into the CID pool if a task is not on a CPU at that point in + * time. Tasks which schedule in before the task walk reaches them do the + * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() completes + * it's guaranteed that no task related to that MM owns a CID anymore. + * + * Switching back to task mode happens when the user count goes below the + * threshold which was recorded on the per CPU mode switch: + * + * pcpu_thrs = min(opt_cids - (opt_cids / 4), num_possible_cpus() / 2); + * + * This threshold is updated when a affinity change increases the number of + * allowed CPUs for the MM, which might cause a switch back to per task + * mode. + * + * If the switch back was initiated by a exiting task, then that task runs + * the fixup function. If it was initiated by a affinity change, then it's + * run either in the deferred update function in context of a workqueue or + * by a task which forks a new one or by a task which exits. Whatever + * happens first. mm_cid_fixup_cpus_to_task() walks through the possible + * CPUs and either transfers the CPU owned CIDs to a related task which + * runs on the CPU or drops it into the pool. Tasks which schedule in on a + * CPU which the walk did not cover yet do the handover themself. + * + * This transition from CPU to per task ownership happens in two phases: + * + * 1) mm:mm_cid.transit contains MM_CID_TRANSIT This is OR'ed on the task + * CID and denotes that the CID is only temporarily owned by the + * task. When it schedules out the task drops the CID back into the + * pool if this bit is set. + * + * 2) The initiating context walks the per CPU space and after completion + * clears mm:mm_cid.transit. So after that point the CIDs are strictly + * task owned again. + * + * This two phase transition is required to prevent CID space exhaustion + * during the transition as a direct transfer of ownership would fail if + * two tasks are scheduled in on the same CPU before the fixup freed per + * CPU CIDs. + * + * When mm_cid_fixup_cpus_to_tasks() completes it's guaranteed that no CID + * related to that MM is owned by a CPU anymore. */ /* * Update the CID range properties when the constraints change. Invoked via * fork(), exit() and affinity changes */ -static void mm_update_max_cids(struct mm_struct *mm) +static void __mm_update_max_cids(struct mm_mm_cid *mc) +{ + unsigned int opt_cids, max_cids; + + /* Calculate the new optimal constraint */ + opt_cids = min(mc->nr_cpus_allowed, mc->users); + + /* Adjust the maximum CIDs to +25% limited by the number of possible CPUs */ + max_cids = min(opt_cids + (opt_cids / 4), num_possible_cpus()); + WRITE_ONCE(mc->max_cids, max_cids); +} + +static inline unsigned int mm_cid_calc_pcpu_thrs(struct mm_mm_cid *mc) +{ + unsigned int opt_cids; + + opt_cids = min(mc->nr_cpus_allowed, mc->users); + /* Has to be at least 1 because 0 indicates PCPU mode off */ + return max(min(opt_cids - opt_cids / 4, num_possible_cpus() / 2), 1); +} + +static bool mm_update_max_cids(struct mm_struct *mm) { struct mm_mm_cid *mc = &mm->mm_cid; - unsigned int max_cids; lockdep_assert_held(&mm->mm_cid.lock); - /* Calculate the new maximum constraint */ - max_cids = min(mc->nr_cpus_allowed, mc->users); - WRITE_ONCE(mc->max_cids, max_cids); + /* Clear deferred mode switch flag. A change is handled by the caller */ + mc->update_deferred = false; + __mm_update_max_cids(mc); + + /* Check whether owner mode must be changed */ + if (!mc->percpu) { + /* Enable per CPU mode when the number of users is above max_cids */ + if (mc->users > mc->max_cids) + mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc); + } else { + /* Switch back to per task if user count under threshold */ + if (mc->users < mc->pcpu_thrs) + mc->pcpu_thrs = 0; + } + + /* Mode change required? */ + if (!!mc->percpu == !!mc->pcpu_thrs) + return false; + /* When switching back to per TASK mode, set the transition flag */ + if (!mc->pcpu_thrs) + WRITE_ONCE(mc->transit, MM_CID_TRANSIT); + WRITE_ONCE(mc->percpu, !!mc->pcpu_thrs); + return true; } static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { struct cpumask *mm_allowed; + struct mm_mm_cid *mc; unsigned int weight; if (!mm || !READ_ONCE(mm->mm_cid.users)) return; - /* * mm::mm_cid::mm_cpus_allowed is the superset of each threads * allowed CPUs mask which means it can only grow. */ - guard(raw_spinlock)(&mm->mm_cid.lock); + mc = &mm->mm_cid; + guard(raw_spinlock)(&mc->lock); mm_allowed = mm_cpus_allowed(mm); weight = cpumask_weighted_or(mm_allowed, mm_allowed, affmsk); - if (weight == mm->mm_cid.nr_cpus_allowed) + if (weight == mc->nr_cpus_allowed) return; - WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, weight); - mm_update_max_cids(mm); + + WRITE_ONCE(mc->nr_cpus_allowed, weight); + __mm_update_max_cids(mc); + if (!mc->percpu) + return; + + /* Adjust the threshold to the wider set */ + mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc); + + /* Scheduling of deferred mode switch goes here */ +} + +static inline void mm_cid_transit_to_task(struct task_struct *t, struct mm_cid_pcpu *pcp) +{ + if (cid_on_cpu(t->mm_cid.cid)) { + unsigned int cid = cpu_cid_to_cid(t->mm_cid.cid); + + t->mm_cid.cid = cid_to_transit_cid(cid); + pcp->cid = t->mm_cid.cid; + } +} + +static void __maybe_unused mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm) +{ + unsigned int cpu; + + /* Walk the CPUs and fixup all stale CIDs */ + for_each_possible_cpu(cpu) { + struct mm_cid_pcpu *pcp = per_cpu_ptr(mm->mm_cid.pcpu, cpu); + struct rq *rq = cpu_rq(cpu); + + /* Remote access to mm::mm_cid::pcpu requires rq_lock */ + guard(rq_lock_irq)(rq); + /* Is the CID still owned by the CPU? */ + if (cid_on_cpu(pcp->cid)) { + /* + * If rq->curr has @mm, transfer it with the + * transition bit set. Otherwise drop it. + */ + if (rq->curr->mm == mm && rq->curr->mm_cid.active) + mm_cid_transit_to_task(rq->curr, pcp); + else + mm_drop_cid_on_cpu(mm, pcp); + + } else if (rq->curr->mm == mm && rq->curr->mm_cid.active) { + unsigned int cid = rq->curr->mm_cid.cid; + + /* Ensure it has the transition bit set */ + if (!cid_in_transit(cid)) { + cid = cid_to_transit_cid(cid); + rq->curr->mm_cid.cid = cid; + pcp->cid = cid; + } + } + } + /* Clear the transition bit */ + WRITE_ONCE(mm->mm_cid.transit, 0); +} + +static inline void mm_cid_transfer_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp) +{ + if (cid_on_task(t->mm_cid.cid)) { + t->mm_cid.cid = cid_to_cpu_cid(t->mm_cid.cid); + pcp->cid = t->mm_cid.cid; + } +} + +static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm) +{ + /* Remote access to mm::mm_cid::pcpu requires rq_lock */ + guard(task_rq_lock)(t); + /* If the task is not active it is not in the users count */ + if (!t->mm_cid.active) + return false; + if (cid_on_task(t->mm_cid.cid)) { + /* If running on the CPU, transfer the CID, otherwise drop it */ + if (task_rq(t)->curr == t) + mm_cid_transfer_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t))); + else + mm_unset_cid_on_task(t); + } + return true; +} + +static void __maybe_unused mm_cid_fixup_tasks_to_cpus(void) +{ + struct mm_struct *mm = current->mm; + struct task_struct *p, *t; + unsigned int users; + + /* + * This can obviously race with a concurrent affinity change, which + * increases the number of allowed CPUs for this mm, but that does + * not affect the mode and only changes the CID constraints. A + * possible switch back to per task mode happens either in the + * deferred handler function or in the next fork()/exit(). + * + * The caller has already transferred. The newly incoming task is + * already accounted for, but not yet visible. + */ + users = mm->mm_cid.users - 2; + if (!users) + return; + + guard(rcu)(); + for_other_threads(current, t) { + if (mm_cid_fixup_task_to_cpu(t, mm)) + users--; + } + + if (!users) + return; + + /* Happens only for VM_CLONE processes. */ + for_each_process_thread(p, t) { + if (t == current || t->mm != mm) + continue; + if (mm_cid_fixup_task_to_cpu(t, mm)) { + if (--users == 0) + return; + } + } +} + +static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm) +{ + t->mm_cid.active = 1; + mm->mm_cid.users++; + return mm_update_max_cids(mm); } void sched_mm_cid_fork(struct task_struct *t) @@ -10442,12 +10669,19 @@ void sched_mm_cid_fork(struct task_struct *t) WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET); guard(mutex)(&mm->mm_cid.mutex); - guard(raw_spinlock)(&mm->mm_cid.lock); - t->mm_cid.active = 1; - mm->mm_cid.users++; - /* Preset last_cid for mm_cid_select() */ - t->mm_cid.last_cid = READ_ONCE(mm->mm_cid.max_cids) - 1; - mm_update_max_cids(mm); + scoped_guard(raw_spinlock, &mm->mm_cid.lock) { + sched_mm_cid_add_user(t, mm); + /* Preset last_cid for mm_cid_select() */ + t->mm_cid.last_cid = mm->mm_cid.max_cids - 1; + } +} + +static bool sched_mm_cid_remove_user(struct task_struct *t) +{ + t->mm_cid.active = 0; + mm_unset_cid_on_task(t); + t->mm->mm_cid.users--; + return mm_update_max_cids(t->mm); } /* @@ -10462,14 +10696,8 @@ void sched_mm_cid_exit(struct task_struct *t) return; guard(mutex)(&mm->mm_cid.mutex); - guard(raw_spinlock)(&mm->mm_cid.lock); - t->mm_cid.active = 0; - mm->mm_cid.users--; - if (t->mm_cid.cid != MM_CID_UNSET) { - clear_bit(t->mm_cid.cid, mm_cidmask(mm)); - t->mm_cid.cid = MM_CID_UNSET; - } - mm_update_max_cids(mm); + scoped_guard(raw_spinlock, &mm->mm_cid.lock) + sched_mm_cid_remove_user(t); } /* Deactivate MM CID allocation across execve() */ @@ -10499,6 +10727,8 @@ void mm_init_cid(struct mm_struct *mm, struct task_struct *p) mm->mm_cid.transit = 0; mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed; mm->mm_cid.users = 0; + mm->mm_cid.pcpu_thrs = 0; + mm->mm_cid.update_deferred = 0; raw_spin_lock_init(&mm->mm_cid.lock); mutex_init(&mm->mm_cid.mutex); cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); From c809f081fe400cb1b9898f4791c0d33146315161 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:27:18 +0100 Subject: [PATCH 52/54] irqwork: Move data struct to a types header ... to avoid header recursion hell. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251119172550.152813625@linutronix.de --- include/linux/irq_work.h | 9 ++------- include/linux/irq_work_types.h | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 7 deletions(-) create mode 100644 include/linux/irq_work_types.h diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index 136f2980cba3..c5afd053ae32 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -2,8 +2,9 @@ #ifndef _LINUX_IRQ_WORK_H #define _LINUX_IRQ_WORK_H -#include +#include #include +#include /* * An entry can be in one of four states: @@ -14,12 +15,6 @@ * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed */ -struct irq_work { - struct __call_single_node node; - void (*func)(struct irq_work *); - struct rcuwait irqwait; -}; - #define __IRQ_WORK_INIT(_func, _flags) (struct irq_work){ \ .node = { .u_flags = (_flags), }, \ .func = (_func), \ diff --git a/include/linux/irq_work_types.h b/include/linux/irq_work_types.h new file mode 100644 index 000000000000..73abec5bb06e --- /dev/null +++ b/include/linux/irq_work_types.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_IRQ_WORK_TYPES_H +#define _LINUX_IRQ_WORK_TYPES_H + +#include +#include + +struct irq_work { + struct __call_single_node node; + void (*func)(struct irq_work *); + struct rcuwait irqwait; +}; + +#endif From 9da6ccbcea3de1fa704202e3346fe6c0226bfc18 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:27:20 +0100 Subject: [PATCH 53/54] sched/mmcid: Implement deferred mode change When affinity changes cause an increase of the number of CPUs allowed for tasks which are related to a MM, that might results in a situation where the ownership mode can go back from per CPU mode to per task mode. As affinity changes happen with runqueue lock held there is no way to do the actual mode change and required fixup right there. Add the infrastructure to defer it to a workqueue. The scheduled work can race with a fork() or exit(). Whatever happens first takes care of it. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251119172550.216484739@linutronix.de --- include/linux/rseq_types.h | 8 +++++ kernel/sched/core.c | 60 +++++++++++++++++++++++++++++++++----- 2 files changed, 60 insertions(+), 8 deletions(-) diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index a3a4f3f10862..81fbb8885e8d 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -2,7 +2,9 @@ #ifndef _LINUX_RSEQ_TYPES_H #define _LINUX_RSEQ_TYPES_H +#include #include +#include #ifdef CONFIG_RSEQ struct rseq; @@ -122,6 +124,8 @@ struct mm_cid_pcpu { * @percpu: Set, when CIDs are in per CPU mode * @transit: Set to MM_CID_TRANSIT during a mode change transition phase * @max_cids: The exclusive maximum CID value for allocation and convergence + * @irq_work: irq_work to handle the affinity mode change case + * @work: Regular work to handle the affinity mode change case * @lock: Spinlock to protect against affinity setting which can't take @mutex * @mutex: Mutex to serialize forks and exits related to this mm * @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. The map @@ -139,6 +143,10 @@ struct mm_mm_cid { unsigned int transit; unsigned int max_cids; + /* Rarely used. Moves @lock and @mutex into the second cacheline */ + struct irq_work irq_work; + struct work_struct work; + raw_spinlock_t lock; struct mutex mutex; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index eb0d59df8acc..cbb543a6efda 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10539,8 +10539,17 @@ static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpu /* Adjust the threshold to the wider set */ mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc); + /* Switch back to per task mode? */ + if (mc->users >= mc->pcpu_thrs) + return; - /* Scheduling of deferred mode switch goes here */ + /* Don't queue twice */ + if (mc->update_deferred) + return; + + /* Queue the irq work, which schedules the real work */ + mc->update_deferred = true; + irq_work_queue(&mc->irq_work); } static inline void mm_cid_transit_to_task(struct task_struct *t, struct mm_cid_pcpu *pcp) @@ -10553,7 +10562,7 @@ static inline void mm_cid_transit_to_task(struct task_struct *t, struct mm_cid_p } } -static void __maybe_unused mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm) +static void mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm) { unsigned int cpu; @@ -10714,14 +10723,47 @@ void sched_mm_cid_after_execve(struct task_struct *t) mm_cid_select(t); } +static void mm_cid_work_fn(struct work_struct *work) +{ + struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.work); + + /* Make it compile, but not functional yet */ + if (!IS_ENABLED(CONFIG_NEW_MM_CID)) + return; + + guard(mutex)(&mm->mm_cid.mutex); + /* Did the last user task exit already? */ + if (!mm->mm_cid.users) + return; + + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { + /* Have fork() or exit() handled it already? */ + if (!mm->mm_cid.update_deferred) + return; + /* This clears mm_cid::update_deferred */ + if (!mm_update_max_cids(mm)) + return; + /* Affinity changes can only switch back to task mode */ + if (WARN_ON_ONCE(mm->mm_cid.percpu)) + return; + } + mm_cid_fixup_cpus_to_tasks(mm); +} + +static void mm_cid_irq_work(struct irq_work *work) +{ + struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.irq_work); + + /* + * Needs to be unconditional because mm_cid::lock cannot be held + * when scheduling work as mm_update_cpus_allowed() nests inside + * rq::lock and schedule_work() might end up in wakeup... + */ + schedule_work(&mm->mm_cid.work); +} + void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { - struct mm_cid_pcpu __percpu *pcpu = mm->mm_cid.pcpu; - int cpu; - - for_each_possible_cpu(cpu) - per_cpu_ptr(pcpu, cpu)->cid = MM_CID_UNSET; - mm->mm_cid.max_cids = 0; mm->mm_cid.percpu = 0; mm->mm_cid.transit = 0; @@ -10731,6 +10773,8 @@ void mm_init_cid(struct mm_struct *mm, struct task_struct *p) mm->mm_cid.update_deferred = 0; raw_spin_lock_init(&mm->mm_cid.lock); mutex_init(&mm->mm_cid.mutex); + mm->mm_cid.irq_work = IRQ_WORK_INIT_HARD(mm_cid_irq_work); + INIT_WORK(&mm->mm_cid.work, mm_cid_work_fn); cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); bitmap_zero(mm_cidmask(mm), num_possible_cpus()); } From 653fda7ae73d8033dedb65537acac0c2c287dc3f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:27:22 +0100 Subject: [PATCH 54/54] sched/mmcid: Switch over to the new mechanism Now that all pieces are in place, change the implementations of sched_mm_cid_fork() and sched_mm_cid_exit() to adhere to the new strict ownership scheme and switch context_switch() over to use the new mm_cid_schedin() functionality. The common case is that there is no mode change required, which makes fork() and exit() just update the user count and the constraints. In case that a new user would exceed the CID space limit the fork() context handles the transition to per CPU mode with mm::mm_cid::mutex held. exit() handles the transition back to per task mode when the user count drops below the switch back threshold. fork() might also be forced to handle a deferred switch back to per task mode, when a affinity change increased the number of allowed CPUs enough. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251119172550.280380631@linutronix.de --- include/linux/rseq.h | 19 ------ include/linux/rseq_types.h | 8 +-- kernel/fork.c | 1 - kernel/sched/core.c | 115 +++++++++++++++++++++++++++++++------ kernel/sched/sched.h | 76 ------------------------ 5 files changed, 103 insertions(+), 116 deletions(-) diff --git a/include/linux/rseq.h b/include/linux/rseq.h index 4c0e8bdd2dd9..2266f4dc77b6 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -84,24 +84,6 @@ static __always_inline void rseq_sched_set_ids_changed(struct task_struct *t) t->rseq.event.ids_changed = true; } -/* - * Invoked from switch_mm_cid() in context switch when the task gets a MM - * CID assigned. - * - * This does not raise TIF_NOTIFY_RESUME as that happens in - * rseq_sched_switch_event(). - */ -static __always_inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid) -{ - /* - * Requires a comparison as the switch_mm_cid() code does not - * provide a conditional for it readily. So avoid excessive updates - * when nothing changes. - */ - if (t->rseq.ids.mm_cid != cid) - t->rseq.event.ids_changed = true; -} - /* Enforce a full update after RSEQ registration and when execve() failed */ static inline void rseq_force_update(void) { @@ -169,7 +151,6 @@ static inline void rseq_handle_slowpath(struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } static inline void rseq_sched_switch_event(struct task_struct *t) { } static inline void rseq_sched_set_ids_changed(struct task_struct *t) { } -static inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid) { } static inline void rseq_force_update(void) { } static inline void rseq_virt_userspace_exit(void) { } static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { } diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index 81fbb8885e8d..332dc14b81c9 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -101,18 +101,18 @@ struct rseq_data { }; /** * struct sched_mm_cid - Storage for per task MM CID data * @active: MM CID is active for the task - * @cid: The CID associated to the task - * @last_cid: The last CID associated to the task + * @cid: The CID associated to the task either permanently or + * borrowed from the CPU */ struct sched_mm_cid { unsigned int active; unsigned int cid; - unsigned int last_cid; }; /** * struct mm_cid_pcpu - Storage for per CPU MM_CID data - * @cid: The CID associated to the CPU + * @cid: The CID associated to the CPU either permanently or + * while a task with a CID is running */ struct mm_cid_pcpu { unsigned int cid; diff --git a/kernel/fork.c b/kernel/fork.c index 6c23219e1169..8475958e029b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -956,7 +956,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #ifdef CONFIG_SCHED_MM_CID tsk->mm_cid.cid = MM_CID_UNSET; - tsk->mm_cid.last_cid = MM_CID_UNSET; tsk->mm_cid.active = 0; #endif return tsk; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cbb543a6efda..62235f1dc04e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5307,7 +5307,7 @@ context_switch(struct rq *rq, struct task_struct *prev, } } - switch_mm_cid(prev, next); + mm_cid_switch_to(prev, next); /* * Tell rseq that the task was scheduled in. Must be after @@ -10624,7 +10624,7 @@ static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm return true; } -static void __maybe_unused mm_cid_fixup_tasks_to_cpus(void) +static void mm_cid_fixup_tasks_to_cpus(void) { struct mm_struct *mm = current->mm; struct task_struct *p, *t; @@ -10674,25 +10674,81 @@ static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm) void sched_mm_cid_fork(struct task_struct *t) { struct mm_struct *mm = t->mm; + bool percpu; WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET); guard(mutex)(&mm->mm_cid.mutex); - scoped_guard(raw_spinlock, &mm->mm_cid.lock) { - sched_mm_cid_add_user(t, mm); - /* Preset last_cid for mm_cid_select() */ - t->mm_cid.last_cid = mm->mm_cid.max_cids - 1; + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { + struct mm_cid_pcpu *pcp = this_cpu_ptr(mm->mm_cid.pcpu); + + /* First user ? */ + if (!mm->mm_cid.users) { + sched_mm_cid_add_user(t, mm); + t->mm_cid.cid = mm_get_cid(mm); + /* Required for execve() */ + pcp->cid = t->mm_cid.cid; + return; + } + + if (!sched_mm_cid_add_user(t, mm)) { + if (!mm->mm_cid.percpu) + t->mm_cid.cid = mm_get_cid(mm); + return; + } + + /* Handle the mode change and transfer current's CID */ + percpu = !!mm->mm_cid.percpu; + if (!percpu) + mm_cid_transit_to_task(current, pcp); + else + mm_cid_transfer_to_cpu(current, pcp); + } + + if (percpu) { + mm_cid_fixup_tasks_to_cpus(); + } else { + mm_cid_fixup_cpus_to_tasks(mm); + t->mm_cid.cid = mm_get_cid(mm); } } static bool sched_mm_cid_remove_user(struct task_struct *t) { t->mm_cid.active = 0; - mm_unset_cid_on_task(t); + scoped_guard(preempt) { + /* Clear the transition bit */ + t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid); + mm_unset_cid_on_task(t); + } t->mm->mm_cid.users--; return mm_update_max_cids(t->mm); } +static bool __sched_mm_cid_exit(struct task_struct *t) +{ + struct mm_struct *mm = t->mm; + + if (!sched_mm_cid_remove_user(t)) + return false; + /* + * Contrary to fork() this only deals with a switch back to per + * task mode either because the above decreased users or an + * affinity change increased the number of allowed CPUs and the + * deferred fixup did not run yet. + */ + if (WARN_ON_ONCE(mm->mm_cid.percpu)) + return false; + /* + * A failed fork(2) cleanup never gets here, so @current must have + * the same MM as @t. That's true for exit() and the failed + * pthread_create() cleanup case. + */ + if (WARN_ON_ONCE(current->mm != mm)) + return false; + return true; +} + /* * When a task exits, the MM CID held by the task is not longer required as * the task cannot return to user space. @@ -10703,10 +10759,43 @@ void sched_mm_cid_exit(struct task_struct *t) if (!mm || !t->mm_cid.active) return; + /* + * Ensure that only one instance is doing MM CID operations within + * a MM. The common case is uncontended. The rare fixup case adds + * some overhead. + */ + scoped_guard(mutex, &mm->mm_cid.mutex) { + /* mm_cid::mutex is sufficient to protect mm_cid::users */ + if (likely(mm->mm_cid.users > 1)) { + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { + if (!__sched_mm_cid_exit(t)) + return; + /* Mode change required. Transfer currents CID */ + mm_cid_transit_to_task(current, this_cpu_ptr(mm->mm_cid.pcpu)); + } + mm_cid_fixup_cpus_to_tasks(mm); + return; + } + /* Last user */ + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { + /* Required across execve() */ + if (t == current) + mm_cid_transit_to_task(t, this_cpu_ptr(mm->mm_cid.pcpu)); + /* Ignore mode change. There is nothing to do. */ + sched_mm_cid_remove_user(t); + } + } - guard(mutex)(&mm->mm_cid.mutex); - scoped_guard(raw_spinlock, &mm->mm_cid.lock) - sched_mm_cid_remove_user(t); + /* + * As this is the last user (execve(), process exit or failed + * fork(2)) there is no concurrency anymore. + * + * Synchronize eventually pending work to ensure that there are no + * dangling references left. @t->mm_cid.users is zero so nothing + * can queue this work anymore. + */ + irq_work_sync(&mm->mm_cid.irq_work); + cancel_work_sync(&mm->mm_cid.work); } /* Deactivate MM CID allocation across execve() */ @@ -10719,18 +10808,12 @@ void sched_mm_cid_before_execve(struct task_struct *t) void sched_mm_cid_after_execve(struct task_struct *t) { sched_mm_cid_fork(t); - guard(preempt)(); - mm_cid_select(t); } static void mm_cid_work_fn(struct work_struct *work) { struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.work); - /* Make it compile, but not functional yet */ - if (!IS_ENABLED(CONFIG_NEW_MM_CID)) - return; - guard(mutex)(&mm->mm_cid.mutex); /* Did the last user task exit already? */ if (!mm->mm_cid.users) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 82c7978d548e..f9d0515db130 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3745,83 +3745,7 @@ static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct mm_cid_schedin(next); } -/* Active implementation */ -static inline void init_sched_mm_cid(struct task_struct *t) -{ - struct mm_struct *mm = t->mm; - unsigned int max_cid; - - if (!mm) - return; - - /* Preset last_mm_cid */ - max_cid = min_t(int, READ_ONCE(mm->mm_cid.nr_cpus_allowed), atomic_read(&mm->mm_users)); - t->mm_cid.last_cid = max_cid - 1; -} - -static inline bool __mm_cid_get(struct task_struct *t, unsigned int cid, unsigned int max_cids) -{ - struct mm_struct *mm = t->mm; - - if (cid >= max_cids) - return false; - if (test_and_set_bit(cid, mm_cidmask(mm))) - return false; - t->mm_cid.cid = t->mm_cid.last_cid = cid; - __this_cpu_write(mm->mm_cid.pcpu->cid, cid); - return true; -} - -static inline bool mm_cid_get(struct task_struct *t) -{ - struct mm_struct *mm = t->mm; - unsigned int max_cids; - - max_cids = READ_ONCE(mm->mm_cid.max_cids); - - /* Try to reuse the last CID of this task */ - if (__mm_cid_get(t, t->mm_cid.last_cid, max_cids)) - return true; - - /* Try to reuse the last CID of this mm on this CPU */ - if (__mm_cid_get(t, __this_cpu_read(mm->mm_cid.pcpu->cid), max_cids)) - return true; - - /* Try the first zero bit in the cidmask. */ - return __mm_cid_get(t, find_first_zero_bit(mm_cidmask(mm), num_possible_cpus()), max_cids); -} - -static inline void mm_cid_select(struct task_struct *t) -{ - /* - * mm_cid_get() can fail when the maximum CID, which is determined - * by min(mm->nr_cpus_allowed, mm->mm_users) changes concurrently. - * That's a transient failure as there cannot be more tasks - * concurrently on a CPU (or about to be scheduled in) than that. - */ - for (;;) { - if (mm_cid_get(t)) - break; - } -} - -static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) -{ - if (prev->mm_cid.active) { - if (prev->mm_cid.cid != MM_CID_UNSET) - clear_bit(prev->mm_cid.cid, mm_cidmask(prev->mm)); - prev->mm_cid.cid = MM_CID_UNSET; - } - - if (next->mm_cid.active) { - mm_cid_select(next); - rseq_sched_set_task_mm_cid(next, next->mm_cid.cid); - } -} - #else /* !CONFIG_SCHED_MM_CID: */ -static inline void mm_cid_select(struct task_struct *t) { } -static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { } static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) { } #endif /* !CONFIG_SCHED_MM_CID */