Merge tag 'io_uring-7.0-20260312' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux

Pull io_uring fixes from Jens Axboe:

 - Fix an inverted true/false comment on task_no_new_privs, from the
   BPF filtering changes merged in this release

 - Use the migration disabling way of running the BPF filters, as the
   io_uring side doesn't do that already

 - Fix an issue with ->rings stability under resize, both for local
   task_work additions and for eventfd signaling

 - Fix an issue with SQE mixed mode, where a bounds check wasn't correct
   for having a 128b SQE

 - Fix an issue where a legacy provided buffer group is changed to to
   ring mapped one while legacy buffers from that group are in flight

* tag 'io_uring-7.0-20260312' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux:
  io_uring/kbuf: check if target buffer list is still legacy on recycle
  io_uring: fix physical SQE bounds check for SQE_MIXED 128-byte ops
  io_uring/eventfd: use ctx->rings_rcu for flags checking
  io_uring: ensure ctx->rings is stable for task work flags manipulation
  io_uring/bpf_filter: use bpf_prog_run_pin_on_cpu() to prevent migration
  io_uring/register: fix comment about task_no_new_privs
This commit is contained in:
Linus Torvalds
2026-03-13 10:09:35 -07:00
7 changed files with 56 additions and 11 deletions

View File

@@ -388,6 +388,7 @@ struct io_ring_ctx {
* regularly bounce b/w CPUs. * regularly bounce b/w CPUs.
*/ */
struct { struct {
struct io_rings __rcu *rings_rcu;
struct llist_head work_llist; struct llist_head work_llist;
struct llist_head retry_llist; struct llist_head retry_llist;
unsigned long check_cq; unsigned long check_cq;

View File

@@ -85,7 +85,7 @@ int __io_uring_run_bpf_filters(struct io_bpf_filter __rcu **filters,
do { do {
if (filter == &dummy_filter) if (filter == &dummy_filter)
return -EACCES; return -EACCES;
ret = bpf_prog_run(filter->prog, &bpf_ctx); ret = bpf_prog_run_pin_on_cpu(filter->prog, &bpf_ctx);
if (!ret) if (!ret)
return -EACCES; return -EACCES;
filter = filter->next; filter = filter->next;

View File

@@ -76,11 +76,15 @@ void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event)
{ {
bool skip = false; bool skip = false;
struct io_ev_fd *ev_fd; struct io_ev_fd *ev_fd;
struct io_rings *rings;
if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
return;
guard(rcu)(); guard(rcu)();
rings = rcu_dereference(ctx->rings_rcu);
if (!rings)
return;
if (READ_ONCE(rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
return;
ev_fd = rcu_dereference(ctx->io_ev_fd); ev_fd = rcu_dereference(ctx->io_ev_fd);
/* /*
* Check again if ev_fd exists in case an io_eventfd_unregister call * Check again if ev_fd exists in case an io_eventfd_unregister call

View File

@@ -1745,7 +1745,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
* well as 2 contiguous entries. * well as 2 contiguous entries.
*/ */
if (!(ctx->flags & IORING_SETUP_SQE_MIXED) || *left < 2 || if (!(ctx->flags & IORING_SETUP_SQE_MIXED) || *left < 2 ||
!(ctx->cached_sq_head & (ctx->sq_entries - 1))) (unsigned)(sqe - ctx->sq_sqes) >= ctx->sq_entries - 1)
return io_init_fail_req(req, -EINVAL); return io_init_fail_req(req, -EINVAL);
/* /*
* A 128b operation on a mixed SQ uses two entries, so we have * A 128b operation on a mixed SQ uses two entries, so we have
@@ -2066,6 +2066,7 @@ static void io_rings_free(struct io_ring_ctx *ctx)
io_free_region(ctx->user, &ctx->sq_region); io_free_region(ctx->user, &ctx->sq_region);
io_free_region(ctx->user, &ctx->ring_region); io_free_region(ctx->user, &ctx->ring_region);
ctx->rings = NULL; ctx->rings = NULL;
RCU_INIT_POINTER(ctx->rings_rcu, NULL);
ctx->sq_sqes = NULL; ctx->sq_sqes = NULL;
} }
@@ -2703,6 +2704,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
if (ret) if (ret)
return ret; return ret;
ctx->rings = rings = io_region_get_ptr(&ctx->ring_region); ctx->rings = rings = io_region_get_ptr(&ctx->ring_region);
rcu_assign_pointer(ctx->rings_rcu, rings);
if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
ctx->sq_array = (u32 *)((char *)rings + rl->sq_array_offset); ctx->sq_array = (u32 *)((char *)rings + rl->sq_array_offset);

View File

@@ -111,9 +111,18 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
buf = req->kbuf; buf = req->kbuf;
bl = io_buffer_get_list(ctx, buf->bgid); bl = io_buffer_get_list(ctx, buf->bgid);
list_add(&buf->list, &bl->buf_list); /*
bl->nbufs++; * If the buffer list was upgraded to a ring-based one, or removed,
* while the request was in-flight in io-wq, drop it.
*/
if (bl && !(bl->flags & IOBL_BUF_RING)) {
list_add(&buf->list, &bl->buf_list);
bl->nbufs++;
} else {
kfree(buf);
}
req->flags &= ~REQ_F_BUFFER_SELECTED; req->flags &= ~REQ_F_BUFFER_SELECTED;
req->kbuf = NULL;
io_ring_submit_unlock(ctx, issue_flags); io_ring_submit_unlock(ctx, issue_flags);
return true; return true;

View File

@@ -202,7 +202,7 @@ static int io_register_restrictions_task(void __user *arg, unsigned int nr_args)
return -EPERM; return -EPERM;
/* /*
* Similar to seccomp, disallow setting a filter if task_no_new_privs * Similar to seccomp, disallow setting a filter if task_no_new_privs
* is true and we're not CAP_SYS_ADMIN. * is false and we're not CAP_SYS_ADMIN.
*/ */
if (!task_no_new_privs(current) && if (!task_no_new_privs(current) &&
!ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN)) !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
@@ -238,7 +238,7 @@ static int io_register_bpf_filter_task(void __user *arg, unsigned int nr_args)
/* /*
* Similar to seccomp, disallow setting a filter if task_no_new_privs * Similar to seccomp, disallow setting a filter if task_no_new_privs
* is true and we're not CAP_SYS_ADMIN. * is false and we're not CAP_SYS_ADMIN.
*/ */
if (!task_no_new_privs(current) && if (!task_no_new_privs(current) &&
!ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN)) !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
@@ -633,7 +633,15 @@ overflow:
ctx->sq_entries = p->sq_entries; ctx->sq_entries = p->sq_entries;
ctx->cq_entries = p->cq_entries; ctx->cq_entries = p->cq_entries;
/*
* Just mark any flag we may have missed and that the application
* should act on unconditionally. Worst case it'll be an extra
* syscall.
*/
atomic_or(IORING_SQ_TASKRUN | IORING_SQ_NEED_WAKEUP, &n.rings->sq_flags);
ctx->rings = n.rings; ctx->rings = n.rings;
rcu_assign_pointer(ctx->rings_rcu, n.rings);
ctx->sq_sqes = n.sq_sqes; ctx->sq_sqes = n.sq_sqes;
swap_old(ctx, o, n, ring_region); swap_old(ctx, o, n, ring_region);
swap_old(ctx, o, n, sq_region); swap_old(ctx, o, n, sq_region);
@@ -642,6 +650,9 @@ overflow:
out: out:
spin_unlock(&ctx->completion_lock); spin_unlock(&ctx->completion_lock);
mutex_unlock(&ctx->mmap_lock); mutex_unlock(&ctx->mmap_lock);
/* Wait for concurrent io_ctx_mark_taskrun() */
if (to_free == &o)
synchronize_rcu_expedited();
io_register_free_rings(ctx, to_free); io_register_free_rings(ctx, to_free);
if (ctx->sq_data) if (ctx->sq_data)

View File

@@ -152,6 +152,21 @@ void tctx_task_work(struct callback_head *cb)
WARN_ON_ONCE(ret); WARN_ON_ONCE(ret);
} }
/*
* Sets IORING_SQ_TASKRUN in the sq_flags shared with userspace, using the
* RCU protected rings pointer to be safe against concurrent ring resizing.
*/
static void io_ctx_mark_taskrun(struct io_ring_ctx *ctx)
{
lockdep_assert_in_rcu_read_lock();
if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) {
struct io_rings *rings = rcu_dereference(ctx->rings_rcu);
atomic_or(IORING_SQ_TASKRUN, &rings->sq_flags);
}
}
void io_req_local_work_add(struct io_kiocb *req, unsigned flags) void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
{ {
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
@@ -206,8 +221,7 @@ void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
*/ */
if (!head) { if (!head) {
if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) io_ctx_mark_taskrun(ctx);
atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
if (ctx->has_evfd) if (ctx->has_evfd)
io_eventfd_signal(ctx, false); io_eventfd_signal(ctx, false);
} }
@@ -231,6 +245,10 @@ void io_req_normal_work_add(struct io_kiocb *req)
if (!llist_add(&req->io_task_work.node, &tctx->task_list)) if (!llist_add(&req->io_task_work.node, &tctx->task_list))
return; return;
/*
* Doesn't need to use ->rings_rcu, as resizing isn't supported for
* !DEFER_TASKRUN.
*/
if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);