Merge tag 'io_uring-7.0-20260312' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux

Pull io_uring fixes from Jens Axboe:

 - Fix an inverted true/false comment on task_no_new_privs, from the
   BPF filtering changes merged in this release

 - Use the migration disabling way of running the BPF filters, as the
   io_uring side doesn't do that already

 - Fix an issue with ->rings stability under resize, both for local
   task_work additions and for eventfd signaling

 - Fix an issue with SQE mixed mode, where a bounds check wasn't correct
   for having a 128b SQE

 - Fix an issue where a legacy provided buffer group is changed to to
   ring mapped one while legacy buffers from that group are in flight

* tag 'io_uring-7.0-20260312' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux:
  io_uring/kbuf: check if target buffer list is still legacy on recycle
  io_uring: fix physical SQE bounds check for SQE_MIXED 128-byte ops
  io_uring/eventfd: use ctx->rings_rcu for flags checking
  io_uring: ensure ctx->rings is stable for task work flags manipulation
  io_uring/bpf_filter: use bpf_prog_run_pin_on_cpu() to prevent migration
  io_uring/register: fix comment about task_no_new_privs
This commit is contained in:
Linus Torvalds
2026-03-13 10:09:35 -07:00
7 changed files with 56 additions and 11 deletions

View File

@@ -388,6 +388,7 @@ struct io_ring_ctx {
* regularly bounce b/w CPUs.
*/
struct {
struct io_rings __rcu *rings_rcu;
struct llist_head work_llist;
struct llist_head retry_llist;
unsigned long check_cq;

View File

@@ -85,7 +85,7 @@ int __io_uring_run_bpf_filters(struct io_bpf_filter __rcu **filters,
do {
if (filter == &dummy_filter)
return -EACCES;
ret = bpf_prog_run(filter->prog, &bpf_ctx);
ret = bpf_prog_run_pin_on_cpu(filter->prog, &bpf_ctx);
if (!ret)
return -EACCES;
filter = filter->next;

View File

@@ -76,11 +76,15 @@ void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event)
{
bool skip = false;
struct io_ev_fd *ev_fd;
if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
return;
struct io_rings *rings;
guard(rcu)();
rings = rcu_dereference(ctx->rings_rcu);
if (!rings)
return;
if (READ_ONCE(rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
return;
ev_fd = rcu_dereference(ctx->io_ev_fd);
/*
* Check again if ev_fd exists in case an io_eventfd_unregister call

View File

@@ -1745,7 +1745,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
* well as 2 contiguous entries.
*/
if (!(ctx->flags & IORING_SETUP_SQE_MIXED) || *left < 2 ||
!(ctx->cached_sq_head & (ctx->sq_entries - 1)))
(unsigned)(sqe - ctx->sq_sqes) >= ctx->sq_entries - 1)
return io_init_fail_req(req, -EINVAL);
/*
* A 128b operation on a mixed SQ uses two entries, so we have
@@ -2066,6 +2066,7 @@ static void io_rings_free(struct io_ring_ctx *ctx)
io_free_region(ctx->user, &ctx->sq_region);
io_free_region(ctx->user, &ctx->ring_region);
ctx->rings = NULL;
RCU_INIT_POINTER(ctx->rings_rcu, NULL);
ctx->sq_sqes = NULL;
}
@@ -2703,6 +2704,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
if (ret)
return ret;
ctx->rings = rings = io_region_get_ptr(&ctx->ring_region);
rcu_assign_pointer(ctx->rings_rcu, rings);
if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
ctx->sq_array = (u32 *)((char *)rings + rl->sq_array_offset);

View File

@@ -111,9 +111,18 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
buf = req->kbuf;
bl = io_buffer_get_list(ctx, buf->bgid);
list_add(&buf->list, &bl->buf_list);
bl->nbufs++;
/*
* If the buffer list was upgraded to a ring-based one, or removed,
* while the request was in-flight in io-wq, drop it.
*/
if (bl && !(bl->flags & IOBL_BUF_RING)) {
list_add(&buf->list, &bl->buf_list);
bl->nbufs++;
} else {
kfree(buf);
}
req->flags &= ~REQ_F_BUFFER_SELECTED;
req->kbuf = NULL;
io_ring_submit_unlock(ctx, issue_flags);
return true;

View File

@@ -202,7 +202,7 @@ static int io_register_restrictions_task(void __user *arg, unsigned int nr_args)
return -EPERM;
/*
* Similar to seccomp, disallow setting a filter if task_no_new_privs
* is true and we're not CAP_SYS_ADMIN.
* is false and we're not CAP_SYS_ADMIN.
*/
if (!task_no_new_privs(current) &&
!ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
@@ -238,7 +238,7 @@ static int io_register_bpf_filter_task(void __user *arg, unsigned int nr_args)
/*
* Similar to seccomp, disallow setting a filter if task_no_new_privs
* is true and we're not CAP_SYS_ADMIN.
* is false and we're not CAP_SYS_ADMIN.
*/
if (!task_no_new_privs(current) &&
!ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
@@ -633,7 +633,15 @@ overflow:
ctx->sq_entries = p->sq_entries;
ctx->cq_entries = p->cq_entries;
/*
* Just mark any flag we may have missed and that the application
* should act on unconditionally. Worst case it'll be an extra
* syscall.
*/
atomic_or(IORING_SQ_TASKRUN | IORING_SQ_NEED_WAKEUP, &n.rings->sq_flags);
ctx->rings = n.rings;
rcu_assign_pointer(ctx->rings_rcu, n.rings);
ctx->sq_sqes = n.sq_sqes;
swap_old(ctx, o, n, ring_region);
swap_old(ctx, o, n, sq_region);
@@ -642,6 +650,9 @@ overflow:
out:
spin_unlock(&ctx->completion_lock);
mutex_unlock(&ctx->mmap_lock);
/* Wait for concurrent io_ctx_mark_taskrun() */
if (to_free == &o)
synchronize_rcu_expedited();
io_register_free_rings(ctx, to_free);
if (ctx->sq_data)

View File

@@ -152,6 +152,21 @@ void tctx_task_work(struct callback_head *cb)
WARN_ON_ONCE(ret);
}
/*
* Sets IORING_SQ_TASKRUN in the sq_flags shared with userspace, using the
* RCU protected rings pointer to be safe against concurrent ring resizing.
*/
static void io_ctx_mark_taskrun(struct io_ring_ctx *ctx)
{
lockdep_assert_in_rcu_read_lock();
if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) {
struct io_rings *rings = rcu_dereference(ctx->rings_rcu);
atomic_or(IORING_SQ_TASKRUN, &rings->sq_flags);
}
}
void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -206,8 +221,7 @@ void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
*/
if (!head) {
if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
io_ctx_mark_taskrun(ctx);
if (ctx->has_evfd)
io_eventfd_signal(ctx, false);
}
@@ -231,6 +245,10 @@ void io_req_normal_work_add(struct io_kiocb *req)
if (!llist_add(&req->io_task_work.node, &tctx->task_list))
return;
/*
* Doesn't need to use ->rings_rcu, as resizing isn't supported for
* !DEFER_TASKRUN.
*/
if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);