mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-03-21 23:16:50 +08:00
Merge tag 'io_uring-7.0-20260312' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull io_uring fixes from Jens Axboe: - Fix an inverted true/false comment on task_no_new_privs, from the BPF filtering changes merged in this release - Use the migration disabling way of running the BPF filters, as the io_uring side doesn't do that already - Fix an issue with ->rings stability under resize, both for local task_work additions and for eventfd signaling - Fix an issue with SQE mixed mode, where a bounds check wasn't correct for having a 128b SQE - Fix an issue where a legacy provided buffer group is changed to to ring mapped one while legacy buffers from that group are in flight * tag 'io_uring-7.0-20260312' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: io_uring/kbuf: check if target buffer list is still legacy on recycle io_uring: fix physical SQE bounds check for SQE_MIXED 128-byte ops io_uring/eventfd: use ctx->rings_rcu for flags checking io_uring: ensure ctx->rings is stable for task work flags manipulation io_uring/bpf_filter: use bpf_prog_run_pin_on_cpu() to prevent migration io_uring/register: fix comment about task_no_new_privs
This commit is contained in:
@@ -388,6 +388,7 @@ struct io_ring_ctx {
|
||||
* regularly bounce b/w CPUs.
|
||||
*/
|
||||
struct {
|
||||
struct io_rings __rcu *rings_rcu;
|
||||
struct llist_head work_llist;
|
||||
struct llist_head retry_llist;
|
||||
unsigned long check_cq;
|
||||
|
||||
@@ -85,7 +85,7 @@ int __io_uring_run_bpf_filters(struct io_bpf_filter __rcu **filters,
|
||||
do {
|
||||
if (filter == &dummy_filter)
|
||||
return -EACCES;
|
||||
ret = bpf_prog_run(filter->prog, &bpf_ctx);
|
||||
ret = bpf_prog_run_pin_on_cpu(filter->prog, &bpf_ctx);
|
||||
if (!ret)
|
||||
return -EACCES;
|
||||
filter = filter->next;
|
||||
|
||||
@@ -76,11 +76,15 @@ void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event)
|
||||
{
|
||||
bool skip = false;
|
||||
struct io_ev_fd *ev_fd;
|
||||
|
||||
if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
|
||||
return;
|
||||
struct io_rings *rings;
|
||||
|
||||
guard(rcu)();
|
||||
|
||||
rings = rcu_dereference(ctx->rings_rcu);
|
||||
if (!rings)
|
||||
return;
|
||||
if (READ_ONCE(rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
|
||||
return;
|
||||
ev_fd = rcu_dereference(ctx->io_ev_fd);
|
||||
/*
|
||||
* Check again if ev_fd exists in case an io_eventfd_unregister call
|
||||
|
||||
@@ -1745,7 +1745,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
|
||||
* well as 2 contiguous entries.
|
||||
*/
|
||||
if (!(ctx->flags & IORING_SETUP_SQE_MIXED) || *left < 2 ||
|
||||
!(ctx->cached_sq_head & (ctx->sq_entries - 1)))
|
||||
(unsigned)(sqe - ctx->sq_sqes) >= ctx->sq_entries - 1)
|
||||
return io_init_fail_req(req, -EINVAL);
|
||||
/*
|
||||
* A 128b operation on a mixed SQ uses two entries, so we have
|
||||
@@ -2066,6 +2066,7 @@ static void io_rings_free(struct io_ring_ctx *ctx)
|
||||
io_free_region(ctx->user, &ctx->sq_region);
|
||||
io_free_region(ctx->user, &ctx->ring_region);
|
||||
ctx->rings = NULL;
|
||||
RCU_INIT_POINTER(ctx->rings_rcu, NULL);
|
||||
ctx->sq_sqes = NULL;
|
||||
}
|
||||
|
||||
@@ -2703,6 +2704,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
|
||||
if (ret)
|
||||
return ret;
|
||||
ctx->rings = rings = io_region_get_ptr(&ctx->ring_region);
|
||||
rcu_assign_pointer(ctx->rings_rcu, rings);
|
||||
if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
|
||||
ctx->sq_array = (u32 *)((char *)rings + rl->sq_array_offset);
|
||||
|
||||
|
||||
@@ -111,9 +111,18 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
|
||||
|
||||
buf = req->kbuf;
|
||||
bl = io_buffer_get_list(ctx, buf->bgid);
|
||||
list_add(&buf->list, &bl->buf_list);
|
||||
bl->nbufs++;
|
||||
/*
|
||||
* If the buffer list was upgraded to a ring-based one, or removed,
|
||||
* while the request was in-flight in io-wq, drop it.
|
||||
*/
|
||||
if (bl && !(bl->flags & IOBL_BUF_RING)) {
|
||||
list_add(&buf->list, &bl->buf_list);
|
||||
bl->nbufs++;
|
||||
} else {
|
||||
kfree(buf);
|
||||
}
|
||||
req->flags &= ~REQ_F_BUFFER_SELECTED;
|
||||
req->kbuf = NULL;
|
||||
|
||||
io_ring_submit_unlock(ctx, issue_flags);
|
||||
return true;
|
||||
|
||||
@@ -202,7 +202,7 @@ static int io_register_restrictions_task(void __user *arg, unsigned int nr_args)
|
||||
return -EPERM;
|
||||
/*
|
||||
* Similar to seccomp, disallow setting a filter if task_no_new_privs
|
||||
* is true and we're not CAP_SYS_ADMIN.
|
||||
* is false and we're not CAP_SYS_ADMIN.
|
||||
*/
|
||||
if (!task_no_new_privs(current) &&
|
||||
!ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
|
||||
@@ -238,7 +238,7 @@ static int io_register_bpf_filter_task(void __user *arg, unsigned int nr_args)
|
||||
|
||||
/*
|
||||
* Similar to seccomp, disallow setting a filter if task_no_new_privs
|
||||
* is true and we're not CAP_SYS_ADMIN.
|
||||
* is false and we're not CAP_SYS_ADMIN.
|
||||
*/
|
||||
if (!task_no_new_privs(current) &&
|
||||
!ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
|
||||
@@ -633,7 +633,15 @@ overflow:
|
||||
ctx->sq_entries = p->sq_entries;
|
||||
ctx->cq_entries = p->cq_entries;
|
||||
|
||||
/*
|
||||
* Just mark any flag we may have missed and that the application
|
||||
* should act on unconditionally. Worst case it'll be an extra
|
||||
* syscall.
|
||||
*/
|
||||
atomic_or(IORING_SQ_TASKRUN | IORING_SQ_NEED_WAKEUP, &n.rings->sq_flags);
|
||||
ctx->rings = n.rings;
|
||||
rcu_assign_pointer(ctx->rings_rcu, n.rings);
|
||||
|
||||
ctx->sq_sqes = n.sq_sqes;
|
||||
swap_old(ctx, o, n, ring_region);
|
||||
swap_old(ctx, o, n, sq_region);
|
||||
@@ -642,6 +650,9 @@ overflow:
|
||||
out:
|
||||
spin_unlock(&ctx->completion_lock);
|
||||
mutex_unlock(&ctx->mmap_lock);
|
||||
/* Wait for concurrent io_ctx_mark_taskrun() */
|
||||
if (to_free == &o)
|
||||
synchronize_rcu_expedited();
|
||||
io_register_free_rings(ctx, to_free);
|
||||
|
||||
if (ctx->sq_data)
|
||||
|
||||
@@ -152,6 +152,21 @@ void tctx_task_work(struct callback_head *cb)
|
||||
WARN_ON_ONCE(ret);
|
||||
}
|
||||
|
||||
/*
|
||||
* Sets IORING_SQ_TASKRUN in the sq_flags shared with userspace, using the
|
||||
* RCU protected rings pointer to be safe against concurrent ring resizing.
|
||||
*/
|
||||
static void io_ctx_mark_taskrun(struct io_ring_ctx *ctx)
|
||||
{
|
||||
lockdep_assert_in_rcu_read_lock();
|
||||
|
||||
if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) {
|
||||
struct io_rings *rings = rcu_dereference(ctx->rings_rcu);
|
||||
|
||||
atomic_or(IORING_SQ_TASKRUN, &rings->sq_flags);
|
||||
}
|
||||
}
|
||||
|
||||
void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
|
||||
{
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
@@ -206,8 +221,7 @@ void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
|
||||
*/
|
||||
|
||||
if (!head) {
|
||||
if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
|
||||
atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
|
||||
io_ctx_mark_taskrun(ctx);
|
||||
if (ctx->has_evfd)
|
||||
io_eventfd_signal(ctx, false);
|
||||
}
|
||||
@@ -231,6 +245,10 @@ void io_req_normal_work_add(struct io_kiocb *req)
|
||||
if (!llist_add(&req->io_task_work.node, &tctx->task_list))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Doesn't need to use ->rings_rcu, as resizing isn't supported for
|
||||
* !DEFER_TASKRUN.
|
||||
*/
|
||||
if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
|
||||
atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user