mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-03-22 07:27:12 +08:00
io_uring: allow registration of per-task restrictions
Currently io_uring supports restricting operations on a per-ring basis. To use those, the ring must be setup in a disabled state by setting IORING_SETUP_R_DISABLED. Then restrictions can be set for the ring, and the ring can then be enabled. This commit adds support for IORING_REGISTER_RESTRICTIONS with ring_fd == -1, like the other "blind" register opcodes which work on the task rather than a specific ring. This allows registration of the same kind of restrictions as can been done on a specific ring, but with the task itself. Once done, any ring created will inherit these restrictions. If a restriction filter is registered with a task, then it's inherited on fork for its children. Children may only further restrict operations, not extend them. Inheriting restrictions include both the classic IORING_REGISTER_RESTRICTIONS based restrictions, as well as the BPF filters that have been registered with the task via IORING_REGISTER_BPF_FILTER. Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
@@ -231,6 +231,8 @@ struct io_restriction {
|
|||||||
DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
|
DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
|
||||||
DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
|
DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
|
||||||
struct io_bpf_filters *bpf_filters;
|
struct io_bpf_filters *bpf_filters;
|
||||||
|
/* ->bpf_filters needs COW on modification */
|
||||||
|
bool bpf_filters_cow;
|
||||||
u8 sqe_flags_allowed;
|
u8 sqe_flags_allowed;
|
||||||
u8 sqe_flags_required;
|
u8 sqe_flags_required;
|
||||||
/* IORING_OP_* restrictions exist */
|
/* IORING_OP_* restrictions exist */
|
||||||
|
|||||||
@@ -808,6 +808,13 @@ struct io_uring_restriction {
|
|||||||
__u32 resv2[3];
|
__u32 resv2[3];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct io_uring_task_restriction {
|
||||||
|
__u16 flags;
|
||||||
|
__u16 nr_res;
|
||||||
|
__u32 resv[3];
|
||||||
|
__DECLARE_FLEX_ARRAY(struct io_uring_restriction, restrictions);
|
||||||
|
};
|
||||||
|
|
||||||
struct io_uring_clock_register {
|
struct io_uring_clock_register {
|
||||||
__u32 clockid;
|
__u32 clockid;
|
||||||
__u32 __resv[3];
|
__u32 __resv[3];
|
||||||
|
|||||||
@@ -249,13 +249,77 @@ static int io_uring_check_cbpf_filter(struct sock_filter *filter,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void io_bpf_filter_clone(struct io_restriction *dst, struct io_restriction *src)
|
||||||
|
{
|
||||||
|
if (!src->bpf_filters)
|
||||||
|
return;
|
||||||
|
|
||||||
|
rcu_read_lock();
|
||||||
|
/*
|
||||||
|
* If the src filter is going away, just ignore it.
|
||||||
|
*/
|
||||||
|
if (refcount_inc_not_zero(&src->bpf_filters->refs)) {
|
||||||
|
dst->bpf_filters = src->bpf_filters;
|
||||||
|
dst->bpf_filters_cow = true;
|
||||||
|
}
|
||||||
|
rcu_read_unlock();
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Allocate a new struct io_bpf_filters. Used when a filter is cloned and
|
||||||
|
* modifications need to be made.
|
||||||
|
*/
|
||||||
|
static struct io_bpf_filters *io_bpf_filter_cow(struct io_restriction *src)
|
||||||
|
{
|
||||||
|
struct io_bpf_filters *filters;
|
||||||
|
struct io_bpf_filter *srcf;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
filters = io_new_bpf_filters();
|
||||||
|
if (IS_ERR(filters))
|
||||||
|
return filters;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Iterate filters from src and assign in destination. Grabbing
|
||||||
|
* a reference is enough, we don't need to duplicate the memory.
|
||||||
|
* This is safe because filters are only ever appended to the
|
||||||
|
* front of the list, hence the only memory ever touched inside
|
||||||
|
* a filter is the refcount.
|
||||||
|
*/
|
||||||
|
rcu_read_lock();
|
||||||
|
for (i = 0; i < IORING_OP_LAST; i++) {
|
||||||
|
srcf = rcu_dereference(src->bpf_filters->filters[i]);
|
||||||
|
if (!srcf) {
|
||||||
|
continue;
|
||||||
|
} else if (srcf == &dummy_filter) {
|
||||||
|
rcu_assign_pointer(filters->filters[i], &dummy_filter);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Getting a ref on the first node is enough, putting the
|
||||||
|
* filter and iterating nodes to free will stop on the first
|
||||||
|
* one that doesn't hit zero when dropping.
|
||||||
|
*/
|
||||||
|
if (!refcount_inc_not_zero(&srcf->refs))
|
||||||
|
goto err;
|
||||||
|
rcu_assign_pointer(filters->filters[i], srcf);
|
||||||
|
}
|
||||||
|
rcu_read_unlock();
|
||||||
|
return filters;
|
||||||
|
err:
|
||||||
|
rcu_read_unlock();
|
||||||
|
__io_put_bpf_filters(filters);
|
||||||
|
return ERR_PTR(-EBUSY);
|
||||||
|
}
|
||||||
|
|
||||||
#define IO_URING_BPF_FILTER_FLAGS IO_URING_BPF_FILTER_DENY_REST
|
#define IO_URING_BPF_FILTER_FLAGS IO_URING_BPF_FILTER_DENY_REST
|
||||||
|
|
||||||
int io_register_bpf_filter(struct io_restriction *res,
|
int io_register_bpf_filter(struct io_restriction *res,
|
||||||
struct io_uring_bpf __user *arg)
|
struct io_uring_bpf __user *arg)
|
||||||
{
|
{
|
||||||
|
struct io_bpf_filters *filters, *old_filters = NULL;
|
||||||
struct io_bpf_filter *filter, *old_filter;
|
struct io_bpf_filter *filter, *old_filter;
|
||||||
struct io_bpf_filters *filters;
|
|
||||||
struct io_uring_bpf reg;
|
struct io_uring_bpf reg;
|
||||||
struct bpf_prog *prog;
|
struct bpf_prog *prog;
|
||||||
struct sock_fprog fprog;
|
struct sock_fprog fprog;
|
||||||
@@ -297,6 +361,17 @@ int io_register_bpf_filter(struct io_restriction *res,
|
|||||||
ret = PTR_ERR(filters);
|
ret = PTR_ERR(filters);
|
||||||
goto err_prog;
|
goto err_prog;
|
||||||
}
|
}
|
||||||
|
} else if (res->bpf_filters_cow) {
|
||||||
|
filters = io_bpf_filter_cow(res);
|
||||||
|
if (IS_ERR(filters)) {
|
||||||
|
ret = PTR_ERR(filters);
|
||||||
|
goto err_prog;
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
* Stash old filters, we'll put them once we know we'll
|
||||||
|
* succeed. Until then, res->bpf_filters is left untouched.
|
||||||
|
*/
|
||||||
|
old_filters = res->bpf_filters;
|
||||||
}
|
}
|
||||||
|
|
||||||
filter = kzalloc(sizeof(*filter), GFP_KERNEL_ACCOUNT);
|
filter = kzalloc(sizeof(*filter), GFP_KERNEL_ACCOUNT);
|
||||||
@@ -306,6 +381,15 @@ int io_register_bpf_filter(struct io_restriction *res,
|
|||||||
}
|
}
|
||||||
refcount_set(&filter->refs, 1);
|
refcount_set(&filter->refs, 1);
|
||||||
filter->prog = prog;
|
filter->prog = prog;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Success - install the new filter set now. If we did COW, put
|
||||||
|
* the old filters as we're replacing them.
|
||||||
|
*/
|
||||||
|
if (old_filters) {
|
||||||
|
__io_put_bpf_filters(old_filters);
|
||||||
|
res->bpf_filters_cow = false;
|
||||||
|
}
|
||||||
res->bpf_filters = filters;
|
res->bpf_filters = filters;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|||||||
@@ -13,6 +13,8 @@ int io_register_bpf_filter(struct io_restriction *res,
|
|||||||
|
|
||||||
void io_put_bpf_filters(struct io_restriction *res);
|
void io_put_bpf_filters(struct io_restriction *res);
|
||||||
|
|
||||||
|
void io_bpf_filter_clone(struct io_restriction *dst, struct io_restriction *src);
|
||||||
|
|
||||||
static inline int io_uring_run_bpf_filters(struct io_bpf_filter __rcu **filters,
|
static inline int io_uring_run_bpf_filters(struct io_bpf_filter __rcu **filters,
|
||||||
struct io_kiocb *req)
|
struct io_kiocb *req)
|
||||||
{
|
{
|
||||||
@@ -37,6 +39,10 @@ static inline int io_uring_run_bpf_filters(struct io_bpf_filter __rcu **filters,
|
|||||||
static inline void io_put_bpf_filters(struct io_restriction *res)
|
static inline void io_put_bpf_filters(struct io_restriction *res)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
static inline void io_bpf_filter_clone(struct io_restriction *dst,
|
||||||
|
struct io_restriction *src)
|
||||||
|
{
|
||||||
|
}
|
||||||
#endif /* CONFIG_IO_URING_BPF */
|
#endif /* CONFIG_IO_URING_BPF */
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -2880,6 +2880,32 @@ int io_prepare_config(struct io_ctx_config *config)
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void io_restriction_clone(struct io_restriction *dst, struct io_restriction *src)
|
||||||
|
{
|
||||||
|
memcpy(&dst->register_op, &src->register_op, sizeof(dst->register_op));
|
||||||
|
memcpy(&dst->sqe_op, &src->sqe_op, sizeof(dst->sqe_op));
|
||||||
|
dst->sqe_flags_allowed = src->sqe_flags_allowed;
|
||||||
|
dst->sqe_flags_required = src->sqe_flags_required;
|
||||||
|
dst->op_registered = src->op_registered;
|
||||||
|
dst->reg_registered = src->reg_registered;
|
||||||
|
|
||||||
|
io_bpf_filter_clone(dst, src);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void io_ctx_restriction_clone(struct io_ring_ctx *ctx,
|
||||||
|
struct io_restriction *src)
|
||||||
|
{
|
||||||
|
struct io_restriction *dst = &ctx->restrictions;
|
||||||
|
|
||||||
|
io_restriction_clone(dst, src);
|
||||||
|
if (dst->bpf_filters)
|
||||||
|
WRITE_ONCE(ctx->bpf_filters, dst->bpf_filters->filters);
|
||||||
|
if (dst->op_registered)
|
||||||
|
ctx->op_restricted = 1;
|
||||||
|
if (dst->reg_registered)
|
||||||
|
ctx->reg_restricted = 1;
|
||||||
|
}
|
||||||
|
|
||||||
static __cold int io_uring_create(struct io_ctx_config *config)
|
static __cold int io_uring_create(struct io_ctx_config *config)
|
||||||
{
|
{
|
||||||
struct io_uring_params *p = &config->p;
|
struct io_uring_params *p = &config->p;
|
||||||
@@ -2940,6 +2966,13 @@ static __cold int io_uring_create(struct io_ctx_config *config)
|
|||||||
else
|
else
|
||||||
ctx->notify_method = TWA_SIGNAL;
|
ctx->notify_method = TWA_SIGNAL;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the current task has restrictions enabled, then copy them to
|
||||||
|
* our newly created ring and mark it as registered.
|
||||||
|
*/
|
||||||
|
if (current->io_uring_restrict)
|
||||||
|
io_ctx_restriction_clone(ctx, current->io_uring_restrict);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This is just grabbed for accounting purposes. When a process exits,
|
* This is just grabbed for accounting purposes. When a process exits,
|
||||||
* the mm is exited and dropped before the files, hence we need to hang
|
* the mm is exited and dropped before the files, hence we need to hang
|
||||||
|
|||||||
@@ -197,6 +197,7 @@ void io_task_refs_refill(struct io_uring_task *tctx);
|
|||||||
bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
|
bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
|
||||||
|
|
||||||
void io_activate_pollwq(struct io_ring_ctx *ctx);
|
void io_activate_pollwq(struct io_ring_ctx *ctx);
|
||||||
|
void io_restriction_clone(struct io_restriction *dst, struct io_restriction *src);
|
||||||
|
|
||||||
static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
|
static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -190,6 +190,82 @@ static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int io_register_restrictions_task(void __user *arg, unsigned int nr_args)
|
||||||
|
{
|
||||||
|
struct io_uring_task_restriction __user *ures = arg;
|
||||||
|
struct io_uring_task_restriction tres;
|
||||||
|
struct io_restriction *res;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
/* Disallow if task already has registered restrictions */
|
||||||
|
if (current->io_uring_restrict)
|
||||||
|
return -EPERM;
|
||||||
|
/*
|
||||||
|
* Similar to seccomp, disallow setting a filter if task_no_new_privs
|
||||||
|
* is true and we're not CAP_SYS_ADMIN.
|
||||||
|
*/
|
||||||
|
if (!task_no_new_privs(current) &&
|
||||||
|
!ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
|
||||||
|
return -EACCES;
|
||||||
|
if (nr_args != 1)
|
||||||
|
return -EINVAL;
|
||||||
|
|
||||||
|
if (copy_from_user(&tres, arg, sizeof(tres)))
|
||||||
|
return -EFAULT;
|
||||||
|
|
||||||
|
if (tres.flags)
|
||||||
|
return -EINVAL;
|
||||||
|
if (!mem_is_zero(tres.resv, sizeof(tres.resv)))
|
||||||
|
return -EINVAL;
|
||||||
|
|
||||||
|
res = kzalloc(sizeof(*res), GFP_KERNEL_ACCOUNT);
|
||||||
|
if (!res)
|
||||||
|
return -ENOMEM;
|
||||||
|
|
||||||
|
ret = io_parse_restrictions(ures->restrictions, tres.nr_res, res);
|
||||||
|
if (ret < 0) {
|
||||||
|
kfree(res);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
current->io_uring_restrict = res;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int io_register_bpf_filter_task(void __user *arg, unsigned int nr_args)
|
||||||
|
{
|
||||||
|
struct io_restriction *res;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Similar to seccomp, disallow setting a filter if task_no_new_privs
|
||||||
|
* is true and we're not CAP_SYS_ADMIN.
|
||||||
|
*/
|
||||||
|
if (!task_no_new_privs(current) &&
|
||||||
|
!ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
|
||||||
|
return -EACCES;
|
||||||
|
|
||||||
|
if (nr_args != 1)
|
||||||
|
return -EINVAL;
|
||||||
|
|
||||||
|
/* If no task restrictions exist, setup a new set */
|
||||||
|
res = current->io_uring_restrict;
|
||||||
|
if (!res) {
|
||||||
|
res = kzalloc(sizeof(*res), GFP_KERNEL_ACCOUNT);
|
||||||
|
if (!res)
|
||||||
|
return -ENOMEM;
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = io_register_bpf_filter(res, arg);
|
||||||
|
if (ret) {
|
||||||
|
if (res != current->io_uring_restrict)
|
||||||
|
kfree(res);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
if (!current->io_uring_restrict)
|
||||||
|
current->io_uring_restrict = res;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
static int io_register_enable_rings(struct io_ring_ctx *ctx)
|
static int io_register_enable_rings(struct io_ring_ctx *ctx)
|
||||||
{
|
{
|
||||||
if (!(ctx->flags & IORING_SETUP_R_DISABLED))
|
if (!(ctx->flags & IORING_SETUP_R_DISABLED))
|
||||||
@@ -912,6 +988,10 @@ static int io_uring_register_blind(unsigned int opcode, void __user *arg,
|
|||||||
return io_uring_register_send_msg_ring(arg, nr_args);
|
return io_uring_register_send_msg_ring(arg, nr_args);
|
||||||
case IORING_REGISTER_QUERY:
|
case IORING_REGISTER_QUERY:
|
||||||
return io_query(arg, nr_args);
|
return io_query(arg, nr_args);
|
||||||
|
case IORING_REGISTER_RESTRICTIONS:
|
||||||
|
return io_register_restrictions_task(arg, nr_args);
|
||||||
|
case IORING_REGISTER_BPF_FILTER:
|
||||||
|
return io_register_bpf_filter_task(arg, nr_args);
|
||||||
}
|
}
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -11,6 +11,7 @@
|
|||||||
|
|
||||||
#include "io_uring.h"
|
#include "io_uring.h"
|
||||||
#include "tctx.h"
|
#include "tctx.h"
|
||||||
|
#include "bpf_filter.h"
|
||||||
|
|
||||||
static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
|
static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
|
||||||
struct task_struct *task)
|
struct task_struct *task)
|
||||||
@@ -66,6 +67,11 @@ void __io_uring_free(struct task_struct *tsk)
|
|||||||
kfree(tctx);
|
kfree(tctx);
|
||||||
tsk->io_uring = NULL;
|
tsk->io_uring = NULL;
|
||||||
}
|
}
|
||||||
|
if (tsk->io_uring_restrict) {
|
||||||
|
io_put_bpf_filters(tsk->io_uring_restrict);
|
||||||
|
kfree(tsk->io_uring_restrict);
|
||||||
|
tsk->io_uring_restrict = NULL;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__cold int io_uring_alloc_task_context(struct task_struct *task,
|
__cold int io_uring_alloc_task_context(struct task_struct *task,
|
||||||
@@ -356,5 +362,16 @@ int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
|
|||||||
|
|
||||||
int __io_uring_fork(struct task_struct *tsk)
|
int __io_uring_fork(struct task_struct *tsk)
|
||||||
{
|
{
|
||||||
|
struct io_restriction *res, *src = tsk->io_uring_restrict;
|
||||||
|
|
||||||
|
/* Don't leave it dangling on error */
|
||||||
|
tsk->io_uring_restrict = NULL;
|
||||||
|
|
||||||
|
res = kzalloc(sizeof(*res), GFP_KERNEL_ACCOUNT);
|
||||||
|
if (!res)
|
||||||
|
return -ENOMEM;
|
||||||
|
|
||||||
|
tsk->io_uring_restrict = res;
|
||||||
|
io_restriction_clone(res, src);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user