Files
linux/io_uring/openclose.c
Al Viro 9fa3ec8458 allow incomplete imports of filenames
There are two filename-related problems in io_uring and its
interplay with audit.

Filenames are imported when request is submitted and used when
it is processed.  Unfortunately, the latter may very well
happen in a different thread.  In that case the reference to
filename is put into the wrong audit_context - that of submitting
thread, not the processing one.  Audit logics is called by
the latter, and it really wants to be able to find the names
in audit_context current (== processing) thread.

Another related problem is the headache with refcounts -
normally all references to given struct filename are visible
only to one thread (the one that uses that struct filename).
io_uring violates that - an extra reference is stashed in
audit_context of submitter.  It gets dropped when submitter
returns to userland, which can happen simultaneously with
processing thread deciding to drop the reference it got.

We paper over that by making refcount atomic, but that means
pointless headache for everyone.

Solution: the notion of partially imported filenames.  Namely,
already copied from userland, but *not* exposed to audit yet.

io_uring can create that in submitter thread, and complete the
import (obtaining the usual reference to struct filename) in
processing thread.

Object: struct delayed_filename.

Primitives for working with it:

delayed_getname(&delayed_filename, user_string) - copies the name from
userland, returning 0 and stashing the address of (still incomplete)
struct filename in delayed_filename on success and returning -E... on
error.

delayed_getname_uflags(&delayed_filename, user_string, atflags) -
similar, in the same relation to delayed_getname() as getname_uflags()
is to getname()

complete_getname(&delayed_filename) - completes the import of filename
stashed in delayed_filename and returns struct filename to caller,
emptying delayed_filename.

CLASS(filename_complete_delayed, name)(&delayed_filename) - variant of
CLASS(filename) with complete_getname() for constructor.

dismiss_delayed_filename(&delayed_filename) - destructor; drops whatever
might be stashed in delayed_filename, emptying it.

putname_to_delayed(&delayed_filename, name) - if name is shared, stashes
its copy into delayed_filename and drops the reference to name, otherwise
stashes the name itself in there.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2026-01-13 15:18:07 -05:00

437 lines
10 KiB
C

// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/fsnotify.h>
#include <linux/namei.h>
#include <linux/pipe_fs_i.h>
#include <linux/watch_queue.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
#include "../fs/internal.h"
#include "filetable.h"
#include "io_uring.h"
#include "rsrc.h"
#include "openclose.h"
struct io_open {
struct file *file;
int dfd;
u32 file_slot;
struct delayed_filename filename;
struct open_how how;
unsigned long nofile;
};
struct io_close {
struct file *file;
int fd;
u32 file_slot;
};
struct io_fixed_install {
struct file *file;
unsigned int o_flags;
};
static bool io_openat_force_async(struct io_open *open)
{
/*
* Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
* it'll always -EAGAIN. Note that we test for __O_TMPFILE because
* O_TMPFILE includes O_DIRECTORY, which isn't a flag we need to force
* async for.
*/
return open->how.flags & (O_TRUNC | O_CREAT | __O_TMPFILE);
}
static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_open *open = io_kiocb_to_cmd(req, struct io_open);
const char __user *fname;
int ret;
if (unlikely(sqe->buf_index))
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
/* open.how should be already initialised */
if (!(open->how.flags & O_PATH) && force_o_largefile())
open->how.flags |= O_LARGEFILE;
open->dfd = READ_ONCE(sqe->fd);
fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
ret = delayed_getname(&open->filename, fname);
if (unlikely(ret))
return ret;
req->flags |= REQ_F_NEED_CLEANUP;
open->file_slot = READ_ONCE(sqe->file_index);
if (open->file_slot && (open->how.flags & O_CLOEXEC))
return -EINVAL;
open->nofile = rlimit(RLIMIT_NOFILE);
if (io_openat_force_async(open))
req->flags |= REQ_F_FORCE_ASYNC;
return 0;
}
int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_open *open = io_kiocb_to_cmd(req, struct io_open);
u64 mode = READ_ONCE(sqe->len);
u64 flags = READ_ONCE(sqe->open_flags);
open->how = build_open_how(flags, mode);
return __io_openat_prep(req, sqe);
}
int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_open *open = io_kiocb_to_cmd(req, struct io_open);
struct open_how __user *how;
size_t len;
int ret;
how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
len = READ_ONCE(sqe->len);
if (len < OPEN_HOW_SIZE_VER0)
return -EINVAL;
ret = copy_struct_from_user(&open->how, sizeof(open->how), how, len);
if (ret)
return ret;
return __io_openat_prep(req, sqe);
}
int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_open *open = io_kiocb_to_cmd(req, struct io_open);
struct open_flags op;
struct file *file;
bool resolve_nonblock, nonblock_set;
bool fixed = !!open->file_slot;
CLASS(filename_complete_delayed, name)(&open->filename);
int ret;
ret = build_open_flags(&open->how, &op);
if (ret)
goto err;
nonblock_set = op.open_flag & O_NONBLOCK;
resolve_nonblock = open->how.resolve & RESOLVE_CACHED;
if (issue_flags & IO_URING_F_NONBLOCK) {
WARN_ON_ONCE(io_openat_force_async(open));
op.lookup_flags |= LOOKUP_CACHED;
op.open_flag |= O_NONBLOCK;
}
if (!fixed) {
ret = __get_unused_fd_flags(open->how.flags, open->nofile);
if (ret < 0)
goto err;
}
file = do_filp_open(open->dfd, name, &op);
if (IS_ERR(file)) {
/*
* We could hang on to this 'fd' on retrying, but seems like
* marginal gain for something that is now known to be a slower
* path. So just put it, and we'll get a new one when we retry.
*/
if (!fixed)
put_unused_fd(ret);
ret = PTR_ERR(file);
/* only retry if RESOLVE_CACHED wasn't already set by application */
if (ret == -EAGAIN && !resolve_nonblock &&
(issue_flags & IO_URING_F_NONBLOCK)) {
ret = putname_to_delayed(&open->filename,
no_free_ptr(name));
if (likely(!ret))
return -EAGAIN;
}
goto err;
}
if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
file->f_flags &= ~O_NONBLOCK;
if (!fixed)
fd_install(ret, file);
else
ret = io_fixed_fd_install(req, issue_flags, file,
open->file_slot);
err:
req->flags &= ~REQ_F_NEED_CLEANUP;
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_COMPLETE;
}
int io_openat(struct io_kiocb *req, unsigned int issue_flags)
{
return io_openat2(req, issue_flags);
}
void io_open_cleanup(struct io_kiocb *req)
{
struct io_open *open = io_kiocb_to_cmd(req, struct io_open);
dismiss_delayed_filename(&open->filename);
}
int __io_close_fixed(struct io_ring_ctx *ctx, unsigned int issue_flags,
unsigned int offset)
{
int ret;
io_ring_submit_lock(ctx, issue_flags);
ret = io_fixed_fd_remove(ctx, offset);
io_ring_submit_unlock(ctx, issue_flags);
return ret;
}
static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_close *close = io_kiocb_to_cmd(req, struct io_close);
return __io_close_fixed(req->ctx, issue_flags, close->file_slot - 1);
}
int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_close *close = io_kiocb_to_cmd(req, struct io_close);
if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index)
return -EINVAL;
if (req->flags & REQ_F_FIXED_FILE)
return -EBADF;
close->fd = READ_ONCE(sqe->fd);
close->file_slot = READ_ONCE(sqe->file_index);
if (close->file_slot && close->fd)
return -EINVAL;
return 0;
}
int io_close(struct io_kiocb *req, unsigned int issue_flags)
{
struct files_struct *files = current->files;
struct io_close *close = io_kiocb_to_cmd(req, struct io_close);
struct file *file;
int ret = -EBADF;
if (close->file_slot) {
ret = io_close_fixed(req, issue_flags);
goto err;
}
spin_lock(&files->file_lock);
file = files_lookup_fd_locked(files, close->fd);
if (!file || io_is_uring_fops(file)) {
spin_unlock(&files->file_lock);
goto err;
}
/* if the file has a flush method, be safe and punt to async */
if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
spin_unlock(&files->file_lock);
return -EAGAIN;
}
file = file_close_fd_locked(files, close->fd);
spin_unlock(&files->file_lock);
if (!file)
goto err;
/* No ->flush() or already async, safely close from here */
ret = filp_close(file, current->files);
err:
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_COMPLETE;
}
int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_fixed_install *ifi;
unsigned int flags;
if (sqe->off || sqe->addr || sqe->len || sqe->buf_index ||
sqe->splice_fd_in || sqe->addr3)
return -EINVAL;
/* must be a fixed file */
if (!(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
flags = READ_ONCE(sqe->install_fd_flags);
if (flags & ~IORING_FIXED_FD_NO_CLOEXEC)
return -EINVAL;
/* ensure the task's creds are used when installing/receiving fds */
if (req->flags & REQ_F_CREDS)
return -EPERM;
/* default to O_CLOEXEC, disable if IORING_FIXED_FD_NO_CLOEXEC is set */
ifi = io_kiocb_to_cmd(req, struct io_fixed_install);
ifi->o_flags = O_CLOEXEC;
if (flags & IORING_FIXED_FD_NO_CLOEXEC)
ifi->o_flags = 0;
return 0;
}
int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_fixed_install *ifi;
int ret;
ifi = io_kiocb_to_cmd(req, struct io_fixed_install);
ret = receive_fd(req->file, NULL, ifi->o_flags);
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_COMPLETE;
}
struct io_pipe {
struct file *file;
int __user *fds;
int flags;
int file_slot;
unsigned long nofile;
};
int io_pipe_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe);
if (sqe->fd || sqe->off || sqe->addr3)
return -EINVAL;
p->fds = u64_to_user_ptr(READ_ONCE(sqe->addr));
p->flags = READ_ONCE(sqe->pipe_flags);
if (p->flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE))
return -EINVAL;
p->file_slot = READ_ONCE(sqe->file_index);
p->nofile = rlimit(RLIMIT_NOFILE);
return 0;
}
static int io_pipe_fixed(struct io_kiocb *req, struct file **files,
unsigned int issue_flags)
{
struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe);
struct io_ring_ctx *ctx = req->ctx;
int ret, fds[2] = { -1, -1 };
int slot = p->file_slot;
if (p->flags & O_CLOEXEC)
return -EINVAL;
io_ring_submit_lock(ctx, issue_flags);
ret = __io_fixed_fd_install(ctx, files[0], slot);
if (ret < 0)
goto err;
fds[0] = ret;
files[0] = NULL;
/*
* If a specific slot is given, next one will be used for
* the write side.
*/
if (slot != IORING_FILE_INDEX_ALLOC)
slot++;
ret = __io_fixed_fd_install(ctx, files[1], slot);
if (ret < 0)
goto err;
fds[1] = ret;
files[1] = NULL;
io_ring_submit_unlock(ctx, issue_flags);
if (!copy_to_user(p->fds, fds, sizeof(fds)))
return 0;
ret = -EFAULT;
io_ring_submit_lock(ctx, issue_flags);
err:
if (fds[0] != -1)
io_fixed_fd_remove(ctx, fds[0]);
if (fds[1] != -1)
io_fixed_fd_remove(ctx, fds[1]);
io_ring_submit_unlock(ctx, issue_flags);
return ret;
}
static int io_pipe_fd(struct io_kiocb *req, struct file **files)
{
struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe);
int ret, fds[2] = { -1, -1 };
ret = __get_unused_fd_flags(p->flags, p->nofile);
if (ret < 0)
goto err;
fds[0] = ret;
ret = __get_unused_fd_flags(p->flags, p->nofile);
if (ret < 0)
goto err;
fds[1] = ret;
if (!copy_to_user(p->fds, fds, sizeof(fds))) {
fd_install(fds[0], files[0]);
fd_install(fds[1], files[1]);
return 0;
}
ret = -EFAULT;
err:
if (fds[0] != -1)
put_unused_fd(fds[0]);
if (fds[1] != -1)
put_unused_fd(fds[1]);
return ret;
}
int io_pipe(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe);
struct file *files[2];
int ret;
ret = create_pipe_files(files, p->flags);
if (ret)
return ret;
if (!!p->file_slot)
ret = io_pipe_fixed(req, files, issue_flags);
else
ret = io_pipe_fd(req, files);
io_req_set_res(req, ret, 0);
if (!ret)
return IOU_COMPLETE;
req_set_fail(req);
if (files[0])
fput(files[0]);
if (files[1])
fput(files[1]);
return ret;
}