Merge tag 'vfs-7.0-rc2.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull vfs fixes from Christian Brauner:

 - Fix an uninitialized variable in file_getattr().

   The flags_valid field wasn't initialized before calling
   vfs_fileattr_get(), triggering KMSAN uninit-value reports in fuse

 - Fix writeback wakeup and logging timeouts when DETECT_HUNG_TASK is
   not enabled.

   sysctl_hung_task_timeout_secs is 0 in that case causing spurious
   "waiting for writeback completion for more than 1 seconds" warnings

 - Fix a null-ptr-deref in do_statmount() when the mount is internal

 - Add missing kernel-doc description for the @private parameter in
   iomap_readahead()

 - Fix mount namespace creation to hold namespace_sem across the mount
   copy in create_new_namespace().

   The previous drop-and-reacquire pattern was fragile and failed to
   clean up mount propagation links if the real rootfs was a shared or
   dependent mount

 - Fix /proc mount iteration where m->index wasn't updated when
   m->show() overflows, causing a restart to repeatedly show the same
   mount entry in a rapidly expanding mount table

 - Return EFSCORRUPTED instead of ENOSPC in minix_new_inode() when the
   inode number is out of range

 - Fix unshare(2) when CLONE_NEWNS is set and current->fs isn't shared.

   copy_mnt_ns() received the live fs_struct so if a subsequent
   namespace creation failed the rollback would leave pwd and root
   pointing to detached mounts. Always allocate a new fs_struct when
   CLONE_NEWNS is requested

 - fserror bug fixes:

    - Remove the unused fsnotify_sb_error() helper now that all callers
      have been converted to fserror_report_metadata

    - Fix a lockdep splat in fserror_report() where igrab() takes
      inode::i_lock which can be held in IRQ context.

      Replace igrab() with a direct i_count bump since filesystems
      should not report inodes that are about to be freed or not yet
      exposed

 - Handle error pointer in procfs for try_lookup_noperm()

 - Fix an integer overflow in ep_loop_check_proc() where recursive calls
   returning INT_MAX would overflow when +1 is added, breaking the
   recursion depth check

 - Fix a misleading break in pidfs

* tag 'vfs-7.0-rc2.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
  pidfs: avoid misleading break
  eventpoll: Fix integer overflow in ep_loop_check_proc()
  proc: Fix pointer error dereference
  fserror: fix lockdep complaint when igrabbing inode
  fsnotify: drop unused helper
  unshare: fix unshare_fs() handling
  minix: Correct errno in minix_new_inode
  namespace: fix proc mount iteration
  mount: hold namespace_sem across copy in create_new_namespace()
  iomap: Describe @private in iomap_readahead()
  statmount: Fix the null-ptr-deref in do_statmount()
  writeback: Fix wakeup and logging timeouts for !DETECT_HUNG_TASK
  fs: init flags_valid before calling vfs_fileattr_get
This commit is contained in:
Linus Torvalds
2026-02-25 10:34:23 -08:00
11 changed files with 142 additions and 90 deletions

View File

@@ -2061,7 +2061,8 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
* @ep: the &struct eventpoll to be currently checked.
* @depth: Current depth of the path being checked.
*
* Return: depth of the subtree, or INT_MAX if we found a loop or went too deep.
* Return: depth of the subtree, or a value bigger than EP_MAX_NESTS if we found
* a loop or went too deep.
*/
static int ep_loop_check_proc(struct eventpoll *ep, int depth)
{
@@ -2080,7 +2081,7 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth)
struct eventpoll *ep_tovisit;
ep_tovisit = epi->ffd.file->private_data;
if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS)
result = INT_MAX;
result = EP_MAX_NESTS+1;
else
result = max(result, ep_loop_check_proc(ep_tovisit, depth + 1) + 1);
if (result > EP_MAX_NESTS)

View File

@@ -378,7 +378,7 @@ SYSCALL_DEFINE5(file_getattr, int, dfd, const char __user *, filename,
struct path filepath __free(path_put) = {};
unsigned int lookup_flags = 0;
struct file_attr fattr;
struct file_kattr fa;
struct file_kattr fa = { .flags_valid = true }; /* hint only */
int error;
BUILD_BUG_ON(sizeof(struct file_attr) < FILE_ATTR_SIZE_VER0);

View File

@@ -198,10 +198,11 @@ static void wb_queue_work(struct bdi_writeback *wb,
static bool wb_wait_for_completion_cb(struct wb_completion *done)
{
unsigned long timeout = sysctl_hung_task_timeout_secs;
unsigned long waited_secs = (jiffies - done->wait_start) / HZ;
done->progress_stamp = jiffies;
if (waited_secs > sysctl_hung_task_timeout_secs)
if (timeout && (waited_secs > timeout))
pr_info("INFO: The task %s:%d has been waiting for writeback "
"completion for more than %lu seconds.",
current->comm, current->pid, waited_secs);
@@ -1954,6 +1955,7 @@ static long writeback_sb_inodes(struct super_block *sb,
.range_end = LLONG_MAX,
};
unsigned long start_time = jiffies;
unsigned long timeout = sysctl_hung_task_timeout_secs;
long write_chunk;
long total_wrote = 0; /* count both pages and inodes */
unsigned long dirtied_before = jiffies;
@@ -2040,9 +2042,8 @@ static long writeback_sb_inodes(struct super_block *sb,
__writeback_single_inode(inode, &wbc);
/* Report progress to inform the hung task detector of the progress. */
if (work->done && work->done->progress_stamp &&
(jiffies - work->done->progress_stamp) > HZ *
sysctl_hung_task_timeout_secs / 2)
if (work->done && work->done->progress_stamp && timeout &&
(jiffies - work->done->progress_stamp) > HZ * timeout / 2)
wake_up_all(work->done->waitq);
wbc_detach_inode(&wbc);

View File

@@ -624,6 +624,7 @@ static int iomap_readahead_iter(struct iomap_iter *iter,
* iomap_readahead - Attempt to read pages from a file.
* @ops: The operations vector for the filesystem.
* @ctx: The ctx used for issuing readahead.
* @private: The filesystem-specific information for issuing iomap_iter.
*
* This function is for filesystems to call to implement their readahead
* address_space operation.

View File

@@ -69,11 +69,57 @@ static u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend)
return folio_count;
}
static DEFINE_SPINLOCK(failed_ioend_lock);
static LIST_HEAD(failed_ioend_list);
static void
iomap_fail_ioends(
struct work_struct *work)
{
struct iomap_ioend *ioend;
struct list_head tmp;
unsigned long flags;
spin_lock_irqsave(&failed_ioend_lock, flags);
list_replace_init(&failed_ioend_list, &tmp);
spin_unlock_irqrestore(&failed_ioend_lock, flags);
while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend,
io_list))) {
list_del_init(&ioend->io_list);
iomap_finish_ioend_buffered(ioend);
cond_resched();
}
}
static DECLARE_WORK(failed_ioend_work, iomap_fail_ioends);
static void iomap_fail_ioend_buffered(struct iomap_ioend *ioend)
{
unsigned long flags;
/*
* Bounce I/O errors to a workqueue to avoid nested i_lock acquisitions
* in the fserror code. The caller no longer owns the ioend reference
* after the spinlock drops.
*/
spin_lock_irqsave(&failed_ioend_lock, flags);
if (list_empty(&failed_ioend_list))
WARN_ON_ONCE(!schedule_work(&failed_ioend_work));
list_add_tail(&ioend->io_list, &failed_ioend_list);
spin_unlock_irqrestore(&failed_ioend_lock, flags);
}
static void ioend_writeback_end_bio(struct bio *bio)
{
struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
ioend->io_error = blk_status_to_errno(bio->bi_status);
if (ioend->io_error) {
iomap_fail_ioend_buffered(ioend);
return;
}
iomap_finish_ioend_buffered(ioend);
}

View File

@@ -247,7 +247,7 @@ struct inode *minix_new_inode(const struct inode *dir, umode_t mode)
j += i * bits_per_zone;
if (!j || j > sbi->s_ninodes) {
iput(inode);
return ERR_PTR(-ENOSPC);
return ERR_PTR(-EFSCORRUPTED);
}
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
inode->i_ino = j;

View File

@@ -1531,23 +1531,33 @@ static struct mount *mnt_find_id_at_reverse(struct mnt_namespace *ns, u64 mnt_id
static void *m_start(struct seq_file *m, loff_t *pos)
{
struct proc_mounts *p = m->private;
struct mount *mnt;
down_read(&namespace_sem);
return mnt_find_id_at(p->ns, *pos);
mnt = mnt_find_id_at(p->ns, *pos);
if (mnt)
*pos = mnt->mnt_id_unique;
return mnt;
}
static void *m_next(struct seq_file *m, void *v, loff_t *pos)
{
struct mount *next = NULL, *mnt = v;
struct mount *mnt = v;
struct rb_node *node = rb_next(&mnt->mnt_node);
++*pos;
if (node) {
next = node_to_mount(node);
struct mount *next = node_to_mount(node);
*pos = next->mnt_id_unique;
return next;
}
return next;
/*
* No more mounts. Set pos past current mount's ID so that if
* iteration restarts, mnt_find_id_at() returns NULL.
*/
*pos = mnt->mnt_id_unique + 1;
return NULL;
}
static void m_stop(struct seq_file *m, void *v)
@@ -2791,7 +2801,8 @@ static inline void unlock_mount(struct pinned_mountpoint *m)
}
static void lock_mount_exact(const struct path *path,
struct pinned_mountpoint *mp);
struct pinned_mountpoint *mp, bool copy_mount,
unsigned int copy_flags);
#define LOCK_MOUNT_MAYBE_BENEATH(mp, path, beneath) \
struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
@@ -2799,7 +2810,10 @@ static void lock_mount_exact(const struct path *path,
#define LOCK_MOUNT(mp, path) LOCK_MOUNT_MAYBE_BENEATH(mp, (path), false)
#define LOCK_MOUNT_EXACT(mp, path) \
struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
lock_mount_exact((path), &mp)
lock_mount_exact((path), &mp, false, 0)
#define LOCK_MOUNT_EXACT_COPY(mp, path, copy_flags) \
struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
lock_mount_exact((path), &mp, true, (copy_flags))
static int graft_tree(struct mount *mnt, const struct pinned_mountpoint *mp)
{
@@ -3073,16 +3087,13 @@ static struct file *open_detached_copy(struct path *path, unsigned int flags)
return file;
}
DEFINE_FREE(put_empty_mnt_ns, struct mnt_namespace *,
if (!IS_ERR_OR_NULL(_T)) free_mnt_ns(_T))
static struct mnt_namespace *create_new_namespace(struct path *path, unsigned int flags)
{
struct mnt_namespace *new_ns __free(put_empty_mnt_ns) = NULL;
struct path to_path __free(path_put) = {};
struct mnt_namespace *ns = current->nsproxy->mnt_ns;
struct user_namespace *user_ns = current_user_ns();
struct mount *new_ns_root;
struct mnt_namespace *new_ns;
struct mount *new_ns_root, *old_ns_root;
struct path to_path;
struct mount *mnt;
unsigned int copy_flags = 0;
bool locked = false;
@@ -3094,71 +3105,63 @@ static struct mnt_namespace *create_new_namespace(struct path *path, unsigned in
if (IS_ERR(new_ns))
return ERR_CAST(new_ns);
scoped_guard(namespace_excl) {
new_ns_root = clone_mnt(ns->root, ns->root->mnt.mnt_root, copy_flags);
if (IS_ERR(new_ns_root))
return ERR_CAST(new_ns_root);
old_ns_root = ns->root;
to_path.mnt = &old_ns_root->mnt;
to_path.dentry = old_ns_root->mnt.mnt_root;
/*
* If the real rootfs had a locked mount on top of it somewhere
* in the stack, lock the new mount tree as well so it can't be
* exposed.
*/
mnt = ns->root;
while (mnt->overmount) {
mnt = mnt->overmount;
if (mnt->mnt.mnt_flags & MNT_LOCKED)
locked = true;
}
VFS_WARN_ON_ONCE(old_ns_root->mnt.mnt_sb->s_type != &nullfs_fs_type);
LOCK_MOUNT_EXACT_COPY(mp, &to_path, copy_flags);
if (IS_ERR(mp.parent)) {
free_mnt_ns(new_ns);
return ERR_CAST(mp.parent);
}
new_ns_root = mp.parent;
/*
* If the real rootfs had a locked mount on top of it somewhere
* in the stack, lock the new mount tree as well so it can't be
* exposed.
*/
mnt = old_ns_root;
while (mnt->overmount) {
mnt = mnt->overmount;
if (mnt->mnt.mnt_flags & MNT_LOCKED)
locked = true;
}
/*
* We dropped the namespace semaphore so we can actually lock
* the copy for mounting. The copied mount isn't attached to any
* mount namespace and it is thus excluded from any propagation.
* So realistically we're isolated and the mount can't be
* overmounted.
*/
/* Borrow the reference from clone_mnt(). */
to_path.mnt = &new_ns_root->mnt;
to_path.dentry = dget(new_ns_root->mnt.mnt_root);
/* Now lock for actual mounting. */
LOCK_MOUNT_EXACT(mp, &to_path);
if (unlikely(IS_ERR(mp.parent)))
return ERR_CAST(mp.parent);
/*
* We don't emulate unshare()ing a mount namespace. We stick to the
* restrictions of creating detached bind-mounts. It has a lot
* saner and simpler semantics.
* We don't emulate unshare()ing a mount namespace. We stick
* to the restrictions of creating detached bind-mounts. It
* has a lot saner and simpler semantics.
*/
mnt = __do_loopback(path, flags, copy_flags);
if (IS_ERR(mnt))
return ERR_CAST(mnt);
scoped_guard(mount_writer) {
if (IS_ERR(mnt)) {
emptied_ns = new_ns;
umount_tree(new_ns_root, 0);
return ERR_CAST(mnt);
}
if (locked)
mnt->mnt.mnt_flags |= MNT_LOCKED;
/*
* Now mount the detached tree on top of the copy of the
* real rootfs we created.
* now mount the detached tree on top of the copy
* of the real rootfs we created.
*/
attach_mnt(mnt, new_ns_root, mp.mp);
if (user_ns != ns->user_ns)
lock_mnt_tree(new_ns_root);
}
/* Add all mounts to the new namespace. */
for (struct mount *p = new_ns_root; p; p = next_mnt(p, new_ns_root)) {
mnt_add_to_ns(new_ns, p);
for (mnt = new_ns_root; mnt; mnt = next_mnt(mnt, new_ns_root)) {
mnt_add_to_ns(new_ns, mnt);
new_ns->nr_mounts++;
}
new_ns->root = real_mount(no_free_ptr(to_path.mnt));
new_ns->root = new_ns_root;
ns_tree_add_raw(new_ns);
return no_free_ptr(new_ns);
return new_ns;
}
static struct file *open_new_namespace(struct path *path, unsigned int flags)
@@ -3840,16 +3843,20 @@ static int do_new_mount(const struct path *path, const char *fstype,
}
static void lock_mount_exact(const struct path *path,
struct pinned_mountpoint *mp)
struct pinned_mountpoint *mp, bool copy_mount,
unsigned int copy_flags)
{
struct dentry *dentry = path->dentry;
int err;
/* Assert that inode_lock() locked the correct inode. */
VFS_WARN_ON_ONCE(copy_mount && !path_mounted(path));
inode_lock(dentry->d_inode);
namespace_lock();
if (unlikely(cant_mount(dentry)))
err = -ENOENT;
else if (path_overmounted(path))
else if (!copy_mount && path_overmounted(path))
err = -EBUSY;
else
err = get_mountpoint(dentry, mp);
@@ -3857,9 +3864,15 @@ static void lock_mount_exact(const struct path *path,
namespace_unlock();
inode_unlock(dentry->d_inode);
mp->parent = ERR_PTR(err);
} else {
mp->parent = real_mount(path->mnt);
return;
}
if (copy_mount)
mp->parent = clone_mnt(real_mount(path->mnt), dentry, copy_flags);
else
mp->parent = real_mount(path->mnt);
if (unlikely(IS_ERR(mp->parent)))
__unlock_mount(mp);
}
int finish_automount(struct vfsmount *__m, const struct path *path)
@@ -5678,6 +5691,8 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
s->mnt = mnt_file->f_path.mnt;
ns = real_mount(s->mnt)->mnt_ns;
if (IS_ERR(ns))
return PTR_ERR(ns);
if (!ns)
/*
* We can't set mount point and mnt_ns_id since we don't have a

View File

@@ -608,9 +608,8 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
struct user_namespace *user_ns;
user_ns = task_cred_xxx(task, user_ns);
if (!ns_ref_get(user_ns))
break;
ns_common = to_ns_common(user_ns);
if (ns_ref_get(user_ns))
ns_common = to_ns_common(user_ns);
}
#endif
break;
@@ -620,9 +619,8 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
struct pid_namespace *pid_ns;
pid_ns = task_active_pid_ns(task);
if (!ns_ref_get(pid_ns))
break;
ns_common = to_ns_common(pid_ns);
if (ns_ref_get(pid_ns))
ns_common = to_ns_common(pid_ns);
}
#endif
break;

View File

@@ -2128,6 +2128,9 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx,
ino_t ino = 1;
child = try_lookup_noperm(&qname, dir);
if (IS_ERR(child))
goto end_instantiate;
if (!child) {
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
child = d_alloc_parallel(dir, &qname, &wq);

View File

@@ -495,19 +495,6 @@ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
fsnotify_dentry(dentry, mask);
}
static inline int fsnotify_sb_error(struct super_block *sb, struct inode *inode,
int error)
{
struct fs_error_report report = {
.error = error,
.inode = inode,
.sb = sb,
};
return fsnotify(FS_ERROR, &report, FSNOTIFY_EVENT_ERROR,
NULL, NULL, NULL, 0);
}
static inline void fsnotify_mnt_attach(struct mnt_namespace *ns, struct vfsmount *mnt)
{
fsnotify_mnt(FS_MNT_ATTACH, ns, mnt);

View File

@@ -3085,7 +3085,7 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
return 0;
/* don't need lock here; in the worst case we'll do useless copy */
if (fs->users == 1)
if (!(unshare_flags & CLONE_NEWNS) && fs->users == 1)
return 0;
*new_fsp = copy_fs_struct(fs);