From cb184dd19154fc486fa3d9e02afe70a97e54e055 Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Fri, 6 Feb 2026 14:20:28 +0800 Subject: [PATCH 01/13] fs: init flags_valid before calling vfs_fileattr_get syzbot reported a uninit-value bug in [1]. Similar to the "*get" context where the kernel's internal file_kattr structure is initialized before calling vfs_fileattr_get(), we should use the same mechanism when using fa. [1] BUG: KMSAN: uninit-value in fuse_fileattr_get+0xeb4/0x1450 fs/fuse/ioctl.c:517 fuse_fileattr_get+0xeb4/0x1450 fs/fuse/ioctl.c:517 vfs_fileattr_get fs/file_attr.c:94 [inline] __do_sys_file_getattr fs/file_attr.c:416 [inline] Local variable fa.i created at: __do_sys_file_getattr fs/file_attr.c:380 [inline] __se_sys_file_getattr+0x8c/0xbd0 fs/file_attr.c:372 Reported-by: syzbot+7c31755f2cea07838b0c@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=7c31755f2cea07838b0c Tested-by: syzbot+7c31755f2cea07838b0c@syzkaller.appspotmail.com Signed-off-by: Edward Adam Davis Link: https://patch.msgid.link/tencent_B6C4583771D76766D71362A368696EC3B605@qq.com Signed-off-by: Christian Brauner --- fs/file_attr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/file_attr.c b/fs/file_attr.c index 42721427245a..0e9b4f737546 100644 --- a/fs/file_attr.c +++ b/fs/file_attr.c @@ -376,7 +376,7 @@ SYSCALL_DEFINE5(file_getattr, int, dfd, const char __user *, filename, struct path filepath __free(path_put) = {}; unsigned int lookup_flags = 0; struct file_attr fattr; - struct file_kattr fa; + struct file_kattr fa = { .flags_valid = true }; /* hint only */ int error; BUILD_BUG_ON(sizeof(struct file_attr) < FILE_ATTR_SIZE_VER0); From 9eed043d10f17301c1b5141e16bb98a85a8fd07e Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Tue, 3 Feb 2026 17:40:14 +0800 Subject: [PATCH 02/13] writeback: Fix wakeup and logging timeouts for !DETECT_HUNG_TASK Recent changes of fs-writeback cause such warnings if DETECT_HUNG_TASK is not enabled: INFO: The task sync:1342 has been waiting for writeback completion for more than 1 seconds. The reason is sysctl_hung_task_timeout_secs is 0 when DETECT_HUNG_TASK is not enabled, then it causes the warning message even if the writeback lasts for only one second. Guard the wakeup and logging with "#ifdef CONFIG_DETECT_HUNG_TASK" can eliminate the warning messages. But on the other hand, it is possible that sysctl_hung_task_timeout_secs be also 0 when DETECT_HUNG_TASK is enabled. So let's just check the value of sysctl_hung_task_timeout_secs to decide whether do wakeup and logging. Fixes: 1888635532fb ("writeback: Wake up waiting tasks when finishing the writeback of a chunk.") Fixes: d6e621590764 ("writeback: Add logging for slow writeback (exceeds sysctl_hung_task_timeout_secs)") Signed-off-by: Huacai Chen Link: https://patch.msgid.link/20260203094014.2273240-1-chenhuacai@loongson.cn Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 68228bf89b82..eb1ca86bf30a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -198,10 +198,11 @@ static void wb_queue_work(struct bdi_writeback *wb, static bool wb_wait_for_completion_cb(struct wb_completion *done) { + unsigned long timeout = sysctl_hung_task_timeout_secs; unsigned long waited_secs = (jiffies - done->wait_start) / HZ; done->progress_stamp = jiffies; - if (waited_secs > sysctl_hung_task_timeout_secs) + if (timeout && (waited_secs > timeout)) pr_info("INFO: The task %s:%d has been waiting for writeback " "completion for more than %lu seconds.", current->comm, current->pid, waited_secs); @@ -1955,6 +1956,7 @@ static long writeback_sb_inodes(struct super_block *sb, .range_end = LLONG_MAX, }; unsigned long start_time = jiffies; + unsigned long timeout = sysctl_hung_task_timeout_secs; long write_chunk; long total_wrote = 0; /* count both pages and inodes */ unsigned long dirtied_before = jiffies; @@ -2041,9 +2043,8 @@ static long writeback_sb_inodes(struct super_block *sb, __writeback_single_inode(inode, &wbc); /* Report progress to inform the hung task detector of the progress. */ - if (work->done && work->done->progress_stamp && - (jiffies - work->done->progress_stamp) > HZ * - sysctl_hung_task_timeout_secs / 2) + if (work->done && work->done->progress_stamp && timeout && + (jiffies - work->done->progress_stamp) > HZ * timeout / 2) wake_up_all(work->done->waitq); wbc_detach_inode(&wbc); From 81f16c9778d730f573d0d565706bb7227e2405f4 Mon Sep 17 00:00:00 2001 From: Qing Wang Date: Fri, 13 Feb 2026 18:30:06 +0800 Subject: [PATCH 03/13] statmount: Fix the null-ptr-deref in do_statmount() If the mount is internal, it's mnt_ns will be MNT_NS_INTERNAL, which is defined as ERR_PTR(-EINVAL). So, in the do_statmount(), need to check ns of mount by IS_ERR() and return. Fixes: 0e5032237ee5 ("statmount: accept fd as a parameter") Reported-by: syzbot+9e03a9535ea65f687a44@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/698e287a.a70a0220.2c38d7.009e.GAE@google.com/ Signed-off-by: Qing Wang Link: https://patch.msgid.link/20260213103006.2472569-1-wangqing7171@gmail.com Reviewed-by: Bhavik Sachdev Signed-off-by: Christian Brauner --- fs/namespace.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/namespace.c b/fs/namespace.c index a67cbe42746d..90700df65f0d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -5678,6 +5678,8 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, s->mnt = mnt_file->f_path.mnt; ns = real_mount(s->mnt)->mnt_ns; + if (IS_ERR(ns)) + return PTR_ERR(ns); if (!ns) /* * We can't set mount point and mnt_ns_id since we don't have a From ac83896172798cf82ebc643cf555aa4cdd3a07da Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Fri, 13 Feb 2026 02:28:12 +0000 Subject: [PATCH 04/13] iomap: Describe @private in iomap_readahead() The kernel test rebot reports the kernel-doc warning: ``` Warning: fs/iomap/buffered-io.c:624 function parameter 'private' not described in 'iomap_readahead' ``` The former commit in "iomap: stash iomap read ctx in the private field of iomap_iter" has added a new parameter @private to iomap_readahead(), so let's describe the parameter. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202601261111.vIL9rhgD-lkp@intel.com/ Fixes: 8806f279244b ("iomap: stash iomap read ctx in the private field of iomap_iter") Signed-off-by: Hongbo Li Link: https://patch.msgid.link/20260213022812.766187-1-lihongbo22@huawei.com Reviewed-by: Gao Xiang Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 1fe19b4ee2f4..58887513b894 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -625,6 +625,7 @@ static int iomap_readahead_iter(struct iomap_iter *iter, * iomap_readahead - Attempt to read pages from a file. * @ops: The operations vector for the filesystem. * @ctx: The ctx used for issuing readahead. + * @private: The filesystem-specific information for issuing iomap_iter. * * This function is for filesystems to call to implement their readahead * address_space operation. From a41dbf5e004edbe1260883c43a8bd134d9cb0c1c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sat, 14 Feb 2026 16:22:13 +0100 Subject: [PATCH 05/13] mount: hold namespace_sem across copy in create_new_namespace() Fix an oversight when creating a new mount namespace. If someone had the bright idea to make the real rootfs a shared or dependent mount and it is later copied the copy will become a peer of the old real rootfs mount or a dependent mount of it. The namespace semaphore is dropped and we use mount lock exact to lock the new real root mount. If that fails or the subsequent do_loopback() fails we rely on the copy of the real root mount to be cleaned up by path_put(). The problem is that this doesn't deal with mount propagation and will leave the mounts linked in the propagation lists. When creating a new mount namespace create_new_namespace() first acquires namespace_sem to clone the nullfs root, drops it, then reacquires it via LOCK_MOUNT_EXACT which takes inode_lock first to respect the inode_lock -> namespace_sem lock ordering. This drop-and-reacquire pattern is fragile and was the source of the propagation cleanup bug fixed in the preceding commit. Extend lock_mount_exact() with a copy_mount mode that clones the mount under the locks atomically. When copy_mount is true, path_overmounted() is skipped since we're copying the mount, not mounting on top of it - the nullfs root always has rootfs mounted on top so the check would always fail. If clone_mnt() fails after get_mountpoint() has pinned the mountpoint, __unlock_mount() is used to properly unpin the mountpoint and release both locks. This allows create_new_namespace() to use LOCK_MOUNT_EXACT_COPY which takes inode_lock and namespace_sem once and holds them throughout the clone and subsequent mount operations, eliminating the drop-and-reacquire pattern entirely. Reported-by: syzbot+a89f9434fb5a001ccd58@syzkaller.appspotmail.com Fixes: 9b8a0ba68246 ("mount: add OPEN_TREE_NAMESPACE") # mainline only Link: https://lore.kernel.org/699047f6.050a0220.2757fb.0024.GAE@google.com Signed-off-by: Christian Brauner --- fs/namespace.c | 117 +++++++++++++++++++++++++------------------------ 1 file changed, 60 insertions(+), 57 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 90700df65f0d..022e59afcb5e 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2791,7 +2791,8 @@ static inline void unlock_mount(struct pinned_mountpoint *m) } static void lock_mount_exact(const struct path *path, - struct pinned_mountpoint *mp); + struct pinned_mountpoint *mp, bool copy_mount, + unsigned int copy_flags); #define LOCK_MOUNT_MAYBE_BENEATH(mp, path, beneath) \ struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \ @@ -2799,7 +2800,10 @@ static void lock_mount_exact(const struct path *path, #define LOCK_MOUNT(mp, path) LOCK_MOUNT_MAYBE_BENEATH(mp, (path), false) #define LOCK_MOUNT_EXACT(mp, path) \ struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \ - lock_mount_exact((path), &mp) + lock_mount_exact((path), &mp, false, 0) +#define LOCK_MOUNT_EXACT_COPY(mp, path, copy_flags) \ + struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \ + lock_mount_exact((path), &mp, true, (copy_flags)) static int graft_tree(struct mount *mnt, const struct pinned_mountpoint *mp) { @@ -3073,16 +3077,13 @@ static struct file *open_detached_copy(struct path *path, unsigned int flags) return file; } -DEFINE_FREE(put_empty_mnt_ns, struct mnt_namespace *, - if (!IS_ERR_OR_NULL(_T)) free_mnt_ns(_T)) - static struct mnt_namespace *create_new_namespace(struct path *path, unsigned int flags) { - struct mnt_namespace *new_ns __free(put_empty_mnt_ns) = NULL; - struct path to_path __free(path_put) = {}; struct mnt_namespace *ns = current->nsproxy->mnt_ns; struct user_namespace *user_ns = current_user_ns(); - struct mount *new_ns_root; + struct mnt_namespace *new_ns; + struct mount *new_ns_root, *old_ns_root; + struct path to_path; struct mount *mnt; unsigned int copy_flags = 0; bool locked = false; @@ -3094,71 +3095,63 @@ static struct mnt_namespace *create_new_namespace(struct path *path, unsigned in if (IS_ERR(new_ns)) return ERR_CAST(new_ns); - scoped_guard(namespace_excl) { - new_ns_root = clone_mnt(ns->root, ns->root->mnt.mnt_root, copy_flags); - if (IS_ERR(new_ns_root)) - return ERR_CAST(new_ns_root); + old_ns_root = ns->root; + to_path.mnt = &old_ns_root->mnt; + to_path.dentry = old_ns_root->mnt.mnt_root; - /* - * If the real rootfs had a locked mount on top of it somewhere - * in the stack, lock the new mount tree as well so it can't be - * exposed. - */ - mnt = ns->root; - while (mnt->overmount) { - mnt = mnt->overmount; - if (mnt->mnt.mnt_flags & MNT_LOCKED) - locked = true; - } + VFS_WARN_ON_ONCE(old_ns_root->mnt.mnt_sb->s_type != &nullfs_fs_type); + + LOCK_MOUNT_EXACT_COPY(mp, &to_path, copy_flags); + if (IS_ERR(mp.parent)) { + free_mnt_ns(new_ns); + return ERR_CAST(mp.parent); + } + new_ns_root = mp.parent; + + /* + * If the real rootfs had a locked mount on top of it somewhere + * in the stack, lock the new mount tree as well so it can't be + * exposed. + */ + mnt = old_ns_root; + while (mnt->overmount) { + mnt = mnt->overmount; + if (mnt->mnt.mnt_flags & MNT_LOCKED) + locked = true; } /* - * We dropped the namespace semaphore so we can actually lock - * the copy for mounting. The copied mount isn't attached to any - * mount namespace and it is thus excluded from any propagation. - * So realistically we're isolated and the mount can't be - * overmounted. - */ - - /* Borrow the reference from clone_mnt(). */ - to_path.mnt = &new_ns_root->mnt; - to_path.dentry = dget(new_ns_root->mnt.mnt_root); - - /* Now lock for actual mounting. */ - LOCK_MOUNT_EXACT(mp, &to_path); - if (unlikely(IS_ERR(mp.parent))) - return ERR_CAST(mp.parent); - - /* - * We don't emulate unshare()ing a mount namespace. We stick to the - * restrictions of creating detached bind-mounts. It has a lot - * saner and simpler semantics. + * We don't emulate unshare()ing a mount namespace. We stick + * to the restrictions of creating detached bind-mounts. It + * has a lot saner and simpler semantics. */ mnt = __do_loopback(path, flags, copy_flags); - if (IS_ERR(mnt)) - return ERR_CAST(mnt); - scoped_guard(mount_writer) { + if (IS_ERR(mnt)) { + emptied_ns = new_ns; + umount_tree(new_ns_root, 0); + return ERR_CAST(mnt); + } + if (locked) mnt->mnt.mnt_flags |= MNT_LOCKED; /* - * Now mount the detached tree on top of the copy of the - * real rootfs we created. + * now mount the detached tree on top of the copy + * of the real rootfs we created. */ attach_mnt(mnt, new_ns_root, mp.mp); if (user_ns != ns->user_ns) lock_mnt_tree(new_ns_root); } - /* Add all mounts to the new namespace. */ - for (struct mount *p = new_ns_root; p; p = next_mnt(p, new_ns_root)) { - mnt_add_to_ns(new_ns, p); + for (mnt = new_ns_root; mnt; mnt = next_mnt(mnt, new_ns_root)) { + mnt_add_to_ns(new_ns, mnt); new_ns->nr_mounts++; } - new_ns->root = real_mount(no_free_ptr(to_path.mnt)); + new_ns->root = new_ns_root; ns_tree_add_raw(new_ns); - return no_free_ptr(new_ns); + return new_ns; } static struct file *open_new_namespace(struct path *path, unsigned int flags) @@ -3840,16 +3833,20 @@ static int do_new_mount(const struct path *path, const char *fstype, } static void lock_mount_exact(const struct path *path, - struct pinned_mountpoint *mp) + struct pinned_mountpoint *mp, bool copy_mount, + unsigned int copy_flags) { struct dentry *dentry = path->dentry; int err; + /* Assert that inode_lock() locked the correct inode. */ + VFS_WARN_ON_ONCE(copy_mount && !path_mounted(path)); + inode_lock(dentry->d_inode); namespace_lock(); if (unlikely(cant_mount(dentry))) err = -ENOENT; - else if (path_overmounted(path)) + else if (!copy_mount && path_overmounted(path)) err = -EBUSY; else err = get_mountpoint(dentry, mp); @@ -3857,9 +3854,15 @@ static void lock_mount_exact(const struct path *path, namespace_unlock(); inode_unlock(dentry->d_inode); mp->parent = ERR_PTR(err); - } else { - mp->parent = real_mount(path->mnt); + return; } + + if (copy_mount) + mp->parent = clone_mnt(real_mount(path->mnt), dentry, copy_flags); + else + mp->parent = real_mount(path->mnt); + if (unlikely(IS_ERR(mp->parent))) + __unlock_mount(mp); } int finish_automount(struct vfsmount *__m, const struct path *path) From 4a403d7aa9074f527f064ef0806aaab38d14b07c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 29 Jan 2026 14:52:22 +0100 Subject: [PATCH 06/13] namespace: fix proc mount iteration The m->index isn't updated when m->show() overflows and retains its value before the current mount causing a restart to start at the same value. If that happens in short order to due a quickly expanding mount table this would cause the same mount to be shown again and again. Ensure that *pos always equals the mount id of the mount that was returned by start/next. On restart after overflow mnt_find_id_at(*pos) finds the exact mount. This should avoid duplicates, avoid skips and should handle concurrent modification just fine. Cc: Fixed: 2eea9ce4310d8 ("mounts: keep list of mounts in an rbtree") Link: https://patch.msgid.link/20260129-geleckt-treuhand-4bb940acacd9@brauner Signed-off-by: Christian Brauner --- fs/namespace.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 022e59afcb5e..31483c66ace8 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1531,23 +1531,33 @@ static struct mount *mnt_find_id_at_reverse(struct mnt_namespace *ns, u64 mnt_id static void *m_start(struct seq_file *m, loff_t *pos) { struct proc_mounts *p = m->private; + struct mount *mnt; down_read(&namespace_sem); - return mnt_find_id_at(p->ns, *pos); + mnt = mnt_find_id_at(p->ns, *pos); + if (mnt) + *pos = mnt->mnt_id_unique; + return mnt; } static void *m_next(struct seq_file *m, void *v, loff_t *pos) { - struct mount *next = NULL, *mnt = v; + struct mount *mnt = v; struct rb_node *node = rb_next(&mnt->mnt_node); - ++*pos; if (node) { - next = node_to_mount(node); + struct mount *next = node_to_mount(node); *pos = next->mnt_id_unique; + return next; } - return next; + + /* + * No more mounts. Set pos past current mount's ID so that if + * iteration restarts, mnt_find_id_at() returns NULL. + */ + *pos = mnt->mnt_id_unique + 1; + return NULL; } static void m_stop(struct seq_file *m, void *v) From ef0b64741a53e47ce8022c973099e969094aa536 Mon Sep 17 00:00:00 2001 From: Jori Koolstra Date: Mon, 1 Dec 2025 13:23:38 +0100 Subject: [PATCH 07/13] minix: Correct errno in minix_new_inode The cases (!j || j > sbi->s_ninodes) can never occur unless the filesystem is broken, so this should not return ENOSPC, but EFSCORRUPTED. Signed-off-by: Jori Koolstra Link: https://patch.msgid.link/20251201122338.90568-1-jkoolstra@xs4all.nl Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/minix/bitmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c index 7da66ca184f4..abec438330a7 100644 --- a/fs/minix/bitmap.c +++ b/fs/minix/bitmap.c @@ -247,7 +247,7 @@ struct inode *minix_new_inode(const struct inode *dir, umode_t mode) j += i * bits_per_zone; if (!j || j > sbi->s_ninodes) { iput(inode); - return ERR_PTR(-ENOSPC); + return ERR_PTR(-EFSCORRUPTED); } inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_ino = j; From 6c4b2243cb6c0755159bd567130d5e12e7b10d9f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 7 Feb 2026 08:25:24 +0000 Subject: [PATCH 08/13] unshare: fix unshare_fs() handling There's an unpleasant corner case in unshare(2), when we have a CLONE_NEWNS in flags and current->fs hadn't been shared at all; in that case copy_mnt_ns() gets passed current->fs instead of a private copy, which causes interesting warts in proof of correctness] > I guess if private means fs->users == 1, the condition could still be true. Unfortunately, it's worse than just a convoluted proof of correctness. Consider the case when we have CLONE_NEWCGROUP in addition to CLONE_NEWNS (and current->fs->users == 1). We pass current->fs to copy_mnt_ns(), all right. Suppose it succeeds and flips current->fs->{pwd,root} to corresponding locations in the new namespace. Now we proceed to copy_cgroup_ns(), which fails (e.g. with -ENOMEM). We call put_mnt_ns() on the namespace created by copy_mnt_ns(), it's destroyed and its mount tree is dissolved, but... current->fs->root and current->fs->pwd are both left pointing to now detached mounts. They are pinning those, so it's not a UAF, but it leaves the calling process with unshare(2) failing with -ENOMEM _and_ leaving it with pwd and root on detached isolated mounts. The last part is clearly a bug. There is other fun related to that mess (races with pivot_root(), including the one between pivot_root() and fork(), of all things), but this one is easy to isolate and fix - treat CLONE_NEWNS as "allocate a new fs_struct even if it hadn't been shared in the first place". Sure, we could go for something like "if both CLONE_NEWNS *and* one of the things that might end up failing after copy_mnt_ns() call in create_new_namespaces() are set, force allocation of new fs_struct", but let's keep it simple - the cost of copy_fs_struct() is trivial. Another benefit is that copy_mnt_ns() with CLONE_NEWNS *always* gets a freshly allocated fs_struct, yet to be attached to anything. That seriously simplifies the analysis... FWIW, that bug had been there since the introduction of unshare(2) ;-/ Signed-off-by: Al Viro Link: https://patch.msgid.link/20260207082524.GE3183987@ZenIV Tested-by: Waiman Long Signed-off-by: Christian Brauner --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/fork.c b/kernel/fork.c index e832da9d15a4..65113a304518 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -3085,7 +3085,7 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) return 0; /* don't need lock here; in the worst case we'll do useless copy */ - if (fs->users == 1) + if (!(unshare_flags & CLONE_NEWNS) && fs->users == 1) return 0; *new_fsp = copy_fs_struct(fs); From 249013e673fce3506c61063c7cbedd75b4c668d8 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 18 Feb 2026 22:09:21 -0800 Subject: [PATCH 09/13] fsnotify: drop unused helper Remove this helper now that all users have been converted to fserror_report_metadata as of 7.0-rc1. Cc: jack@suse.cz Cc: amir73il@gmail.com Signed-off-by: Darrick J. Wong Link: https://patch.msgid.link/177148129543.716249.980530449513340111.stgit@frogsfrogsfrogs Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- include/linux/fsnotify.h | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 28a9cb13fbfa..079c18bcdbde 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -495,19 +495,6 @@ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid) fsnotify_dentry(dentry, mask); } -static inline int fsnotify_sb_error(struct super_block *sb, struct inode *inode, - int error) -{ - struct fs_error_report report = { - .error = error, - .inode = inode, - .sb = sb, - }; - - return fsnotify(FS_ERROR, &report, FSNOTIFY_EVENT_ERROR, - NULL, NULL, NULL, 0); -} - static inline void fsnotify_mnt_attach(struct mnt_namespace *ns, struct vfsmount *mnt) { fsnotify_mnt(FS_MNT_ATTACH, ns, mnt); From 294f54f849d846f4643a67db9b41b63867dc8bfe Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 18 Feb 2026 22:09:37 -0800 Subject: [PATCH 10/13] fserror: fix lockdep complaint when igrabbing inode Christoph Hellwig reported a lockdep splat in generic/108: ================================ WARNING: inconsistent lock state 6.19.0+ #4827 Tainted: G N -------------------------------- inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage. swapper/1/0 [HC1[1]:SC0[0]:HE0:SE1] takes: ffff88811ed1b140 (&sb->s_type->i_lock_key#33){?.+.}-{3:3}, at: igrab+0x1a/0xb0 {HARDIRQ-ON-W} state was registered at: lock_acquire+0xca/0x2c0 _raw_spin_lock+0x2e/0x40 unlock_new_inode+0x2c/0xc0 xfs_iget+0xcf4/0x1080 xfs_trans_metafile_iget+0x3d/0x100 xfs_metafile_iget+0x2b/0x50 xfs_mount_setup_metadir+0x20/0x60 xfs_mountfs+0x457/0xa60 xfs_fs_fill_super+0x6b3/0xa90 get_tree_bdev_flags+0x13c/0x1e0 vfs_get_tree+0x27/0xe0 vfs_cmd_create+0x54/0xe0 __do_sys_fsconfig+0x309/0x620 do_syscall_64+0x8b/0xf80 entry_SYSCALL_64_after_hwframe+0x76/0x7e irq event stamp: 139080 hardirqs last enabled at (139079): [] do_idle+0x1ec/0x270 hardirqs last disabled at (139080): [] common_interrupt+0x19/0xe0 softirqs last enabled at (139032): [] __irq_exit_rcu+0xc3/0x120 softirqs last disabled at (139025): [] __irq_exit_rcu+0xc3/0x120 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&sb->s_type->i_lock_key#33); lock(&sb->s_type->i_lock_key#33); *** DEADLOCK *** 1 lock held by swapper/1/0: #0: ffff8881052c81a0 (&vblk->vqs[i].lock){-.-.}-{3:3}, at: virtblk_done+0x4b/0x110 stack backtrace: CPU: 1 UID: 0 PID: 0 Comm: swapper/1 Tainted: G N 6.19.0+ #4827 PREEMPT(full) Tainted: [N]=TEST Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.17.0-0-gb52ca86e094d-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x5b/0x80 print_usage_bug.part.0+0x22c/0x2c0 mark_lock+0xa6f/0xe90 __lock_acquire+0x10b6/0x25e0 lock_acquire+0xca/0x2c0 _raw_spin_lock+0x2e/0x40 igrab+0x1a/0xb0 fserror_report+0x135/0x260 iomap_finish_ioend_buffered+0x170/0x210 clone_endio+0x8f/0x1c0 blk_update_request+0x1e4/0x4d0 blk_mq_end_request+0x1b/0x100 virtblk_done+0x6f/0x110 vring_interrupt+0x59/0x80 __handle_irq_event_percpu+0x8a/0x2e0 handle_irq_event+0x33/0x70 handle_edge_irq+0xdd/0x1e0 __common_interrupt+0x6f/0x180 common_interrupt+0xb7/0xe0 It looks like the concern here is that inode::i_lock is sometimes taken in IRQ context, and sometimes it is held when going to IRQ context, though it's a little difficult to tell since I think this is a kernel from after the actual 6.19 release but before 7.0-rc1. Either way, we don't need to take i_lock, because filesystems should not report files to fserror if they're about to be freed or have not yet been exposed to other threads, because the resulting fsnotify report will be meaningless. Therefore, bump inode::i_count directly and clarify the preconditions on the inode being passed in. Link: https://lore.kernel.org/linux-fsdevel/aY7BndIgQg3ci_6s@infradead.org/ Reported-by: Christoph Hellwig Signed-off-by: Darrick J. Wong Link: https://patch.msgid.link/177148129564.716249.3069780698231701540.stgit@frogsfrogsfrogs Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/iomap/ioend.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c index e4d57cb969f1..4d1ef8a2cee9 100644 --- a/fs/iomap/ioend.c +++ b/fs/iomap/ioend.c @@ -69,11 +69,57 @@ static u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend) return folio_count; } +static DEFINE_SPINLOCK(failed_ioend_lock); +static LIST_HEAD(failed_ioend_list); + +static void +iomap_fail_ioends( + struct work_struct *work) +{ + struct iomap_ioend *ioend; + struct list_head tmp; + unsigned long flags; + + spin_lock_irqsave(&failed_ioend_lock, flags); + list_replace_init(&failed_ioend_list, &tmp); + spin_unlock_irqrestore(&failed_ioend_lock, flags); + + while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend, + io_list))) { + list_del_init(&ioend->io_list); + iomap_finish_ioend_buffered(ioend); + cond_resched(); + } +} + +static DECLARE_WORK(failed_ioend_work, iomap_fail_ioends); + +static void iomap_fail_ioend_buffered(struct iomap_ioend *ioend) +{ + unsigned long flags; + + /* + * Bounce I/O errors to a workqueue to avoid nested i_lock acquisitions + * in the fserror code. The caller no longer owns the ioend reference + * after the spinlock drops. + */ + spin_lock_irqsave(&failed_ioend_lock, flags); + if (list_empty(&failed_ioend_list)) + WARN_ON_ONCE(!schedule_work(&failed_ioend_work)); + list_add_tail(&ioend->io_list, &failed_ioend_list); + spin_unlock_irqrestore(&failed_ioend_lock, flags); +} + static void ioend_writeback_end_bio(struct bio *bio) { struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); ioend->io_error = blk_status_to_errno(bio->bi_status); + if (ioend->io_error) { + iomap_fail_ioend_buffered(ioend); + return; + } + iomap_finish_ioend_buffered(ioend); } From f6a495484a27150fb85f943e1a7464da88c2a797 Mon Sep 17 00:00:00 2001 From: Ethan Tidmore Date: Thu, 19 Feb 2026 16:10:01 -0600 Subject: [PATCH 11/13] proc: Fix pointer error dereference The function try_lookup_noperm() can return an error pointer. Add check for error pointer. Detected by Smatch: fs/proc/base.c:2148 proc_fill_cache() error: 'child' dereferencing possible ERR_PTR() Fixes: 1df98b8bbcca ("proc_fill_cache(): clean up, get rid of pointless find_inode_number() use") Signed-off-by: Ethan Tidmore Link: https://patch.msgid.link/20260219221001.1117135-1-ethantidmore06@gmail.com Signed-off-by: Christian Brauner --- fs/proc/base.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/proc/base.c b/fs/proc/base.c index 4eec684baca9..4c863d17dfb4 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2128,6 +2128,9 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx, ino_t ino = 1; child = try_lookup_noperm(&qname, dir); + if (IS_ERR(child)) + goto end_instantiate; + if (!child) { DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); child = d_alloc_parallel(dir, &qname, &wq); From fdcfce93073d990ed4b71752e31ad1c1d6e9d58b Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 23 Feb 2026 20:59:33 +0100 Subject: [PATCH 12/13] eventpoll: Fix integer overflow in ep_loop_check_proc() If a recursive call to ep_loop_check_proc() hits the `result = INT_MAX`, an integer overflow will occur in the calling ep_loop_check_proc() at `result = max(result, ep_loop_check_proc(ep_tovisit, depth + 1) + 1)`, breaking the recursion depth check. Fix it by using a different placeholder value that can't lead to an overflow. Reported-by: Guenter Roeck Fixes: f2e467a48287 ("eventpoll: Fix semi-unbounded recursion") Cc: stable@vger.kernel.org Signed-off-by: Jann Horn Link: https://patch.msgid.link/20260223-epoll-int-overflow-v1-1-452f35132224@google.com Signed-off-by: Christian Brauner --- fs/eventpoll.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 6c36d9dc6926..d20917b03161 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -2061,7 +2061,8 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, * @ep: the &struct eventpoll to be currently checked. * @depth: Current depth of the path being checked. * - * Return: depth of the subtree, or INT_MAX if we found a loop or went too deep. + * Return: depth of the subtree, or a value bigger than EP_MAX_NESTS if we found + * a loop or went too deep. */ static int ep_loop_check_proc(struct eventpoll *ep, int depth) { @@ -2080,7 +2081,7 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth) struct eventpoll *ep_tovisit; ep_tovisit = epi->ffd.file->private_data; if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS) - result = INT_MAX; + result = EP_MAX_NESTS+1; else result = max(result, ep_loop_check_proc(ep_tovisit, depth + 1) + 1); if (result > EP_MAX_NESTS) From 4a1ddb0f1c48c2b56f21d8b5200e2e29adf4c1df Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 24 Feb 2026 12:09:00 +0100 Subject: [PATCH 13/13] pidfs: avoid misleading break The break would only break out of the scoped_guard() loop, not the switch statement. It still works correct as is ofc but let's avoid the confusion. Reported-by: David Lechner Link:: https://lore.kernel.org/cd2153f1-098b-463c-bbc1-5c6ca9ef1f12@baylibre.com Signed-off-by: Christian Brauner --- fs/pidfs.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/pidfs.c b/fs/pidfs.c index 1e20e36e0ed5..21f9f011a957 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -577,9 +577,8 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct user_namespace *user_ns; user_ns = task_cred_xxx(task, user_ns); - if (!ns_ref_get(user_ns)) - break; - ns_common = to_ns_common(user_ns); + if (ns_ref_get(user_ns)) + ns_common = to_ns_common(user_ns); } #endif break; @@ -589,9 +588,8 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct pid_namespace *pid_ns; pid_ns = task_active_pid_ns(task); - if (!ns_ref_get(pid_ns)) - break; - ns_common = to_ns_common(pid_ns); + if (ns_ref_get(pid_ns)) + ns_common = to_ns_common(pid_ns); } #endif break;