From 23e490336467fcdaf95e1efcf8f58067b59f647b Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sat, 1 Mar 2025 11:43:56 +0100 Subject: [PATCH 1/6] fs: predict no error in close() Vast majority of the time the system call returns 0. Letting the compiler know shortens the routine (119 -> 116) and the fast path. Disasm starting at the call to __fput_sync(): before: <+55>: call 0xffffffff816b0da0 <__fput_sync> <+60>: lea 0x201(%rbx),%eax <+66>: cmp $0x1,%eax <+69>: jbe 0xffffffff816ab707 <__x64_sys_close+103> <+71>: mov %ebx,%edx <+73>: movslq %ebx,%rax <+76>: and $0xfffffffd,%edx <+79>: cmp $0xfffffdfc,%edx <+85>: mov $0xfffffffffffffffc,%rdx <+92>: cmove %rdx,%rax <+96>: pop %rbx <+97>: pop %rbp <+98>: jmp 0xffffffff82242fa0 <__x86_return_thunk> <+103>: mov $0xfffffffffffffffc,%rax <+110>: jmp 0xffffffff816ab700 <__x64_sys_close+96> <+112>: mov $0xfffffffffffffff7,%rax <+119>: jmp 0xffffffff816ab700 <__x64_sys_close+96> after: <+56>: call 0xffffffff816b0da0 <__fput_sync> <+61>: xor %eax,%eax <+63>: test %ebp,%ebp <+65>: jne 0xffffffff816ab6ea <__x64_sys_close+74> <+67>: pop %rbx <+68>: pop %rbp <+69>: jmp 0xffffffff82242fa0 <__x86_return_thunk> # the jmp out <+74>: lea 0x201(%rbp),%edx <+80>: mov $0xfffffffffffffffc,%rax <+87>: cmp $0x1,%edx <+90>: jbe 0xffffffff816ab6e3 <__x64_sys_close+67> <+92>: mov %ebp,%edx <+94>: and $0xfffffffd,%edx <+97>: cmp $0xfffffdfc,%edx <+103>: cmovne %rbp,%rax <+107>: jmp 0xffffffff816ab6e3 <__x64_sys_close+67> <+109>: mov $0xfffffffffffffff7,%rax <+116>: jmp 0xffffffff816ab6e3 <__x64_sys_close+67> Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250301104356.246031-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/open.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/open.c b/fs/open.c index 932e5a6de63b..a8a5f843e3cf 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1578,11 +1578,14 @@ SYSCALL_DEFINE1(close, unsigned int, fd) */ __fput_sync(file); + if (likely(retval == 0)) + return 0; + /* can't restart close syscall because file table entry was cleared */ - if (unlikely(retval == -ERESTARTSYS || - retval == -ERESTARTNOINTR || - retval == -ERESTARTNOHAND || - retval == -ERESTART_RESTARTBLOCK)) + if (retval == -ERESTARTSYS || + retval == -ERESTARTNOINTR || + retval == -ERESTARTNOHAND || + retval == -ERESTART_RESTARTBLOCK) retval = -EINTR; return retval; From e83588458f656417d9b7ac45baf1c7b45790059b Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 5 Mar 2025 13:36:41 +0100 Subject: [PATCH 2/6] file: add fput and file_ref_put routines optimized for use when closing a fd Vast majority of the time closing a file descriptor also operates on the last reference, where a regular fput usage will result in 2 atomics. This can be changed to only suffer 1. See commentary above file_ref_put_close() for more information. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250305123644.554845-2-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/file.c | 41 ++++++++++++---------- fs/file_table.c | 74 +++++++++++++++++++++++++++------------- fs/internal.h | 3 ++ include/linux/file_ref.h | 34 ++++++++++++++++++ 4 files changed, 111 insertions(+), 41 deletions(-) diff --git a/fs/file.c b/fs/file.c index d868cdb95d1e..3fef798b96e5 100644 --- a/fs/file.c +++ b/fs/file.c @@ -26,6 +26,28 @@ #include "internal.h" +bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt) +{ + /* + * If the reference count was already in the dead zone, then this + * put() operation is imbalanced. Warn, put the reference count back to + * DEAD and tell the caller to not deconstruct the object. + */ + if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) { + atomic_long_set(&ref->refcnt, FILE_REF_DEAD); + return false; + } + + /* + * This is a put() operation on a saturated refcount. Restore the + * mean saturation value and tell the caller to not deconstruct the + * object. + */ + if (cnt > FILE_REF_MAXREF) + atomic_long_set(&ref->refcnt, FILE_REF_SATURATED); + return false; +} + /** * __file_ref_put - Slowpath of file_ref_put() * @ref: Pointer to the reference count @@ -67,24 +89,7 @@ bool __file_ref_put(file_ref_t *ref, unsigned long cnt) return true; } - /* - * If the reference count was already in the dead zone, then this - * put() operation is imbalanced. Warn, put the reference count back to - * DEAD and tell the caller to not deconstruct the object. - */ - if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) { - atomic_long_set(&ref->refcnt, FILE_REF_DEAD); - return false; - } - - /* - * This is a put() operation on a saturated refcount. Restore the - * mean saturation value and tell the caller to not deconstruct the - * object. - */ - if (cnt > FILE_REF_MAXREF) - atomic_long_set(&ref->refcnt, FILE_REF_SATURATED); - return false; + return __file_ref_put_badval(ref, cnt); } EXPORT_SYMBOL_GPL(__file_ref_put); diff --git a/fs/file_table.c b/fs/file_table.c index f0291a66f9db..18debb7bd285 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -495,30 +495,36 @@ void flush_delayed_fput(void) } EXPORT_SYMBOL_GPL(flush_delayed_fput); +static void __fput_deferred(struct file *file) +{ + struct task_struct *task = current; + + if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) { + file_free(file); + return; + } + + if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { + init_task_work(&file->f_task_work, ____fput); + if (!task_work_add(task, &file->f_task_work, TWA_RESUME)) + return; + /* + * After this task has run exit_task_work(), + * task_work_add() will fail. Fall through to delayed + * fput to avoid leaking *file. + */ + } + + if (llist_add(&file->f_llist, &delayed_fput_list)) + schedule_delayed_work(&delayed_fput_work, 1); +} + void fput(struct file *file) { - if (file_ref_put(&file->f_ref)) { - struct task_struct *task = current; - - if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) { - file_free(file); - return; - } - if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { - init_task_work(&file->f_task_work, ____fput); - if (!task_work_add(task, &file->f_task_work, TWA_RESUME)) - return; - /* - * After this task has run exit_task_work(), - * task_work_add() will fail. Fall through to delayed - * fput to avoid leaking *file. - */ - } - - if (llist_add(&file->f_llist, &delayed_fput_list)) - schedule_delayed_work(&delayed_fput_work, 1); - } + if (unlikely(file_ref_put(&file->f_ref))) + __fput_deferred(file); } +EXPORT_SYMBOL(fput); /* * synchronous analog of fput(); for kernel threads that might be needed @@ -533,10 +539,32 @@ void __fput_sync(struct file *file) if (file_ref_put(&file->f_ref)) __fput(file); } - -EXPORT_SYMBOL(fput); EXPORT_SYMBOL(__fput_sync); +/* + * Equivalent to __fput_sync(), but optimized for being called with the last + * reference. + * + * See file_ref_put_close() for details. + */ +void fput_close_sync(struct file *file) +{ + if (likely(file_ref_put_close(&file->f_ref))) + __fput(file); +} + +/* + * Equivalent to fput(), but optimized for being called with the last + * reference. + * + * See file_ref_put_close() for details. + */ +void fput_close(struct file *file) +{ + if (file_ref_put_close(&file->f_ref)) + __fput_deferred(file); +} + void __init files_init(void) { struct kmem_cache_args args = { diff --git a/fs/internal.h b/fs/internal.h index e7f02ae1e098..05c817f39a28 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -118,6 +118,9 @@ static inline void put_file_access(struct file *file) } } +void fput_close_sync(struct file *); +void fput_close(struct file *); + /* * super.c */ diff --git a/include/linux/file_ref.h b/include/linux/file_ref.h index 9b3a8d9b17ab..6ef92d765a66 100644 --- a/include/linux/file_ref.h +++ b/include/linux/file_ref.h @@ -61,6 +61,7 @@ static inline void file_ref_init(file_ref_t *ref, unsigned long cnt) atomic_long_set(&ref->refcnt, cnt - 1); } +bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt); bool __file_ref_put(file_ref_t *ref, unsigned long cnt); /** @@ -160,6 +161,39 @@ static __always_inline __must_check bool file_ref_put(file_ref_t *ref) return __file_ref_put(ref, cnt); } +/** + * file_ref_put_close - drop a reference expecting it would transition to FILE_REF_NOREF + * @ref: Pointer to the reference count + * + * Semantically it is equivalent to calling file_ref_put(), but it trades lower + * performance in face of other CPUs also modifying the refcount for higher + * performance when this happens to be the last reference. + * + * For the last reference file_ref_put() issues 2 atomics. One to drop the + * reference and another to transition it to FILE_REF_DEAD. This routine does + * the work in one step, but in order to do it has to pre-read the variable which + * decreases scalability. + * + * Use with close() et al, stick to file_ref_put() by default. + */ +static __always_inline __must_check bool file_ref_put_close(file_ref_t *ref) +{ + long old, new; + + old = atomic_long_read(&ref->refcnt); + do { + if (unlikely(old < 0)) + return __file_ref_put_badval(ref, old); + + if (old == FILE_REF_ONEREF) + new = FILE_REF_DEAD; + else + new = old - 1; + } while (!atomic_long_try_cmpxchg(&ref->refcnt, &old, new)); + + return new == FILE_REF_DEAD; +} + /** * file_ref_read - Read the number of file references * @ref: Pointer to the reference count From 3e46a92a27c2927fcef996ba06cbe299da629c28 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 5 Mar 2025 13:36:42 +0100 Subject: [PATCH 3/6] fs: use fput_close_sync() in close() This bumps open+close rate by 1% on Sapphire Rapids by eliding one atomic. It would be higher if it was not for several other slowdowns of the same nature. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250305123644.554845-3-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/open.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/open.c b/fs/open.c index a8a5f843e3cf..47615d0e1d9c 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1576,7 +1576,7 @@ SYSCALL_DEFINE1(close, unsigned int, fd) * We're returning to user space. Don't bother * with any delayed fput() cases. */ - __fput_sync(file); + fput_close_sync(file); if (likely(retval == 0)) return 0; From a914bd93f3edfedcdd59deb615e8dd1b3643cac5 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 5 Mar 2025 13:36:43 +0100 Subject: [PATCH 4/6] fs: use fput_close() in filp_close() When tracing a kernel build over refcounts seen this is a wash: @[kprobe:filp_close]: [0] 32195 |@@@@@@@@@@ | [1] 164567 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| I verified vast majority of the skew comes from do_close_on_exec() which could be changed to use a different variant instead. Even without changing that, the 19.5% of calls which got here still can save the extra atomic. Calls here are borderline non-existent compared to fput (over 3.2 mln!), so they should not negatively affect scalability. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250305123644.554845-4-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/open.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/open.c b/fs/open.c index 47615d0e1d9c..39a89df7c37e 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1550,7 +1550,7 @@ int filp_close(struct file *filp, fl_owner_t id) int retval; retval = filp_flush(filp, id); - fput(filp); + fput_close(filp); return retval; } From 606623de503f4eaa4ca505650bbd2da1f5fb5e6a Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 5 Mar 2025 13:36:44 +0100 Subject: [PATCH 5/6] fs: use fput_close() in path_openat() This bumps failing open rate by 1.7% on Sapphire Rapids by avoiding one atomic. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250305123644.554845-5-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/namei.c b/fs/namei.c index 3ab9440c5b93..355a37097769 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3995,7 +3995,7 @@ static struct file *path_openat(struct nameidata *nd, WARN_ON(1); error = -EINVAL; } - fput(file); + fput_close(file); if (error == -EOPENSTALE) { if (flags & LOOKUP_RCU) error = -ECHILD; From 5370b43e4bcf60049bfd44db83ba8d2ec43d0152 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 19 Mar 2025 22:58:01 +0100 Subject: [PATCH 6/6] fs: reduce work in fdget_pos() 1. predict the file was found 2. explicitly compare the ref to "one", ignoring the dead zone The latter arguably improves the behavior to begin with. Suppose the count turned bad -- the previously used ref routine is going to check for it and return 0, indicating the count does not necessitate taking ->f_pos_lock. But there very well may be several users. i.e. not paying for special-casing the dead zone improves semantics. While here spell out each condition in a dedicated if statement. This has no effect on generated code. Sizes are as follows (in bytes; gcc 13, x86-64): stock: 321 likely(): 298 likely()+ref: 280 Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250319215801.1870660-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/file.c | 11 ++++++++--- include/linux/file_ref.h | 14 ++++++++++++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/fs/file.c b/fs/file.c index 3fef798b96e5..134274446bf6 100644 --- a/fs/file.c +++ b/fs/file.c @@ -1183,8 +1183,13 @@ struct fd fdget_raw(unsigned int fd) */ static inline bool file_needs_f_pos_lock(struct file *file) { - return (file->f_mode & FMODE_ATOMIC_POS) && - (file_count(file) > 1 || file->f_op->iterate_shared); + if (!(file->f_mode & FMODE_ATOMIC_POS)) + return false; + if (__file_ref_read_raw(&file->f_ref) != FILE_REF_ONEREF) + return true; + if (file->f_op->iterate_shared) + return true; + return false; } struct fd fdget_pos(unsigned int fd) @@ -1192,7 +1197,7 @@ struct fd fdget_pos(unsigned int fd) struct fd f = fdget(fd); struct file *file = fd_file(f); - if (file && file_needs_f_pos_lock(file)) { + if (likely(file) && file_needs_f_pos_lock(file)) { f.word |= FDPUT_POS_UNLOCK; mutex_lock(&file->f_pos_lock); } diff --git a/include/linux/file_ref.h b/include/linux/file_ref.h index 6ef92d765a66..7db62fbc0500 100644 --- a/include/linux/file_ref.h +++ b/include/linux/file_ref.h @@ -208,4 +208,18 @@ static inline unsigned long file_ref_read(file_ref_t *ref) return c >= FILE_REF_RELEASED ? 0 : c + 1; } +/* + * __file_ref_read_raw - Return the value stored in ref->refcnt + * @ref: Pointer to the reference count + * + * Return: The raw value found in the counter + * + * A hack for file_needs_f_pos_lock(), you probably want to use + * file_ref_read() instead. + */ +static inline unsigned long __file_ref_read_raw(file_ref_t *ref) +{ + return atomic_long_read(&ref->refcnt); +} + #endif