From 05e65c14ea59a401cec4284e9d612f9d5dc1b3f8 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 12 Nov 2025 21:52:01 +0000 Subject: [PATCH 01/61] f2fs: support large folio for immutable non-compressed case This patch enables large folio for limited case where we can get the high-order memory allocation. It supports the encrypted and fsverity files, which are essential for Android environment. How to test: - dd if=/dev/zero of=/mnt/test/test bs=1G count=4 - f2fs_io setflags immutable /mnt/test/test - echo 3 > /proc/sys/vm/drop_caches : to reload inode with large folio - f2fs_io read 32 0 1024 mmap 0 0 /mnt/test/test Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 43 +++++ fs/f2fs/data.c | 247 +++++++++++++++++++++++++++-- fs/f2fs/f2fs.h | 16 ++ fs/f2fs/file.c | 4 + fs/f2fs/inode.c | 6 +- 5 files changed, 306 insertions(+), 10 deletions(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index cb90d1ae82d0..9b3b835a174e 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -1033,3 +1033,46 @@ the reserved space back to F2FS for its own use. So, the key idea is, user can do any file operations on /dev/vdc, and reclaim the space after the use, while the space is counted as /data. That doesn't require modifying partition size and filesystem format. + +Per-file Read-Only Large Folio Support +-------------------------------------- + +F2FS implements large folio support on the read path to leverage high-order +page allocation for significant performance gains. To minimize code complexity, +this support is currently excluded from the write path, which requires handling +complex optimizations such as compression and block allocation modes. + +This optional feature is triggered only when a file's immutable bit is set. +Consequently, F2FS will return EOPNOTSUPP if a user attempts to open a cached +file with write permissions, even immediately after clearing the bit. Write +access is only restored once the cached inode is dropped. The usage flow is +demonstrated below: + +.. code-block:: + + # f2fs_io setflags immutable /data/testfile_read_seq + + /* flush and reload the inode to enable the large folio */ + # sync && echo 3 > /proc/sys/vm/drop_caches + + /* mmap(MAP_POPULATE) + mlock() */ + # f2fs_io read 128 0 1024 mmap 1 0 /data/testfile_read_seq + + /* mmap() + fadvise(POSIX_FADV_WILLNEED) + mlock() */ + # f2fs_io read 128 0 1024 fadvise 1 0 /data/testfile_read_seq + + /* mmap() + mlock2(MLOCK_ONFAULT) + madvise(MADV_POPULATE_READ) */ + # f2fs_io read 128 0 1024 madvise 1 0 /data/testfile_read_seq + + # f2fs_io clearflags immutable /data/testfile_read_seq + + # f2fs_io write 1 0 1 zero buffered /data/testfile_read_seq + Failed to open /mnt/test/test: Operation not supported + + /* flush and reload the inode to disable the large folio */ + # sync && echo 3 > /proc/sys/vm/drop_caches + + # f2fs_io write 1 0 1 zero buffered /data/testfile_read_seq + Written 4096 bytes with pattern = zero, total_time = 29 us, max_latency = 28 us + + # rm /data/testfile_read_seq diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index c30e69392a62..2a6ae274b8de 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -31,9 +31,15 @@ static struct kmem_cache *bio_post_read_ctx_cache; static struct kmem_cache *bio_entry_slab; +static struct kmem_cache *ffs_entry_slab; static mempool_t *bio_post_read_ctx_pool; static struct bio_set f2fs_bioset; +struct f2fs_folio_state { + spinlock_t state_lock; + unsigned int read_pages_pending; +}; + #define F2FS_BIO_POOL_SIZE NR_CURSEG_TYPE int __init f2fs_init_bioset(void) @@ -138,11 +144,15 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task) { struct folio_iter fi; struct bio_post_read_ctx *ctx = bio->bi_private; + unsigned long flags; bio_for_each_folio_all(fi, bio) { struct folio *folio = fi.folio; + unsigned nr_pages = fi.length >> PAGE_SHIFT; + bool finished = true; - if (f2fs_is_compressed_page(folio)) { + if (!folio_test_large(folio) && + f2fs_is_compressed_page(folio)) { if (ctx && !ctx->decompression_attempted) f2fs_end_read_compressed_page(folio, true, 0, in_task); @@ -150,8 +160,20 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task) continue; } - dec_page_count(F2FS_F_SB(folio), __read_io_type(folio)); - folio_end_read(folio, bio->bi_status == BLK_STS_OK); + if (folio_test_large(folio)) { + struct f2fs_folio_state *ffs = folio->private; + + spin_lock_irqsave(&ffs->state_lock, flags); + ffs->read_pages_pending -= nr_pages; + finished = !ffs->read_pages_pending; + spin_unlock_irqrestore(&ffs->state_lock, flags); + } + + while (nr_pages--) + dec_page_count(F2FS_F_SB(folio), __read_io_type(folio)); + + if (finished) + folio_end_read(folio, bio->bi_status == BLK_STS_OK); } if (ctx) @@ -509,6 +531,9 @@ static bool f2fs_crypt_mergeable_bio(struct bio *bio, const struct inode *inode, void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio, enum page_type type) { + if (!bio) + return; + WARN_ON_ONCE(!is_read_io(bio_op(bio))); trace_f2fs_submit_read_bio(sbi->sb, type, bio); @@ -1200,11 +1225,21 @@ struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index, struct dnode_of_data dn; struct folio *folio; int err; - +retry: folio = f2fs_grab_cache_folio(mapping, index, for_write); if (IS_ERR(folio)) return folio; + if (folio_test_large(folio)) { + pgoff_t folio_index = mapping_align_index(mapping, index); + + f2fs_folio_put(folio, true); + invalidate_inode_pages2_range(mapping, folio_index, + folio_index + folio_nr_pages(folio) - 1); + f2fs_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT); + goto retry; + } + if (f2fs_lookup_read_extent_cache_block(inode, index, &dn.data_blkaddr)) { if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr, @@ -2332,6 +2367,179 @@ out: } #endif +static struct f2fs_folio_state *ffs_find_or_alloc(struct folio *folio) +{ + struct f2fs_folio_state *ffs = folio->private; + + if (ffs) + return ffs; + + ffs = f2fs_kmem_cache_alloc(ffs_entry_slab, GFP_NOIO, true, NULL); + + spin_lock_init(&ffs->state_lock); + folio_attach_private(folio, ffs); + return ffs; +} + +static void ffs_detach_free(struct folio *folio) +{ + struct f2fs_folio_state *ffs; + + if (!folio_test_large(folio)) { + folio_detach_private(folio); + return; + } + + ffs = folio_detach_private(folio); + if (!ffs) + return; + + WARN_ON_ONCE(ffs->read_pages_pending != 0); + kmem_cache_free(ffs_entry_slab, ffs); +} + +static int f2fs_read_data_large_folio(struct inode *inode, + struct readahead_control *rac, struct folio *folio) +{ + struct bio *bio = NULL; + sector_t last_block_in_bio = 0; + struct f2fs_map_blocks map = {0, }; + pgoff_t index, offset; + unsigned max_nr_pages = rac ? readahead_count(rac) : + folio_nr_pages(folio); + unsigned nrpages; + struct f2fs_folio_state *ffs; + int ret = 0; + + if (!IS_IMMUTABLE(inode)) + return -EOPNOTSUPP; + + if (f2fs_compressed_file(inode)) + return -EOPNOTSUPP; + + map.m_seg_type = NO_CHECK_TYPE; + + if (rac) + folio = readahead_folio(rac); +next_folio: + if (!folio) + goto out; + + index = folio->index; + offset = 0; + ffs = NULL; + nrpages = folio_nr_pages(folio); + + for (; nrpages; nrpages--) { + sector_t block_nr; + /* + * Map blocks using the previous result first. + */ + if ((map.m_flags & F2FS_MAP_MAPPED) && + index > map.m_lblk && + index < (map.m_lblk + map.m_len)) + goto got_it; + + /* + * Then do more f2fs_map_blocks() calls until we are + * done with this page. + */ + memset(&map, 0, sizeof(map)); + map.m_seg_type = NO_CHECK_TYPE; + map.m_lblk = index; + map.m_len = max_nr_pages; + + ret = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DEFAULT); + if (ret) + goto err_out; +got_it: + if ((map.m_flags & F2FS_MAP_MAPPED)) { + block_nr = map.m_pblk + index - map.m_lblk; + if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr, + DATA_GENERIC_ENHANCE_READ)) { + ret = -EFSCORRUPTED; + goto err_out; + } + } else { + folio_zero_range(folio, offset << PAGE_SHIFT, PAGE_SIZE); + if (f2fs_need_verity(inode, index) && + !fsverity_verify_page(folio_file_page(folio, + index))) { + ret = -EIO; + goto err_out; + } + continue; + } + + /* + * This page will go to BIO. Do we need to send this + * BIO off first? + */ + if (bio && (!page_is_mergeable(F2FS_I_SB(inode), bio, + last_block_in_bio, block_nr) || + !f2fs_crypt_mergeable_bio(bio, inode, index, NULL))) { +submit_and_realloc: + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); + bio = NULL; + } + if (bio == NULL) + bio = f2fs_grab_read_bio(inode, block_nr, + max_nr_pages, + f2fs_ra_op_flags(rac), + index, false); + + /* + * If the page is under writeback, we need to wait for + * its completion to see the correct decrypted data. + */ + f2fs_wait_on_block_writeback(inode, block_nr); + + if (!bio_add_folio(bio, folio, F2FS_BLKSIZE, + offset << PAGE_SHIFT)) + goto submit_and_realloc; + + if (folio_test_large(folio)) { + ffs = ffs_find_or_alloc(folio); + + /* set the bitmap to wait */ + spin_lock_irq(&ffs->state_lock); + ffs->read_pages_pending++; + spin_unlock_irq(&ffs->state_lock); + } + + inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA); + f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO, + F2FS_BLKSIZE); + last_block_in_bio = block_nr; + index++; + offset++; + } + if (rac) { + folio = readahead_folio(rac); + goto next_folio; + } +err_out: + /* Nothing was submitted. */ + if (!bio) { + if (!ret) + folio_mark_uptodate(folio); + folio_unlock(folio); + return ret; + } + + if (ret) { + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); + + /* Wait bios and clear uptodate. */ + folio_lock(folio); + folio_clear_uptodate(folio); + folio_unlock(folio); + } +out: + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); + return ret; +} + /* * This function was originally taken from fs/mpage.c, and customized for f2fs. * Major change was from block_size == page_size in f2fs by default. @@ -2357,9 +2565,13 @@ static int f2fs_mpage_readpages(struct inode *inode, pgoff_t index; #endif unsigned nr_pages = rac ? readahead_count(rac) : 1; + struct address_space *mapping = rac ? rac->mapping : folio->mapping; unsigned max_nr_pages = nr_pages; int ret = 0; + if (mapping_large_folio_support(mapping)) + return f2fs_read_data_large_folio(inode, rac, folio); + #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { index = rac ? readahead_index(rac) : folio->index; @@ -2450,8 +2662,7 @@ next_page: } #endif } - if (bio) - f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); return ret; } @@ -3735,7 +3946,12 @@ void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length) f2fs_remove_dirty_inode(inode); } } - folio_detach_private(folio); + + if (offset || length != folio_size(folio)) + return; + + folio_cancel_dirty(folio); + ffs_detach_free(folio); } bool f2fs_release_folio(struct folio *folio, gfp_t wait) @@ -3744,7 +3960,7 @@ bool f2fs_release_folio(struct folio *folio, gfp_t wait) if (folio_test_dirty(folio)) return false; - folio_detach_private(folio); + ffs_detach_free(folio); return true; } @@ -4150,12 +4366,25 @@ int __init f2fs_init_bio_entry_cache(void) { bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab", sizeof(struct bio_entry)); - return bio_entry_slab ? 0 : -ENOMEM; + + if (!bio_entry_slab) + return -ENOMEM; + + ffs_entry_slab = f2fs_kmem_cache_create("f2fs_ffs_slab", + sizeof(struct f2fs_folio_state)); + + if (!ffs_entry_slab) { + kmem_cache_destroy(bio_entry_slab); + return -ENOMEM; + } + + return 0; } void f2fs_destroy_bio_entry_cache(void) { kmem_cache_destroy(bio_entry_slab); + kmem_cache_destroy(ffs_entry_slab); } static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 20edbb99b814..53cbce96f126 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4917,6 +4917,22 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi) return false; } +static inline bool f2fs_quota_file(struct inode *inode) +{ +#ifdef CONFIG_QUOTA + int i; + + if (!f2fs_sb_has_quota_ino(F2FS_I_SB(inode))) + return false; + + for (i = 0; i < MAXQUOTAS; i++) { + if (f2fs_qf_ino(F2FS_I_SB(inode)->sb, i) == inode->i_ino) + return true; + } +#endif + return false; +} + static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi) { return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index d7047ca6b98d..e75e61ac50d7 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -624,6 +624,10 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) if (!f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; + if (mapping_large_folio_support(inode->i_mapping) && + filp->f_mode & FMODE_WRITE) + return -EOPNOTSUPP; + err = fsverity_file_open(inode, filp); if (err) return err; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 38b8994bc1b2..921fb02c0f49 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -597,6 +597,8 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) if (ret) goto bad_inode; make_now: + f2fs_set_inode_flags(inode); + if (ino == F2FS_NODE_INO(sbi)) { inode->i_mapping->a_ops = &f2fs_node_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); @@ -618,6 +620,9 @@ make_now: inode->i_op = &f2fs_file_inode_operations; inode->i_fop = &f2fs_file_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; + if (IS_IMMUTABLE(inode) && !f2fs_compressed_file(inode) && + !f2fs_quota_file(inode)) + mapping_set_folio_min_order(inode->i_mapping, 0); } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &f2fs_dir_inode_operations; inode->i_fop = &f2fs_dir_operations; @@ -638,7 +643,6 @@ make_now: ret = -EIO; goto bad_inode; } - f2fs_set_inode_flags(inode); unlock_new_inode(inode); trace_f2fs_iget(inode); From 903c6e95bc9a4a3556d37e727853fc0ffb7f3acb Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 12 Nov 2025 22:00:37 +0000 Subject: [PATCH 02/61] f2fs: add a tracepoint to see large folio read submission For example, 1327.539878: f2fs_preload_pages_start: dev = (252,16), ino = 14, i_size = 4294967296 start: 0, end: 8191 1327.539878: page_cache_sync_ra: dev=252:16 ino=e index=0 req_count=8192 order=9 size=0 async_size=0 ra_pages=4096 mmap_miss=0 prev_pos=-1 1327.539879: page_cache_ra_order: dev=252:16 ino=e index=0 order=9 size=4096 async_size=2048 ra_pages=4096 1327.541895: f2fs_readpages: dev = (252,16), ino = 14, start = 0 nrpage = 4096 1327.541930: f2fs_lookup_extent_tree_start: dev = (252,16), ino = 14, pgofs = 0, type = Read 1327.541931: f2fs_lookup_read_extent_tree_end: dev = (252,16), ino = 14, pgofs = 0, read_ext_info(fofs: 0, len: 1048576, blk: 4221440) 1327.541931: f2fs_map_blocks: dev = (252,16), ino = 14, file offset = 0, start blkaddr = 0x406a00, len = 0x1000, flags = 2, seg_type = 8, may_create = 0, multidevice = 0, flag = 0, err = 0 1327.541989: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 0, nr_pages = 512, dirty = 0, uptodate = 0 1327.542012: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 512, nr_pages = 512, dirty = 0, uptodate = 0 1327.542036: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 1024, nr_pages = 512, dirty = 0, uptodate = 0 1327.542080: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 1536, nr_pages = 512, dirty = 0, uptodate = 0 1327.542127: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 2048, nr_pages = 512, dirty = 0, uptodate = 0 1327.542151: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 2560, nr_pages = 512, dirty = 0, uptodate = 0 1327.542196: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 3072, nr_pages = 512, dirty = 0, uptodate = 0 1327.542219: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 3584, nr_pages = 512, dirty = 0, uptodate = 0 1327.542239: f2fs_submit_read_bio: dev = (252,16)/(252,16), rw = READ(R), DATA, sector = 33771520, size = 16777216 1327.542269: page_cache_sync_ra: dev=252:16 ino=e index=4096 req_count=8192 order=9 size=4096 async_size=2048 ra_pages=4096 mmap_miss=0 prev_pos=-1 1327.542289: page_cache_ra_order: dev=252:16 ino=e index=4096 order=9 size=4096 async_size=2048 ra_pages=4096 1327.544485: f2fs_readpages: dev = (252,16), ino = 14, start = 4096 nrpage = 4096 1327.544521: f2fs_lookup_extent_tree_start: dev = (252,16), ino = 14, pgofs = 4096, type = Read 1327.544521: f2fs_lookup_read_extent_tree_end: dev = (252,16), ino = 14, pgofs = 4096, read_ext_info(fofs: 0, len: 1048576, blk: 4221440) 1327.544522: f2fs_map_blocks: dev = (252,16), ino = 14, file offset = 4096, start blkaddr = 0x407a00, len = 0x1000, flags = 2, seg_type = 8, may_create = 0, multidevice = 0, flag = 0, err = 0 1327.544550: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 4096, nr_pages = 512, dirty = 0, uptodate = 0 1327.544575: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 4608, nr_pages = 512, dirty = 0, uptodate = 0 1327.544601: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 5120, nr_pages = 512, dirty = 0, uptodate = 0 1327.544647: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 5632, nr_pages = 512, dirty = 0, uptodate = 0 1327.544692: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 6144, nr_pages = 512, dirty = 0, uptodate = 0 1327.544734: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 6656, nr_pages = 512, dirty = 0, uptodate = 0 1327.544777: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 7168, nr_pages = 512, dirty = 0, uptodate = 0 1327.544805: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 7680, nr_pages = 512, dirty = 0, uptodate = 0 1327.544826: f2fs_submit_read_bio: dev = (252,16)/(252,16), rw = READ(R), DATA, sector = 33804288, size = 16777216 1327.544852: f2fs_preload_pages_end: dev = (252,16), ino = 14, i_size = 4294967296 start: 8192, end: 8191 Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 1 + include/trace/events/f2fs.h | 12 +++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 2a6ae274b8de..12bf4b6e0075 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2514,6 +2514,7 @@ submit_and_realloc: index++; offset++; } + trace_f2fs_read_folio(folio, DATA); if (rac) { folio = readahead_folio(rac); goto next_folio; diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index df4017dcc701..635dcabcf1c7 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1358,6 +1358,7 @@ DECLARE_EVENT_CLASS(f2fs__folio, __field(int, type) __field(int, dir) __field(pgoff_t, index) + __field(pgoff_t, nrpages) __field(int, dirty) __field(int, uptodate) ), @@ -1368,16 +1369,18 @@ DECLARE_EVENT_CLASS(f2fs__folio, __entry->type = type; __entry->dir = S_ISDIR(folio->mapping->host->i_mode); __entry->index = folio->index; + __entry->nrpages= folio_nr_pages(folio); __entry->dirty = folio_test_dirty(folio); __entry->uptodate = folio_test_uptodate(folio); ), - TP_printk("dev = (%d,%d), ino = %lu, %s, %s, index = %lu, " + TP_printk("dev = (%d,%d), ino = %lu, %s, %s, index = %lu, nr_pages = %lu, " "dirty = %d, uptodate = %d", show_dev_ino(__entry), show_block_type(__entry->type), show_file_type(__entry->dir), (unsigned long)__entry->index, + (unsigned long)__entry->nrpages, __entry->dirty, __entry->uptodate) ); @@ -1403,6 +1406,13 @@ DEFINE_EVENT(f2fs__folio, f2fs_readpage, TP_ARGS(folio, type) ); +DEFINE_EVENT(f2fs__folio, f2fs_read_folio, + + TP_PROTO(struct folio *folio, int type), + + TP_ARGS(folio, type) +); + DEFINE_EVENT(f2fs__folio, f2fs_set_page_dirty, TP_PROTO(struct folio *folio, int type), From 4a210a5be279bfd5514dac3f5ef2c737cd984e84 Mon Sep 17 00:00:00 2001 From: Joanne Chang Date: Fri, 12 Dec 2025 08:40:34 +0000 Subject: [PATCH 03/61] f2fs: improve check for enough free sections MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The check for enough sections in segment.h has the following issues: 1. has_not_enough_free_secs() should return "enough secs" when "free_secs >= upper_secs", not just structly greater. Conversely, it should only return "not enough secs" when "free_secs < lower_secs", not when they are equal. This accounts for the possibility that blocks can fit within curseg without requiring an additional free section. 2. __get_secs_required() currently separates the needed space to section and block parts, checking them against free sections and curseg, respectively. This does not consider the case where curseg cannot hold the whole block part, but excess free sections beyond the section part can accommodate some of the block part. 3. has_curseg_enough_space() only checks CURSEG_HOT_DATA for dentry blocks, but when active_logs=6, they may be placed in WARM and COLD sections. Also, the current logic does not consider that dentry and data blocks can be put in the same section when active_logs=2 or 6. This patch modifies the three functions to address the above issues: 1. Rename has_curseg_enough_space() to get_additional_blocks_required(). Calculate the minimum node, dentry, and data blocks curseg can accommodate. Then subtract these from the total required blocks of respective type to determine the worst-case number of blocks that must be placed in free sections. 2. In __get_secs_required(), get the number of blocks needing new sections from the new get_additional_blocks_required(). Return the upper bound of necessary free sections for these blocks. For active_logs=2 or 6, dentry blocks are combined with data blocks. 3. In has_not_enough_free_secs(), get the required sections from __get_secs_required(), and return “not enough secs” if “free_secs < required_secs”. Signed-off-by: Joanne Chang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 2 +- fs/f2fs/segment.h | 99 ++++++++++++++++++++++------------------------- 2 files changed, 47 insertions(+), 54 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 384fa7e2085b..6afd57fa5387 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -2000,7 +2000,7 @@ retry: goto stop; } - __get_secs_required(sbi, NULL, &upper_secs, NULL); + upper_secs = __get_secs_required(sbi); /* * Write checkpoint to reclaim prefree segments. diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 07dcbcbeb7c6..20daaccb34a5 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -621,97 +621,90 @@ static inline unsigned int get_left_section_blocks(struct f2fs_sb_info *sbi, return CAP_BLKS_PER_SEC(sbi) - get_ckpt_valid_blocks(sbi, segno, true); } -static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, - unsigned int node_blocks, unsigned int data_blocks, - unsigned int dent_blocks) +static inline void get_additional_blocks_required(struct f2fs_sb_info *sbi, + unsigned int *total_node_blocks, unsigned int *total_data_blocks, + unsigned int *total_dent_blocks, bool separate_dent) { - unsigned int segno, left_blocks, blocks; + unsigned int segno, left_blocks; int i; + unsigned int min_free_node_blocks = CAP_BLKS_PER_SEC(sbi); + unsigned int min_free_dent_blocks = CAP_BLKS_PER_SEC(sbi); + unsigned int min_free_data_blocks = CAP_BLKS_PER_SEC(sbi); /* check current data/node sections in the worst case. */ for (i = CURSEG_HOT_DATA; i < NR_PERSISTENT_LOG; i++) { segno = CURSEG_I(sbi, i)->segno; if (unlikely(segno == NULL_SEGNO)) - return false; + return; left_blocks = get_left_section_blocks(sbi, i, segno); - blocks = i <= CURSEG_COLD_DATA ? data_blocks : node_blocks; - if (blocks > left_blocks) - return false; + if (i > CURSEG_COLD_DATA) + min_free_node_blocks = min(min_free_node_blocks, left_blocks); + else if (i == CURSEG_HOT_DATA && separate_dent) + min_free_dent_blocks = left_blocks; + else + min_free_data_blocks = min(min_free_data_blocks, left_blocks); } - /* check current data section for dentry blocks. */ - segno = CURSEG_I(sbi, CURSEG_HOT_DATA)->segno; - - if (unlikely(segno == NULL_SEGNO)) - return false; - - left_blocks = get_left_section_blocks(sbi, CURSEG_HOT_DATA, segno); - - if (dent_blocks > left_blocks) - return false; - return true; + *total_node_blocks = (*total_node_blocks > min_free_node_blocks) ? + *total_node_blocks - min_free_node_blocks : 0; + *total_dent_blocks = (*total_dent_blocks > min_free_dent_blocks) ? + *total_dent_blocks - min_free_dent_blocks : 0; + *total_data_blocks = (*total_data_blocks > min_free_data_blocks) ? + *total_data_blocks - min_free_data_blocks : 0; } /* - * calculate needed sections for dirty node/dentry and call - * has_curseg_enough_space, please note that, it needs to account - * dirty data as well in lfs mode when checkpoint is disabled. + * call get_additional_blocks_required to calculate dirty blocks + * needing to be placed in free sections, please note that, it + * needs to account dirty data as well in lfs mode when checkpoint + * is disabled. */ -static inline void __get_secs_required(struct f2fs_sb_info *sbi, - unsigned int *lower_p, unsigned int *upper_p, bool *curseg_p) +static inline int __get_secs_required(struct f2fs_sb_info *sbi) { unsigned int total_node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) + get_pages(sbi, F2FS_DIRTY_DENTS) + get_pages(sbi, F2FS_DIRTY_IMETA); unsigned int total_dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS); unsigned int total_data_blocks = 0; - unsigned int node_secs = total_node_blocks / CAP_BLKS_PER_SEC(sbi); - unsigned int dent_secs = total_dent_blocks / CAP_BLKS_PER_SEC(sbi); - unsigned int data_secs = 0; - unsigned int node_blocks = total_node_blocks % CAP_BLKS_PER_SEC(sbi); - unsigned int dent_blocks = total_dent_blocks % CAP_BLKS_PER_SEC(sbi); - unsigned int data_blocks = 0; + bool separate_dent = true; - if (f2fs_lfs_mode(sbi)) { + if (f2fs_lfs_mode(sbi)) total_data_blocks = get_pages(sbi, F2FS_DIRTY_DATA); - data_secs = total_data_blocks / CAP_BLKS_PER_SEC(sbi); - data_blocks = total_data_blocks % CAP_BLKS_PER_SEC(sbi); + + /* + * When active_logs != 4, dentry blocks and data blocks can be + * mixed in the same logs, so check their space together. + */ + if (F2FS_OPTION(sbi).active_logs != 4) { + total_data_blocks += total_dent_blocks; + total_dent_blocks = 0; + separate_dent = false; } - if (lower_p) - *lower_p = node_secs + dent_secs + data_secs; - if (upper_p) - *upper_p = node_secs + dent_secs + data_secs + - (node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0) + - (data_blocks ? 1 : 0); - if (curseg_p) - *curseg_p = has_curseg_enough_space(sbi, - node_blocks, data_blocks, dent_blocks); + get_additional_blocks_required(sbi, &total_node_blocks, &total_dent_blocks, + &total_data_blocks, separate_dent); + + return DIV_ROUND_UP(total_node_blocks, CAP_BLKS_PER_SEC(sbi)) + + DIV_ROUND_UP(total_dent_blocks, CAP_BLKS_PER_SEC(sbi)) + + DIV_ROUND_UP(total_data_blocks, CAP_BLKS_PER_SEC(sbi)); } static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed, int needed) { - unsigned int free_secs, lower_secs, upper_secs; - bool curseg_space; + unsigned int free_secs, required_secs; if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; - __get_secs_required(sbi, &lower_secs, &upper_secs, &curseg_space); - free_secs = free_sections(sbi) + freed; - lower_secs += needed + reserved_sections(sbi); - upper_secs += needed + reserved_sections(sbi); + required_secs = needed + reserved_sections(sbi) + + __get_secs_required(sbi); - if (free_secs > upper_secs) - return false; - if (free_secs <= lower_secs) - return true; - return !curseg_space; + return free_secs < required_secs; } static inline bool has_enough_free_secs(struct f2fs_sb_info *sbi, From 3250bd41d95ccdaa157ad5f128c1596e353ee7e0 Mon Sep 17 00:00:00 2001 From: Zhiguo Niu Date: Wed, 17 Dec 2025 15:45:29 +0800 Subject: [PATCH 04/61] f2fs: remove some redundant codes in f2fs_quota_enable 1. qf_inum has been got and checked in its caller f2fs_enable_quotas 2. f2fs_sb_has_quota_ino has bee checked in its all callers 3. use sbi cleanup F2FS_SB(sb) Signed-off-by: Zhiguo Niu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index c4c225e09dc4..036ba9cc799c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3222,19 +3222,12 @@ int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly) } static int f2fs_quota_enable(struct super_block *sb, int type, int format_id, - unsigned int flags) + unsigned int flags, unsigned long qf_inum) { struct inode *qf_inode; - unsigned long qf_inum; unsigned long qf_flag = F2FS_QUOTA_DEFAULT_FL; int err; - BUG_ON(!f2fs_sb_has_quota_ino(F2FS_SB(sb))); - - qf_inum = f2fs_qf_ino(sb, type); - if (!qf_inum) - return -EPERM; - qf_inode = f2fs_iget(sb, qf_inum); if (IS_ERR(qf_inode)) { f2fs_err(F2FS_SB(sb), "Bad quota inode %u:%lu", type, qf_inum); @@ -3267,7 +3260,7 @@ static int f2fs_enable_quotas(struct super_block *sb) test_opt(sbi, PRJQUOTA), }; - if (is_set_ckpt_flags(F2FS_SB(sb), CP_QUOTA_NEED_FSCK_FLAG)) { + if (is_set_ckpt_flags(sbi, CP_QUOTA_NEED_FSCK_FLAG)) { f2fs_err(sbi, "quota file may be corrupted, skip loading it"); return 0; } @@ -3279,14 +3272,13 @@ static int f2fs_enable_quotas(struct super_block *sb) if (qf_inum) { err = f2fs_quota_enable(sb, type, QFMT_VFS_V1, DQUOT_USAGE_ENABLED | - (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0)); + (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0), qf_inum); if (err) { f2fs_err(sbi, "Failed to enable quota tracking (type=%d, err=%d). Please run fsck to fix.", type, err); for (type--; type >= 0; type--) dquot_quota_off(sb, type); - set_sbi_flag(F2FS_SB(sb), - SBI_QUOTA_NEED_REPAIR); + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); return err; } } From 3cb396a2c7905c3daed0b6b2c5806a95386f4581 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 19 Dec 2025 10:51:04 +0800 Subject: [PATCH 05/61] f2fs: fix to do sanity check on nat entry of quota inode As Zhiguo reported, nat entry of quota inode could be corrupted: "ino/block_addr=NULL_ADDR in nid=4 entry" We'd better to do sanity check on quota inode to detect and record nat.blk_addr inconsistency, so that we can have a chance to repair it w/ later fsck. Reported-by: Zhiguo Niu Signed-off-by: Chao Yu Reviewed-by: Zhiguo Niu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 6 +++--- fs/f2fs/inode.c | 2 +- fs/f2fs/node.c | 11 +++++++++++ 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 53cbce96f126..291a694fdaf0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4917,16 +4917,16 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi) return false; } -static inline bool f2fs_quota_file(struct inode *inode) +static inline bool f2fs_quota_file(struct f2fs_sb_info *sbi, nid_t ino) { #ifdef CONFIG_QUOTA int i; - if (!f2fs_sb_has_quota_ino(F2FS_I_SB(inode))) + if (!f2fs_sb_has_quota_ino(sbi)) return false; for (i = 0; i < MAXQUOTAS; i++) { - if (f2fs_qf_ino(F2FS_I_SB(inode)->sb, i) == inode->i_ino) + if (f2fs_qf_ino(sbi->sb, i) == ino) return true; } #endif diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 921fb02c0f49..d1270b25ad7d 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -621,7 +621,7 @@ make_now: inode->i_fop = &f2fs_file_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; if (IS_IMMUTABLE(inode) && !f2fs_compressed_file(inode) && - !f2fs_quota_file(inode)) + !f2fs_quota_file(sbi, inode->i_ino)) mapping_set_folio_min_order(inode->i_mapping, 0); } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &f2fs_dir_inode_operations; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 482a362f2625..3a80da524739 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -643,6 +643,17 @@ sanity_check: return -EFSCORRUPTED; } + if (unlikely(f2fs_quota_file(sbi, ni->nid) && + !__is_valid_data_blkaddr(ni->blk_addr))) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_err_ratelimited(sbi, + "f2fs_get_node_info of %pS: inconsistent nat entry from qf_ino, " + "ino:%u, nid:%u, blkaddr:%u, ver:%u, flag:%u", + __builtin_return_address(0), + ni->ino, ni->nid, ni->blk_addr, ni->version, ni->flag); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_NAT); + } + /* cache nat entry */ if (need_cache) cache_nat_entry(sbi, nid, &ne); From 761dac9073cd67d4705a94cd1af674945a117f4c Mon Sep 17 00:00:00 2001 From: Zhiguo Niu Date: Fri, 26 Dec 2025 10:56:04 +0800 Subject: [PATCH 06/61] f2fs: fix to add gc count stat in f2fs_gc_range It missed the stat count in f2fs_gc_range. Cc: stable@kernel.org Fixes: 9bf1dcbdfdc8 ("f2fs: fix to account gc stats correctly") Signed-off-by: Zhiguo Niu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 6afd57fa5387..58b291d19f06 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -2096,6 +2096,7 @@ int f2fs_gc_range(struct f2fs_sb_info *sbi, if (unlikely(f2fs_cp_error(sbi))) return -EIO; + stat_inc_gc_call_count(sbi, FOREGROUND); for (segno = start_seg; segno <= end_seg; segno += SEGS_PER_SEC(sbi)) { struct gc_inode_list gc_list = { .ilist = LIST_HEAD_INIT(gc_list.ilist), From 86c1cf0578c59c8e68185d86d03be846bcaef0e2 Mon Sep 17 00:00:00 2001 From: Yongpeng Yang Date: Sat, 27 Dec 2025 17:19:06 +0800 Subject: [PATCH 07/61] f2fs: clean up the force parameter in __submit_merged_write_cond() The force parameter in __submit_merged_write_cond is redundant, where `force == true` implies `inode == NULL && folio == NULL && ino == 0` is true, and `force == false` implies `inode != NULL || folio != NULL || ino != 0` is true. Thus, this patch replaces the force parameter with a stack variable force. Signed-off-by: Yongpeng Yang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 12bf4b6e0075..d4ef26beadbc 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -664,10 +664,11 @@ unlock_out: static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, struct folio *folio, - nid_t ino, enum page_type type, bool force) + nid_t ino, enum page_type type) { enum temp_type temp; bool ret = true; + bool force = !inode && !folio && !ino; for (temp = HOT; temp < NR_TEMP_TYPE; temp++) { if (!force) { @@ -689,14 +690,14 @@ static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type) { - __submit_merged_write_cond(sbi, NULL, NULL, 0, type, true); + __submit_merged_write_cond(sbi, NULL, NULL, 0, type); } void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, struct folio *folio, nid_t ino, enum page_type type) { - __submit_merged_write_cond(sbi, inode, folio, ino, type, false); + __submit_merged_write_cond(sbi, inode, folio, ino, type); } void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi) From db1a8a7813f74968f79bd510fd5f0ae866bf8efd Mon Sep 17 00:00:00 2001 From: Yongpeng Yang Date: Sat, 27 Dec 2025 17:46:10 +0800 Subject: [PATCH 08/61] f2fs: return immediately after submitting the specified folio in __submit_merged_write_cond f2fs_folio_wait_writeback ensures the folio write is submitted to the block layer via __submit_merged_write_cond, then waits for the folio to complete. Other I/O submissions are irrelevant to f2fs_folio_wait_writeback. Thus, if the folio write bio is already submitted, the function can return immediately. This patch adds a writeback parameter to __submit_merged_write_cond(), which signals an immediate return after submitting the target folio, and waitting writeback can use this parameter. Signed-off-by: Yongpeng Yang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 22 ++++++++++++++++++---- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/segment.c | 2 +- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d4ef26beadbc..471e52c6c1e0 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -664,7 +664,7 @@ unlock_out: static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, struct folio *folio, - nid_t ino, enum page_type type) + nid_t ino, enum page_type type, bool writeback) { enum temp_type temp; bool ret = true; @@ -679,8 +679,16 @@ static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, ret = __has_merged_page(io->bio, inode, folio, ino); f2fs_up_read(&io->io_rwsem); } - if (ret) + if (ret) { __f2fs_submit_merged_write(sbi, type, temp); + /* + * For waitting writebck case, if the bio owned by the + * folio is already submitted, we do not need to submit + * other types of bios. + */ + if (writeback) + break; + } /* TODO: use HOT temp only for meta pages now. */ if (type >= META) @@ -690,14 +698,20 @@ static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type) { - __submit_merged_write_cond(sbi, NULL, NULL, 0, type); + __submit_merged_write_cond(sbi, NULL, NULL, 0, type, false); } void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, struct folio *folio, nid_t ino, enum page_type type) { - __submit_merged_write_cond(sbi, inode, folio, ino, type); + __submit_merged_write_cond(sbi, inode, folio, ino, type, false); +} + +void f2fs_submit_merged_write_folio(struct f2fs_sb_info *sbi, + struct folio *folio, enum page_type type) +{ + __submit_merged_write_cond(sbi, NULL, folio, 0, type, true); } void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 291a694fdaf0..5d81de7cee70 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4052,6 +4052,8 @@ void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, struct folio *folio, nid_t ino, enum page_type type); +void f2fs_submit_merged_write_folio(struct f2fs_sb_info *sbi, + struct folio *folio, enum page_type type); void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, struct bio **bio, struct folio *folio); void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c26424f47686..c0c5b7075b04 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -4240,7 +4240,7 @@ void f2fs_folio_wait_writeback(struct folio *folio, enum page_type type, struct f2fs_sb_info *sbi = F2FS_F_SB(folio); /* submit cached LFS IO */ - f2fs_submit_merged_write_cond(sbi, NULL, folio, 0, type); + f2fs_submit_merged_write_folio(sbi, folio, type); /* submit cached IPU IO */ f2fs_submit_merged_ipu_write(sbi, NULL, folio); if (ordered) { From 9609dd704725a40cd63d915f2ab6c44248a44598 Mon Sep 17 00:00:00 2001 From: Yongpeng Yang Date: Sat, 27 Dec 2025 15:34:31 +0800 Subject: [PATCH 09/61] f2fs: remove non-uptodate folio from the page cache in move_data_block During data movement, move_data_block acquires file folio without triggering a file read. Such folio are typically not uptodate, they need to be removed from the page cache after gc complete. This patch marks folio with the PG_dropbehind flag and uses folio_end_dropbehind to remove folio from the page cache. Signed-off-by: Yunlei He Signed-off-by: Yongpeng Yang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 58b291d19f06..d889f7d9a70f 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1453,7 +1453,11 @@ up_out: put_out: f2fs_put_dnode(&dn); out: - f2fs_folio_put(folio, true); + if (!folio_test_uptodate(folio)) + __folio_set_dropbehind(folio); + folio_unlock(folio); + folio_end_dropbehind(folio); + folio_put(folio); return err; } From 572b1c6f2ade7afe687a385caccb717081ada070 Mon Sep 17 00:00:00 2001 From: ZhaoYueNan Date: Tue, 30 Dec 2025 14:26:48 +0800 Subject: [PATCH 10/61] f2fs: Update the default value of the documentation ckpt_thread_ioprio The commit 8a2d9f00d has been updated to set its default value to "rt,3", fixing the outdated default value in the F2FS documentation. Signed-off-by: ZhaoYueNan Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 770470e0598b..c39a85e84b6b 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -520,7 +520,7 @@ What: /sys/fs/f2fs//ckpt_thread_ioprio Date: January 2021 Contact: "Daeho Jeong" Description: Give a way to change checkpoint merge daemon's io priority. - Its default value is "be,3", which means "BE" I/O class and + Its default value is "rt,3", which means "RT" I/O class and I/O priority "3". We can select the class between "rt" and "be", and set the I/O priority within valid range of it. "," delimiter is necessary in between I/O class and priority number. From 7ec199117c32543e0fa8787a6eedd9126523a8d4 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Tue, 30 Dec 2025 09:38:45 -0800 Subject: [PATCH 11/61] f2fs: flush plug periodically during GC to maximize readahead effect During the garbage collection process, F2FS submits readahead I/Os for valid blocks. However, since the GC loop runs within a single plug scope without intermediate flushing, these readahead I/Os often accumulate in the block layer's plug list instead of being dispatched to the device immediately. Consequently, when the GC thread attempts to lock the page later, the I/O might not have completed (or even started), leading to a "read try and wait" scenario. This negates the benefit of readahead and causes unnecessary delays in GC latency. This patch addresses this issue by introducing an intermediate blk_finish_plug() and blk_start_plug() pair within the GC loop. This forces the dispatch of pending I/Os, ensuring that readahead pages are fetched in time, thereby reducing GC latency. Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d889f7d9a70f..ba66d8bc9b5f 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1031,7 +1031,8 @@ static int check_valid_map(struct f2fs_sb_info *sbi, * ignore that. */ static int gc_node_segment(struct f2fs_sb_info *sbi, - struct f2fs_summary *sum, unsigned int segno, int gc_type) + struct f2fs_summary *sum, unsigned int segno, int gc_type, + struct blk_plug *plug) { struct f2fs_summary *entry; block_t start_addr; @@ -1100,8 +1101,11 @@ next_step: stat_inc_node_blk_count(sbi, 1, gc_type); } - if (++phase < 3) + if (++phase < 3) { + blk_finish_plug(plug); + blk_start_plug(plug); goto next_step; + } if (fggc) atomic_dec(&sbi->wb_sync_req[NODE]); @@ -1539,7 +1543,7 @@ out: */ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, struct gc_inode_list *gc_list, unsigned int segno, int gc_type, - bool force_migrate) + bool force_migrate, struct blk_plug *plug) { struct super_block *sb = sbi->sb; struct f2fs_summary *entry; @@ -1707,8 +1711,11 @@ next_step: } } - if (++phase < 5) + if (++phase < 5) { + blk_finish_plug(plug); + blk_start_plug(plug); goto next_step; + } return submitted; } @@ -1857,11 +1864,11 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, */ if (type == SUM_TYPE_NODE) submitted += gc_node_segment(sbi, sum->entries, - cur_segno, gc_type); + cur_segno, gc_type, &plug); else submitted += gc_data_segment(sbi, sum->entries, gc_list, cur_segno, - gc_type, force_migrate); + gc_type, force_migrate, &plug); stat_inc_gc_seg_count(sbi, data_type, gc_type); sbi->gc_reclaimed_segs[sbi->gc_mode]++; From 79b3cebc70fcadf914d3ad1ae59d59cc62a47c46 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 4 Jan 2026 10:07:16 +0800 Subject: [PATCH 12/61] f2fs: add lock elapsed time trace facility for f2fs rwsemphore This patch adds lock elapsed time trace facility for f2fs rwsemphore. If total elapsed time of critical region covered by lock exceeds a threshold, it will print tracepoint to dump information of lock related context, including: - thread information - CPU/IO priority - lock information - elapsed time - total time - running time (depend on CONFIG_64BIT) - runnable time (depend on CONFIG_SCHED_INFO and CONFIG_SCHEDSTATS) - io sleep time (depend on CONFIG_TASK_DELAY_ACCT and /proc/sys/kernel/task_delayacct) - other time (by default other time will account nonio sleep time, but, if above kconfig is not defined, other time will include runnable time and/or io sleep time as wll) output: f2fs_lock_elapsed_time: dev = (254,52), comm: sh, pid: 13855, prio: 120, ioprio_class: 2, ioprio_data: 4, lock_name: cp_rwsem, lock_type: rlock, total: 1000, running: 993, runnable: 2, io_sleep: 0, other: 5 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 106 ++++++++++++++++++++++++++++++++++++ fs/f2fs/f2fs.h | 51 +++++++++++++++-- include/trace/events/f2fs.h | 68 +++++++++++++++++++++++ 3 files changed, 221 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 300664269eb6..bc6058a3122b 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -14,6 +14,9 @@ #include #include #include +#include +#include +#include #include "f2fs.h" #include "node.h" @@ -21,6 +24,109 @@ #include "iostat.h" #include +static inline void get_lock_elapsed_time(struct f2fs_time_stat *ts) +{ + ts->total_time = ktime_get(); +#ifdef CONFIG_64BIT + ts->running_time = current->se.sum_exec_runtime; +#endif +#if defined(CONFIG_SCHED_INFO) && defined(CONFIG_SCHEDSTATS) + ts->runnable_time = current->sched_info.run_delay; +#endif +#ifdef CONFIG_TASK_DELAY_ACCT + if (current->delays) + ts->io_sleep_time = current->delays->blkio_delay; +#endif +} + +static inline void trace_lock_elapsed_time_start(struct f2fs_rwsem *sem, + struct f2fs_lock_context *lc) +{ + lc->lock_trace = trace_f2fs_lock_elapsed_time_enabled(); + if (!lc->lock_trace) + return; + + get_lock_elapsed_time(&lc->ts); +} + +static inline void trace_lock_elapsed_time_end(struct f2fs_rwsem *sem, + struct f2fs_lock_context *lc, bool is_write) +{ + struct f2fs_time_stat tts; + unsigned long long total_time; + unsigned long long running_time = 0; + unsigned long long runnable_time = 0; + unsigned long long io_sleep_time = 0; + unsigned long long other_time = 0; + unsigned npm = NSEC_PER_MSEC; + + if (!lc->lock_trace) + return; + + get_lock_elapsed_time(&tts); + + total_time = div_u64(tts.total_time - lc->ts.total_time, npm); + if (total_time <= MAX_LOCK_ELAPSED_TIME) + return; + +#ifdef CONFIG_64BIT + running_time = div_u64(tts.running_time - lc->ts.running_time, npm); +#endif +#if defined(CONFIG_SCHED_INFO) && defined(CONFIG_SCHEDSTATS) + runnable_time = div_u64(tts.runnable_time - lc->ts.runnable_time, npm); +#endif +#ifdef CONFIG_TASK_DELAY_ACCT + io_sleep_time = div_u64(tts.io_sleep_time - lc->ts.io_sleep_time, npm); +#endif + if (total_time > running_time + io_sleep_time + runnable_time) + other_time = total_time - running_time - + io_sleep_time - runnable_time; + + trace_f2fs_lock_elapsed_time(sem->sbi, sem->name, is_write, current, + get_current_ioprio(), total_time, running_time, + runnable_time, io_sleep_time, other_time); +} + +void f2fs_down_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc) +{ + f2fs_down_read(sem); + trace_lock_elapsed_time_start(sem, lc); +} + +int f2fs_down_read_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc) +{ + if (!f2fs_down_read_trylock(sem)) + return 0; + trace_lock_elapsed_time_start(sem, lc); + return 1; +} + +void f2fs_up_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc) +{ + f2fs_up_read(sem); + trace_lock_elapsed_time_end(sem, lc, false); +} + +void f2fs_down_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc) +{ + f2fs_down_write(sem); + trace_lock_elapsed_time_start(sem, lc); +} + +int f2fs_down_write_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc) +{ + if (!f2fs_down_write_trylock(sem)) + return 0; + trace_lock_elapsed_time_start(sem, lc); + return 1; +} + +void f2fs_up_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc) +{ + f2fs_up_write(sem); + trace_lock_elapsed_time_end(sem, lc, true); +} + #define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 3)) static struct kmem_cache *ino_entry_slab; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5d81de7cee70..fc3f532972ed 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -173,6 +173,10 @@ enum device_allocation_policy { ALLOCATE_FORWARD_FROM_HINT, }; +enum f2fs_lock_name { + LOCK_NAME_NONE, +}; + /* * An implementation of an rwsem that is explicitly unfair to readers. This * prevents priority inversion when a low-priority reader acquires the read lock @@ -181,6 +185,8 @@ enum device_allocation_policy { */ struct f2fs_rwsem { + struct f2fs_sb_info *sbi; + enum f2fs_lock_name name; struct rw_semaphore internal_rwsem; #ifdef CONFIG_F2FS_UNFAIR_RWSEM wait_queue_head_t read_waiters; @@ -1409,6 +1415,24 @@ struct f2fs_gc_control { unsigned int nr_free_secs; /* # of free sections to do GC */ }; +struct f2fs_time_stat { + unsigned long long total_time; /* total wall clock time */ +#ifdef CONFIG_64BIT + unsigned long long running_time; /* running time */ +#endif +#if defined(CONFIG_SCHED_INFO) && defined(CONFIG_SCHEDSTATS) + unsigned long long runnable_time; /* runnable(including preempted) time */ +#endif +#ifdef CONFIG_TASK_DELAY_ACCT + unsigned long long io_sleep_time; /* IO sleep time */ +#endif +}; + +struct f2fs_lock_context { + struct f2fs_time_stat ts; + bool lock_trace; +}; + /* * For s_flag in struct f2fs_sb_info * Modification on enum should be synchronized with s_flag array @@ -1525,6 +1549,9 @@ enum f2fs_lookup_mode { LOOKUP_AUTO, }; +/* a threshold of maximum elapsed time in critical region to print tracepoint */ +#define MAX_LOCK_ELAPSED_TIME 500 + static inline int f2fs_test_bit(unsigned int nr, char *addr); static inline void f2fs_set_bit(unsigned int nr, char *addr); static inline void f2fs_clear_bit(unsigned int nr, char *addr); @@ -2263,16 +2290,22 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) spin_unlock_irqrestore(&sbi->cp_lock, flags); } -#define init_f2fs_rwsem(sem) \ +#define init_f2fs_rwsem(sem) __init_f2fs_rwsem(sem, NULL, LOCK_NAME_NONE) +#define init_f2fs_rwsem_trace __init_f2fs_rwsem + +#define __init_f2fs_rwsem(sem, sbi, name) \ do { \ static struct lock_class_key __key; \ \ - __init_f2fs_rwsem((sem), #sem, &__key); \ + do_init_f2fs_rwsem((sem), #sem, &__key, sbi, name); \ } while (0) -static inline void __init_f2fs_rwsem(struct f2fs_rwsem *sem, - const char *sem_name, struct lock_class_key *key) +static inline void do_init_f2fs_rwsem(struct f2fs_rwsem *sem, + const char *sem_name, struct lock_class_key *key, + struct f2fs_sb_info *sbi, enum f2fs_lock_name name) { + sem->sbi = sbi; + sem->name = name; __init_rwsem(&sem->internal_rwsem, sem_name, key); #ifdef CONFIG_F2FS_UNFAIR_RWSEM init_waitqueue_head(&sem->read_waiters); @@ -2341,6 +2374,16 @@ static inline void f2fs_up_write(struct f2fs_rwsem *sem) #endif } +void f2fs_down_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc); +int f2fs_down_read_trylock_trace(struct f2fs_rwsem *sem, + struct f2fs_lock_context *lc); +void f2fs_up_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc); +void f2fs_down_write_trace(struct f2fs_rwsem *sem, + struct f2fs_lock_context *lc); +int f2fs_down_write_trylock_trace(struct f2fs_rwsem *sem, + struct f2fs_lock_context *lc); +void f2fs_up_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc); + static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock) { unsigned long flags; diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 635dcabcf1c7..9a852a16df9c 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -184,6 +184,10 @@ TRACE_DEFINE_ENUM(CP_PHASE_FINISH_CHECKPOINT); { CP_PHASE_FINISH_BLOCK_OPS, "finish block_ops" }, \ { CP_PHASE_FINISH_CHECKPOINT, "finish checkpoint" }) +#define show_lock_name(lock) \ + __print_symbolic(lock, \ + { LOCK_NAME_NONE, "none" }) + struct f2fs_sb_info; struct f2fs_io_info; struct extent_info; @@ -2452,6 +2456,70 @@ DEFINE_EVENT(f2fs__rw_end, f2fs_datawrite_end, TP_ARGS(inode, offset, bytes) ); +TRACE_EVENT(f2fs_lock_elapsed_time, + + TP_PROTO(struct f2fs_sb_info *sbi, enum f2fs_lock_name lock_name, + bool is_write, struct task_struct *p, int ioprio, + unsigned long long total_time, + unsigned long long running_time, + unsigned long long runnable_time, + unsigned long long io_sleep_time, + unsigned long long other_time), + + TP_ARGS(sbi, lock_name, is_write, p, ioprio, total_time, running_time, + runnable_time, io_sleep_time, other_time), + + TP_STRUCT__entry( + __field(dev_t, dev) + __array(char, comm, TASK_COMM_LEN) + __field(pid_t, pid) + __field(int, prio) + __field(int, ioprio_class) + __field(int, ioprio_data) + __field(unsigned int, lock_name) + __field(bool, is_write) + __field(unsigned long long, total_time) + __field(unsigned long long, running_time) + __field(unsigned long long, runnable_time) + __field(unsigned long long, io_sleep_time) + __field(unsigned long long, other_time) + ), + + TP_fast_assign( + __entry->dev = sbi->sb->s_dev; + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; + __entry->ioprio_class = IOPRIO_PRIO_CLASS(ioprio); + __entry->ioprio_data = IOPRIO_PRIO_DATA(ioprio); + __entry->lock_name = lock_name; + __entry->is_write = is_write; + __entry->total_time = total_time; + __entry->running_time = running_time; + __entry->runnable_time = runnable_time; + __entry->io_sleep_time = io_sleep_time; + __entry->other_time = other_time; + ), + + TP_printk("dev = (%d,%d), comm: %s, pid: %d, prio: %d, " + "ioprio_class: %d, ioprio_data: %d, lock_name: %s, " + "lock_type: %s, total: %llu, running: %llu, " + "runnable: %llu, io_sleep: %llu, other: %llu", + show_dev(__entry->dev), + __entry->comm, + __entry->pid, + __entry->prio, + __entry->ioprio_class, + __entry->ioprio_data, + show_lock_name(__entry->lock_name), + __entry->is_write ? "wlock" : "rlock", + __entry->total_time, + __entry->running_time, + __entry->runnable_time, + __entry->io_sleep_time, + __entry->other_time) +); + #endif /* _TRACE_F2FS_H */ /* This part must be outside protection */ From e4b75621fc439399b94c4265cb54d2bda1177397 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 4 Jan 2026 10:07:17 +0800 Subject: [PATCH 13/61] f2fs: sysfs: introduce max_lock_elapsed_time This patch add a new sysfs node in /sys/fs/f2fs//max_lock_elapsed_time. This is a threshold, once a thread enters critical region that lock covers, total elapsed time exceeds this threshold, f2fs will print tracepoint to dump information of related context. This sysfs entry can be used to control the value of threshold, by default, the value is 500 ms. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 8 ++++++++ fs/f2fs/checkpoint.c | 2 +- fs/f2fs/f2fs.h | 3 +++ fs/f2fs/super.c | 1 + fs/f2fs/sysfs.c | 2 ++ 5 files changed, 15 insertions(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index c39a85e84b6b..648ddd0d59f6 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -939,3 +939,11 @@ Description: Controls write priority in multi-devices setups. A value of 0 means allocate_section_policy = 1 Prioritize writing to section before allocate_section_hint allocate_section_policy = 2 Prioritize writing to section after allocate_section_hint =========================== ========================================================== + +What: /sys/fs/f2fs//max_lock_elapsed_time +Date: December 2025 +Contact: "Chao Yu" +Description: This is a threshold, once a thread enters critical region that lock covers, total + elapsed time exceeds this threshold, f2fs will print tracepoint to dump information + of related context. This sysfs entry can be used to control the value of threshold, + by default, the value is 500 ms. diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index bc6058a3122b..61bcf227d8ca 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -66,7 +66,7 @@ static inline void trace_lock_elapsed_time_end(struct f2fs_rwsem *sem, get_lock_elapsed_time(&tts); total_time = div_u64(tts.total_time - lc->ts.total_time, npm); - if (total_time <= MAX_LOCK_ELAPSED_TIME) + if (total_time <= sem->sbi->max_lock_elapsed_time) return; #ifdef CONFIG_64BIT diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fc3f532972ed..7f2675ed7f86 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1951,6 +1951,9 @@ struct f2fs_sb_info { /* carve out reserved_blocks from total blocks */ bool carve_out; + /* max elapsed time threshold in critical region that lock covered */ + unsigned long long max_lock_elapsed_time; + #ifdef CONFIG_F2FS_FS_COMPRESSION struct kmem_cache *page_array_slab; /* page array entry */ unsigned int page_array_slab_size; /* default page array slab size */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 036ba9cc799c..8fe1ac8a609c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4295,6 +4295,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->max_fragment_hole = DEF_FRAGMENT_SIZE; spin_lock_init(&sbi->gc_remaining_trials_lock); atomic64_set(&sbi->current_atomic_write, 0); + sbi->max_lock_elapsed_time = MAX_LOCK_ELAPSED_TIME; sbi->dir_level = DEF_DIR_LEVEL; sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index c42f4f979d13..e03cba5a9d70 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -1219,6 +1219,7 @@ F2FS_SBI_GENERAL_RW_ATTR(blkzone_alloc_policy); F2FS_SBI_GENERAL_RW_ATTR(carve_out); F2FS_SBI_GENERAL_RW_ATTR(reserved_pin_section); F2FS_SBI_GENERAL_RW_ATTR(bggc_io_aware); +F2FS_SBI_GENERAL_RW_ATTR(max_lock_elapsed_time); /* STAT_INFO ATTR */ #ifdef CONFIG_F2FS_STAT_FS @@ -1422,6 +1423,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(reserved_pin_section), ATTR_LIST(allocate_section_hint), ATTR_LIST(allocate_section_policy), + ATTR_LIST(max_lock_elapsed_time), NULL, }; ATTRIBUTE_GROUPS(f2fs); From 66e9e0d55d117a7de2c00a9a06fb943ead56e1c2 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 4 Jan 2026 10:07:18 +0800 Subject: [PATCH 14/61] f2fs: trace elapsed time for cp_rwsem lock Use f2fs_{down,up}_read_trace for cp_rwsem to trace lock elapsed time. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 28 ++++++++++++++++ fs/f2fs/compress.c | 12 ++++--- fs/f2fs/data.c | 47 ++++++++++++++++----------- fs/f2fs/f2fs.h | 33 +++---------------- fs/f2fs/file.c | 65 ++++++++++++++++++++++--------------- fs/f2fs/gc.c | 5 +-- fs/f2fs/inline.c | 10 +++--- fs/f2fs/inode.c | 10 +++--- fs/f2fs/namei.c | 65 +++++++++++++++++++++---------------- fs/f2fs/segment.c | 10 +++--- fs/f2fs/super.c | 7 ++-- fs/f2fs/xattr.c | 5 +-- include/trace/events/f2fs.h | 2 +- 13 files changed, 173 insertions(+), 126 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 61bcf227d8ca..dfd54cba1b35 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -127,6 +127,34 @@ void f2fs_up_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc) trace_lock_elapsed_time_end(sem, lc, true); } +void f2fs_lock_op(struct f2fs_sb_info *sbi, struct f2fs_lock_context *lc) +{ + f2fs_down_read_trace(&sbi->cp_rwsem, lc); +} + +int f2fs_trylock_op(struct f2fs_sb_info *sbi, struct f2fs_lock_context *lc) +{ + if (time_to_inject(sbi, FAULT_LOCK_OP)) + return 0; + + return f2fs_down_read_trylock_trace(&sbi->cp_rwsem, lc); +} + +void f2fs_unlock_op(struct f2fs_sb_info *sbi, struct f2fs_lock_context *lc) +{ + f2fs_up_read_trace(&sbi->cp_rwsem, lc); +} + +static inline void f2fs_lock_all(struct f2fs_sb_info *sbi) +{ + f2fs_down_write(&sbi->cp_rwsem); +} + +static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) +{ + f2fs_up_write(&sbi->cp_rwsem); +} + #define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 3)) static struct kmem_cache *ino_entry_slab; diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 7b68bf22989d..3155d30b2448 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1290,6 +1290,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, struct dnode_of_data dn; struct node_info ni; struct compress_io_ctx *cic; + struct f2fs_lock_context lc; pgoff_t start_idx = start_idx_of_cluster(cc); unsigned int last_index = cc->cluster_size - 1; loff_t psize; @@ -1309,7 +1310,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, * the below discard race condition. */ f2fs_down_read(&sbi->node_write); - } else if (!f2fs_trylock_op(sbi)) { + } else if (!f2fs_trylock_op(sbi, &lc)) { goto out_free; } @@ -1435,7 +1436,7 @@ unlock_continue: if (quota_inode) f2fs_up_read(&sbi->node_write); else - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); spin_lock(&fi->i_size_lock); if (fi->last_disk_size < psize) @@ -1464,7 +1465,7 @@ out_unlock_op: if (quota_inode) f2fs_up_read(&sbi->node_write); else - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); out_free: for (i = 0; i < cc->valid_nr_cpages; i++) { f2fs_compress_free_page(cc->cpages[i]); @@ -1511,6 +1512,7 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc, { struct address_space *mapping = cc->inode->i_mapping; struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); + struct f2fs_lock_context lc; int submitted, compr_blocks, i; int ret = 0; @@ -1529,7 +1531,7 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc, /* overwrite compressed cluster w/ normal cluster */ if (compr_blocks > 0) - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); for (i = 0; i < cc->cluster_size; i++) { struct folio *folio; @@ -1585,7 +1587,7 @@ continue_unlock: out: if (compr_blocks > 0) - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); f2fs_balance_fs(sbi, true); return ret; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 471e52c6c1e0..73fcafbc8191 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1466,34 +1466,39 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) return 0; } -static void f2fs_map_lock(struct f2fs_sb_info *sbi, int flag) +static void f2fs_map_lock(struct f2fs_sb_info *sbi, + struct f2fs_lock_context *lc, + int flag) { f2fs_down_read(&sbi->cp_enable_rwsem); if (flag == F2FS_GET_BLOCK_PRE_AIO) f2fs_down_read(&sbi->node_change); else - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, lc); } -static void f2fs_map_unlock(struct f2fs_sb_info *sbi, int flag) +static void f2fs_map_unlock(struct f2fs_sb_info *sbi, + struct f2fs_lock_context *lc, + int flag) { if (flag == F2FS_GET_BLOCK_PRE_AIO) f2fs_up_read(&sbi->node_change); else - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, lc); f2fs_up_read(&sbi->cp_enable_rwsem); } int f2fs_get_block_locked(struct dnode_of_data *dn, pgoff_t index) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + struct f2fs_lock_context lc; int err = 0; - f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO); + f2fs_map_lock(sbi, &lc, F2FS_GET_BLOCK_PRE_AIO); if (!f2fs_lookup_read_extent_cache_block(dn->inode, index, &dn->data_blkaddr)) err = f2fs_reserve_block(dn, index); - f2fs_map_unlock(sbi, F2FS_GET_BLOCK_PRE_AIO); + f2fs_map_unlock(sbi, &lc, F2FS_GET_BLOCK_PRE_AIO); return err; } @@ -1584,6 +1589,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag) unsigned int maxblocks = map->m_len; struct dnode_of_data dn; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_lock_context lc; int mode = map->m_may_create ? ALLOC_NODE : LOOKUP_NODE; pgoff_t pgofs, end_offset, end; int err = 0, ofs = 1; @@ -1622,7 +1628,7 @@ next_dnode: if (map->m_may_create) { if (f2fs_lfs_mode(sbi)) f2fs_balance_fs(sbi, true); - f2fs_map_lock(sbi, flag); + f2fs_map_lock(sbi, &lc, flag); } /* When reading holes, we need its node page */ @@ -1788,7 +1794,7 @@ skip: f2fs_put_dnode(&dn); if (map->m_may_create) { - f2fs_map_unlock(sbi, flag); + f2fs_map_unlock(sbi, &lc, flag); f2fs_balance_fs(sbi, dn.node_changed); } goto next_dnode; @@ -1835,7 +1841,7 @@ sync_out: f2fs_put_dnode(&dn); unlock_out: if (map->m_may_create) { - f2fs_map_unlock(sbi, flag); + f2fs_map_unlock(sbi, &lc, flag); f2fs_balance_fs(sbi, dn.node_changed); } out: @@ -2865,6 +2871,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) struct inode *inode = folio->mapping->host; struct dnode_of_data dn; struct node_info ni; + struct f2fs_lock_context lc; bool ipu_force = false; bool atomic_commit; int err = 0; @@ -2890,7 +2897,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) } /* Deadlock due to between page->lock and f2fs_lock_op */ - if (fio->need_lock == LOCK_REQ && !f2fs_trylock_op(fio->sbi)) + if (fio->need_lock == LOCK_REQ && !f2fs_trylock_op(fio->sbi, &lc)) return -EAGAIN; err = f2fs_get_dnode_of_data(&dn, folio->index, LOOKUP_NODE); @@ -2931,7 +2938,7 @@ got_it: folio_start_writeback(folio); f2fs_put_dnode(&dn); if (fio->need_lock == LOCK_REQ) - f2fs_unlock_op(fio->sbi); + f2fs_unlock_op(fio->sbi, &lc); err = f2fs_inplace_write_data(fio); if (err) { if (fscrypt_inode_uses_fs_layer_crypto(inode)) @@ -2945,7 +2952,7 @@ got_it: } if (fio->need_lock == LOCK_RETRY) { - if (!f2fs_trylock_op(fio->sbi)) { + if (!f2fs_trylock_op(fio->sbi, &lc)) { err = -EAGAIN; goto out_writepage; } @@ -2977,7 +2984,7 @@ out_writepage: f2fs_put_dnode(&dn); out: if (fio->need_lock == LOCK_REQ) - f2fs_unlock_op(fio->sbi); + f2fs_unlock_op(fio->sbi, &lc); return err; } @@ -3570,6 +3577,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, struct inode *inode = folio->mapping->host; pgoff_t index = folio->index; struct dnode_of_data dn; + struct f2fs_lock_context lc; struct folio *ifolio; bool locked = false; int flag = F2FS_GET_BLOCK_PRE_AIO; @@ -3586,10 +3594,10 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, if (f2fs_has_inline_data(inode)) { if (pos + len > MAX_INLINE_DATA(inode)) flag = F2FS_GET_BLOCK_DEFAULT; - f2fs_map_lock(sbi, flag); + f2fs_map_lock(sbi, &lc, flag); locked = true; } else if ((pos & PAGE_MASK) >= i_size_read(inode)) { - f2fs_map_lock(sbi, flag); + f2fs_map_lock(sbi, &lc, flag); locked = true; } @@ -3633,7 +3641,7 @@ restart: if (!err && dn.data_blkaddr != NULL_ADDR) goto out; f2fs_put_dnode(&dn); - f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO); + f2fs_map_lock(sbi, &lc, F2FS_GET_BLOCK_PRE_AIO); WARN_ON(flag != F2FS_GET_BLOCK_PRE_AIO); locked = true; goto restart; @@ -3647,7 +3655,7 @@ out: f2fs_put_dnode(&dn); unlock_out: if (locked) - f2fs_map_unlock(sbi, flag); + f2fs_map_unlock(sbi, &lc, flag); return err; } @@ -3683,10 +3691,11 @@ static int __reserve_data_block(struct inode *inode, pgoff_t index, { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; + struct f2fs_lock_context lc; struct folio *ifolio; int err = 0; - f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO); + f2fs_map_lock(sbi, &lc, F2FS_GET_BLOCK_PRE_AIO); ifolio = f2fs_get_inode_folio(sbi, inode->i_ino); if (IS_ERR(ifolio)) { @@ -3704,7 +3713,7 @@ static int __reserve_data_block(struct inode *inode, pgoff_t index, f2fs_put_dnode(&dn); unlock_out: - f2fs_map_unlock(sbi, F2FS_GET_BLOCK_PRE_AIO); + f2fs_map_unlock(sbi, &lc, F2FS_GET_BLOCK_PRE_AIO); return err; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7f2675ed7f86..58244bb87fef 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -175,6 +175,7 @@ enum device_allocation_policy { enum f2fs_lock_name { LOCK_NAME_NONE, + LOCK_NAME_CP_RWSEM, }; /* @@ -2417,33 +2418,6 @@ static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi, return (cpc) ? (cpc->reason & CP_UMOUNT) && set : set; } -static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) -{ - f2fs_down_read(&sbi->cp_rwsem); -} - -static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi) -{ - if (time_to_inject(sbi, FAULT_LOCK_OP)) - return 0; - return f2fs_down_read_trylock(&sbi->cp_rwsem); -} - -static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi) -{ - f2fs_up_read(&sbi->cp_rwsem); -} - -static inline void f2fs_lock_all(struct f2fs_sb_info *sbi) -{ - f2fs_down_write(&sbi->cp_rwsem); -} - -static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) -{ - f2fs_up_write(&sbi->cp_rwsem); -} - static inline int __get_cp_reason(struct f2fs_sb_info *sbi) { int reason = CP_SYNC; @@ -3770,7 +3744,7 @@ void f2fs_update_inode_page(struct inode *inode); int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc); void f2fs_remove_donate_inode(struct inode *inode); void f2fs_evict_inode(struct inode *inode); -void f2fs_handle_failed_inode(struct inode *inode); +void f2fs_handle_failed_inode(struct inode *inode, struct f2fs_lock_context *lc); /* * namei.c @@ -4037,6 +4011,9 @@ static inline bool f2fs_need_rand_seg(struct f2fs_sb_info *sbi) /* * checkpoint.c */ +void f2fs_lock_op(struct f2fs_sb_info *sbi, struct f2fs_lock_context *lc); +int f2fs_trylock_op(struct f2fs_sb_info *sbi, struct f2fs_lock_context *lc); +void f2fs_unlock_op(struct f2fs_sb_info *sbi, struct f2fs_lock_context *lc); void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io, unsigned char reason); void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index e75e61ac50d7..1cdbbc2e1005 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -774,6 +774,7 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; + struct f2fs_lock_context lc; pgoff_t free_from; int count = 0, err = 0; struct folio *ifolio; @@ -792,7 +793,7 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock) goto free_partial; if (lock) - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); ifolio = f2fs_get_inode_folio(sbi, inode->i_ino); if (IS_ERR(ifolio)) { @@ -843,7 +844,7 @@ free_next: err = f2fs_truncate_inode_blocks(inode, free_from); out: if (lock) - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); free_partial: /* lastly zero out the first data page */ if (!err) @@ -1118,11 +1119,13 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, } if (i_uid_needs_update(idmap, attr, inode) || i_gid_needs_update(idmap, attr, inode)) { - f2fs_lock_op(sbi); + struct f2fs_lock_context lc; + + f2fs_lock_op(sbi, &lc); err = dquot_transfer(idmap, inode, attr); if (err) { set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); return err; } /* @@ -1132,7 +1135,7 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, i_uid_update(idmap, attr, inode); i_gid_update(idmap, attr, inode); f2fs_mark_inode_dirty_sync(inode, true); - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); } if (attr->ia_valid & ATTR_SIZE) { @@ -1216,15 +1219,16 @@ static int fill_zero(struct inode *inode, pgoff_t index, { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct folio *folio; + struct f2fs_lock_context lc; if (!len) return 0; f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); folio = f2fs_get_new_data_folio(inode, NULL, index, false); - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); if (IS_ERR(folio)) return PTR_ERR(folio); @@ -1307,6 +1311,7 @@ static int f2fs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (pg_start < pg_end) { loff_t blk_start, blk_end; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_lock_context lc; f2fs_balance_fs(sbi, true); @@ -1318,9 +1323,9 @@ static int f2fs_punch_hole(struct inode *inode, loff_t offset, loff_t len) truncate_pagecache_range(inode, blk_start, blk_end - 1); - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); ret = f2fs_truncate_hole(inode, pg_start, pg_end); - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); filemap_invalidate_unlock(inode->i_mapping); f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); @@ -1552,6 +1557,7 @@ roll_back: static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_lock_context lc; pgoff_t nrpages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); pgoff_t start = offset >> PAGE_SHIFT; pgoff_t end = (offset + len) >> PAGE_SHIFT; @@ -1565,11 +1571,11 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) f2fs_zero_post_eof_page(inode, offset + len, false); - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); f2fs_drop_extent_tree(inode); truncate_pagecache(inode, offset); ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); filemap_invalidate_unlock(inode->i_mapping); f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); @@ -1717,6 +1723,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, for (index = pg_start; index < pg_end;) { struct dnode_of_data dn; + struct f2fs_lock_context lc; unsigned int end_offset; pgoff_t end; @@ -1727,12 +1734,12 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, (loff_t)index << PAGE_SHIFT, ((loff_t)pg_end << PAGE_SHIFT) - 1); - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); set_new_dnode(&dn, inode, NULL, NULL, 0); ret = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE); if (ret) { - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); filemap_invalidate_unlock(mapping); f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); goto out; @@ -1744,7 +1751,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = f2fs_do_zero_range(&dn, index, end); f2fs_put_dnode(&dn); - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); filemap_invalidate_unlock(mapping); f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); @@ -1827,17 +1834,19 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) truncate_pagecache(inode, offset); while (!ret && idx > pg_start) { + struct f2fs_lock_context lc; + nr = idx - pg_start; if (nr > delta) nr = delta; idx -= nr; - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); f2fs_drop_extent_tree(inode); ret = __exchange_data_block(inode, inode, idx, idx + delta, nr, false); - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); } filemap_invalidate_unlock(mapping); f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); @@ -3093,6 +3102,7 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, struct inode *src = file_inode(file_in); struct inode *dst = file_inode(file_out); struct f2fs_sb_info *sbi = F2FS_I_SB(src); + struct f2fs_lock_context lc; size_t olen = len, dst_max_i_size = 0; size_t dst_osize; int ret; @@ -3188,7 +3198,7 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, goto out_src; } - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); ret = __exchange_data_block(src, dst, F2FS_BYTES_TO_BLK(pos_in), F2FS_BYTES_TO_BLK(pos_out), F2FS_BYTES_TO_BLK(len), false); @@ -3199,7 +3209,7 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, else if (dst_osize != dst->i_size) f2fs_i_size_write(dst, dst_osize); } - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); if (src != dst) f2fs_up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); @@ -3367,6 +3377,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode *ri = NULL; + struct f2fs_lock_context lc; kprojid_t kprojid; int err; @@ -3397,7 +3408,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) if (err) return err; - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); err = f2fs_transfer_project_quota(inode, kprojid); if (err) goto out_unlock; @@ -3406,7 +3417,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) inode_set_ctime_current(inode); f2fs_mark_inode_dirty_sync(inode, true); out_unlock: - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); return err; } #else @@ -3839,6 +3850,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) struct inode *inode = file_inode(filp); struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_lock_context lc; pgoff_t page_idx = 0, last_idx; unsigned int released_blocks = 0; int ret; @@ -3893,12 +3905,12 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) struct dnode_of_data dn; pgoff_t end_offset, count; - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); set_new_dnode(&dn, inode, NULL, NULL, 0); ret = f2fs_get_dnode_of_data(&dn, page_idx, LOOKUP_NODE); if (ret) { - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); if (ret == -ENOENT) { page_idx = f2fs_get_next_page_offset(&dn, page_idx); @@ -3916,7 +3928,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) f2fs_put_dnode(&dn); - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); if (ret < 0) break; @@ -4069,14 +4081,15 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) while (page_idx < last_idx) { struct dnode_of_data dn; + struct f2fs_lock_context lc; pgoff_t end_offset, count; - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); set_new_dnode(&dn, inode, NULL, NULL, 0); ret = f2fs_get_dnode_of_data(&dn, page_idx, LOOKUP_NODE); if (ret) { - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); if (ret == -ENOENT) { page_idx = f2fs_get_next_page_offset(&dn, page_idx); @@ -4094,7 +4107,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) f2fs_put_dnode(&dn); - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); if (ret < 0) break; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index ba66d8bc9b5f..8999829a9559 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -2263,6 +2263,7 @@ int f2fs_resize_fs(struct file *filp, __u64 block_count) struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(filp)); __u64 old_block_count, shrunk_blocks; struct cp_control cpc = { CP_RESIZE, 0, 0, 0 }; + struct f2fs_lock_context lc; unsigned int secs; int err = 0; __u32 rem; @@ -2312,7 +2313,7 @@ int f2fs_resize_fs(struct file *filp, __u64 block_count) } /* stop CP to protect MAIN_SEC in free_segment_range */ - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); spin_lock(&sbi->stat_lock); if (shrunk_blocks + valid_user_blocks(sbi) + @@ -2327,7 +2328,7 @@ int f2fs_resize_fs(struct file *filp, __u64 block_count) err = free_segment_range(sbi, secs, true); out_unlock: - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); f2fs_up_write(&sbi->gc_lock); out_drop_write: mnt_drop_write_file(filp); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index e5c6a08b7e4f..0a1052d5ee62 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -218,6 +218,7 @@ int f2fs_convert_inline_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; + struct f2fs_lock_context lc; struct folio *ifolio, *folio; int err = 0; @@ -235,7 +236,7 @@ int f2fs_convert_inline_inode(struct inode *inode) if (IS_ERR(folio)) return PTR_ERR(folio); - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); ifolio = f2fs_get_inode_folio(sbi, inode->i_ino); if (IS_ERR(ifolio)) { @@ -250,7 +251,7 @@ int f2fs_convert_inline_inode(struct inode *inode) f2fs_put_dnode(&dn); out: - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); f2fs_folio_put(folio, true); @@ -597,13 +598,14 @@ int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry) struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct folio *ifolio; struct f2fs_filename fname; + struct f2fs_lock_context lc; void *inline_dentry = NULL; int err = 0; if (!f2fs_has_inline_dentry(dir)) return 0; - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); err = f2fs_setup_filename(dir, &dentry->d_name, 0, &fname); if (err) @@ -628,7 +630,7 @@ int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry) out_fname: f2fs_free_filename(&fname); out: - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); return err; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index d1270b25ad7d..b8cf1fab6391 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -910,9 +910,11 @@ retry: err = -EIO; if (!err) { - f2fs_lock_op(sbi); + struct f2fs_lock_context lc; + + f2fs_lock_op(sbi, &lc); err = f2fs_remove_inode_page(inode); - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); if (err == -ENOENT) { err = 0; @@ -1009,7 +1011,7 @@ out_clear: } /* caller should call f2fs_lock_op() */ -void f2fs_handle_failed_inode(struct inode *inode) +void f2fs_handle_failed_inode(struct inode *inode, struct f2fs_lock_context *lc) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct node_info ni; @@ -1058,7 +1060,7 @@ void f2fs_handle_failed_inode(struct inode *inode) } out: - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, lc); /* iput will drop the inode object */ iput(inode); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 043d20516a21..e360f08a9586 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -354,6 +354,7 @@ static int f2fs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct f2fs_lock_context lc; struct inode *inode; nid_t ino = 0; int err; @@ -376,11 +377,11 @@ static int f2fs_create(struct mnt_idmap *idmap, struct inode *dir, inode->i_mapping->a_ops = &f2fs_dblock_aops; ino = inode->i_ino; - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); err = f2fs_add_link(dentry, inode); if (err) goto out; - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); f2fs_alloc_nid_done(sbi, ino); @@ -392,7 +393,7 @@ static int f2fs_create(struct mnt_idmap *idmap, struct inode *dir, f2fs_balance_fs(sbi, true); return 0; out: - f2fs_handle_failed_inode(inode); + f2fs_handle_failed_inode(inode, &lc); return err; } @@ -401,6 +402,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, { struct inode *inode = d_inode(old_dentry); struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct f2fs_lock_context lc; int err; if (unlikely(f2fs_cp_error(sbi))) @@ -427,11 +429,11 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, ihold(inode); set_inode_flag(inode, FI_INC_LINK); - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); err = f2fs_add_link(dentry, inode); if (err) goto out; - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); d_instantiate(dentry, inode); @@ -441,7 +443,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, out: clear_inode_flag(inode, FI_INC_LINK); iput(inode); - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); return err; } @@ -545,6 +547,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode = d_inode(dentry); struct f2fs_dir_entry *de; + struct f2fs_lock_context lc; struct folio *folio; int err; @@ -581,15 +584,15 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); err = f2fs_acquire_orphan_inode(sbi); if (err) { - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); f2fs_folio_put(folio, false); goto out; } f2fs_delete_entry(de, folio, dir, inode); - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid @@ -632,6 +635,7 @@ static int f2fs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct f2fs_lock_context lc; struct inode *inode; size_t len = strlen(symname); struct fscrypt_str disk_link; @@ -662,11 +666,11 @@ static int f2fs_symlink(struct mnt_idmap *idmap, struct inode *dir, inode_nohighmem(inode); inode->i_mapping->a_ops = &f2fs_dblock_aops; - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); err = f2fs_add_link(dentry, inode); if (err) goto out_f2fs_handle_failed_inode; - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); f2fs_alloc_nid_done(sbi, inode->i_ino); err = fscrypt_encrypt_symlink(inode, symname, len, &disk_link); @@ -701,7 +705,7 @@ err_out: goto out_free_encrypted_link; out_f2fs_handle_failed_inode: - f2fs_handle_failed_inode(inode); + f2fs_handle_failed_inode(inode, &lc); out_free_encrypted_link: if (disk_link.name != (unsigned char *)symname) kfree(disk_link.name); @@ -712,6 +716,7 @@ static struct dentry *f2fs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct f2fs_lock_context lc; struct inode *inode; int err; @@ -732,11 +737,11 @@ static struct dentry *f2fs_mkdir(struct mnt_idmap *idmap, struct inode *dir, mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); set_inode_flag(inode, FI_INC_LINK); - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); err = f2fs_add_link(dentry, inode); if (err) goto out_fail; - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); f2fs_alloc_nid_done(sbi, inode->i_ino); @@ -750,7 +755,7 @@ static struct dentry *f2fs_mkdir(struct mnt_idmap *idmap, struct inode *dir, out_fail: clear_inode_flag(inode, FI_INC_LINK); - f2fs_handle_failed_inode(inode); + f2fs_handle_failed_inode(inode, &lc); return ERR_PTR(err); } @@ -767,6 +772,7 @@ static int f2fs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct f2fs_lock_context lc; struct inode *inode; int err = 0; @@ -786,11 +792,11 @@ static int f2fs_mknod(struct mnt_idmap *idmap, struct inode *dir, init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &f2fs_special_inode_operations; - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); err = f2fs_add_link(dentry, inode); if (err) goto out; - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); f2fs_alloc_nid_done(sbi, inode->i_ino); @@ -802,7 +808,7 @@ static int f2fs_mknod(struct mnt_idmap *idmap, struct inode *dir, f2fs_balance_fs(sbi, true); return 0; out: - f2fs_handle_failed_inode(inode); + f2fs_handle_failed_inode(inode, &lc); return err; } @@ -811,6 +817,7 @@ static int __f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct inode **new_inode, struct f2fs_filename *fname) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct f2fs_lock_context lc; struct inode *inode; int err; @@ -831,7 +838,7 @@ static int __f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, inode->i_mapping->a_ops = &f2fs_dblock_aops; } - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); err = f2fs_acquire_orphan_inode(sbi); if (err) goto out; @@ -860,7 +867,7 @@ static int __f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, f2fs_i_links_write(inode, false); } /* link_count was changed by d_tmpfile as well. */ - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); unlock_new_inode(inode); if (new_inode) @@ -872,7 +879,7 @@ static int __f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, release_out: f2fs_release_orphan_inode(sbi); out: - f2fs_handle_failed_inode(inode); + f2fs_handle_failed_inode(inode, &lc); return err; } @@ -920,6 +927,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct f2fs_dir_entry *old_dir_entry = NULL; struct f2fs_dir_entry *old_entry; struct f2fs_dir_entry *new_entry; + struct f2fs_lock_context lc; bool old_is_dir = S_ISDIR(old_inode->i_mode); int err; @@ -1008,7 +1016,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); err = f2fs_acquire_orphan_inode(sbi); if (err) @@ -1031,11 +1039,11 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, } else { f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); err = f2fs_add_link(new_dentry, old_inode); if (err) { - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); goto out_dir; } @@ -1084,7 +1092,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, TRANS_DIR_INO); } - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) f2fs_sync_fs(sbi->sb, 1); @@ -1093,7 +1101,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, return 0; put_out_dir: - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); f2fs_folio_put(new_folio, false); out_dir: if (old_dir_entry) @@ -1115,6 +1123,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, struct folio *old_folio, *new_folio; struct f2fs_dir_entry *old_dir_entry = NULL, *new_dir_entry = NULL; struct f2fs_dir_entry *old_entry, *new_entry; + struct f2fs_lock_context lc; int old_nlink = 0, new_nlink = 0; int err; @@ -1194,7 +1203,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); /* update ".." directory entry info of old dentry */ if (old_dir_entry) @@ -1247,7 +1256,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); } - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) f2fs_sync_fs(sbi->sb, 1); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c0c5b7075b04..e4a8daf433a8 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -400,6 +400,7 @@ int f2fs_commit_atomic_write(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_lock_context lc; int err; err = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); @@ -407,11 +408,11 @@ int f2fs_commit_atomic_write(struct inode *inode) return err; f2fs_down_write(&fi->i_gc_rwsem[WRITE]); - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); err = __f2fs_commit_atomic_write(inode); - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); f2fs_up_write(&fi->i_gc_rwsem[WRITE]); return err; @@ -3362,13 +3363,14 @@ int f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force) int f2fs_allocate_pinning_section(struct f2fs_sb_info *sbi) { + struct f2fs_lock_context lc; int err; bool gc_required = true; retry: - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); err = f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false); - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); if (f2fs_sb_has_blkzoned(sbi) && err == -EAGAIN && gc_required) { f2fs_down_write(&sbi->gc_lock); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8fe1ac8a609c..d70567b48d12 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3325,6 +3325,7 @@ int f2fs_do_quota_sync(struct super_block *sb, int type) * that userspace sees the changes. */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + struct f2fs_lock_context lc; if (type != -1 && cnt != type) continue; @@ -3344,13 +3345,13 @@ int f2fs_do_quota_sync(struct super_block *sb, int type) * block_operation * f2fs_down_read(quota_sem) */ - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); f2fs_down_read(&sbi->quota_sem); ret = f2fs_quota_sync_file(sbi, cnt); f2fs_up_read(&sbi->quota_sem); - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); if (!f2fs_sb_has_quota_ino(sbi)) inode_unlock(dqopt->files[cnt]); @@ -4898,7 +4899,7 @@ try_onemore: init_f2fs_rwsem(&sbi->node_write); init_f2fs_rwsem(&sbi->node_change); spin_lock_init(&sbi->stat_lock); - init_f2fs_rwsem(&sbi->cp_rwsem); + init_f2fs_rwsem_trace(&sbi->cp_rwsem, sbi, LOCK_NAME_CP_RWSEM); init_f2fs_rwsem(&sbi->cp_enable_rwsem); init_f2fs_rwsem(&sbi->quota_sem); init_waitqueue_head(&sbi->cp_wait); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index b4e5c406632f..941dc62a6d6f 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -804,6 +804,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, struct folio *ifolio, int flags) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_lock_context lc; int err; if (unlikely(f2fs_cp_error(sbi))) @@ -821,11 +822,11 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, size, ifolio, flags); f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); f2fs_down_write(&F2FS_I(inode)->i_xattr_sem); err = __f2fs_setxattr(inode, index, name, value, size, NULL, flags); f2fs_up_write(&F2FS_I(inode)->i_xattr_sem); - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); f2fs_update_time(sbi, REQ_TIME); return err; diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 9a852a16df9c..f4f13ddbe104 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -186,7 +186,7 @@ TRACE_DEFINE_ENUM(CP_PHASE_FINISH_CHECKPOINT); #define show_lock_name(lock) \ __print_symbolic(lock, \ - { LOCK_NAME_NONE, "none" }) + { LOCK_NAME_CP_RWSEM, "cp_rwsem" }) struct f2fs_sb_info; struct f2fs_io_info; From f9f93602512bceb28865942abfab54021e3a3d86 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 4 Jan 2026 10:07:19 +0800 Subject: [PATCH 15/61] f2fs: trace elapsed time for node_change lock Use f2fs_{down,up}_read_trace for node_change to trace lock elapsed time. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 ++-- fs/f2fs/f2fs.h | 1 + fs/f2fs/super.c | 2 +- include/trace/events/f2fs.h | 3 ++- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 73fcafbc8191..5469547142e7 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1472,7 +1472,7 @@ static void f2fs_map_lock(struct f2fs_sb_info *sbi, { f2fs_down_read(&sbi->cp_enable_rwsem); if (flag == F2FS_GET_BLOCK_PRE_AIO) - f2fs_down_read(&sbi->node_change); + f2fs_down_read_trace(&sbi->node_change, lc); else f2fs_lock_op(sbi, lc); } @@ -1482,7 +1482,7 @@ static void f2fs_map_unlock(struct f2fs_sb_info *sbi, int flag) { if (flag == F2FS_GET_BLOCK_PRE_AIO) - f2fs_up_read(&sbi->node_change); + f2fs_up_read_trace(&sbi->node_change, lc); else f2fs_unlock_op(sbi, lc); f2fs_up_read(&sbi->cp_enable_rwsem); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 58244bb87fef..8f6a255f9e57 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -176,6 +176,7 @@ enum device_allocation_policy { enum f2fs_lock_name { LOCK_NAME_NONE, LOCK_NAME_CP_RWSEM, + LOCK_NAME_NODE_CHANGE, }; /* diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d70567b48d12..247638b98cfb 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4897,7 +4897,7 @@ try_onemore: mutex_init(&sbi->writepages); init_f2fs_rwsem(&sbi->cp_global_sem); init_f2fs_rwsem(&sbi->node_write); - init_f2fs_rwsem(&sbi->node_change); + init_f2fs_rwsem_trace(&sbi->node_change, sbi, LOCK_NAME_NODE_CHANGE); spin_lock_init(&sbi->stat_lock); init_f2fs_rwsem_trace(&sbi->cp_rwsem, sbi, LOCK_NAME_CP_RWSEM); init_f2fs_rwsem(&sbi->cp_enable_rwsem); diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index f4f13ddbe104..d472f47eedec 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -186,7 +186,8 @@ TRACE_DEFINE_ENUM(CP_PHASE_FINISH_CHECKPOINT); #define show_lock_name(lock) \ __print_symbolic(lock, \ - { LOCK_NAME_CP_RWSEM, "cp_rwsem" }) + { LOCK_NAME_CP_RWSEM, "cp_rwsem" }, \ + { LOCK_NAME_NODE_CHANGE, "node_change" }) struct f2fs_sb_info; struct f2fs_io_info; From bb28b66875cca72fcb62ee572fb32e0d4267a5f9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 4 Jan 2026 10:07:20 +0800 Subject: [PATCH 16/61] f2fs: trace elapsed time for node_write lock Use f2fs_{down,up}_read_trace for node_write to trace lock elapsed time. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 6 +++--- fs/f2fs/data.c | 6 ++++-- fs/f2fs/f2fs.h | 1 + fs/f2fs/node.c | 9 +++++---- fs/f2fs/super.c | 2 +- include/trace/events/f2fs.h | 3 ++- 6 files changed, 16 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 3155d30b2448..316bc3e6d2d4 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1309,7 +1309,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, * checkpoint. This can only happen to quota writes which can cause * the below discard race condition. */ - f2fs_down_read(&sbi->node_write); + f2fs_down_read_trace(&sbi->node_write, &lc); } else if (!f2fs_trylock_op(sbi, &lc)) { goto out_free; } @@ -1434,7 +1434,7 @@ unlock_continue: f2fs_put_dnode(&dn); if (quota_inode) - f2fs_up_read(&sbi->node_write); + f2fs_up_read_trace(&sbi->node_write, &lc); else f2fs_unlock_op(sbi, &lc); @@ -1463,7 +1463,7 @@ out_put_dnode: f2fs_put_dnode(&dn); out_unlock_op: if (quota_inode) - f2fs_up_read(&sbi->node_write); + f2fs_up_read_trace(&sbi->node_write, &lc); else f2fs_unlock_op(sbi, &lc); out_free: diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 5469547142e7..79455d7acba5 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3064,19 +3064,21 @@ int f2fs_write_single_data_page(struct folio *folio, int *submitted, write: /* Dentry/quota blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode) || quota_inode) { + struct f2fs_lock_context lc; + /* * We need to wait for node_write to avoid block allocation during * checkpoint. This can only happen to quota writes which can cause * the below discard race condition. */ if (quota_inode) - f2fs_down_read(&sbi->node_write); + f2fs_down_read_trace(&sbi->node_write, &lc); fio.need_lock = LOCK_DONE; err = f2fs_do_write_data_page(&fio); if (quota_inode) - f2fs_up_read(&sbi->node_write); + f2fs_up_read_trace(&sbi->node_write, &lc); goto done; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8f6a255f9e57..a31394f1b493 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -177,6 +177,7 @@ enum f2fs_lock_name { LOCK_NAME_NONE, LOCK_NAME_CP_RWSEM, LOCK_NAME_NODE_CHANGE, + LOCK_NAME_NODE_WRITE, }; /* diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 3a80da524739..d378549010e6 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1738,6 +1738,7 @@ static bool __write_node_folio(struct folio *folio, bool atomic, bool *submitted .io_type = io_type, .io_wbc = wbc, }; + struct f2fs_lock_context lc; unsigned int seq; trace_f2fs_writepage(folio, NODE); @@ -1767,13 +1768,13 @@ static bool __write_node_folio(struct folio *folio, bool atomic, bool *submitted if (f2fs_get_node_info(sbi, nid, &ni, !do_balance)) goto redirty_out; - f2fs_down_read(&sbi->node_write); + f2fs_down_read_trace(&sbi->node_write, &lc); /* This page is already truncated */ if (unlikely(ni.blk_addr == NULL_ADDR)) { folio_clear_uptodate(folio); dec_page_count(sbi, F2FS_DIRTY_NODES); - f2fs_up_read(&sbi->node_write); + f2fs_up_read_trace(&sbi->node_write, &lc); folio_unlock(folio); return true; } @@ -1781,7 +1782,7 @@ static bool __write_node_folio(struct folio *folio, bool atomic, bool *submitted if (__is_valid_data_blkaddr(ni.blk_addr) && !f2fs_is_valid_blkaddr(sbi, ni.blk_addr, DATA_GENERIC_ENHANCE)) { - f2fs_up_read(&sbi->node_write); + f2fs_up_read_trace(&sbi->node_write, &lc); goto redirty_out; } @@ -1801,7 +1802,7 @@ static bool __write_node_folio(struct folio *folio, bool atomic, bool *submitted f2fs_do_write_node_page(nid, &fio); set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(folio)); dec_page_count(sbi, F2FS_DIRTY_NODES); - f2fs_up_read(&sbi->node_write); + f2fs_up_read_trace(&sbi->node_write, &lc); folio_unlock(folio); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 247638b98cfb..8cd519bb3c97 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4896,7 +4896,7 @@ try_onemore: init_f2fs_rwsem(&sbi->gc_lock); mutex_init(&sbi->writepages); init_f2fs_rwsem(&sbi->cp_global_sem); - init_f2fs_rwsem(&sbi->node_write); + init_f2fs_rwsem_trace(&sbi->node_write, sbi, LOCK_NAME_NODE_WRITE); init_f2fs_rwsem_trace(&sbi->node_change, sbi, LOCK_NAME_NODE_CHANGE); spin_lock_init(&sbi->stat_lock); init_f2fs_rwsem_trace(&sbi->cp_rwsem, sbi, LOCK_NAME_CP_RWSEM); diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index d472f47eedec..e5cfb8ad0d5e 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -187,7 +187,8 @@ TRACE_DEFINE_ENUM(CP_PHASE_FINISH_CHECKPOINT); #define show_lock_name(lock) \ __print_symbolic(lock, \ { LOCK_NAME_CP_RWSEM, "cp_rwsem" }, \ - { LOCK_NAME_NODE_CHANGE, "node_change" }) + { LOCK_NAME_NODE_CHANGE, "node_change" }, \ + { LOCK_NAME_NODE_WRITE, "node_write" }) struct f2fs_sb_info; struct f2fs_io_info; From e605302c14ffda051dc7fbc5f27e1fecc9f681e3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 4 Jan 2026 18:34:12 +0800 Subject: [PATCH 17/61] f2fs: trace elapsed time for gc_lock lock Use f2fs_{down,up}_write_trace for gc_lock to trace lock elapsed time. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 10 ++++++---- fs/f2fs/f2fs.h | 22 ++++++++++++---------- fs/f2fs/file.c | 13 +++++++------ fs/f2fs/gc.c | 23 +++++++++++++---------- fs/f2fs/segment.c | 11 ++++++----- fs/f2fs/super.c | 14 ++++++++------ include/trace/events/f2fs.h | 3 ++- 7 files changed, 54 insertions(+), 42 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index dfd54cba1b35..da7bcfa2a178 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1930,11 +1930,12 @@ void f2fs_destroy_checkpoint_caches(void) static int __write_checkpoint_sync(struct f2fs_sb_info *sbi) { struct cp_control cpc = { .reason = CP_SYNC, }; + struct f2fs_lock_context lc; int err; - f2fs_down_write(&sbi->gc_lock); + f2fs_down_write_trace(&sbi->gc_lock, &lc); err = f2fs_write_checkpoint(sbi, &cpc); - f2fs_up_write(&sbi->gc_lock); + f2fs_up_write_trace(&sbi->gc_lock, &lc); return err; } @@ -2022,11 +2023,12 @@ int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi) cpc.reason = __get_cp_reason(sbi); if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC || sbi->umount_lock_holder == current) { + struct f2fs_lock_context lc; int ret; - f2fs_down_write(&sbi->gc_lock); + f2fs_down_write_trace(&sbi->gc_lock, &lc); ret = f2fs_write_checkpoint(sbi, &cpc); - f2fs_up_write(&sbi->gc_lock); + f2fs_up_write_trace(&sbi->gc_lock, &lc); return ret; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a31394f1b493..3f3faa5e8ff1 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -178,6 +178,7 @@ enum f2fs_lock_name { LOCK_NAME_CP_RWSEM, LOCK_NAME_NODE_CHANGE, LOCK_NAME_NODE_WRITE, + LOCK_NAME_GC_LOCK, }; /* @@ -1408,16 +1409,6 @@ struct atgc_management { unsigned long long age_threshold; /* age threshold */ }; -struct f2fs_gc_control { - unsigned int victim_segno; /* target victim segment number */ - int init_gc_type; /* FG_GC or BG_GC */ - bool no_bg_gc; /* check the space and stop bg_gc */ - bool should_migrate_blocks; /* should migrate blocks */ - bool err_gc_skipped; /* return EAGAIN if GC skipped */ - bool one_time; /* require one time GC in one migration unit */ - unsigned int nr_free_secs; /* # of free sections to do GC */ -}; - struct f2fs_time_stat { unsigned long long total_time; /* total wall clock time */ #ifdef CONFIG_64BIT @@ -1436,6 +1427,17 @@ struct f2fs_lock_context { bool lock_trace; }; +struct f2fs_gc_control { + unsigned int victim_segno; /* target victim segment number */ + int init_gc_type; /* FG_GC or BG_GC */ + bool no_bg_gc; /* check the space and stop bg_gc */ + bool should_migrate_blocks; /* should migrate blocks */ + bool err_gc_skipped; /* return EAGAIN if GC skipped */ + bool one_time; /* require one time GC in one migration unit */ + unsigned int nr_free_secs; /* # of free sections to do GC */ + struct f2fs_lock_context lc; /* lock context for gc_lock */ +}; + /* * For s_flag in struct f2fs_sb_info * Modification on enum should be synchronized with s_flag array diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 1cdbbc2e1005..ce291f152bc3 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1928,7 +1928,7 @@ next_alloc: if (has_not_enough_free_secs(sbi, 0, sbi->reserved_pin_section)) { - f2fs_down_write(&sbi->gc_lock); + f2fs_down_write_trace(&sbi->gc_lock, &gc_control.lc); stat_inc_gc_call_count(sbi, FOREGROUND); err = f2fs_gc(sbi, &gc_control); if (err && err != -ENODATA) { @@ -2779,12 +2779,13 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) return ret; if (!sync) { - if (!f2fs_down_write_trylock(&sbi->gc_lock)) { + if (!f2fs_down_write_trylock_trace(&sbi->gc_lock, + &gc_control.lc)) { ret = -EBUSY; goto out; } } else { - f2fs_down_write(&sbi->gc_lock); + f2fs_down_write_trace(&sbi->gc_lock, &gc_control.lc); } gc_control.init_gc_type = sync ? FG_GC : BG_GC; @@ -2824,12 +2825,12 @@ static int __f2fs_ioc_gc_range(struct file *filp, struct f2fs_gc_range *range) do_more: if (!range->sync) { - if (!f2fs_down_write_trylock(&sbi->gc_lock)) { + if (!f2fs_down_write_trylock_trace(&sbi->gc_lock, &gc_control.lc)) { ret = -EBUSY; goto out; } } else { - f2fs_down_write(&sbi->gc_lock); + f2fs_down_write_trace(&sbi->gc_lock, &gc_control.lc); } gc_control.victim_segno = GET_SEGNO(sbi, range->start); @@ -3320,7 +3321,7 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) end_segno = min(start_segno + range.segments, dev_end_segno); while (start_segno < end_segno) { - if (!f2fs_down_write_trylock(&sbi->gc_lock)) { + if (!f2fs_down_write_trylock_trace(&sbi->gc_lock, &gc_control.lc)) { ret = -EBUSY; goto out; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 8999829a9559..391e66064c7e 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -102,21 +102,22 @@ static int gc_thread_func(void *data) if (sbi->gc_mode == GC_URGENT_HIGH || sbi->gc_mode == GC_URGENT_MID) { wait_ms = gc_th->urgent_sleep_time; - f2fs_down_write(&sbi->gc_lock); + f2fs_down_write_trace(&sbi->gc_lock, &gc_control.lc); goto do_gc; } if (foreground) { - f2fs_down_write(&sbi->gc_lock); + f2fs_down_write_trace(&sbi->gc_lock, &gc_control.lc); goto do_gc; - } else if (!f2fs_down_write_trylock(&sbi->gc_lock)) { + } else if (!f2fs_down_write_trylock_trace(&sbi->gc_lock, + &gc_control.lc)) { stat_other_skip_bggc_count(sbi); goto next; } if (!is_idle(sbi, GC_TIME)) { increase_sleep_time(gc_th, &wait_ms); - f2fs_up_write(&sbi->gc_lock); + f2fs_up_write_trace(&sbi->gc_lock, &gc_control.lc); stat_io_skip_bggc_count(sbi); goto next; } @@ -125,7 +126,8 @@ static int gc_thread_func(void *data) if (has_enough_free_blocks(sbi, gc_th->no_zoned_gc_percent)) { wait_ms = gc_th->no_gc_sleep_time; - f2fs_up_write(&sbi->gc_lock); + f2fs_up_write_trace(&sbi->gc_lock, + &gc_control.lc); goto next; } if (wait_ms == gc_th->no_gc_sleep_time) @@ -2046,7 +2048,7 @@ stop: reserved_segments(sbi), prefree_segments(sbi)); - f2fs_up_write(&sbi->gc_lock); + f2fs_up_write_trace(&sbi->gc_lock, &gc_control->lc); put_gc_inode(&gc_list); @@ -2264,6 +2266,7 @@ int f2fs_resize_fs(struct file *filp, __u64 block_count) __u64 old_block_count, shrunk_blocks; struct cp_control cpc = { CP_RESIZE, 0, 0, 0 }; struct f2fs_lock_context lc; + struct f2fs_lock_context glc; unsigned int secs; int err = 0; __u32 rem; @@ -2307,7 +2310,7 @@ int f2fs_resize_fs(struct file *filp, __u64 block_count) secs = div_u64(shrunk_blocks, BLKS_PER_SEC(sbi)); /* stop other GC */ - if (!f2fs_down_write_trylock(&sbi->gc_lock)) { + if (!f2fs_down_write_trylock_trace(&sbi->gc_lock, &glc)) { err = -EAGAIN; goto out_drop_write; } @@ -2329,7 +2332,7 @@ int f2fs_resize_fs(struct file *filp, __u64 block_count) out_unlock: f2fs_unlock_op(sbi, &lc); - f2fs_up_write(&sbi->gc_lock); + f2fs_up_write_trace(&sbi->gc_lock, &glc); out_drop_write: mnt_drop_write_file(filp); if (err) @@ -2346,7 +2349,7 @@ out_drop_write: return -EROFS; } - f2fs_down_write(&sbi->gc_lock); + f2fs_down_write_trace(&sbi->gc_lock, &glc); f2fs_down_write(&sbi->cp_global_sem); spin_lock(&sbi->stat_lock); @@ -2396,7 +2399,7 @@ recover_out: } out_err: f2fs_up_write(&sbi->cp_global_sem); - f2fs_up_write(&sbi->gc_lock); + f2fs_up_write_trace(&sbi->gc_lock, &glc); thaw_super(sbi->sb, FREEZE_HOLDER_KERNEL, NULL); return err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e4a8daf433a8..776b0df828ed 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -462,7 +462,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) .should_migrate_blocks = false, .err_gc_skipped = false, .nr_free_secs = 1 }; - f2fs_down_write(&sbi->gc_lock); + f2fs_down_write_trace(&sbi->gc_lock, &gc_control.lc); stat_inc_gc_call_count(sbi, FOREGROUND); f2fs_gc(sbi, &gc_control); } @@ -3373,10 +3373,10 @@ retry: f2fs_unlock_op(sbi, &lc); if (f2fs_sb_has_blkzoned(sbi) && err == -EAGAIN && gc_required) { - f2fs_down_write(&sbi->gc_lock); + f2fs_down_write_trace(&sbi->gc_lock, &lc); err = f2fs_gc_range(sbi, 0, sbi->first_seq_zone_segno - 1, true, ZONED_PIN_SEC_REQUIRED_COUNT); - f2fs_up_write(&sbi->gc_lock); + f2fs_up_write_trace(&sbi->gc_lock, &lc); gc_required = false; if (!err) @@ -3496,6 +3496,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) block_t start_block, end_block; struct cp_control cpc; struct discard_policy dpolicy; + struct f2fs_lock_context lc; unsigned long long trimmed = 0; int err = 0; bool need_align = f2fs_lfs_mode(sbi) && __is_large_section(sbi); @@ -3528,10 +3529,10 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) if (sbi->discard_blks == 0) goto out; - f2fs_down_write(&sbi->gc_lock); + f2fs_down_write_trace(&sbi->gc_lock, &lc); stat_inc_cp_call_count(sbi, TOTAL_CALL); err = f2fs_write_checkpoint(sbi, &cpc); - f2fs_up_write(&sbi->gc_lock); + f2fs_up_write_trace(&sbi->gc_lock, &lc); if (err) goto out; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8cd519bb3c97..da3316528d1b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2559,6 +2559,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) { unsigned int s_flags = sbi->sb->s_flags; struct cp_control cpc; + struct f2fs_lock_context lc; unsigned int gc_mode = sbi->gc_mode; int err = 0; int ret; @@ -2588,7 +2589,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) .no_bg_gc = true, .nr_free_secs = 1 }; - f2fs_down_write(&sbi->gc_lock); + f2fs_down_write_trace(&sbi->gc_lock, &gc_control.lc); stat_inc_gc_call_count(sbi, FOREGROUND); err = f2fs_gc(sbi, &gc_control); if (err == -ENODATA) { @@ -2612,7 +2613,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) } skip_gc: - f2fs_down_write(&sbi->gc_lock); + f2fs_down_write_trace(&sbi->gc_lock, &lc); cpc.reason = CP_PAUSE; set_sbi_flag(sbi, SBI_CP_DISABLED); stat_inc_cp_call_count(sbi, TOTAL_CALL); @@ -2625,7 +2626,7 @@ skip_gc: spin_unlock(&sbi->stat_lock); out_unlock: - f2fs_up_write(&sbi->gc_lock); + f2fs_up_write_trace(&sbi->gc_lock, &lc); restore_flag: sbi->gc_mode = gc_mode; sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ @@ -2638,6 +2639,7 @@ static int f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) unsigned int nr_pages = get_pages(sbi, F2FS_DIRTY_DATA) / 16; long long start, writeback, lock, sync_inode, end; int ret; + struct f2fs_lock_context lc; f2fs_info(sbi, "%s start, meta: %lld, node: %lld, data: %lld", __func__, @@ -2672,12 +2674,12 @@ static int f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) sync_inode = ktime_get(); - f2fs_down_write(&sbi->gc_lock); + f2fs_down_write_trace(&sbi->gc_lock, &lc); f2fs_dirty_to_prefree(sbi); clear_sbi_flag(sbi, SBI_CP_DISABLED); set_sbi_flag(sbi, SBI_IS_DIRTY); - f2fs_up_write(&sbi->gc_lock); + f2fs_up_write_trace(&sbi->gc_lock, &lc); f2fs_info(sbi, "%s sync_fs, meta: %lld, imeta: %lld, node: %lld, dents: %lld, qdata: %lld", __func__, @@ -4893,7 +4895,7 @@ try_onemore: sbi->sb = sb; /* initialize locks within allocated memory */ - init_f2fs_rwsem(&sbi->gc_lock); + init_f2fs_rwsem_trace(&sbi->gc_lock, sbi, LOCK_NAME_GC_LOCK); mutex_init(&sbi->writepages); init_f2fs_rwsem(&sbi->cp_global_sem); init_f2fs_rwsem_trace(&sbi->node_write, sbi, LOCK_NAME_NODE_WRITE); diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index e5cfb8ad0d5e..bf353e7e024d 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -188,7 +188,8 @@ TRACE_DEFINE_ENUM(CP_PHASE_FINISH_CHECKPOINT); __print_symbolic(lock, \ { LOCK_NAME_CP_RWSEM, "cp_rwsem" }, \ { LOCK_NAME_NODE_CHANGE, "node_change" }, \ - { LOCK_NAME_NODE_WRITE, "node_write" }) + { LOCK_NAME_NODE_WRITE, "node_write" }, \ + { LOCK_NAME_GC_LOCK, "gc_lock" }) struct f2fs_sb_info; struct f2fs_io_info; From ce9fe67c9cdb21a0321f8ea37b725b3258d2b3cd Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 4 Jan 2026 10:07:22 +0800 Subject: [PATCH 18/61] f2fs: trace elapsed time for cp_global_sem lock Use f2fs_{down,up}_write_trace for cp_global_sem to trace lock elapsed time. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 10 ++++++---- fs/f2fs/f2fs.h | 1 + fs/f2fs/gc.c | 5 +++-- fs/f2fs/recovery.c | 5 +++-- fs/f2fs/super.c | 2 +- include/trace/events/f2fs.h | 3 ++- 6 files changed, 16 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index da7bcfa2a178..86656231ce83 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -513,6 +513,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); + struct f2fs_lock_context lc; long diff, written; if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) @@ -525,13 +526,13 @@ static int f2fs_write_meta_pages(struct address_space *mapping, goto skip_write; /* if locked failed, cp will flush dirty pages instead */ - if (!f2fs_down_write_trylock(&sbi->cp_global_sem)) + if (!f2fs_down_write_trylock_trace(&sbi->cp_global_sem, &lc)) goto skip_write; trace_f2fs_writepages(mapping->host, wbc, META); diff = nr_pages_to_write(sbi, META, wbc); written = f2fs_sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO); - f2fs_up_write(&sbi->cp_global_sem); + f2fs_up_write_trace(&sbi->cp_global_sem, &lc); wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); return 0; @@ -1780,6 +1781,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_lock_context lc; unsigned long long ckpt_ver; int err = 0; @@ -1794,7 +1796,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_warn(sbi, "Start checkpoint disabled!"); } if (cpc->reason != CP_RESIZE) - f2fs_down_write(&sbi->cp_global_sem); + f2fs_down_write_trace(&sbi->cp_global_sem, &lc); stat_cp_time(cpc, CP_TIME_LOCK); @@ -1884,7 +1886,7 @@ stop: trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, CP_PHASE_FINISH_CHECKPOINT); out: if (cpc->reason != CP_RESIZE) - f2fs_up_write(&sbi->cp_global_sem); + f2fs_up_write_trace(&sbi->cp_global_sem, &lc); return err; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3f3faa5e8ff1..dc08f6c06810 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -179,6 +179,7 @@ enum f2fs_lock_name { LOCK_NAME_NODE_CHANGE, LOCK_NAME_NODE_WRITE, LOCK_NAME_GC_LOCK, + LOCK_NAME_CP_GLOBAL, }; /* diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 391e66064c7e..1538f5b0a644 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -2267,6 +2267,7 @@ int f2fs_resize_fs(struct file *filp, __u64 block_count) struct cp_control cpc = { CP_RESIZE, 0, 0, 0 }; struct f2fs_lock_context lc; struct f2fs_lock_context glc; + struct f2fs_lock_context clc; unsigned int secs; int err = 0; __u32 rem; @@ -2350,7 +2351,7 @@ out_drop_write: } f2fs_down_write_trace(&sbi->gc_lock, &glc); - f2fs_down_write(&sbi->cp_global_sem); + f2fs_down_write_trace(&sbi->cp_global_sem, &clc); spin_lock(&sbi->stat_lock); if (shrunk_blocks + valid_user_blocks(sbi) + @@ -2398,7 +2399,7 @@ recover_out: spin_unlock(&sbi->stat_lock); } out_err: - f2fs_up_write(&sbi->cp_global_sem); + f2fs_up_write_trace(&sbi->cp_global_sem, &clc); f2fs_up_write_trace(&sbi->gc_lock, &glc); thaw_super(sbi->sb, FREEZE_HOLDER_KERNEL, NULL); return err; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index c3415ebb9f50..39f6e9830a9c 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -875,6 +875,7 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) LIST_HEAD(inode_list); LIST_HEAD(tmp_inode_list); LIST_HEAD(dir_list); + struct f2fs_lock_context lc; int err; int ret = 0; unsigned long s_flags = sbi->sb->s_flags; @@ -888,7 +889,7 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) f2fs_info(sbi, "recover fsync data on readonly fs"); /* prevent checkpoint */ - f2fs_down_write(&sbi->cp_global_sem); + f2fs_down_write_trace(&sbi->cp_global_sem, &lc); /* step #1: find fsynced inode numbers */ err = find_fsync_dnodes(sbi, &inode_list, check_only, &new_inode); @@ -932,7 +933,7 @@ skip: if (!err) clear_sbi_flag(sbi, SBI_POR_DOING); - f2fs_up_write(&sbi->cp_global_sem); + f2fs_up_write_trace(&sbi->cp_global_sem, &lc); /* let's drop all the directory inodes for clean checkpoint */ destroy_fsync_dnodes(&dir_list, err); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index da3316528d1b..be01eacbe5fa 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4897,7 +4897,7 @@ try_onemore: /* initialize locks within allocated memory */ init_f2fs_rwsem_trace(&sbi->gc_lock, sbi, LOCK_NAME_GC_LOCK); mutex_init(&sbi->writepages); - init_f2fs_rwsem(&sbi->cp_global_sem); + init_f2fs_rwsem_trace(&sbi->cp_global_sem, sbi, LOCK_NAME_CP_GLOBAL); init_f2fs_rwsem_trace(&sbi->node_write, sbi, LOCK_NAME_NODE_WRITE); init_f2fs_rwsem_trace(&sbi->node_change, sbi, LOCK_NAME_NODE_CHANGE); spin_lock_init(&sbi->stat_lock); diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index bf353e7e024d..859de7c8d1c7 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -189,7 +189,8 @@ TRACE_DEFINE_ENUM(CP_PHASE_FINISH_CHECKPOINT); { LOCK_NAME_CP_RWSEM, "cp_rwsem" }, \ { LOCK_NAME_NODE_CHANGE, "node_change" }, \ { LOCK_NAME_NODE_WRITE, "node_write" }, \ - { LOCK_NAME_GC_LOCK, "gc_lock" }) + { LOCK_NAME_GC_LOCK, "gc_lock" }, \ + { LOCK_NAME_CP_GLOBAL, "cp_global" }) struct f2fs_sb_info; struct f2fs_io_info; From 67972c2b89749356bc9823bd58f7f14b28e681e4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 4 Jan 2026 10:07:23 +0800 Subject: [PATCH 19/61] f2fs: trace elapsed time for io_rwsem lock Use f2fs_{down,up}_{read,write}_trace for io_rwsem to trace lock elapsed time. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 18 +++++++++++------- fs/f2fs/f2fs.h | 1 + include/trace/events/f2fs.h | 3 ++- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 79455d7acba5..7dee58fbfc0b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -620,7 +620,8 @@ int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi) for (j = HOT; j < n; j++) { struct f2fs_bio_info *io = &sbi->write_io[i][j]; - init_f2fs_rwsem(&io->io_rwsem); + init_f2fs_rwsem_trace(&io->io_rwsem, sbi, + LOCK_NAME_IO_RWSEM); io->sbi = sbi; io->bio = NULL; io->last_block_in_bio = 0; @@ -644,8 +645,9 @@ static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, { enum page_type btype = PAGE_TYPE_OF_BIO(type); struct f2fs_bio_info *io = sbi->write_io[btype] + temp; + struct f2fs_lock_context lc; - f2fs_down_write(&io->io_rwsem); + f2fs_down_write_trace(&io->io_rwsem, &lc); if (!io->bio) goto unlock_out; @@ -659,7 +661,7 @@ static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, } __submit_merged_bio(io); unlock_out: - f2fs_up_write(&io->io_rwsem); + f2fs_up_write_trace(&io->io_rwsem, &lc); } static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, @@ -674,10 +676,11 @@ static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, if (!force) { enum page_type btype = PAGE_TYPE_OF_BIO(type); struct f2fs_bio_info *io = sbi->write_io[btype] + temp; + struct f2fs_lock_context lc; - f2fs_down_read(&io->io_rwsem); + f2fs_down_read_trace(&io->io_rwsem, &lc); ret = __has_merged_page(io->bio, inode, folio, ino); - f2fs_up_read(&io->io_rwsem); + f2fs_up_read_trace(&io->io_rwsem, &lc); } if (ret) { __f2fs_submit_merged_write(sbi, type, temp); @@ -987,11 +990,12 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio) enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); struct f2fs_bio_info *io = sbi->write_io[btype] + fio->temp; struct folio *bio_folio; + struct f2fs_lock_context lc; enum count_type type; f2fs_bug_on(sbi, is_read_io(fio->op)); - f2fs_down_write(&io->io_rwsem); + f2fs_down_write_trace(&io->io_rwsem, &lc); next: #ifdef CONFIG_BLK_DEV_ZONED if (f2fs_sb_has_blkzoned(sbi) && btype < META && io->zone_pending_bio) { @@ -1073,7 +1077,7 @@ out: if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) || !f2fs_is_checkpoint_ready(sbi)) __submit_merged_bio(io); - f2fs_up_write(&io->io_rwsem); + f2fs_up_write_trace(&io->io_rwsem, &lc); } static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index dc08f6c06810..dc9f834bbe0d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -180,6 +180,7 @@ enum f2fs_lock_name { LOCK_NAME_NODE_WRITE, LOCK_NAME_GC_LOCK, LOCK_NAME_CP_GLOBAL, + LOCK_NAME_IO_RWSEM, }; /* diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 859de7c8d1c7..c3b6b509472f 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -190,7 +190,8 @@ TRACE_DEFINE_ENUM(CP_PHASE_FINISH_CHECKPOINT); { LOCK_NAME_NODE_CHANGE, "node_change" }, \ { LOCK_NAME_NODE_WRITE, "node_write" }, \ { LOCK_NAME_GC_LOCK, "gc_lock" }, \ - { LOCK_NAME_CP_GLOBAL, "cp_global" }) + { LOCK_NAME_CP_GLOBAL, "cp_global" }, \ + { LOCK_NAME_IO_RWSEM, "io_rwsem" }) struct f2fs_sb_info; struct f2fs_io_info; From b5da276ae6abe95767c3e1eb72f39e0ef8df7d22 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 4 Jan 2026 10:07:24 +0800 Subject: [PATCH 20/61] f2fs: clean up w/ __f2fs_schedule_timeout() No logic changes. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index dc9f834bbe0d..1d7ad3b860e0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4988,8 +4988,7 @@ static inline void f2fs_io_schedule_timeout_killable(long timeout) while (timeout) { if (fatal_signal_pending(current)) return; - set_current_state(TASK_UNINTERRUPTIBLE); - io_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT); + __f2fs_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT, true); if (timeout <= DEFAULT_SCHEDULE_TIMEOUT) return; timeout -= DEFAULT_SCHEDULE_TIMEOUT; From da90b6715567e900a3c5d112dfaf8f385b343edc Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 4 Jan 2026 10:07:25 +0800 Subject: [PATCH 21/61] f2fs: fix to use jiffies based precision for DEFAULT_SCHEDULE_TIMEOUT Due to timeout parameter in {io,}_schedule_timeout() is based on jiffies unit precision. It will lose precision when using msecs_to_jiffies(x) for conversion. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1d7ad3b860e0..a2b5439e7f08 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -681,8 +681,8 @@ enum { #define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO or flush count */ -/* IO/non-IO congestion wait timeout value, default: 1ms */ -#define DEFAULT_SCHEDULE_TIMEOUT (msecs_to_jiffies(1)) +/* IO/non-IO congestion wait timeout value, default: 1 jiffies */ +#define DEFAULT_SCHEDULE_TIMEOUT 1 /* timeout value injected, default: 1000ms */ #define DEFAULT_FAULT_TIMEOUT (msecs_to_jiffies(1000)) From 6fa116053951d5785ef1a0b060858843e663a31a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 4 Jan 2026 10:07:26 +0800 Subject: [PATCH 22/61] f2fs: fix timeout precision of f2fs_io_schedule_timeout_killable() Sometimes, f2fs_io_schedule_timeout_killable(HZ) may delay for about 2 seconds, this is because __f2fs_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT) may delay for about 2 * DEFAULT_SCHEDULE_TIMEOUT due to its precision, but we only account the delay as DEFAULT_SCHEDULE_TIMEOUT as below, fix it. f2fs_io_schedule_timeout_killable() .. timeout -= DEFAULT_SCHEDULE_TIMEOUT; Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a2b5439e7f08..95b7a8e3669c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4985,13 +4985,12 @@ static inline void __f2fs_schedule_timeout(long timeout, bool io) static inline void f2fs_io_schedule_timeout_killable(long timeout) { - while (timeout) { + unsigned long last_time = jiffies + timeout; + + while (jiffies < last_time) { if (fatal_signal_pending(current)) return; __f2fs_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT, true); - if (timeout <= DEFAULT_SCHEDULE_TIMEOUT) - return; - timeout -= DEFAULT_SCHEDULE_TIMEOUT; } } From 7a127c80b0eec7649b6df14c12e53f859dddbe52 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 4 Jan 2026 10:07:27 +0800 Subject: [PATCH 23/61] f2fs: rename FAULT_TIMEOUT to FAULT_ATOMIC_TIMEOUT No logic changes. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 2 +- Documentation/filesystems/f2fs.rst | 2 +- fs/f2fs/f2fs.h | 2 +- fs/f2fs/segment.c | 2 +- fs/f2fs/super.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 648ddd0d59f6..de5a80124e04 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -741,7 +741,7 @@ Description: Support configuring fault injection type, should be FAULT_BLKADDR_CONSISTENCE 0x00080000 FAULT_NO_SEGMENT 0x00100000 FAULT_INCONSISTENT_FOOTER 0x00200000 - FAULT_TIMEOUT 0x00400000 (1000ms) + FAULT_ATOMIC_TIMEOUT 0x00400000 (1000ms) FAULT_VMALLOC 0x00800000 =========================== ========== diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 9b3b835a174e..2d71efa7db7a 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -215,7 +215,7 @@ fault_type=%d Support configuring fault injection type, should be FAULT_BLKADDR_CONSISTENCE 0x00080000 FAULT_NO_SEGMENT 0x00100000 FAULT_INCONSISTENT_FOOTER 0x00200000 - FAULT_TIMEOUT 0x00400000 (1000ms) + FAULT_ATOMIC_TIMEOUT 0x00400000 (1000ms) FAULT_VMALLOC 0x00800000 =========================== ========== mode=%s Control block allocation mode which supports "adaptive" diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 95b7a8e3669c..533440f15e20 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -63,7 +63,7 @@ enum { FAULT_BLKADDR_CONSISTENCE, FAULT_NO_SEGMENT, FAULT_INCONSISTENT_FOOTER, - FAULT_TIMEOUT, + FAULT_ATOMIC_TIMEOUT, FAULT_VMALLOC, FAULT_MAX, }; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 776b0df828ed..469d37c37398 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -371,7 +371,7 @@ next: } out: - if (time_to_inject(sbi, FAULT_TIMEOUT)) + if (time_to_inject(sbi, FAULT_ATOMIC_TIMEOUT)) f2fs_io_schedule_timeout_killable(DEFAULT_FAULT_TIMEOUT); if (ret) { diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index be01eacbe5fa..1b7fc2bd3bb7 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -67,7 +67,7 @@ const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_BLKADDR_CONSISTENCE] = "inconsistent blkaddr", [FAULT_NO_SEGMENT] = "no free segment", [FAULT_INCONSISTENT_FOOTER] = "inconsistent footer", - [FAULT_TIMEOUT] = "timeout", + [FAULT_ATOMIC_TIMEOUT] = "atomic timeout", [FAULT_VMALLOC] = "vmalloc", }; From c56254e2e04216839699937a04aac18c585e833e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 4 Jan 2026 10:07:28 +0800 Subject: [PATCH 24/61] f2fs: introduce FAULT_LOCK_TIMEOUT This patch introduce a new fault type FAULT_LOCK_TIMEOUT, it can be used to inject timeout into lock duration. Timeout type can be set via /sys/fs/f2fs//inject_timeout_type Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 1 + Documentation/filesystems/f2fs.rst | 1 + fs/f2fs/checkpoint.c | 3 +++ fs/f2fs/f2fs.h | 1 + fs/f2fs/super.c | 1 + 5 files changed, 7 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index de5a80124e04..4b0bec3c0746 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -743,6 +743,7 @@ Description: Support configuring fault injection type, should be FAULT_INCONSISTENT_FOOTER 0x00200000 FAULT_ATOMIC_TIMEOUT 0x00400000 (1000ms) FAULT_VMALLOC 0x00800000 + FAULT_LOCK_TIMEOUT 0x01000000 (1000ms) =========================== ========== What: /sys/fs/f2fs//discard_io_aware_gran diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 2d71efa7db7a..33d2166ac6b7 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -217,6 +217,7 @@ fault_type=%d Support configuring fault injection type, should be FAULT_INCONSISTENT_FOOTER 0x00200000 FAULT_ATOMIC_TIMEOUT 0x00400000 (1000ms) FAULT_VMALLOC 0x00800000 + FAULT_LOCK_TIMEOUT 0x01000000 (1000ms) =========================== ========== mode=%s Control block allocation mode which supports "adaptive" and "lfs". In "lfs" mode, there should be no random diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 86656231ce83..f2ab5ba8fb6a 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -63,6 +63,9 @@ static inline void trace_lock_elapsed_time_end(struct f2fs_rwsem *sem, if (!lc->lock_trace) return; + if (time_to_inject(sem->sbi, FAULT_LOCK_TIMEOUT)) + f2fs_io_schedule_timeout_killable(DEFAULT_FAULT_TIMEOUT); + get_lock_elapsed_time(&tts); total_time = div_u64(tts.total_time - lc->ts.total_time, npm); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 533440f15e20..68fa9fd9cd5d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -65,6 +65,7 @@ enum { FAULT_INCONSISTENT_FOOTER, FAULT_ATOMIC_TIMEOUT, FAULT_VMALLOC, + FAULT_LOCK_TIMEOUT, FAULT_MAX, }; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1b7fc2bd3bb7..30e6cf027ba7 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -69,6 +69,7 @@ const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_INCONSISTENT_FOOTER] = "inconsistent footer", [FAULT_ATOMIC_TIMEOUT] = "atomic timeout", [FAULT_VMALLOC] = "vmalloc", + [FAULT_LOCK_TIMEOUT] = "lock timeout", }; int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned long rate, From d36de29f4bb59b24e57ff22403baae6fc7e89bd8 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 4 Jan 2026 10:07:29 +0800 Subject: [PATCH 25/61] f2fs: sysfs: introduce inject_lock_timeout This patch adds a new sysfs node in /sys/fs/f2fs//inject_lock_timeout, it relies on CONFIG_F2FS_FAULT_INJECTION kernel config. It can be used to simulate different type of timeout in lock duration. ========== =============================== Flag_Value Flag_Description ========== =============================== 0x00000000 No timeout (default) 0x00000001 Simulate running time 0x00000002 Simulate IO type sleep time 0x00000003 Simulate Non-IO type sleep time 0x00000004 Simulate runnable time ========== =============================== Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 14 +++++++ fs/f2fs/checkpoint.c | 2 +- fs/f2fs/f2fs.h | 22 +++++++++-- fs/f2fs/segment.c | 2 +- fs/f2fs/super.c | 49 +++++++++++++++++++++++++ fs/f2fs/sysfs.c | 9 +++++ 6 files changed, 93 insertions(+), 5 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 4b0bec3c0746..ca9ed3b44b31 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -948,3 +948,17 @@ Description: This is a threshold, once a thread enters critical region that lock elapsed time exceeds this threshold, f2fs will print tracepoint to dump information of related context. This sysfs entry can be used to control the value of threshold, by default, the value is 500 ms. + +What: /sys/fs/f2fs//inject_timeout_type +Date: December 2025 +Contact: "Chao Yu" +Description: This sysfs entry can be used to change type of injected timeout: + ========== =============================== + Flag_Value Flag_Description + ========== =============================== + 0x00000000 No timeout (default) + 0x00000001 Simulate running time + 0x00000002 Simulate IO type sleep time + 0x00000003 Simulate Non-IO type sleep time + 0x00000004 Simulate runnable time + ========== =============================== diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f2ab5ba8fb6a..3dfc83a0813e 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -64,7 +64,7 @@ static inline void trace_lock_elapsed_time_end(struct f2fs_rwsem *sem, return; if (time_to_inject(sem->sbi, FAULT_LOCK_TIMEOUT)) - f2fs_io_schedule_timeout_killable(DEFAULT_FAULT_TIMEOUT); + f2fs_schedule_timeout_killable(DEFAULT_FAULT_TIMEOUT, true); get_lock_elapsed_time(&tts); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 68fa9fd9cd5d..399514187280 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -73,7 +73,8 @@ enum { enum fault_option { FAULT_RATE = 1, /* only update fault rate */ FAULT_TYPE = 2, /* only update fault type */ - FAULT_ALL = 4, /* reset all fault injection options/stats */ + FAULT_TIMEOUT = 4, /* only update fault timeout type */ + FAULT_ALL = 8, /* reset all fault injection options/stats */ }; #ifdef CONFIG_F2FS_FAULT_INJECTION @@ -83,6 +84,7 @@ struct f2fs_fault_info { unsigned int inject_type; /* Used to account total count of injection for each type */ unsigned int inject_count[FAULT_MAX]; + unsigned int inject_lock_timeout; /* inject lock timeout */ }; extern const char *f2fs_fault_name[FAULT_MAX]; @@ -184,6 +186,15 @@ enum f2fs_lock_name { LOCK_NAME_IO_RWSEM, }; +enum f2fs_timeout_type { + TIMEOUT_TYPE_NONE, + TIMEOUT_TYPE_RUNNING, + TIMEOUT_TYPE_IO_SLEEP, + TIMEOUT_TYPE_NONIO_SLEEP, + TIMEOUT_TYPE_RUNNABLE, + TIMEOUT_TYPE_MAX, +}; + /* * An implementation of an rwsem that is explicitly unfair to readers. This * prevents priority inversion when a low-priority reader acquires the read lock @@ -4927,6 +4938,7 @@ static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx) #ifdef CONFIG_F2FS_FAULT_INJECTION extern int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned long rate, unsigned long type, enum fault_option fo); +extern void f2fs_simulate_lock_timeout(struct f2fs_sb_info *sbi); #else static inline int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned long rate, unsigned long type, @@ -4934,6 +4946,10 @@ static inline int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, { return 0; } +static inline void f2fs_simulate_lock_timeout(struct f2fs_sb_info *sbi) +{ + return; +} #endif static inline bool is_journalled_quota(struct f2fs_sb_info *sbi) @@ -4984,14 +5000,14 @@ static inline void __f2fs_schedule_timeout(long timeout, bool io) #define f2fs_schedule_timeout(timeout) \ __f2fs_schedule_timeout(timeout, false) -static inline void f2fs_io_schedule_timeout_killable(long timeout) +static inline void f2fs_schedule_timeout_killable(long timeout, bool io) { unsigned long last_time = jiffies + timeout; while (jiffies < last_time) { if (fatal_signal_pending(current)) return; - __f2fs_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT, true); + __f2fs_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT, io); } } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 469d37c37398..587ae3b4bfd8 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -372,7 +372,7 @@ next: out: if (time_to_inject(sbi, FAULT_ATOMIC_TIMEOUT)) - f2fs_io_schedule_timeout_killable(DEFAULT_FAULT_TIMEOUT); + f2fs_schedule_timeout_killable(DEFAULT_FAULT_TIMEOUT, true); if (ret) { sbi->revoked_atomic_block += fi->atomic_write_cnt; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 30e6cf027ba7..4573bac94793 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -97,8 +97,57 @@ int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned long rate, f2fs_info(sbi, "build fault injection type: 0x%lx", type); } + if (fo & FAULT_TIMEOUT) { + if (type >= TIMEOUT_TYPE_MAX) + return -EINVAL; + ffi->inject_lock_timeout = (unsigned int)type; + f2fs_info(sbi, "build fault timeout injection type: 0x%lx", type); + } + return 0; } + +static void inject_timeout(struct f2fs_sb_info *sbi) +{ + struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info; + enum f2fs_timeout_type type = ffi->inject_lock_timeout; + unsigned long start_time = jiffies; + unsigned long timeout = HZ; + + switch (type) { + case TIMEOUT_TYPE_RUNNING: + while (!time_after(jiffies, start_time + timeout)) { + if (fatal_signal_pending(current)) + return; + ; + } + break; + case TIMEOUT_TYPE_IO_SLEEP: + f2fs_schedule_timeout_killable(timeout, true); + break; + case TIMEOUT_TYPE_NONIO_SLEEP: + f2fs_schedule_timeout_killable(timeout, false); + break; + case TIMEOUT_TYPE_RUNNABLE: + while (!time_after(jiffies, start_time + timeout)) { + if (fatal_signal_pending(current)) + return; + schedule(); + } + break; + default: + return; + } +} + +void f2fs_simulate_lock_timeout(struct f2fs_sb_info *sbi) +{ + struct f2fs_lock_context lc; + + f2fs_lock_op(sbi, &lc); + inject_timeout(sbi); + f2fs_unlock_op(sbi, &lc); +} #endif /* f2fs-wide shrinker description */ diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index e03cba5a9d70..c32e4996b335 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -35,6 +35,7 @@ enum { #ifdef CONFIG_F2FS_FAULT_INJECTION FAULT_INFO_RATE, /* struct f2fs_fault_info */ FAULT_INFO_TYPE, /* struct f2fs_fault_info */ + FAULT_INFO_TIMEOUT, /* struct f2fs_fault_info */ #endif RESERVED_BLOCKS, /* struct f2fs_sb_info */ CPRC_INFO, /* struct ckpt_req_control */ @@ -529,6 +530,12 @@ out: return -EINVAL; return count; } + if (a->struct_type == FAULT_INFO_TIMEOUT) { + if (f2fs_build_fault_attr(sbi, 0, t, FAULT_TIMEOUT)) + return -EINVAL; + f2fs_simulate_lock_timeout(sbi); + return count; + } #endif if (a->struct_type == RESERVED_BLOCKS) { spin_lock(&sbi->stat_lock); @@ -1233,6 +1240,7 @@ STAT_INFO_RO_ATTR(gc_background_calls, gc_call_count[BACKGROUND]); #ifdef CONFIG_F2FS_FAULT_INJECTION FAULT_INFO_GENERAL_RW_ATTR(FAULT_INFO_RATE, inject_rate); FAULT_INFO_GENERAL_RW_ATTR(FAULT_INFO_TYPE, inject_type); +FAULT_INFO_GENERAL_RW_ATTR(FAULT_INFO_TIMEOUT, inject_lock_timeout); #endif /* RESERVED_BLOCKS ATTR */ @@ -1362,6 +1370,7 @@ static struct attribute *f2fs_attrs[] = { #ifdef CONFIG_F2FS_FAULT_INJECTION ATTR_LIST(inject_rate), ATTR_LIST(inject_type), + ATTR_LIST(inject_lock_timeout), #endif ATTR_LIST(data_io_flag), ATTR_LIST(node_io_flag), From 00feea1dfcea2cc7c22e58b6325f72637c6ea217 Mon Sep 17 00:00:00 2001 From: Nanzhe Zhao Date: Mon, 5 Jan 2026 23:30:57 +0800 Subject: [PATCH 26/61] f2fs: Zero f2fs_folio_state on allocation f2fs_folio_state is attached to folio->private and is expected to start with read_pages_pending == 0. However, the structure was allocated from ffs_entry_slab without being fully initialized, which can leave read_pages_pending with stale values. Allocate the object with __GFP_ZERO so all fields are reliably zeroed at creation time. Signed-off-by: Nanzhe Zhao Reviewed-by: Barry Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7dee58fbfc0b..b5b39a788ee5 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2399,7 +2399,8 @@ static struct f2fs_folio_state *ffs_find_or_alloc(struct folio *folio) if (ffs) return ffs; - ffs = f2fs_kmem_cache_alloc(ffs_entry_slab, GFP_NOIO, true, NULL); + ffs = f2fs_kmem_cache_alloc(ffs_entry_slab, + GFP_NOIO | __GFP_ZERO, true, NULL); spin_lock_init(&ffs->state_lock); folio_attach_private(folio, ffs); From c0c589fa1d17fc13b3be1a4dd2ec62266c2a0659 Mon Sep 17 00:00:00 2001 From: Nanzhe Zhao Date: Mon, 5 Jan 2026 23:30:58 +0800 Subject: [PATCH 27/61] f2fs: Accounting large folio subpages before bio submission In f2fs_read_data_large_folio(), read_pages_pending is incremented only after the subpage has been added to the BIO. With a heavily fragmented file, each new subpage can force submission of the previous BIO. If the BIO completes quickly, f2fs_finish_read_bio() may decrement read_pages_pending to zero and call folio_end_read() while the read loop is still processing other subpages of the same large folio. Fix the ordering by incrementing read_pages_pending before any possible BIO submission for the current subpage, matching the iomap ordering and preventing premature folio_end_read(). Signed-off-by: Nanzhe Zhao Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b5b39a788ee5..f32eb51ccee4 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2497,6 +2497,18 @@ got_it: continue; } + /* We must increment read_pages_pending before possible BIOs submitting + * to prevent from premature folio_end_read() call on folio + */ + if (folio_test_large(folio)) { + ffs = ffs_find_or_alloc(folio); + + /* set the bitmap to wait */ + spin_lock_irq(&ffs->state_lock); + ffs->read_pages_pending++; + spin_unlock_irq(&ffs->state_lock); + } + /* * This page will go to BIO. Do we need to send this * BIO off first? @@ -2524,15 +2536,6 @@ submit_and_realloc: offset << PAGE_SHIFT)) goto submit_and_realloc; - if (folio_test_large(folio)) { - ffs = ffs_find_or_alloc(folio); - - /* set the bitmap to wait */ - spin_lock_irq(&ffs->state_lock); - ffs->read_pages_pending++; - spin_unlock_irq(&ffs->state_lock); - } - inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA); f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO, F2FS_BLKSIZE); From 98ea0039dbfdd00e5cc1b9a8afa40434476c0955 Mon Sep 17 00:00:00 2001 From: Yongpeng Yang Date: Wed, 7 Jan 2026 10:33:46 +0800 Subject: [PATCH 28/61] f2fs: fix out-of-bounds access in sysfs attribute read/write Some f2fs sysfs attributes suffer from out-of-bounds memory access and incorrect handling of integer values whose size is not 4 bytes. For example: vm:~# echo 65537 > /sys/fs/f2fs/vde/carve_out vm:~# cat /sys/fs/f2fs/vde/carve_out 65537 vm:~# echo 4294967297 > /sys/fs/f2fs/vde/atgc_age_threshold vm:~# cat /sys/fs/f2fs/vde/atgc_age_threshold 1 carve_out maps to {struct f2fs_sb_info}->carve_out, which is a 8-bit integer. However, the sysfs interface allows setting it to a value larger than 255, resulting in an out-of-range update. atgc_age_threshold maps to {struct atgc_management}->age_threshold, which is a 64-bit integer, but its sysfs interface cannot correctly set values larger than UINT_MAX. The root causes are: 1. __sbi_store() treats all default values as unsigned int, which prevents updating integers larger than 4 bytes and causes out-of-bounds writes for integers smaller than 4 bytes. 2. f2fs_sbi_show() also assumes all default values are unsigned int, leading to out-of-bounds reads and incorrect access to integers larger than 4 bytes. This patch introduces {struct f2fs_attr}->size to record the actual size of the integer associated with each sysfs attribute. With this information, sysfs read and write operations can correctly access and update values according to their real data size, avoiding memory corruption and truncation. Fixes: b59d0bae6ca3 ("f2fs: add sysfs support for controlling the gc_thread") Cc: stable@kernel.org Signed-off-by: Jinbao Liu Signed-off-by: Yongpeng Yang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 60 ++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 52 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index c32e4996b335..91bc0544ba1f 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -59,6 +59,7 @@ struct f2fs_attr { const char *buf, size_t len); int struct_type; int offset; + int size; int id; }; @@ -345,11 +346,30 @@ static ssize_t main_blkaddr_show(struct f2fs_attr *a, (unsigned long long)MAIN_BLKADDR(sbi)); } +static ssize_t __sbi_show_value(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf, + unsigned char *value) +{ + switch (a->size) { + case 1: + return sysfs_emit(buf, "%u\n", *(u8 *)value); + case 2: + return sysfs_emit(buf, "%u\n", *(u16 *)value); + case 4: + return sysfs_emit(buf, "%u\n", *(u32 *)value); + case 8: + return sysfs_emit(buf, "%llu\n", *(u64 *)value); + default: + f2fs_bug_on(sbi, 1); + return sysfs_emit(buf, + "show sysfs node value with wrong type\n"); + } +} + static ssize_t f2fs_sbi_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { unsigned char *ptr = NULL; - unsigned int *ui; ptr = __struct_ptr(sbi, a->struct_type); if (!ptr) @@ -429,9 +449,30 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, atomic_read(&sbi->cp_call_count[BACKGROUND])); #endif - ui = (unsigned int *)(ptr + a->offset); + return __sbi_show_value(a, sbi, buf, ptr + a->offset); +} - return sysfs_emit(buf, "%u\n", *ui); +static void __sbi_store_value(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, + unsigned char *ui, unsigned long value) +{ + switch (a->size) { + case 1: + *(u8 *)ui = value; + break; + case 2: + *(u16 *)ui = value; + break; + case 4: + *(u32 *)ui = value; + break; + case 8: + *(u64 *)ui = value; + break; + default: + f2fs_bug_on(sbi, 1); + f2fs_err(sbi, "store sysfs node value with wrong type"); + } } static ssize_t __sbi_store(struct f2fs_attr *a, @@ -913,7 +954,7 @@ out: return count; } - *ui = (unsigned int)t; + __sbi_store_value(a, sbi, ptr + a->offset, t); return count; } @@ -1060,24 +1101,27 @@ static struct f2fs_attr f2fs_attr_sb_##_name = { \ .id = F2FS_FEATURE_##_feat, \ } -#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \ +#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset, _size) \ static struct f2fs_attr f2fs_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = _mode }, \ .show = _show, \ .store = _store, \ .struct_type = _struct_type, \ - .offset = _offset \ + .offset = _offset, \ + .size = _size \ } #define F2FS_RO_ATTR(struct_type, struct_name, name, elname) \ F2FS_ATTR_OFFSET(struct_type, name, 0444, \ f2fs_sbi_show, NULL, \ - offsetof(struct struct_name, elname)) + offsetof(struct struct_name, elname), \ + sizeof_field(struct struct_name, elname)) #define F2FS_RW_ATTR(struct_type, struct_name, name, elname) \ F2FS_ATTR_OFFSET(struct_type, name, 0644, \ f2fs_sbi_show, f2fs_sbi_store, \ - offsetof(struct struct_name, elname)) + offsetof(struct struct_name, elname), \ + sizeof_field(struct struct_name, elname)) #define F2FS_GENERAL_RO_ATTR(name) \ static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL) From 071e50d61cf2474bec724c10bb1ae8082ef6c237 Mon Sep 17 00:00:00 2001 From: Yongpeng Yang Date: Wed, 7 Jan 2026 10:33:47 +0800 Subject: [PATCH 29/61] f2fs: change seq_file_ra_mul and max_io_bytes to unsigned int {struct file_ra_state}->ra_pages and {struct bio}->bi_iter.bi_size is defined as unsigned int, so values of seq_file_ra_mul and max_io_bytes exceeding UINT_MAX are meaningless. Signed-off-by: Yongpeng Yang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 399514187280..ded41b416ed7 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1808,7 +1808,7 @@ struct f2fs_sb_info { unsigned int total_valid_node_count; /* valid node block count */ int dir_level; /* directory level */ bool readdir_ra; /* readahead inode in readdir */ - u64 max_io_bytes; /* max io bytes to merge IOs */ + unsigned int max_io_bytes; /* max io bytes to merge IOs */ block_t user_block_count; /* # of user blocks */ block_t total_valid_block_count; /* # of valid blocks */ @@ -1956,7 +1956,7 @@ struct f2fs_sb_info { unsigned int gc_segment_mode; /* GC state for reclaimed segments */ unsigned int gc_reclaimed_segs[MAX_GC_MODE]; /* Reclaimed segs for each mode */ - unsigned long seq_file_ra_mul; /* multiplier for ra_pages of seq. files in fadvise */ + unsigned int seq_file_ra_mul; /* multiplier for ra_pages of seq. files in fadvise */ int max_fragment_chunk; /* max chunk size for block fragmentation mode */ int max_fragment_hole; /* max hole size for block fragmentation mode */ From 7633a7387eb4d0259d6bea945e1d3469cd135bbc Mon Sep 17 00:00:00 2001 From: Yongpeng Yang Date: Tue, 6 Jan 2026 20:12:11 +0800 Subject: [PATCH 30/61] f2fs: fix IS_CHECKPOINTED flag inconsistency issue caused by concurrent atomic commit and checkpoint writes During SPO tests, when mounting F2FS, an -EINVAL error was returned from f2fs_recover_inode_page. The issue occurred under the following scenario Thread A Thread B f2fs_ioc_commit_atomic_write - f2fs_do_sync_file // atomic = true - f2fs_fsync_node_pages : last_folio = inode folio : schedule before folio_lock(last_folio) f2fs_write_checkpoint - block_operations// writeback last_folio - schedule before f2fs_flush_nat_entries : set_fsync_mark(last_folio, 1) : set_dentry_mark(last_folio, 1) : folio_mark_dirty(last_folio) - __write_node_folio(last_folio) : f2fs_down_read(&sbi->node_write)//block - f2fs_flush_nat_entries : {struct nat_entry}->flag |= BIT(IS_CHECKPOINTED) - unblock_operations : f2fs_up_write(&sbi->node_write) f2fs_write_checkpoint//return : f2fs_do_write_node_page() f2fs_ioc_commit_atomic_write//return SPO Thread A calls f2fs_need_dentry_mark(sbi, ino), and the last_folio has already been written once. However, the {struct nat_entry}->flag did not have the IS_CHECKPOINTED set, causing set_dentry_mark(last_folio, 1) and write last_folio again after Thread B finishes f2fs_write_checkpoint. After SPO and reboot, it was detected that {struct node_info}->blk_addr was not NULL_ADDR because Thread B successfully write the checkpoint. This issue only occurs in atomic write scenarios. For regular file fsync operations, the folio must be dirty. If block_operations->f2fs_sync_node_pages successfully submit the folio write, this path will not be executed. Otherwise, the f2fs_write_checkpoint will need to wait for the folio write submission to complete, as sbi->nr_pages[F2FS_DIRTY_NODES] > 0. Therefore, the situation where f2fs_need_dentry_mark checks that the {struct nat_entry}->flag /wo the IS_CHECKPOINTED flag, but the folio write has already been submitted, will not occur. Therefore, for atomic file fsync, sbi->node_write should be acquired through __write_node_folio to ensure that the IS_CHECKPOINTED flag correctly indicates that the checkpoint write has been completed. Fixes: 608514deba38 ("f2fs: set fsync mark only for the last dnode") Cc: stable@kernel.org Signed-off-by: Sheng Yong Signed-off-by: Jinbao Liu Signed-off-by: Yongpeng Yang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d378549010e6..99e425e8c00a 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1786,8 +1786,13 @@ static bool __write_node_folio(struct folio *folio, bool atomic, bool *submitted goto redirty_out; } - if (atomic && !test_opt(sbi, NOBARRIER)) - fio.op_flags |= REQ_PREFLUSH | REQ_FUA; + if (atomic) { + if (!test_opt(sbi, NOBARRIER)) + fio.op_flags |= REQ_PREFLUSH | REQ_FUA; + if (IS_INODE(folio)) + set_dentry_mark(folio, + f2fs_need_dentry_mark(sbi, ino_of_node(folio))); + } /* should add to global list before clearing PAGECACHE status */ if (f2fs_in_warm_node_list(sbi, folio)) { @@ -1928,8 +1933,9 @@ continue_unlock: if (is_inode_flag_set(inode, FI_DIRTY_INODE)) f2fs_update_inode(inode, folio); - set_dentry_mark(folio, - f2fs_need_dentry_mark(sbi, ino)); + if (!atomic) + set_dentry_mark(folio, + f2fs_need_dentry_mark(sbi, ino)); } /* may be written by other thread */ if (!folio_test_dirty(folio)) From 0eda086de85e140f53c6123a4c00662f4e614ee4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 6 Jan 2026 14:31:17 +0800 Subject: [PATCH 31/61] f2fs: fix to check sysfs filename w/ gc_pin_file_thresh correctly Sysfs entry name is gc_pin_file_thresh instead of gc_pin_file_threshold, fix it. Cc: stable@kernel.org Fixes: c521a6ab4ad7 ("f2fs: fix to limit gc_pin_file_threshold") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 91bc0544ba1f..cd22bfe75c45 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -797,7 +797,7 @@ out: return count; } - if (!strcmp(a->attr.name, "gc_pin_file_threshold")) { + if (!strcmp(a->attr.name, "gc_pin_file_thresh")) { if (t > MAX_GC_FAILED_PINNED_FILES) return -EINVAL; sbi->gc_pin_file_threshold = t; From 3996b70209f145bfcf2afc7d05dd92c27b233b48 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 16 Jan 2026 11:38:16 +0800 Subject: [PATCH 32/61] Revert "f2fs: block cache/dio write during f2fs_enable_checkpoint()" This reverts commit 196c81fdd438f7ac429d5639090a9816abb9760a. Original patch may cause below deadlock, revert it. write remount - write_begin - lock_page --- lock A - prepare_write_begin - f2fs_map_lock - f2fs_enable_checkpoint - down_write(cp_enable_rwsem) --- lock B - sync_inode_sb - writepages - lock_page --- lock A - down_read(cp_enable_rwsem) --- lock A Cc: stable@kernel.org Fixes: 196c81fdd438 ("f2fs: block cache/dio write during f2fs_enable_checkpoint()") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 -- fs/f2fs/f2fs.h | 3 +-- fs/f2fs/super.c | 38 ++++++++------------------------------ 3 files changed, 9 insertions(+), 34 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f32eb51ccee4..2e133a723b99 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1474,7 +1474,6 @@ static void f2fs_map_lock(struct f2fs_sb_info *sbi, struct f2fs_lock_context *lc, int flag) { - f2fs_down_read(&sbi->cp_enable_rwsem); if (flag == F2FS_GET_BLOCK_PRE_AIO) f2fs_down_read_trace(&sbi->node_change, lc); else @@ -1489,7 +1488,6 @@ static void f2fs_map_unlock(struct f2fs_sb_info *sbi, f2fs_up_read_trace(&sbi->node_change, lc); else f2fs_unlock_op(sbi, lc); - f2fs_up_read(&sbi->cp_enable_rwsem); } int f2fs_get_block_locked(struct dnode_of_data *dn, pgoff_t index) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ded41b416ed7..90aa1d53722a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -311,7 +311,7 @@ enum { #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ #define DEF_DISABLE_INTERVAL 5 /* 5 secs */ -#define DEF_ENABLE_INTERVAL 5 /* 5 secs */ +#define DEF_ENABLE_INTERVAL 16 /* 16 secs */ #define DEF_DISABLE_QUICK_INTERVAL 1 /* 1 secs */ #define DEF_UMOUNT_DISCARD_TIMEOUT 5 /* 5 secs */ @@ -1762,7 +1762,6 @@ struct f2fs_sb_info { long interval_time[MAX_TIME]; /* to store thresholds */ struct ckpt_req_control cprc_info; /* for checkpoint request control */ struct cp_stats cp_stats; /* for time stat of checkpoint */ - struct f2fs_rwsem cp_enable_rwsem; /* block cache/dio write */ struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4573bac94793..25f796232ad9 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2687,12 +2687,11 @@ restore_flag: static int f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) { unsigned int nr_pages = get_pages(sbi, F2FS_DIRTY_DATA) / 16; - long long start, writeback, lock, sync_inode, end; + long long start, writeback, end; int ret; struct f2fs_lock_context lc; - f2fs_info(sbi, "%s start, meta: %lld, node: %lld, data: %lld", - __func__, + f2fs_info(sbi, "f2fs_enable_checkpoint() starts, meta: %lld, node: %lld, data: %lld", get_pages(sbi, F2FS_DIRTY_META), get_pages(sbi, F2FS_DIRTY_NODES), get_pages(sbi, F2FS_DIRTY_DATA)); @@ -2711,18 +2710,11 @@ static int f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) } writeback = ktime_get(); - f2fs_down_write(&sbi->cp_enable_rwsem); - - lock = ktime_get(); - - if (get_pages(sbi, F2FS_DIRTY_DATA)) - sync_inodes_sb(sbi->sb); + sync_inodes_sb(sbi->sb); if (unlikely(get_pages(sbi, F2FS_DIRTY_DATA))) - f2fs_warn(sbi, "%s: has some unwritten data: %lld", - __func__, get_pages(sbi, F2FS_DIRTY_DATA)); - - sync_inode = ktime_get(); + f2fs_warn(sbi, "checkpoint=enable has some unwritten data: %lld", + get_pages(sbi, F2FS_DIRTY_DATA)); f2fs_down_write_trace(&sbi->gc_lock, &lc); f2fs_dirty_to_prefree(sbi); @@ -2731,13 +2723,6 @@ static int f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) set_sbi_flag(sbi, SBI_IS_DIRTY); f2fs_up_write_trace(&sbi->gc_lock, &lc); - f2fs_info(sbi, "%s sync_fs, meta: %lld, imeta: %lld, node: %lld, dents: %lld, qdata: %lld", - __func__, - get_pages(sbi, F2FS_DIRTY_META), - get_pages(sbi, F2FS_DIRTY_IMETA), - get_pages(sbi, F2FS_DIRTY_NODES), - get_pages(sbi, F2FS_DIRTY_DENTS), - get_pages(sbi, F2FS_DIRTY_QDATA)); ret = f2fs_sync_fs(sbi->sb, 1); if (ret) f2fs_err(sbi, "%s sync_fs failed, ret: %d", __func__, ret); @@ -2745,17 +2730,11 @@ static int f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) /* Let's ensure there's no pending checkpoint anymore */ f2fs_flush_ckpt_thread(sbi); - f2fs_up_write(&sbi->cp_enable_rwsem); - end = ktime_get(); - f2fs_info(sbi, "%s end, writeback:%llu, " - "lock:%llu, sync_inode:%llu, sync_fs:%llu", - __func__, - ktime_ms_delta(writeback, start), - ktime_ms_delta(lock, writeback), - ktime_ms_delta(sync_inode, lock), - ktime_ms_delta(end, sync_inode)); + f2fs_info(sbi, "f2fs_enable_checkpoint() finishes, writeback:%llu, sync:%llu", + ktime_ms_delta(writeback, start), + ktime_ms_delta(end, writeback)); return ret; } @@ -4952,7 +4931,6 @@ try_onemore: init_f2fs_rwsem_trace(&sbi->node_change, sbi, LOCK_NAME_NODE_CHANGE); spin_lock_init(&sbi->stat_lock); init_f2fs_rwsem_trace(&sbi->cp_rwsem, sbi, LOCK_NAME_CP_RWSEM); - init_f2fs_rwsem(&sbi->cp_enable_rwsem); init_f2fs_rwsem(&sbi->quota_sem); init_waitqueue_head(&sbi->cp_wait); spin_lock_init(&sbi->error_lock); From ce2739e482bce8d2c014d76c4531c877f382aa54 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 7 Jan 2026 19:22:18 +0800 Subject: [PATCH 33/61] f2fs: fix to avoid UAF in f2fs_write_end_io() As syzbot reported an use-after-free issue in f2fs_write_end_io(). It is caused by below race condition: loop device umount - worker_thread - loop_process_work - do_req_filebacked - lo_rw_aio - lo_rw_aio_complete - blk_mq_end_request - blk_update_request - f2fs_write_end_io - dec_page_count - folio_end_writeback - kill_f2fs_super - kill_block_super - f2fs_put_super : free(sbi) : get_pages(, F2FS_WB_CP_DATA) accessed sbi which is freed In kill_f2fs_super(), we will drop all page caches of f2fs inodes before call free(sbi), it guarantee that all folios should end its writeback, so it should be safe to access sbi before last folio_end_writeback(). Let's relocate ckpt thread wakeup flow before folio_end_writeback() to resolve this issue. Cc: stable@kernel.org Fixes: e234088758fc ("f2fs: avoid wait if IO end up when do_checkpoint for better performance") Reported-by: syzbot+b4444e3c972a7a124187@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=b4444e3c972a7a124187 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 2e133a723b99..f461f1318b4c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -378,14 +378,20 @@ static void f2fs_write_end_io(struct bio *bio) folio->index != nid_of_node(folio)); dec_page_count(sbi, type); + + /* + * we should access sbi before folio_end_writeback() to + * avoid racing w/ kill_f2fs_super() + */ + if (type == F2FS_WB_CP_DATA && !get_pages(sbi, type) && + wq_has_sleeper(&sbi->cp_wait)) + wake_up(&sbi->cp_wait); + if (f2fs_in_warm_node_list(sbi, folio)) f2fs_del_fsync_node_entry(sbi, folio); folio_clear_f2fs_gcing(folio); folio_end_writeback(folio); } - if (!get_pages(sbi, F2FS_WB_CP_DATA) && - wq_has_sleeper(&sbi->cp_wait)) - wake_up(&sbi->cp_wait); bio_put(bio); } From 1dd3b437d49ce09f0bd72acc1d694e212f26d1fe Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 15 Dec 2025 20:28:03 +0800 Subject: [PATCH 34/61] f2fs: make FAULT_DISCARD obsolete __blkdev_issue_discard() in __submit_discard_cmd() will never fail, so let's make FAULT_DISCARD fault injection obsolete. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 2 +- Documentation/filesystems/f2fs.rst | 2 +- fs/f2fs/f2fs.h | 2 +- fs/f2fs/segment.c | 18 +++--------------- 4 files changed, 6 insertions(+), 18 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index ca9ed3b44b31..7398b369784c 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -732,7 +732,7 @@ Description: Support configuring fault injection type, should be FAULT_TRUNCATE 0x00000400 FAULT_READ_IO 0x00000800 FAULT_CHECKPOINT 0x00001000 - FAULT_DISCARD 0x00002000 + FAULT_DISCARD 0x00002000 (obsolete) FAULT_WRITE_IO 0x00004000 FAULT_SLAB_ALLOC 0x00008000 FAULT_DQUOT_INIT 0x00010000 diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 33d2166ac6b7..fc005f2eaf86 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -206,7 +206,7 @@ fault_type=%d Support configuring fault injection type, should be FAULT_TRUNCATE 0x00000400 FAULT_READ_IO 0x00000800 FAULT_CHECKPOINT 0x00001000 - FAULT_DISCARD 0x00002000 + FAULT_DISCARD 0x00002000 (obsolete) FAULT_WRITE_IO 0x00004000 FAULT_SLAB_ALLOC 0x00008000 FAULT_DQUOT_INIT 0x00010000 diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 90aa1d53722a..579aafb0055f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -54,7 +54,7 @@ enum { FAULT_TRUNCATE, FAULT_READ_IO, FAULT_CHECKPOINT, - FAULT_DISCARD, + FAULT_DISCARD, /* it's obsolete due to __blkdev_issue_discard() will never fail */ FAULT_WRITE_IO, FAULT_SLAB_ALLOC, FAULT_DQUOT_INIT, diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 587ae3b4bfd8..74fca1d4b8ed 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1287,7 +1287,6 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, &(dcc->fstrim_list) : &(dcc->wait_list); blk_opf_t flag = dpolicy->sync ? REQ_SYNC : 0; block_t lstart, start, len, total_len; - int err = 0; if (dc->state != D_PREP) return 0; @@ -1328,7 +1327,7 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, dc->di.len = 0; - while (total_len && *issued < dpolicy->max_requests && !err) { + while (total_len && *issued < dpolicy->max_requests) { struct bio *bio = NULL; unsigned long flags; bool last = true; @@ -1344,17 +1343,6 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, dc->di.len += len; - err = 0; - if (time_to_inject(sbi, FAULT_DISCARD)) { - err = -EIO; - spin_lock_irqsave(&dc->lock, flags); - if (dc->state == D_PARTIAL) - dc->state = D_SUBMIT; - spin_unlock_irqrestore(&dc->lock, flags); - - break; - } - __blkdev_issue_discard(bdev, SECTOR_FROM_BLOCK(start), SECTOR_FROM_BLOCK(len), GFP_NOFS, &bio); f2fs_bug_on(sbi, !bio); @@ -1393,11 +1381,11 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, len = total_len; } - if (!err && len) { + if (len) { dcc->undiscard_blks -= len; __update_discard_tree_range(sbi, bdev, lstart, start, len); } - return err; + return 0; } static void __insert_discard_cmd(struct f2fs_sb_info *sbi, From e48e16f3e37fac76e2f0c14c58df2b0398a323b0 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Sat, 10 Jan 2026 15:54:05 -0800 Subject: [PATCH 35/61] f2fs: support non-4KB block size without packed_ssa feature Currently, F2FS requires the packed_ssa feature to be enabled when utilizing non-4KB block sizes (e.g., 16KB). This restriction limits the flexibility of filesystem formatting options. This patch allows F2FS to support non-4KB block sizes even when the packed_ssa feature is disabled. It adjusts the SSA calculation logic to correctly handle summary entries in larger blocks without the packed layout. Cc: stable@kernel.org Fixes: 7ee8bc3942f2 ("f2fs: revert summary entry count from 2048 to 512 in 16kb block support") Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 52 ++++++++++++++++++------- fs/f2fs/gc.c | 23 +++++------ fs/f2fs/node.c | 12 +++--- fs/f2fs/recovery.c | 6 +-- fs/f2fs/segment.c | 86 ++++++++++++++++++++++------------------- fs/f2fs/segment.h | 9 ++--- fs/f2fs/super.c | 26 ++++++------- include/linux/f2fs_fs.h | 73 ++++++++++++++++++++-------------- 8 files changed, 165 insertions(+), 122 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 579aafb0055f..f82c9c424748 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -545,13 +545,25 @@ struct fsync_inode_entry { #define nats_in_cursum(jnl) (le16_to_cpu((jnl)->n_nats)) #define sits_in_cursum(jnl) (le16_to_cpu((jnl)->n_sits)) -#define nat_in_journal(jnl, i) ((jnl)->nat_j.entries[i].ne) -#define nid_in_journal(jnl, i) ((jnl)->nat_j.entries[i].nid) -#define sit_in_journal(jnl, i) ((jnl)->sit_j.entries[i].se) -#define segno_in_journal(jnl, i) ((jnl)->sit_j.entries[i].segno) +#define nat_in_journal(jnl, i) \ + (((struct nat_journal_entry *)(jnl)->nat_j.entries)[i].ne) +#define nid_in_journal(jnl, i) \ + (((struct nat_journal_entry *)(jnl)->nat_j.entries)[i].nid) +#define sit_in_journal(jnl, i) \ + (((struct sit_journal_entry *)(jnl)->sit_j.entries)[i].se) +#define segno_in_journal(jnl, i) \ + (((struct sit_journal_entry *)(jnl)->sit_j.entries)[i].segno) -#define MAX_NAT_JENTRIES(jnl) (NAT_JOURNAL_ENTRIES - nats_in_cursum(jnl)) -#define MAX_SIT_JENTRIES(jnl) (SIT_JOURNAL_ENTRIES - sits_in_cursum(jnl)) +#define sum_entries(sum) ((struct f2fs_summary *)(sum)) +#define sum_journal(sbi, sum) \ + ((struct f2fs_journal *)((char *)(sum) + \ + ((sbi)->entries_in_sum * sizeof(struct f2fs_summary)))) +#define sum_footer(sbi, sum) \ + ((struct summary_footer *)((char *)(sum) + (sbi)->sum_blocksize - \ + sizeof(struct summary_footer))) + +#define MAX_NAT_JENTRIES(sbi, jnl) ((sbi)->nat_journal_entries - nats_in_cursum(jnl)) +#define MAX_SIT_JENTRIES(sbi, jnl) ((sbi)->sit_journal_entries - sits_in_cursum(jnl)) static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i) { @@ -569,14 +581,6 @@ static inline int update_sits_in_cursum(struct f2fs_journal *journal, int i) return before; } -static inline bool __has_cursum_space(struct f2fs_journal *journal, - int size, int type) -{ - if (type == NAT_JOURNAL) - return size <= MAX_NAT_JENTRIES(journal); - return size <= MAX_SIT_JENTRIES(journal); -} - /* for inline stuff */ #define DEF_INLINE_RESERVED_SIZE 1 static inline int get_extra_isize(struct inode *inode); @@ -1809,6 +1813,15 @@ struct f2fs_sb_info { bool readdir_ra; /* readahead inode in readdir */ unsigned int max_io_bytes; /* max io bytes to merge IOs */ + /* variable summary block units */ + unsigned int sum_blocksize; /* sum block size */ + unsigned int sums_per_block; /* sum block count per block */ + unsigned int entries_in_sum; /* entry count in sum block */ + unsigned int sum_entry_size; /* total entry size in sum block */ + unsigned int sum_journal_size; /* journal size in sum block */ + unsigned int nat_journal_entries; /* nat journal entry count in the journal */ + unsigned int sit_journal_entries; /* sit journal entry count in the journal */ + block_t user_block_count; /* # of user blocks */ block_t total_valid_block_count; /* # of valid blocks */ block_t discard_blks; /* discard command candidats */ @@ -2850,6 +2863,14 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); } +static inline bool __has_cursum_space(struct f2fs_sb_info *sbi, + struct f2fs_journal *journal, int size, int type) +{ + if (type == NAT_JOURNAL) + return size <= MAX_NAT_JENTRIES(sbi, journal); + return size <= MAX_SIT_JENTRIES(sbi, journal); +} + extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync); static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, struct inode *inode, bool is_inode) @@ -3993,7 +4014,8 @@ void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr, block_t len); void f2fs_write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk); void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk); -int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, +int f2fs_lookup_journal_in_cursum(struct f2fs_sb_info *sbi, + struct f2fs_journal *journal, int type, unsigned int val, int alloc); void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); int f2fs_check_and_fix_write_pointer(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 1538f5b0a644..60378614bc54 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1782,8 +1782,8 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, sanity_check_seg_type(sbi, get_seg_entry(sbi, segno)->type); - segno = rounddown(segno, SUMS_PER_BLOCK); - sum_blk_cnt = DIV_ROUND_UP(end_segno - segno, SUMS_PER_BLOCK); + segno = rounddown(segno, sbi->sums_per_block); + sum_blk_cnt = DIV_ROUND_UP(end_segno - segno, sbi->sums_per_block); /* readahead multi ssa blocks those have contiguous address */ if (__is_large_section(sbi)) f2fs_ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), @@ -1793,17 +1793,17 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, while (segno < end_segno) { struct folio *sum_folio = f2fs_get_sum_folio(sbi, segno); - segno += SUMS_PER_BLOCK; + segno += sbi->sums_per_block; if (IS_ERR(sum_folio)) { int err = PTR_ERR(sum_folio); - end_segno = segno - SUMS_PER_BLOCK; - segno = rounddown(start_segno, SUMS_PER_BLOCK); + end_segno = segno - sbi->sums_per_block; + segno = rounddown(start_segno, sbi->sums_per_block); while (segno < end_segno) { sum_folio = filemap_get_folio(META_MAPPING(sbi), GET_SUM_BLOCK(sbi, segno)); folio_put_refs(sum_folio, 2); - segno += SUMS_PER_BLOCK; + segno += sbi->sums_per_block; } return err; } @@ -1819,8 +1819,8 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, /* find segment summary of victim */ struct folio *sum_folio = filemap_get_folio(META_MAPPING(sbi), GET_SUM_BLOCK(sbi, segno)); - unsigned int block_end_segno = rounddown(segno, SUMS_PER_BLOCK) - + SUMS_PER_BLOCK; + unsigned int block_end_segno = rounddown(segno, sbi->sums_per_block) + + sbi->sums_per_block; if (block_end_segno > end_segno) block_end_segno = end_segno; @@ -1846,12 +1846,13 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, migrated >= sbi->migration_granularity) continue; - sum = SUM_BLK_PAGE_ADDR(sum_folio, cur_segno); - if (type != GET_SUM_TYPE((&sum->footer))) { + sum = SUM_BLK_PAGE_ADDR(sbi, sum_folio, cur_segno); + if (type != GET_SUM_TYPE(sum_footer(sbi, sum))) { f2fs_err(sbi, "Inconsistent segment (%u) type " "[%d, %d] in SSA and SIT", cur_segno, type, - GET_SUM_TYPE((&sum->footer))); + GET_SUM_TYPE( + sum_footer(sbi, sum))); f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_CORRUPTED_SUMMARY); continue; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 99e425e8c00a..00587e783b44 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -606,7 +606,7 @@ retry: goto retry; } - i = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0); + i = f2fs_lookup_journal_in_cursum(sbi, journal, NAT_JOURNAL, nid, 0); if (i >= 0) { ne = nat_in_journal(journal, i); node_info_from_raw_nat(ni, &ne); @@ -2955,7 +2955,7 @@ int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, /* scan the node segment */ last_offset = BLKS_PER_SEG(sbi); addr = START_BLOCK(sbi, segno); - sum_entry = &sum->entries[0]; + sum_entry = sum_entries(sum); for (i = 0; i < last_offset; i += nrpages, addr += nrpages) { nrpages = bio_max_segs(last_offset - i); @@ -3096,7 +3096,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, * #2, flush nat entries to nat page. */ if (enabled_nat_bits(sbi, cpc) || - !__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL)) + !__has_cursum_space(sbi, journal, set->entry_cnt, NAT_JOURNAL)) to_journal = false; if (to_journal) { @@ -3119,7 +3119,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, nat_get_blkaddr(ne) == NEW_ADDR); if (to_journal) { - offset = f2fs_lookup_journal_in_cursum(journal, + offset = f2fs_lookup_journal_in_cursum(sbi, journal, NAT_JOURNAL, nid, 1); f2fs_bug_on(sbi, offset < 0); raw_ne = &nat_in_journal(journal, offset); @@ -3190,7 +3190,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * into nat entry set. */ if (enabled_nat_bits(sbi, cpc) || - !__has_cursum_space(journal, + !__has_cursum_space(sbi, journal, nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL)) remove_nats_in_journal(sbi); @@ -3201,7 +3201,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) set_idx = setvec[found - 1]->set + 1; for (idx = 0; idx < found; idx++) __adjust_nat_entry_set(setvec[idx], &sets, - MAX_NAT_JENTRIES(journal)); + MAX_NAT_JENTRIES(sbi, journal)); } /* flush dirty nats in nat entry set */ diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 39f6e9830a9c..a26071f2b0bc 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -514,7 +514,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, struct curseg_info *curseg = CURSEG_I(sbi, i); if (curseg->segno == segno) { - sum = curseg->sum_blk->entries[blkoff]; + sum = sum_entries(curseg->sum_blk)[blkoff]; goto got_it; } } @@ -522,8 +522,8 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, sum_folio = f2fs_get_sum_folio(sbi, segno); if (IS_ERR(sum_folio)) return PTR_ERR(sum_folio); - sum_node = SUM_BLK_PAGE_ADDR(sum_folio, segno); - sum = sum_node->entries[blkoff]; + sum_node = SUM_BLK_PAGE_ADDR(sbi, sum_folio, segno); + sum = sum_entries(sum_node)[blkoff]; f2fs_folio_put(sum_folio, true); got_it: /* Use the locked dnode page and inode */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 74fca1d4b8ed..00870a8fe387 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2674,12 +2674,12 @@ int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) valid_sum_count += f2fs_curseg_valid_blocks(sbi, i); } - sum_in_page = (PAGE_SIZE - 2 * SUM_JOURNAL_SIZE - + sum_in_page = (sbi->sum_blocksize - 2 * sbi->sum_journal_size - SUM_FOOTER_SIZE) / SUMMARY_SIZE; if (valid_sum_count <= sum_in_page) return 1; else if ((valid_sum_count - sum_in_page) <= - (PAGE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE) + (sbi->sum_blocksize - SUM_FOOTER_SIZE) / SUMMARY_SIZE) return 2; return 3; } @@ -2699,7 +2699,7 @@ void f2fs_update_meta_page(struct f2fs_sb_info *sbi, { struct folio *folio; - if (SUMS_PER_BLOCK == 1) + if (!f2fs_sb_has_packed_ssa(sbi)) folio = f2fs_grab_meta_folio(sbi, blk_addr); else folio = f2fs_get_meta_folio_retry(sbi, blk_addr); @@ -2717,7 +2717,7 @@ static void write_sum_page(struct f2fs_sb_info *sbi, { struct folio *folio; - if (SUMS_PER_BLOCK == 1) + if (!f2fs_sb_has_packed_ssa(sbi)) return f2fs_update_meta_page(sbi, (void *)sum_blk, GET_SUM_BLOCK(sbi, segno)); @@ -2725,7 +2725,8 @@ static void write_sum_page(struct f2fs_sb_info *sbi, if (IS_ERR(folio)) return; - memcpy(SUM_BLK_PAGE_ADDR(folio, segno), sum_blk, sizeof(*sum_blk)); + memcpy(SUM_BLK_PAGE_ADDR(sbi, folio, segno), sum_blk, + sbi->sum_blocksize); folio_mark_dirty(folio); f2fs_folio_put(folio, true); } @@ -2744,11 +2745,11 @@ static void write_current_sum_page(struct f2fs_sb_info *sbi, mutex_lock(&curseg->curseg_mutex); down_read(&curseg->journal_rwsem); - memcpy(&dst->journal, curseg->journal, SUM_JOURNAL_SIZE); + memcpy(sum_journal(sbi, dst), curseg->journal, sbi->sum_journal_size); up_read(&curseg->journal_rwsem); - memcpy(dst->entries, src->entries, SUM_ENTRY_SIZE); - memcpy(&dst->footer, &src->footer, SUM_FOOTER_SIZE); + memcpy(sum_entries(dst), sum_entries(src), sbi->sum_entry_size); + memcpy(sum_footer(sbi, dst), sum_footer(sbi, src), SUM_FOOTER_SIZE); mutex_unlock(&curseg->curseg_mutex); @@ -2921,7 +2922,7 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) curseg->next_blkoff = 0; curseg->next_segno = NULL_SEGNO; - sum_footer = &(curseg->sum_blk->footer); + sum_footer = sum_footer(sbi, curseg->sum_blk); memset(sum_footer, 0, sizeof(struct summary_footer)); sanity_check_seg_type(sbi, seg_type); @@ -3067,11 +3068,11 @@ static int change_curseg(struct f2fs_sb_info *sbi, int type) sum_folio = f2fs_get_sum_folio(sbi, new_segno); if (IS_ERR(sum_folio)) { /* GC won't be able to use stale summary pages by cp_error */ - memset(curseg->sum_blk, 0, SUM_ENTRY_SIZE); + memset(curseg->sum_blk, 0, sbi->sum_entry_size); return PTR_ERR(sum_folio); } - sum_node = SUM_BLK_PAGE_ADDR(sum_folio, new_segno); - memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); + sum_node = SUM_BLK_PAGE_ADDR(sbi, sum_folio, new_segno); + memcpy(curseg->sum_blk, sum_node, sbi->sum_entry_size); f2fs_folio_put(sum_folio, true); return 0; } @@ -3805,7 +3806,7 @@ int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct folio *folio, f2fs_wait_discard_bio(sbi, *new_blkaddr); - curseg->sum_blk->entries[curseg->next_blkoff] = *sum; + sum_entries(curseg->sum_blk)[curseg->next_blkoff] = *sum; if (curseg->alloc_type == SSR) { curseg->next_blkoff = f2fs_find_next_ssr_block(sbi, curseg); } else { @@ -4174,7 +4175,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, } curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); - curseg->sum_blk->entries[curseg->next_blkoff] = *sum; + sum_entries(curseg->sum_blk)[curseg->next_blkoff] = *sum; if (!recover_curseg || recover_newaddr) { if (!from_gc) @@ -4294,12 +4295,12 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi) /* Step 1: restore nat cache */ seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); - memcpy(seg_i->journal, kaddr, SUM_JOURNAL_SIZE); + memcpy(seg_i->journal, kaddr, sbi->sum_journal_size); /* Step 2: restore sit cache */ seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA); - memcpy(seg_i->journal, kaddr + SUM_JOURNAL_SIZE, SUM_JOURNAL_SIZE); - offset = 2 * SUM_JOURNAL_SIZE; + memcpy(seg_i->journal, kaddr + sbi->sum_journal_size, sbi->sum_journal_size); + offset = 2 * sbi->sum_journal_size; /* Step 3: restore summary entries */ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { @@ -4321,9 +4322,9 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi) struct f2fs_summary *s; s = (struct f2fs_summary *)(kaddr + offset); - seg_i->sum_blk->entries[j] = *s; + sum_entries(seg_i->sum_blk)[j] = *s; offset += SUMMARY_SIZE; - if (offset + SUMMARY_SIZE <= PAGE_SIZE - + if (offset + SUMMARY_SIZE <= sbi->sum_blocksize - SUM_FOOTER_SIZE) continue; @@ -4379,7 +4380,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) if (IS_NODESEG(type)) { if (__exist_node_summaries(sbi)) { - struct f2fs_summary *ns = &sum->entries[0]; + struct f2fs_summary *ns = sum_entries(sum); int i; for (i = 0; i < BLKS_PER_SEG(sbi); i++, ns++) { @@ -4399,11 +4400,13 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) /* update journal info */ down_write(&curseg->journal_rwsem); - memcpy(curseg->journal, &sum->journal, SUM_JOURNAL_SIZE); + memcpy(curseg->journal, sum_journal(sbi, sum), sbi->sum_journal_size); up_write(&curseg->journal_rwsem); - memcpy(curseg->sum_blk->entries, sum->entries, SUM_ENTRY_SIZE); - memcpy(&curseg->sum_blk->footer, &sum->footer, SUM_FOOTER_SIZE); + memcpy(sum_entries(curseg->sum_blk), sum_entries(sum), + sbi->sum_entry_size); + memcpy(sum_footer(sbi, curseg->sum_blk), sum_footer(sbi, sum), + SUM_FOOTER_SIZE); curseg->next_segno = segno; reset_curseg(sbi, type, 0); curseg->alloc_type = ckpt->alloc_type[type]; @@ -4447,8 +4450,8 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) } /* sanity check for summary blocks */ - if (nats_in_cursum(nat_j) > NAT_JOURNAL_ENTRIES || - sits_in_cursum(sit_j) > SIT_JOURNAL_ENTRIES) { + if (nats_in_cursum(nat_j) > sbi->nat_journal_entries || + sits_in_cursum(sit_j) > sbi->sit_journal_entries) { f2fs_err(sbi, "invalid journal entries nats %u sits %u", nats_in_cursum(nat_j), sits_in_cursum(sit_j)); return -EINVAL; @@ -4472,13 +4475,13 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) /* Step 1: write nat cache */ seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); - memcpy(kaddr, seg_i->journal, SUM_JOURNAL_SIZE); - written_size += SUM_JOURNAL_SIZE; + memcpy(kaddr, seg_i->journal, sbi->sum_journal_size); + written_size += sbi->sum_journal_size; /* Step 2: write sit cache */ seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA); - memcpy(kaddr + written_size, seg_i->journal, SUM_JOURNAL_SIZE); - written_size += SUM_JOURNAL_SIZE; + memcpy(kaddr + written_size, seg_i->journal, sbi->sum_journal_size); + written_size += sbi->sum_journal_size; /* Step 3: write summary entries */ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { @@ -4491,7 +4494,7 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) written_size = 0; } summary = (struct f2fs_summary *)(kaddr + written_size); - *summary = seg_i->sum_blk->entries[j]; + *summary = sum_entries(seg_i->sum_blk)[j]; written_size += SUMMARY_SIZE; if (written_size + SUMMARY_SIZE <= PAGE_SIZE - @@ -4536,8 +4539,9 @@ void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk) write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE); } -int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, - unsigned int val, int alloc) +int f2fs_lookup_journal_in_cursum(struct f2fs_sb_info *sbi, + struct f2fs_journal *journal, int type, + unsigned int val, int alloc) { int i; @@ -4546,13 +4550,13 @@ int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, if (le32_to_cpu(nid_in_journal(journal, i)) == val) return i; } - if (alloc && __has_cursum_space(journal, 1, NAT_JOURNAL)) + if (alloc && __has_cursum_space(sbi, journal, 1, NAT_JOURNAL)) return update_nats_in_cursum(journal, 1); } else if (type == SIT_JOURNAL) { for (i = 0; i < sits_in_cursum(journal); i++) if (le32_to_cpu(segno_in_journal(journal, i)) == val) return i; - if (alloc && __has_cursum_space(journal, 1, SIT_JOURNAL)) + if (alloc && __has_cursum_space(sbi, journal, 1, SIT_JOURNAL)) return update_sits_in_cursum(journal, 1); } return -1; @@ -4700,8 +4704,8 @@ void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * entries, remove all entries from journal and add and account * them in sit entry set. */ - if (!__has_cursum_space(journal, sit_i->dirty_sentries, SIT_JOURNAL) || - !to_journal) + if (!__has_cursum_space(sbi, journal, + sit_i->dirty_sentries, SIT_JOURNAL) || !to_journal) remove_sits_in_journal(sbi); /* @@ -4718,7 +4722,8 @@ void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) unsigned int segno = start_segno; if (to_journal && - !__has_cursum_space(journal, ses->entry_cnt, SIT_JOURNAL)) + !__has_cursum_space(sbi, journal, ses->entry_cnt, + SIT_JOURNAL)) to_journal = false; if (to_journal) { @@ -4746,7 +4751,7 @@ void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) } if (to_journal) { - offset = f2fs_lookup_journal_in_cursum(journal, + offset = f2fs_lookup_journal_in_cursum(sbi, journal, SIT_JOURNAL, segno, 1); f2fs_bug_on(sbi, offset < 0); segno_in_journal(journal, offset) = @@ -4953,12 +4958,13 @@ static int build_curseg(struct f2fs_sb_info *sbi) for (i = 0; i < NO_CHECK_TYPE; i++) { mutex_init(&array[i].curseg_mutex); - array[i].sum_blk = f2fs_kzalloc(sbi, PAGE_SIZE, GFP_KERNEL); + array[i].sum_blk = f2fs_kzalloc(sbi, sbi->sum_blocksize, + GFP_KERNEL); if (!array[i].sum_blk) return -ENOMEM; init_rwsem(&array[i].journal_rwsem); array[i].journal = f2fs_kzalloc(sbi, - sizeof(struct f2fs_journal), GFP_KERNEL); + sbi->sum_journal_size, GFP_KERNEL); if (!array[i].journal) return -ENOMEM; array[i].seg_type = log_type_to_seg_type(i); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 20daaccb34a5..068845660b0f 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -90,12 +90,11 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, #define GET_ZONE_FROM_SEG(sbi, segno) \ GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno)) -#define SUMS_PER_BLOCK (F2FS_BLKSIZE / F2FS_SUM_BLKSIZE) #define GET_SUM_BLOCK(sbi, segno) \ - (SM_I(sbi)->ssa_blkaddr + (segno / SUMS_PER_BLOCK)) -#define GET_SUM_BLKOFF(segno) (segno % SUMS_PER_BLOCK) -#define SUM_BLK_PAGE_ADDR(folio, segno) \ - (folio_address(folio) + GET_SUM_BLKOFF(segno) * F2FS_SUM_BLKSIZE) + (SM_I(sbi)->ssa_blkaddr + (segno / (sbi)->sums_per_block)) +#define GET_SUM_BLKOFF(sbi, segno) (segno % (sbi)->sums_per_block) +#define SUM_BLK_PAGE_ADDR(sbi, folio, segno) \ + (folio_address(folio) + GET_SUM_BLKOFF(sbi, segno) * (sbi)->sum_blocksize) #define GET_SUM_TYPE(footer) ((footer)->entry_type) #define SET_SUM_TYPE(footer, type) ((footer)->entry_type = (type)) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 25f796232ad9..1660d663a8c5 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4104,20 +4104,6 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, if (sanity_check_area_boundary(sbi, folio, index)) return -EFSCORRUPTED; - /* - * Check for legacy summary layout on 16KB+ block devices. - * Modern f2fs-tools packs multiple 4KB summary areas into one block, - * whereas legacy versions used one block per summary, leading - * to a much larger SSA. - */ - if (SUMS_PER_BLOCK > 1 && - !(__F2FS_HAS_FEATURE(raw_super, F2FS_FEATURE_PACKED_SSA))) { - f2fs_info(sbi, "Error: Device formatted with a legacy version. " - "Please reformat with a tool supporting the packed ssa " - "feature for block sizes larger than 4kb."); - return -EOPNOTSUPP; - } - return 0; } @@ -4329,6 +4315,18 @@ static void init_sb_info(struct f2fs_sb_info *sbi) atomic64_set(&sbi->current_atomic_write, 0); sbi->max_lock_elapsed_time = MAX_LOCK_ELAPSED_TIME; + sbi->sum_blocksize = f2fs_sb_has_packed_ssa(sbi) ? + 4096 : sbi->blocksize; + sbi->sums_per_block = sbi->blocksize / sbi->sum_blocksize; + sbi->entries_in_sum = sbi->sum_blocksize / 8; + sbi->sum_entry_size = SUMMARY_SIZE * sbi->entries_in_sum; + sbi->sum_journal_size = sbi->sum_blocksize - SUM_FOOTER_SIZE - + sbi->sum_entry_size; + sbi->nat_journal_entries = (sbi->sum_journal_size - 2) / + sizeof(struct nat_journal_entry); + sbi->sit_journal_entries = (sbi->sum_journal_size - 2) / + sizeof(struct sit_journal_entry); + sbi->dir_level = DEF_DIR_LEVEL; sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL; sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL; diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index a7880787cad3..dc41722fcc9d 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -17,7 +17,6 @@ #define F2FS_LOG_SECTORS_PER_BLOCK (PAGE_SHIFT - 9) /* log number for sector/blk */ #define F2FS_BLKSIZE PAGE_SIZE /* support only block == page */ #define F2FS_BLKSIZE_BITS PAGE_SHIFT /* bits for F2FS_BLKSIZE */ -#define F2FS_SUM_BLKSIZE 4096 /* only support 4096 byte sum block */ #define F2FS_MAX_EXTENSION 64 /* # of extension entries */ #define F2FS_EXTENSION_LEN 8 /* max size of extension */ @@ -442,10 +441,8 @@ struct f2fs_sit_block { * from node's page's beginning to get a data block address. * ex) data_blkaddr = (block_t)(nodepage_start_address + ofs_in_node) */ -#define ENTRIES_IN_SUM (F2FS_SUM_BLKSIZE / 8) #define SUMMARY_SIZE (7) /* sizeof(struct f2fs_summary) */ #define SUM_FOOTER_SIZE (5) /* sizeof(struct summary_footer) */ -#define SUM_ENTRY_SIZE (SUMMARY_SIZE * ENTRIES_IN_SUM) /* a summary entry for a block in a segment */ struct f2fs_summary { @@ -468,22 +465,6 @@ struct summary_footer { __le32 check_sum; /* summary checksum */ } __packed; -#define SUM_JOURNAL_SIZE (F2FS_SUM_BLKSIZE - SUM_FOOTER_SIZE -\ - SUM_ENTRY_SIZE) -#define NAT_JOURNAL_ENTRIES ((SUM_JOURNAL_SIZE - 2) /\ - sizeof(struct nat_journal_entry)) -#define NAT_JOURNAL_RESERVED ((SUM_JOURNAL_SIZE - 2) %\ - sizeof(struct nat_journal_entry)) -#define SIT_JOURNAL_ENTRIES ((SUM_JOURNAL_SIZE - 2) /\ - sizeof(struct sit_journal_entry)) -#define SIT_JOURNAL_RESERVED ((SUM_JOURNAL_SIZE - 2) %\ - sizeof(struct sit_journal_entry)) - -/* Reserved area should make size of f2fs_extra_info equals to - * that of nat_journal and sit_journal. - */ -#define EXTRA_INFO_RESERVED (SUM_JOURNAL_SIZE - 2 - 8) - /* * frequently updated NAT/SIT entries can be stored in the spare area in * summary blocks @@ -498,9 +479,16 @@ struct nat_journal_entry { struct f2fs_nat_entry ne; } __packed; +/* + * The nat_journal structure is a placeholder whose actual size varies depending + * on the use of packed_ssa. Therefore, it must always be accessed only through + * specific sets of macros and fields, and size calculations should use + * size-related macros instead of sizeof(). + * Relevant macros: sbi->nat_journal_entries, nat_in_journal(), + * nid_in_journal(), MAX_NAT_JENTRIES(). + */ struct nat_journal { - struct nat_journal_entry entries[NAT_JOURNAL_ENTRIES]; - __u8 reserved[NAT_JOURNAL_RESERVED]; + struct nat_journal_entry entries[0]; } __packed; struct sit_journal_entry { @@ -508,14 +496,21 @@ struct sit_journal_entry { struct f2fs_sit_entry se; } __packed; +/* + * The sit_journal structure is a placeholder whose actual size varies depending + * on the use of packed_ssa. Therefore, it must always be accessed only through + * specific sets of macros and fields, and size calculations should use + * size-related macros instead of sizeof(). + * Relevant macros: sbi->sit_journal_entries, sit_in_journal(), + * segno_in_journal(), MAX_SIT_JENTRIES(). + */ struct sit_journal { - struct sit_journal_entry entries[SIT_JOURNAL_ENTRIES]; - __u8 reserved[SIT_JOURNAL_RESERVED]; + struct sit_journal_entry entries[0]; } __packed; struct f2fs_extra_info { __le64 kbytes_written; - __u8 reserved[EXTRA_INFO_RESERVED]; + __u8 reserved[]; } __packed; struct f2fs_journal { @@ -531,11 +526,33 @@ struct f2fs_journal { }; } __packed; -/* Block-sized summary block structure */ +/* + * Block-sized summary block structure + * + * The f2fs_summary_block structure is a placeholder whose actual size varies + * depending on the use of packed_ssa. Therefore, it must always be accessed + * only through specific sets of macros and fields, and size calculations should + * use size-related macros instead of sizeof(). + * Relevant macros: sbi->sum_blocksize, sbi->entries_in_sum, + * sbi->sum_entry_size, sum_entries(), sum_journal(), sum_footer(). + * + * Summary Block Layout + * + * +-----------------------+ <--- Block Start + * | struct f2fs_summary | + * | entries[0] | + * | ... | + * | entries[N-1] | + * +-----------------------+ + * | struct f2fs_journal | + * +-----------------------+ + * | struct summary_footer | + * +-----------------------+ <--- Block End + */ struct f2fs_summary_block { - struct f2fs_summary entries[ENTRIES_IN_SUM]; - struct f2fs_journal journal; - struct summary_footer footer; + struct f2fs_summary entries[0]; + // struct f2fs_journal journal; + // struct summary_footer footer; } __packed; /* From f7b929eda1f1c28ec80ab613cb23410d84755591 Mon Sep 17 00:00:00 2001 From: Yangyang Zang Date: Mon, 12 Jan 2026 15:46:35 +0800 Subject: [PATCH 36/61] f2fs: clean up the type parameter in f2fs_sync_meta_pages() Clean up code to improve readability, no logic changes. Signed-off-by: Yangyang Zang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 15 +++++++-------- fs/f2fs/f2fs.h | 4 ++-- fs/f2fs/file.c | 2 +- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 3dfc83a0813e..8bb38cfcce6e 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -534,7 +534,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping, trace_f2fs_writepages(mapping->host, wbc, META); diff = nr_pages_to_write(sbi, META, wbc); - written = f2fs_sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO); + written = f2fs_sync_meta_pages(sbi, wbc->nr_to_write, FS_META_IO); f2fs_up_write_trace(&sbi->cp_global_sem, &lc); wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); return 0; @@ -545,8 +545,8 @@ skip_write: return 0; } -long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, - long nr_to_write, enum iostat_type io_type) +long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, long nr_to_write, + enum iostat_type io_type) { struct address_space *mapping = META_MAPPING(sbi); pgoff_t index = 0, prev = ULONG_MAX; @@ -607,7 +607,7 @@ continue_unlock: } stop: if (nwritten) - f2fs_submit_merged_write(sbi, type); + f2fs_submit_merged_write(sbi, META); blk_finish_plug(&plug); @@ -1450,8 +1450,7 @@ void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type) break; if (type == F2FS_DIRTY_META) - f2fs_sync_meta_pages(sbi, META, LONG_MAX, - FS_CP_META_IO); + f2fs_sync_meta_pages(sbi, LONG_MAX, FS_CP_META_IO); else if (type == F2FS_WB_CP_DATA) f2fs_submit_merged_write(sbi, DATA); @@ -1623,7 +1622,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) int err; /* Flush all the NAT/SIT pages */ - f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + f2fs_sync_meta_pages(sbi, LONG_MAX, FS_CP_META_IO); stat_cp_time(cpc, CP_TIME_SYNC_META); @@ -1722,7 +1721,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) } /* Here, we have one bio having CP pack except cp pack 2 page */ - f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + f2fs_sync_meta_pages(sbi, LONG_MAX, FS_CP_META_IO); stat_cp_time(cpc, CP_TIME_SYNC_CP_META); /* Wait for all dirty meta pages to be submitted for IO */ diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f82c9c424748..ae78b8e1ca0c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4068,8 +4068,8 @@ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync); void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index, unsigned int ra_blocks); -long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, - long nr_to_write, enum iostat_type io_type); +long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, long nr_to_write, + enum iostat_type io_type); void f2fs_add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); void f2fs_remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); void f2fs_release_ino_entry(struct f2fs_sb_info *sbi, bool all); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ce291f152bc3..abcf6f486dd7 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2463,7 +2463,7 @@ int f2fs_do_shutdown(struct f2fs_sb_info *sbi, unsigned int flag, f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); break; case F2FS_GOING_DOWN_METAFLUSH: - f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO); + f2fs_sync_meta_pages(sbi, LONG_MAX, FS_META_IO); f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); break; case F2FS_GOING_DOWN_NEED_FSCK: From 0a736109c9d29de0c26567e42cb99b27861aa8ba Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 12 Jan 2026 15:49:15 +0800 Subject: [PATCH 37/61] f2fs: fix to do sanity check on node footer in __write_node_folio() Add node footer sanity check during node folio's writeback, if sanity check fails, let's shutdown filesystem to avoid looping to redirty and writeback in .writepages. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 00587e783b44..30e26b878af0 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1763,7 +1763,11 @@ static bool __write_node_folio(struct folio *folio, bool atomic, bool *submitted /* get old block addr of this node page */ nid = nid_of_node(folio); - f2fs_bug_on(sbi, folio->index != nid); + + if (sanity_check_node_footer(sbi, folio, nid, NODE_TYPE_REGULAR)) { + f2fs_handle_critical_error(sbi, STOP_CP_REASON_CORRUPTED_NID); + goto redirty_out; + } if (f2fs_get_node_info(sbi, nid, &ni, !do_balance)) goto redirty_out; From 50ac3ecd8e05b6bcc350c71a4307d40c030ec7e4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 12 Jan 2026 15:49:16 +0800 Subject: [PATCH 38/61] f2fs: fix to do sanity check on node footer in {read,write}_end_io -----------[ cut here ]------------ kernel BUG at fs/f2fs/data.c:358! Call Trace: blk_update_request+0x5eb/0xe70 block/blk-mq.c:987 blk_mq_end_request+0x3e/0x70 block/blk-mq.c:1149 blk_complete_reqs block/blk-mq.c:1224 [inline] blk_done_softirq+0x107/0x160 block/blk-mq.c:1229 handle_softirqs+0x283/0x870 kernel/softirq.c:579 __do_softirq kernel/softirq.c:613 [inline] invoke_softirq kernel/softirq.c:453 [inline] __irq_exit_rcu+0xca/0x1f0 kernel/softirq.c:680 irq_exit_rcu+0x9/0x30 kernel/softirq.c:696 instr_sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1050 [inline] sysvec_apic_timer_interrupt+0xa6/0xc0 arch/x86/kernel/apic/apic.c:1050 In f2fs_write_end_io(), it detects there is inconsistency in between node page index (nid) and footer.nid of node page. If footer of node page is corrupted in fuzzed image, then we load corrupted node page w/ async method, e.g. f2fs_ra_node_pages() or f2fs_ra_node_page(), in where we won't do sanity check on node footer, once node page becomes dirty, we will encounter this bug after node page writeback. Cc: stable@kernel.org Reported-by: syzbot+803dd716c4310d16ff3a@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=803dd716c4310d16ff3a Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 12 ++++++++++-- fs/f2fs/f2fs.h | 11 +++++++++++ fs/f2fs/node.c | 20 +++++++++++--------- fs/f2fs/node.h | 8 -------- 4 files changed, 32 insertions(+), 19 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f461f1318b4c..9b70b6d33703 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -172,6 +172,11 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task) while (nr_pages--) dec_page_count(F2FS_F_SB(folio), __read_io_type(folio)); + if (F2FS_F_SB(folio)->node_inode && is_node_folio(folio) && + f2fs_sanity_check_node_footer(F2FS_F_SB(folio), + folio, folio->index, NODE_TYPE_REGULAR, true)) + bio->bi_status = BLK_STS_IOERR; + if (finished) folio_end_read(folio, bio->bi_status == BLK_STS_OK); } @@ -374,8 +379,11 @@ static void f2fs_write_end_io(struct bio *bio) STOP_CP_REASON_WRITE_FAIL); } - f2fs_bug_on(sbi, is_node_folio(folio) && - folio->index != nid_of_node(folio)); + if (is_node_folio(folio)) { + f2fs_sanity_check_node_footer(sbi, folio, + folio->index, NODE_TYPE_REGULAR, true); + f2fs_bug_on(sbi, folio->index != nid_of_node(folio)); + } dec_page_count(sbi, type); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ae78b8e1ca0c..d41210a381cd 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1572,6 +1572,14 @@ enum f2fs_lookup_mode { LOOKUP_AUTO, }; +/* For node type in __get_node_folio() */ +enum node_type { + NODE_TYPE_REGULAR, + NODE_TYPE_INODE, + NODE_TYPE_XATTR, + NODE_TYPE_NON_INODE, +}; + /* a threshold of maximum elapsed time in critical region to print tracepoint */ #define MAX_LOCK_ELAPSED_TIME 500 @@ -3915,6 +3923,9 @@ struct folio *f2fs_new_node_folio(struct dnode_of_data *dn, unsigned int ofs); void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid, enum node_type node_type); +int f2fs_sanity_check_node_footer(struct f2fs_sb_info *sbi, + struct folio *folio, pgoff_t nid, + enum node_type ntype, bool in_irq); struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino); struct folio *f2fs_get_xnode_folio(struct f2fs_sb_info *sbi, pgoff_t xnid); int f2fs_move_node_folio(struct folio *node_folio, int gc_type); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 30e26b878af0..efd4f176a1f4 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1511,9 +1511,9 @@ void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) f2fs_folio_put(afolio, err ? true : false); } -static int sanity_check_node_footer(struct f2fs_sb_info *sbi, +int f2fs_sanity_check_node_footer(struct f2fs_sb_info *sbi, struct folio *folio, pgoff_t nid, - enum node_type ntype) + enum node_type ntype, bool in_irq) { if (unlikely(nid != nid_of_node(folio))) goto out_err; @@ -1538,12 +1538,13 @@ static int sanity_check_node_footer(struct f2fs_sb_info *sbi, goto out_err; return 0; out_err: - f2fs_warn(sbi, "inconsistent node block, node_type:%d, nid:%lu, " - "node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", - ntype, nid, nid_of_node(folio), ino_of_node(folio), - ofs_of_node(folio), cpver_of_node(folio), - next_blkaddr_of_node(folio)); set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn_ratelimited(sbi, "inconsistent node block, node_type:%d, nid:%lu, " + "node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", + ntype, nid, nid_of_node(folio), ino_of_node(folio), + ofs_of_node(folio), cpver_of_node(folio), + next_blkaddr_of_node(folio)); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); return -EFSCORRUPTED; } @@ -1589,7 +1590,7 @@ repeat: goto out_err; } page_hit: - err = sanity_check_node_footer(sbi, folio, nid, ntype); + err = f2fs_sanity_check_node_footer(sbi, folio, nid, ntype, false); if (!err) return folio; out_err: @@ -1764,7 +1765,8 @@ static bool __write_node_folio(struct folio *folio, bool atomic, bool *submitted /* get old block addr of this node page */ nid = nid_of_node(folio); - if (sanity_check_node_footer(sbi, folio, nid, NODE_TYPE_REGULAR)) { + if (f2fs_sanity_check_node_footer(sbi, folio, nid, + NODE_TYPE_REGULAR, false)) { f2fs_handle_critical_error(sbi, STOP_CP_REASON_CORRUPTED_NID); goto redirty_out; } diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 9cb8dcf8d417..824ac9f0e6e4 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -52,14 +52,6 @@ enum { IS_PREALLOC, /* nat entry is preallocated */ }; -/* For node type in __get_node_folio() */ -enum node_type { - NODE_TYPE_REGULAR, - NODE_TYPE_INODE, - NODE_TYPE_XATTR, - NODE_TYPE_NON_INODE, -}; - /* * For node information */ From 93ffb6c28ff180560d2d7313ac106efcd9e012b8 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 12 Jan 2026 15:49:17 +0800 Subject: [PATCH 39/61] f2fs: detect more inconsistent cases in sanity_check_node_footer() Let's enhance sanity_check_node_footer() to detect more inconsistent cases as below: Node Type Node Footer Info =================== ============================= NODE_TYPE_REGULAR inode = true and xnode = true NODE_TYPE_INODE inode = false or xnode = true NODE_TYPE_XATTR inode = true or xnode = false NODE_TYPE_NON_INODE inode = false Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index efd4f176a1f4..63252ff1e5c3 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1515,20 +1515,29 @@ int f2fs_sanity_check_node_footer(struct f2fs_sb_info *sbi, struct folio *folio, pgoff_t nid, enum node_type ntype, bool in_irq) { + bool is_inode, is_xnode; + if (unlikely(nid != nid_of_node(folio))) goto out_err; + is_inode = IS_INODE(folio); + is_xnode = f2fs_has_xattr_block(ofs_of_node(folio)); + switch (ntype) { + case NODE_TYPE_REGULAR: + if (is_inode && is_xnode) + goto out_err; + break; case NODE_TYPE_INODE: - if (!IS_INODE(folio)) + if (!is_inode || is_xnode) goto out_err; break; case NODE_TYPE_XATTR: - if (!f2fs_has_xattr_block(ofs_of_node(folio))) + if (is_inode || !is_xnode) goto out_err; break; case NODE_TYPE_NON_INODE: - if (IS_INODE(folio)) + if (is_inode) goto out_err; break; default: From 540d34c18272d124ef3113b7dbe499304ce0023c Mon Sep 17 00:00:00 2001 From: Yongpeng Yang Date: Fri, 9 Jan 2026 22:46:18 +0800 Subject: [PATCH 40/61] f2fs: avoid unnecessary block mapping lookups in f2fs_read_data_large_folio In the second call to f2fs_map_blocks within f2fs_read_data_large_folio, map.m_len exceeds the logical address space to be read. This patch ensures map.m_len does not exceed the required address space. Signed-off-by: Yongpeng Yang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9b70b6d33703..7aa1cd43898f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2468,7 +2468,7 @@ next_folio: ffs = NULL; nrpages = folio_nr_pages(folio); - for (; nrpages; nrpages--) { + for (; nrpages; nrpages--, max_nr_pages--) { sector_t block_nr; /* * Map blocks using the previous result first. From 6afd05ca6d45b834af36c8e1257e7203b2604583 Mon Sep 17 00:00:00 2001 From: Nanzhe Zhao Date: Sun, 11 Jan 2026 18:09:40 +0800 Subject: [PATCH 41/61] f2fs: add 'folio_in_bio' to handle readahead folios with no BIO submission f2fs_read_data_large_folio() can build a single read BIO across multiple folios during readahead. If a folio ends up having none of its subpages added to the BIO (e.g. all subpages are zeroed / treated as holes), it will never be seen by f2fs_finish_read_bio(), so folio_end_read() is never called. This leaves the folio locked and not marked uptodate. Track whether the current folio has been added to a BIO via a local 'folio_in_bio' bool flag, and when iterating readahead folios, explicitly mark the folio uptodate (on success) and unlock it when nothing was added. Signed-off-by: Nanzhe Zhao Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7aa1cd43898f..8ca24206fc68 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2448,6 +2448,7 @@ static int f2fs_read_data_large_folio(struct inode *inode, unsigned nrpages; struct f2fs_folio_state *ffs; int ret = 0; + bool folio_in_bio; if (!IS_IMMUTABLE(inode)) return -EOPNOTSUPP; @@ -2463,6 +2464,7 @@ next_folio: if (!folio) goto out; + folio_in_bio = false; index = folio->index; offset = 0; ffs = NULL; @@ -2548,6 +2550,7 @@ submit_and_realloc: offset << PAGE_SHIFT)) goto submit_and_realloc; + folio_in_bio = true; inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA); f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO, F2FS_BLKSIZE); @@ -2557,6 +2560,10 @@ submit_and_realloc: } trace_f2fs_read_folio(folio, DATA); if (rac) { + if (!folio_in_bio) { + folio_mark_uptodate(folio); + folio_unlock(folio); + } folio = readahead_folio(rac); goto next_folio; } From d194f112a9e6504ea23bd4a7b350c089fae9defd Mon Sep 17 00:00:00 2001 From: Nanzhe Zhao Date: Sun, 11 Jan 2026 18:09:41 +0800 Subject: [PATCH 42/61] f2fs: advance index and offset after zeroing in large folio read In f2fs_read_data_large_folio(), the block zeroing path calls folio_zero_range() and then continues the loop. However, it fails to advance index and offset before continuing. This can cause the loop to repeatedly process the same subpage of the folio, leading to stalls/hangs and incorrect progress when reading large folios with holes/zeroed blocks. Fix it by advancing index and offset unconditionally in the loop iteration, so they are updated even when the zeroing path continues. Signed-off-by: Nanzhe Zhao Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8ca24206fc68..ef44b8421cd7 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2470,7 +2470,7 @@ next_folio: ffs = NULL; nrpages = folio_nr_pages(folio); - for (; nrpages; nrpages--, max_nr_pages--) { + for (; nrpages; nrpages--, max_nr_pages--, index++, offset++) { sector_t block_nr; /* * Map blocks using the previous result first. @@ -2555,8 +2555,6 @@ submit_and_realloc: f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO, F2FS_BLKSIZE); last_block_in_bio = block_nr; - index++; - offset++; } trace_f2fs_read_folio(folio, DATA); if (rac) { From fe2961fb77e4784261976ca887135b1aecd8a9f1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 12 Jan 2026 09:33:20 +0800 Subject: [PATCH 43/61] f2fs: avoid f2fs_map_blocks() for consecutive holes in readpages For consecutive large hole mapping across {d,id,did}nodes , we don't need to call f2fs_map_blocks() to check one hole block per one time, instead, we can use map.m_next_pgofs as a hint of next potential valid block, so that we can skip calling f2fs_map_blocks the range of [cur_pgofs + 1, .m_next_pgofs). 1) regular case touch /mnt/f2fs/file truncate -s $((1024*1024*1024)) /mnt/f2fs/file time dd if=/mnt/f2fs/file of=/dev/null bs=1M count=1024 Before: real 0m0.706s user 0m0.000s sys 0m0.706s After: real 0m0.620s user 0m0.008s sys 0m0.611s 2) large folio case touch /mnt/f2fs/file truncate -s $((1024*1024*1024)) /mnt/f2fs/file f2fs_io setflags immutable /mnt/f2fs/file sync echo 3 > /proc/sys/vm/drop_caches time dd if=/mnt/f2fs/file of=/dev/null bs=1M count=1024 Before: real 0m0.438s user 0m0.004s sys 0m0.433s After: real 0m0.368s user 0m0.004s sys 0m0.364s Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ef44b8421cd7..3c2af0ef62bb 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2164,10 +2164,13 @@ static int f2fs_read_single_page(struct inode *inode, struct folio *folio, /* * Map blocks using the previous result first. */ - if ((map->m_flags & F2FS_MAP_MAPPED) && - block_in_file > map->m_lblk && + if (map->m_flags & F2FS_MAP_MAPPED) { + if (block_in_file > map->m_lblk && block_in_file < (map->m_lblk + map->m_len)) + goto got_it; + } else if (block_in_file < *map->m_next_pgofs) { goto got_it; + } /* * Then do more f2fs_map_blocks() calls until we are @@ -2442,7 +2445,7 @@ static int f2fs_read_data_large_folio(struct inode *inode, struct bio *bio = NULL; sector_t last_block_in_bio = 0; struct f2fs_map_blocks map = {0, }; - pgoff_t index, offset; + pgoff_t index, offset, next_pgofs = 0; unsigned max_nr_pages = rac ? readahead_count(rac) : folio_nr_pages(folio); unsigned nrpages; @@ -2475,16 +2478,21 @@ next_folio: /* * Map blocks using the previous result first. */ - if ((map.m_flags & F2FS_MAP_MAPPED) && - index > map.m_lblk && + if (map.m_flags & F2FS_MAP_MAPPED) { + if (index > map.m_lblk && index < (map.m_lblk + map.m_len)) + goto got_it; + } else if (index < next_pgofs) { + /* hole case */ goto got_it; + } /* * Then do more f2fs_map_blocks() calls until we are * done with this page. */ memset(&map, 0, sizeof(map)); + map.m_next_pgofs = &next_pgofs; map.m_seg_type = NO_CHECK_TYPE; map.m_lblk = index; map.m_len = max_nr_pages; @@ -2611,6 +2619,7 @@ static int f2fs_mpage_readpages(struct inode *inode, pgoff_t nc_cluster_idx = NULL_CLUSTER; pgoff_t index; #endif + pgoff_t next_pgofs = 0; unsigned nr_pages = rac ? readahead_count(rac) : 1; struct address_space *mapping = rac ? rac->mapping : folio->mapping; unsigned max_nr_pages = nr_pages; @@ -2631,7 +2640,7 @@ static int f2fs_mpage_readpages(struct inode *inode, map.m_lblk = 0; map.m_len = 0; map.m_flags = 0; - map.m_next_pgofs = NULL; + map.m_next_pgofs = &next_pgofs; map.m_next_extent = NULL; map.m_seg_type = NO_CHECK_TYPE; map.m_may_create = false; From 5c145c03188bc9ba1c29e0bc4d527a5978fc47f9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 13 Jan 2026 14:22:29 +0800 Subject: [PATCH 44/61] f2fs: fix to avoid mapping wrong physical block for swapfile Xiaolong Guo reported a f2fs bug in bugzilla [1] [1] https://bugzilla.kernel.org/show_bug.cgi?id=220951 Quoted: "When using stress-ng's swap stress test on F2FS filesystem with kernel 6.6+, the system experiences data corruption leading to either: 1 dm-verity corruption errors and device reboot 2 F2FS node corruption errors and boot hangs The issue occurs specifically when: 1 Using F2FS filesystem (ext4 is unaffected) 2 Swapfile size is less than F2FS section size (2MB) 3 Swapfile has fragmented physical layout (multiple non-contiguous extents) 4 Kernel version is 6.6+ (6.1 is unaffected) The root cause is in check_swap_activate() function in fs/f2fs/data.c. When the first extent of a small swapfile (< 2MB) is not aligned to section boundaries, the function incorrectly treats it as the last extent, failing to map subsequent extents. This results in incorrect swap_extent creation where only the first extent is mapped, causing subsequent swap writes to overwrite wrong physical locations (other files' data). Steps to Reproduce 1 Setup a device with F2FS-formatted userdata partition 2 Compile stress-ng from https://github.com/ColinIanKing/stress-ng 3 Run swap stress test: (Android devices) adb shell "cd /data/stressng; ./stress-ng-64 --metrics-brief --timeout 60 --swap 0" Log: 1 Ftrace shows in kernel 6.6, only first extent is mapped during second f2fs_map_blocks call in check_swap_activate(): stress-ng-swap-8990: f2fs_map_blocks: ino=11002, file offset=0, start blkaddr=0x43143, len=0x1 (Only 4KB mapped, not the full swapfile) 2 in kernel 6.1, both extents are correctly mapped: stress-ng-swap-5966: f2fs_map_blocks: ino=28011, file offset=0, start blkaddr=0x13cd4, len=0x1 stress-ng-swap-5966: f2fs_map_blocks: ino=28011, file offset=1, start blkaddr=0x60c84b, len=0xff The problematic code is in check_swap_activate(): if ((pblock - SM_I(sbi)->main_blkaddr) % blks_per_sec || nr_pblocks % blks_per_sec || !f2fs_valid_pinned_area(sbi, pblock)) { bool last_extent = false; not_aligned++; nr_pblocks = roundup(nr_pblocks, blks_per_sec); if (cur_lblock + nr_pblocks > sis->max) nr_pblocks -= blks_per_sec; /* this extent is last one */ if (!nr_pblocks) { nr_pblocks = last_lblock - cur_lblock; last_extent = true; } ret = f2fs_migrate_blocks(inode, cur_lblock, nr_pblocks); if (ret) { if (ret == -ENOENT) ret = -EINVAL; goto out; } if (!last_extent) goto retry; } When the first extent is unaligned and roundup(nr_pblocks, blks_per_sec) exceeds sis->max, we subtract blks_per_sec resulting in nr_pblocks = 0. The code then incorrectly assumes this is the last extent, sets nr_pblocks = last_lblock - cur_lblock (entire swapfile), and performs migration. After migration, it doesn't retry mapping, so subsequent extents are never processed. " In order to fix this issue, we need to lookup block mapping info after we migrate all blocks in the tail of swapfile. Cc: stable@kernel.org Fixes: 9703d69d9d15 ("f2fs: support file pinning for zoned devices") Cc: Daeho Jeong Reported-and-tested-by: Xiaolong Guo Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220951 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 3c2af0ef62bb..4e2f10bd2b07 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -4206,6 +4206,7 @@ static int check_swap_activate(struct swap_info_struct *sis, while (cur_lblock < last_lblock && cur_lblock < sis->max) { struct f2fs_map_blocks map; + bool last_extent = false; retry: cond_resched(); @@ -4231,11 +4232,10 @@ retry: pblock = map.m_pblk; nr_pblocks = map.m_len; - if ((pblock - SM_I(sbi)->main_blkaddr) % blks_per_sec || - nr_pblocks % blks_per_sec || - f2fs_is_sequential_zone_area(sbi, pblock)) { - bool last_extent = false; - + if (!last_extent && + ((pblock - SM_I(sbi)->main_blkaddr) % blks_per_sec || + nr_pblocks % blks_per_sec || + f2fs_is_sequential_zone_area(sbi, pblock))) { not_aligned++; nr_pblocks = roundup(nr_pblocks, blks_per_sec); @@ -4256,8 +4256,8 @@ retry: goto out; } - if (!last_extent) - goto retry; + /* lookup block mapping info after block migration */ + goto retry; } if (cur_lblock + nr_pblocks >= sis->max) From ec8bb999dc0c5d64a3366ce8765a479305a82029 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 16 Jan 2026 04:25:43 +0000 Subject: [PATCH 45/61] f2fs: use folio_end_read No logic change. Suggested-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 4e2f10bd2b07..5b4832956196 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2566,19 +2566,15 @@ submit_and_realloc: } trace_f2fs_read_folio(folio, DATA); if (rac) { - if (!folio_in_bio) { - folio_mark_uptodate(folio); - folio_unlock(folio); - } + if (!folio_in_bio) + folio_end_read(folio, true); folio = readahead_folio(rac); goto next_folio; } err_out: /* Nothing was submitted. */ if (!bio) { - if (!ret) - folio_mark_uptodate(folio); - folio_unlock(folio); + folio_end_read(folio, !ret); return ret; } From fe15bc3d447c5ee61dbea41c9e9a11fa2968d32d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 16 Jan 2026 14:29:27 +0800 Subject: [PATCH 46/61] f2fs: fix error path handling in f2fs_read_data_large_folio() In error path of f2fs_read_data_large_folio(), if bio is valid, it may submit bio twice, fix it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 5b4832956196..d685c889f7b6 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2577,17 +2577,14 @@ err_out: folio_end_read(folio, !ret); return ret; } - +out: + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); if (ret) { - f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); - /* Wait bios and clear uptodate. */ folio_lock(folio); folio_clear_uptodate(folio); folio_unlock(folio); } -out: - f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); return ret; } From a5d8b9d94e1863f3ebb7182c238b2c713f6f4efd Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 16 Jan 2026 14:31:15 +0800 Subject: [PATCH 47/61] f2fs: fix to unlock folio in f2fs_read_data_large_folio() We missed to unlock folio in error path of f2fs_read_data_large_folio(), fix it. With below testcase, it can reproduce the bug. touch /mnt/f2fs/file truncate -s $((1024*1024*1024)) /mnt/f2fs/file f2fs_io setflags immutable /mnt/f2fs/file sync echo 3 > /proc/sys/vm/drop_caches time dd if=/mnt/f2fs/file of=/dev/null bs=1M count=1024 f2fs_io clearflags immutable /mnt/f2fs/file echo 1 > /proc/sys/vm/drop_caches time dd if=/mnt/f2fs/file of=/dev/null bs=1M count=1024 time dd if=/mnt/f2fs/file of=/dev/null bs=1M count=1024 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d685c889f7b6..d509172b51df 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2453,11 +2453,11 @@ static int f2fs_read_data_large_folio(struct inode *inode, int ret = 0; bool folio_in_bio; - if (!IS_IMMUTABLE(inode)) - return -EOPNOTSUPP; - - if (f2fs_compressed_file(inode)) + if (!IS_IMMUTABLE(inode) || f2fs_compressed_file(inode)) { + if (folio) + folio_unlock(folio); return -EOPNOTSUPP; + } map.m_seg_type = NO_CHECK_TYPE; @@ -2565,18 +2565,16 @@ submit_and_realloc: last_block_in_bio = block_nr; } trace_f2fs_read_folio(folio, DATA); +err_out: + if (!folio_in_bio) { + folio_end_read(folio, !ret); + if (ret) + return ret; + } if (rac) { - if (!folio_in_bio) - folio_end_read(folio, true); folio = readahead_folio(rac); goto next_folio; } -err_out: - /* Nothing was submitted. */ - if (!bio) { - folio_end_read(folio, !ret); - return ret; - } out: f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); if (ret) { From 993663874be557a80d4cdc9700b760bb6d27c098 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 16 Jan 2026 21:50:44 +0000 Subject: [PATCH 48/61] Revert "f2fs: add timeout in f2fs_enable_checkpoint()" This reverts commit 4bc347779698b5e67e1514bab105c2c083e55502. Let's apply a better approach to flush the only dirty pages committed by user to avoid the delay caused by unncessary incoming ones. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 -- fs/f2fs/super.c | 15 ++++----------- 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d41210a381cd..035239758e33 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -311,7 +311,6 @@ enum { #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ #define DEF_DISABLE_INTERVAL 5 /* 5 secs */ -#define DEF_ENABLE_INTERVAL 16 /* 16 secs */ #define DEF_DISABLE_QUICK_INTERVAL 1 /* 1 secs */ #define DEF_UMOUNT_DISCARD_TIMEOUT 5 /* 5 secs */ @@ -1486,7 +1485,6 @@ enum { DISCARD_TIME, GC_TIME, DISABLE_TIME, - ENABLE_TIME, UMOUNT_DISCARD_TIMEOUT, MAX_TIME, }; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1660d663a8c5..97c2264ec7fe 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2686,7 +2686,7 @@ restore_flag: static int f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) { - unsigned int nr_pages = get_pages(sbi, F2FS_DIRTY_DATA) / 16; + int retry = DEFAULT_RETRY_IO_COUNT; long long start, writeback, end; int ret; struct f2fs_lock_context lc; @@ -2696,22 +2696,16 @@ static int f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) get_pages(sbi, F2FS_DIRTY_NODES), get_pages(sbi, F2FS_DIRTY_DATA)); - f2fs_update_time(sbi, ENABLE_TIME); - start = ktime_get(); /* we should flush all the data to keep data consistency */ - while (get_pages(sbi, F2FS_DIRTY_DATA)) { - writeback_inodes_sb_nr(sbi->sb, nr_pages, WB_REASON_SYNC); + do { + sync_inodes_sb(sbi->sb); f2fs_io_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT); + } while (get_pages(sbi, F2FS_DIRTY_DATA) && retry--); - if (f2fs_time_over(sbi, ENABLE_TIME)) - break; - } writeback = ktime_get(); - sync_inodes_sb(sbi->sb); - if (unlikely(get_pages(sbi, F2FS_DIRTY_DATA))) f2fs_warn(sbi, "checkpoint=enable has some unwritten data: %lld", get_pages(sbi, F2FS_DIRTY_DATA)); @@ -4333,7 +4327,6 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->interval_time[DISCARD_TIME] = DEF_IDLE_INTERVAL; sbi->interval_time[GC_TIME] = DEF_IDLE_INTERVAL; sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_INTERVAL; - sbi->interval_time[ENABLE_TIME] = DEF_ENABLE_INTERVAL; sbi->interval_time[UMOUNT_DISCARD_TIMEOUT] = DEF_UMOUNT_DISCARD_TIMEOUT; clear_sbi_flag(sbi, SBI_NEED_FSCK); From ab59919c8a041bf0fc6c8fe65dd10729e19de88c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 26 Jan 2026 06:32:03 +0000 Subject: [PATCH 49/61] f2fs: check skipped write in f2fs_enable_checkpoint() This patch introduces sbi->nr_pages[F2FS_SKIPPED_WRITE] to record any skipped write during data flush in f2fs_enable_checkpoint(). So in the loop of data flush, if there is any skipped write in previous flush, let's retry sync_inode_sb(), otherwise, all dirty data written before f2fs_enable_checkpoint() should have been persisted, then break the retry loop. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 19 +++++++++++++++++++ fs/f2fs/debug.c | 1 + fs/f2fs/f2fs.h | 2 ++ fs/f2fs/super.c | 37 +++++++++++++++++++++++++++++++++---- 4 files changed, 55 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d509172b51df..25b4d3f21fa7 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3495,6 +3495,16 @@ static inline void account_writeback(struct inode *inode, bool inc) f2fs_up_read(&F2FS_I(inode)->i_sem); } +static inline void update_skipped_write(struct f2fs_sb_info *sbi, + struct writeback_control *wbc) +{ + long skipped = wbc->pages_skipped; + + if (is_sbi_flag_set(sbi, SBI_ENABLE_CHECKPOINT) && skipped && + wbc->sync_mode == WB_SYNC_ALL) + atomic_add(skipped, &sbi->nr_pages[F2FS_SKIPPED_WRITE]); +} + static int __f2fs_write_data_pages(struct address_space *mapping, struct writeback_control *wbc, enum iostat_type io_type) @@ -3559,10 +3569,19 @@ static int __f2fs_write_data_pages(struct address_space *mapping, */ f2fs_remove_dirty_inode(inode); + + /* + * f2fs_write_cache_pages() has retry logic for EAGAIN case which is + * common when racing w/ checkpoint, so only update skipped write + * when ret is non-zero. + */ + if (ret) + update_skipped_write(sbi, wbc); return ret; skip_write: wbc->pages_skipped += get_dirty_pages(inode); + update_skipped_write(sbi, wbc); trace_f2fs_writepages(mapping->host, wbc, DATA); return 0; } diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 032683835569..8e1040e375a7 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -423,6 +423,7 @@ static const char *s_flag[MAX_SBI_FLAG] = { [SBI_IS_RESIZEFS] = "resizefs", [SBI_IS_FREEZING] = "freezefs", [SBI_IS_WRITABLE] = "writable", + [SBI_ENABLE_CHECKPOINT] = "enable_checkpoint", }; static const char *ipu_mode_names[F2FS_IPU_MAX] = { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 035239758e33..52cec6b3ecf0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1238,6 +1238,7 @@ enum count_type { F2FS_RD_META, F2FS_DIO_WRITE, F2FS_DIO_READ, + F2FS_SKIPPED_WRITE, /* skip or fail during f2fs_enable_checkpoint() */ NR_COUNT_TYPE, }; @@ -1476,6 +1477,7 @@ enum { SBI_IS_RESIZEFS, /* resizefs is in process */ SBI_IS_FREEZING, /* freezefs is in process */ SBI_IS_WRITABLE, /* remove ro mountoption transiently */ + SBI_ENABLE_CHECKPOINT, /* indicate it's during f2fs_enable_checkpoint() */ MAX_SBI_FLAG, }; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 97c2264ec7fe..0afe9f829058 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2690,6 +2690,7 @@ static int f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) long long start, writeback, end; int ret; struct f2fs_lock_context lc; + long long skipped_write, dirty_data; f2fs_info(sbi, "f2fs_enable_checkpoint() starts, meta: %lld, node: %lld, data: %lld", get_pages(sbi, F2FS_DIRTY_META), @@ -2698,17 +2699,45 @@ static int f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) start = ktime_get(); + set_sbi_flag(sbi, SBI_ENABLE_CHECKPOINT); + /* we should flush all the data to keep data consistency */ do { + skipped_write = get_pages(sbi, F2FS_SKIPPED_WRITE); + dirty_data = get_pages(sbi, F2FS_DIRTY_DATA); + sync_inodes_sb(sbi->sb); f2fs_io_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT); - } while (get_pages(sbi, F2FS_DIRTY_DATA) && retry--); + + f2fs_info(sbi, "sync_inode_sb done, dirty_data: %lld, %lld, " + "skipped write: %lld, %lld, retry: %d", + get_pages(sbi, F2FS_DIRTY_DATA), + dirty_data, + get_pages(sbi, F2FS_SKIPPED_WRITE), + skipped_write, retry); + + /* + * sync_inodes_sb() has retry logic, so let's check dirty_data + * in prior to skipped_write in case there is no dirty data. + */ + if (!get_pages(sbi, F2FS_DIRTY_DATA)) + break; + if (get_pages(sbi, F2FS_SKIPPED_WRITE) == skipped_write) + break; + } while (retry--); + + clear_sbi_flag(sbi, SBI_ENABLE_CHECKPOINT); writeback = ktime_get(); - if (unlikely(get_pages(sbi, F2FS_DIRTY_DATA))) - f2fs_warn(sbi, "checkpoint=enable has some unwritten data: %lld", - get_pages(sbi, F2FS_DIRTY_DATA)); + if (unlikely(get_pages(sbi, F2FS_DIRTY_DATA) || + get_pages(sbi, F2FS_SKIPPED_WRITE))) + f2fs_warn(sbi, "checkpoint=enable unwritten data: %lld, skipped data: %lld, retry: %d", + get_pages(sbi, F2FS_DIRTY_DATA), + get_pages(sbi, F2FS_SKIPPED_WRITE), retry); + + if (get_pages(sbi, F2FS_SKIPPED_WRITE)) + atomic_set(&sbi->nr_pages[F2FS_SKIPPED_WRITE], 0); f2fs_down_write_trace(&sbi->gc_lock, &lc); f2fs_dirty_to_prefree(sbi); From 1120764691736cb803cd763c82aa151b1fee2b8e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 20 Jan 2026 17:12:15 +0800 Subject: [PATCH 50/61] f2fs: introduce FAULT_SKIP_WRITE In order to simulate skipped write during enable_checkpoint(). Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 1 + Documentation/filesystems/f2fs.rst | 1 + fs/f2fs/data.c | 4 ++++ fs/f2fs/f2fs.h | 1 + fs/f2fs/super.c | 1 + 5 files changed, 8 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 7398b369784c..9a8ec2290f68 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -744,6 +744,7 @@ Description: Support configuring fault injection type, should be FAULT_ATOMIC_TIMEOUT 0x00400000 (1000ms) FAULT_VMALLOC 0x00800000 FAULT_LOCK_TIMEOUT 0x01000000 (1000ms) + FAULT_SKIP_WRITE 0x02000000 =========================== ========== What: /sys/fs/f2fs//discard_io_aware_gran diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index fc005f2eaf86..7e4031631286 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -218,6 +218,7 @@ fault_type=%d Support configuring fault injection type, should be FAULT_ATOMIC_TIMEOUT 0x00400000 (1000ms) FAULT_VMALLOC 0x00800000 FAULT_LOCK_TIMEOUT 0x01000000 (1000ms) + FAULT_SKIP_WRITE 0x02000000 =========================== ========== mode=%s Control block allocation mode which supports "adaptive" and "lfs". In "lfs" mode, there should be no random diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 25b4d3f21fa7..9ef875e7b34c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2921,6 +2921,10 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) goto got_it; } + if (is_sbi_flag_set(fio->sbi, SBI_ENABLE_CHECKPOINT) && + time_to_inject(fio->sbi, FAULT_SKIP_WRITE)) + return -EINVAL; + /* Deadlock due to between page->lock and f2fs_lock_op */ if (fio->need_lock == LOCK_REQ && !f2fs_trylock_op(fio->sbi, &lc)) return -EAGAIN; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 52cec6b3ecf0..3a8e1dcdcd69 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -66,6 +66,7 @@ enum { FAULT_ATOMIC_TIMEOUT, FAULT_VMALLOC, FAULT_LOCK_TIMEOUT, + FAULT_SKIP_WRITE, FAULT_MAX, }; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 0afe9f829058..5d8b2e812340 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -70,6 +70,7 @@ const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_ATOMIC_TIMEOUT] = "atomic timeout", [FAULT_VMALLOC] = "vmalloc", [FAULT_LOCK_TIMEOUT] = "lock timeout", + [FAULT_SKIP_WRITE] = "skip write", }; int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned long rate, From 252cf8c4d679fc40cdb934da6c5128e5943fec3f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 20 Jan 2026 17:18:26 +0800 Subject: [PATCH 51/61] f2fs: fix to show simulate_lock_timeout correctly Commit d36de29f4bb5 ("f2fs: sysfs: introduce inject_lock_timeout") introduces a bug as below, fix it. cat /sys/fs/f2fs/vdx/inject_lock_timeout s/fs/f2fs/vdx/inject_lock_timeout: Invalid argument Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index cd22bfe75c45..d01a2664a250 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -86,7 +86,8 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) return (unsigned char *)sbi; #ifdef CONFIG_F2FS_FAULT_INJECTION else if (struct_type == FAULT_INFO_RATE || - struct_type == FAULT_INFO_TYPE) + struct_type == FAULT_INFO_TYPE || + struct_type == FAULT_INFO_TIMEOUT) return (unsigned char *)&F2FS_OPTION(sbi).fault_info; #endif #ifdef CONFIG_F2FS_STAT_FS From be38b5717a2953648dd294418b7c2dfdb8e81d7a Mon Sep 17 00:00:00 2001 From: Yongpeng Yang Date: Fri, 16 Jan 2026 00:47:50 +0800 Subject: [PATCH 52/61] f2fs: pin files do not require sbi->writepages lock for ordering For pinned files, the file mapping is already established before writing, and since the writes are in IPU, there is no need to acquire the sbi->writepages lock to guarantee write ordering. Signed-off-by: Yongpeng Yang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9ef875e7b34c..84746a06cd58 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3477,6 +3477,8 @@ static inline bool __should_serialize_io(struct inode *inode, if (IS_NOQUOTA(inode)) return false; + if (f2fs_is_pinned_file(inode)) + return false; if (f2fs_need_compress_data(inode)) return true; if (wbc->sync_mode != WB_SYNC_ALL) From 401a3034d3b9f33e0fd085f6964512fe999ba135 Mon Sep 17 00:00:00 2001 From: Yongpeng Yang Date: Tue, 13 Jan 2026 23:21:37 +0800 Subject: [PATCH 53/61] f2fs: add write latency stats for NAT and SIT blocks in f2fs_write_checkpoint This patch adds separate write latency accounting for NAT and SIT blocks in f2fs_write_checkpoint(). Signed-off-by: Yongpeng Yang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 +++- fs/f2fs/f2fs.h | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 8bb38cfcce6e..5172396c0b01 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1839,6 +1839,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) goto out; } } + stat_cp_time(cpc, CP_TIME_MERGE_WRITE); /* * update checkpoint pack index @@ -1855,10 +1856,11 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_bug_on(sbi, !f2fs_cp_error(sbi)); goto stop; } + stat_cp_time(cpc, CP_TIME_FLUSH_NAT); f2fs_flush_sit_entries(sbi, cpc); - stat_cp_time(cpc, CP_TIME_FLUSH_META); + stat_cp_time(cpc, CP_TIME_FLUSH_SIT); /* save inmem log status */ f2fs_save_inmem_curseg(sbi); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3a8e1dcdcd69..064164ffa489 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -319,7 +319,9 @@ enum cp_time { CP_TIME_START, /* begin */ CP_TIME_LOCK, /* after cp_global_sem */ CP_TIME_OP_LOCK, /* after block_operation */ - CP_TIME_FLUSH_META, /* after flush sit/nat */ + CP_TIME_MERGE_WRITE, /* after flush DATA/NODE/META */ + CP_TIME_FLUSH_NAT, /* after flush nat */ + CP_TIME_FLUSH_SIT, /* after flush sit */ CP_TIME_SYNC_META, /* after sync_meta_pages */ CP_TIME_SYNC_CP_META, /* after sync cp meta pages */ CP_TIME_WAIT_DIRTY_META,/* after wait on dirty meta */ From 7c9ee0ed2bd4e30192d83de529c9094e18ab6f41 Mon Sep 17 00:00:00 2001 From: Yongpeng Yang Date: Tue, 13 Jan 2026 23:21:38 +0800 Subject: [PATCH 54/61] f2fs: change size parameter of __has_cursum_space() to unsigned int All callers of __has_cursum_space() pass an unsigned int value as the size parameter. Change the parameter type to unsigned int accordingly. Signed-off-by: Yongpeng Yang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 064164ffa489..47f316ac05a3 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2875,7 +2875,7 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) } static inline bool __has_cursum_space(struct f2fs_sb_info *sbi, - struct f2fs_journal *journal, int size, int type) + struct f2fs_journal *journal, unsigned int size, int type) { if (type == NAT_JOURNAL) return size <= MAX_NAT_JENTRIES(sbi, journal); From 1db4b3609aa13efceddeae2e58749acb62d42d71 Mon Sep 17 00:00:00 2001 From: Yongpeng Yang Date: Tue, 13 Jan 2026 23:23:15 +0800 Subject: [PATCH 55/61] f2fs: optimize NAT block loading during checkpoint write Under stress tests with frequent metadata operations, checkpoint write time can become excessively long. Analysis shows that the slowdown is caused by synchronous, one-by-one reads of NAT blocks during checkpoint processing. The issue can be reproduced with the following workload: 1. seq 1 650000 | xargs -P 16 -n 1 touch 2. sync # avoid checkpoint write during deleting 3. delete 1 file every 455 files 4. echo 3 > /proc/sys/vm/drop_caches 5. sync # trigger checkpoint write This patch submits read I/O for all NAT blocks required in the __flush_nat_entry_set() phase in advance, reducing the overhead of synchronous waiting for individual NAT block reads. The NAT block flush latency before and after the change is as below: | |NAT blocks accessed|NAT blocks read|Flush time (ms)| |-------------|-------------------|---------------|---------------| |Before change|1205 |1191 |158 | |After change |1264 |1242 |11 | With a similar number of NAT blocks accessed and read from disk, adding NAT block readahead reduces the total NAT block flush time by more than 90%. Signed-off-by: Yongpeng Yang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 63252ff1e5c3..74992fd9c9b6 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -3179,7 +3179,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct f2fs_journal *journal = curseg->journal; struct nat_entry_set *setvec[NAT_VEC_SIZE]; struct nat_entry_set *set, *tmp; - unsigned int found; + unsigned int found, entry_count = 0; nid_t set_idx = 0; LIST_HEAD(sets); int err = 0; @@ -3219,6 +3219,18 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) MAX_NAT_JENTRIES(sbi, journal)); } + /* + * Readahead the current NAT block to prevent read requests from + * being issued and waited on one by one. + */ + list_for_each_entry(set, &sets, set_list) { + entry_count += set->entry_cnt; + if (!enabled_nat_bits(sbi, cpc) && + __has_cursum_space(sbi, journal, + entry_count, NAT_JOURNAL)) + continue; + f2fs_ra_meta_pages(sbi, set->set, 1, META_NAT, true); + } /* flush dirty nats in nat entry set */ list_for_each_entry_safe(set, tmp, &sets, set_list) { err = __flush_nat_entry_set(sbi, set, cpc); From 6bb9010f78d7f0ff0e4a17b1be951e76d96757a5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 21 Jan 2026 05:14:04 +0000 Subject: [PATCH 56/61] f2fs: decrease maximum flush retry count in f2fs_enable_checkpoint() It's rare case that sync_inodes_sb() always skips to flush some drity datas, so it's enough to give extra three more chances to flush data. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/super.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 47f316ac05a3..29f81a496b72 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -699,6 +699,8 @@ enum { #define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO or flush count */ +#define MAX_FLUSH_RETRY_COUNT 3 /* maximum flush retry count in f2fs_enable_checkpoint() */ + /* IO/non-IO congestion wait timeout value, default: 1 jiffies */ #define DEFAULT_SCHEDULE_TIMEOUT 1 diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 5d8b2e812340..9d421a07d2d5 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2687,7 +2687,7 @@ restore_flag: static int f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) { - int retry = DEFAULT_RETRY_IO_COUNT; + int retry = MAX_FLUSH_RETRY_COUNT; long long start, writeback, end; int ret; struct f2fs_lock_context lc; From 91b76f1059b60f453b51877f29f0e35693737383 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Mon, 26 Jan 2026 14:28:01 -0800 Subject: [PATCH 57/61] f2fs: fix incomplete block usage in compact SSA summaries In a previous commit, a bug was introduced where compact SSA summaries failed to utilize the entire block space in non-4KB block size configurations, leading to inefficient space management. This patch fixes the calculation logic to ensure that compact SSA summaries can fully occupy the block regardless of the block size. Reported-by: Chris Mason Fixes: e48e16f3e37f ("f2fs: support non-4KB block size without packed_ssa feature") Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 00870a8fe387..6a97fe76712b 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2674,12 +2674,12 @@ int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) valid_sum_count += f2fs_curseg_valid_blocks(sbi, i); } - sum_in_page = (sbi->sum_blocksize - 2 * sbi->sum_journal_size - + sum_in_page = (sbi->blocksize - 2 * sbi->sum_journal_size - SUM_FOOTER_SIZE) / SUMMARY_SIZE; if (valid_sum_count <= sum_in_page) return 1; else if ((valid_sum_count - sum_in_page) <= - (sbi->sum_blocksize - SUM_FOOTER_SIZE) / SUMMARY_SIZE) + (sbi->blocksize - SUM_FOOTER_SIZE) / SUMMARY_SIZE) return 2; return 3; } @@ -4324,7 +4324,7 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi) s = (struct f2fs_summary *)(kaddr + offset); sum_entries(seg_i->sum_blk)[j] = *s; offset += SUMMARY_SIZE; - if (offset + SUMMARY_SIZE <= sbi->sum_blocksize - + if (offset + SUMMARY_SIZE <= sbi->blocksize - SUM_FOOTER_SIZE) continue; @@ -4497,7 +4497,7 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) *summary = sum_entries(seg_i->sum_blk)[j]; written_size += SUMMARY_SIZE; - if (written_size + SUMMARY_SIZE <= PAGE_SIZE - + if (written_size + SUMMARY_SIZE <= sbi->blocksize - SUM_FOOTER_SIZE) continue; From d860974a7e38d35e9e2c4dc8a9f4223b38b6ad99 Mon Sep 17 00:00:00 2001 From: Yeongjin Gil Date: Thu, 22 Jan 2026 19:45:27 +0900 Subject: [PATCH 58/61] f2fs: optimize f2fs_overwrite_io() for f2fs_iomap_begin When overwriting already allocated blocks, f2fs_iomap_begin() calls f2fs_overwrite_io() to check block mappings. However, f2fs_overwrite_io() iterates through all mapped blocks in the range, which can be inefficient for fragmented files with large I/O requests. This patch optimizes f2fs_overwrite_io() by adding a 'check_first' parameter and introducing __f2fs_overwrite_io() helper. When called from f2fs_iomap_begin(), we only check the first mapping to determine if the range is already allocated, which is sufficient for setting map.m_may_create. This optimization significantly reduces the number of f2fs_map_blocks() calls in f2fs_overwrite_io() when called from f2fs_iomap_begin(), especially for fragmented files with large I/O requests. Cc: stable@kernel.org Fixes: 351bc761338d ("f2fs: optimize f2fs DIO overwrites") Reviewed-by: Sungjong Seo Reviewed-by: Sunmin Jeong Signed-off-by: Yeongjin Gil Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 84746a06cd58..f70efb040c73 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1865,7 +1865,8 @@ out: return err; } -bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len) +static bool __f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len, + bool check_first) { struct f2fs_map_blocks map; block_t last_lblk; @@ -1887,10 +1888,17 @@ bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len) if (err || map.m_len == 0) return false; map.m_lblk += map.m_len; + if (check_first) + break; } return true; } +bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len) +{ + return __f2fs_overwrite_io(inode, pos, len, false); +} + static int f2fs_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo) { @@ -4487,7 +4495,7 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, * f2fs_map_lock and f2fs_balance_fs are not necessary. */ if ((flags & IOMAP_WRITE) && - !f2fs_overwrite_io(inode, offset, length)) + !__f2fs_overwrite_io(inode, offset, length, true)) map.m_may_create = true; err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DIO); From 07de55cbf5762cb4a7e9e0db7aba5c10c8cfe079 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 30 Jan 2026 21:28:08 +0800 Subject: [PATCH 59/61] f2fs: fix lock priority inversion issue If userspace thread has held f2fs rw semaphore, due to its low priority, it could be runnable or preempted state for long time, during the time, it will block high priority thread which is trying to grab the same rw semaphore, e.g. cp_rwsem, io_rwsem... To fix such issue, let's detect thread's priority when it tries to grab f2fs_rwsem lock, if the priority is lower than a priority threshold, let's uplift the priority before it enters into critical region of lock, and restore the priority after it leaves from critical region. Meanwhile, introducing two new sysfs nodes: - /sys/fs/f2fs//adjust_lock_priority, it is used to control whether the functionality is enable or not. ========== ================== Flag_Value Flag_Description ========== ================== 0x00000000 Disabled (default) 0x00000001 cp_rwsem 0x00000002 node_change 0x00000004 node_write 0x00000008 gc_lock 0x00000010 cp_global 0x00000020 io_rwsem ========== ================== - /sys/fs/f2fs//lock_duration_priority, it is used to control priority threshold. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 24 +++++++++ fs/f2fs/checkpoint.c | 66 ++++++++++++++++++++++++- fs/f2fs/f2fs.h | 12 +++++ fs/f2fs/super.c | 2 + fs/f2fs/sysfs.c | 18 +++++++ 5 files changed, 120 insertions(+), 2 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 9a8ec2290f68..ea6474db8a31 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -963,3 +963,27 @@ Description: This sysfs entry can be used to change type of injected timeout: 0x00000003 Simulate Non-IO type sleep time 0x00000004 Simulate runnable time ========== =============================== + +What: /sys/fs/f2fs//adjust_lock_priority +Date: January 2026 +Contact: "Chao Yu" +Description: This sysfs entry can be used to enable/disable to adjust priority for task + which is in critical region covered by lock. + ========== ================== + Flag_Value Flag_Description + ========== ================== + 0x00000000 Disabled (default) + 0x00000001 cp_rwsem + 0x00000002 node_change + 0x00000004 node_write + 0x00000008 gc_lock + 0x00000010 cp_global + 0x00000020 io_rwsem + ========== ================== + +What: /sys/fs/f2fs//lock_duration_priority +Date: January 2026 +Contact: "Chao Yu" +Description: f2fs can tune priority of thread which has entered into critical region covered by + f2fs rwsemphore lock. This sysfs entry can be used to control priority value, the + range is [100,139], by default the value is 120. diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 5172396c0b01..2f5a03e29d0b 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -90,16 +90,72 @@ static inline void trace_lock_elapsed_time_end(struct f2fs_rwsem *sem, runnable_time, io_sleep_time, other_time); } +static bool need_uplift_priority(struct f2fs_rwsem *sem, bool is_write) +{ + if (!(sem->sbi->adjust_lock_priority & BIT(sem->name - 1))) + return false; + + switch (sem->name) { + /* + * writer is checkpoint which has high priority, let's just uplift + * priority for reader + */ + case LOCK_NAME_CP_RWSEM: + case LOCK_NAME_NODE_CHANGE: + case LOCK_NAME_NODE_WRITE: + return !is_write; + case LOCK_NAME_GC_LOCK: + case LOCK_NAME_CP_GLOBAL: + case LOCK_NAME_IO_RWSEM: + return true; + default: + f2fs_bug_on(sem->sbi, 1); + } + return false; +} + +static void uplift_priority(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc, + bool is_write) +{ + lc->need_restore = false; + if (!sem->sbi->adjust_lock_priority) + return; + if (rt_task(current)) + return; + if (!need_uplift_priority(sem, is_write)) + return; + lc->orig_nice = task_nice(current); + lc->new_nice = PRIO_TO_NICE(sem->sbi->lock_duration_priority); + if (lc->orig_nice <= lc->new_nice) + return; + set_user_nice(current, lc->new_nice); + lc->need_restore = true; +} + +static void restore_priority(struct f2fs_lock_context *lc) +{ + if (!lc->need_restore) + return; + /* someone has updated the priority */ + if (task_nice(current) != lc->new_nice) + return; + set_user_nice(current, lc->orig_nice); +} + void f2fs_down_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc) { + uplift_priority(sem, lc, false); f2fs_down_read(sem); trace_lock_elapsed_time_start(sem, lc); } int f2fs_down_read_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc) { - if (!f2fs_down_read_trylock(sem)) + uplift_priority(sem, lc, false); + if (!f2fs_down_read_trylock(sem)) { + restore_priority(lc); return 0; + } trace_lock_elapsed_time_start(sem, lc); return 1; } @@ -107,19 +163,24 @@ int f2fs_down_read_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_contex void f2fs_up_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc) { f2fs_up_read(sem); + restore_priority(lc); trace_lock_elapsed_time_end(sem, lc, false); } void f2fs_down_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc) { + uplift_priority(sem, lc, true); f2fs_down_write(sem); trace_lock_elapsed_time_start(sem, lc); } int f2fs_down_write_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc) { - if (!f2fs_down_write_trylock(sem)) + uplift_priority(sem, lc, true); + if (!f2fs_down_write_trylock(sem)) { + restore_priority(lc); return 0; + } trace_lock_elapsed_time_start(sem, lc); return 1; } @@ -127,6 +188,7 @@ int f2fs_down_write_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_conte void f2fs_up_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc) { f2fs_up_write(sem); + restore_priority(lc); trace_lock_elapsed_time_end(sem, lc, true); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 29f81a496b72..a6e7368fc40a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -185,6 +185,7 @@ enum f2fs_lock_name { LOCK_NAME_GC_LOCK, LOCK_NAME_CP_GLOBAL, LOCK_NAME_IO_RWSEM, + LOCK_NAME_MAX, }; enum f2fs_timeout_type { @@ -1447,7 +1448,10 @@ struct f2fs_time_stat { struct f2fs_lock_context { struct f2fs_time_stat ts; + int orig_nice; + int new_nice; bool lock_trace; + bool need_restore; }; struct f2fs_gc_control { @@ -1588,6 +1592,8 @@ enum node_type { /* a threshold of maximum elapsed time in critical region to print tracepoint */ #define MAX_LOCK_ELAPSED_TIME 500 +#define F2FS_DEFAULT_TASK_PRIORITY (DEFAULT_PRIO) + static inline int f2fs_test_bit(unsigned int nr, char *addr); static inline void f2fs_set_bit(unsigned int nr, char *addr); static inline void f2fs_clear_bit(unsigned int nr, char *addr); @@ -1998,6 +2004,12 @@ struct f2fs_sb_info { /* max elapsed time threshold in critical region that lock covered */ unsigned long long max_lock_elapsed_time; + /* enable/disable to adjust task priority in critical region covered by lock */ + unsigned int adjust_lock_priority; + + /* adjust priority for task which is in critical region covered by lock */ + unsigned int lock_duration_priority; + #ifdef CONFIG_F2FS_FS_COMPRESSION struct kmem_cache *page_array_slab; /* page array entry */ unsigned int page_array_slab_size; /* default page array slab size */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 9d421a07d2d5..d5cf7265e5d3 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4338,6 +4338,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) spin_lock_init(&sbi->gc_remaining_trials_lock); atomic64_set(&sbi->current_atomic_write, 0); sbi->max_lock_elapsed_time = MAX_LOCK_ELAPSED_TIME; + sbi->adjust_lock_priority = 0; + sbi->lock_duration_priority = F2FS_DEFAULT_TASK_PRIORITY; sbi->sum_blocksize = f2fs_sb_has_packed_ssa(sbi) ? 4096 : sbi->blocksize; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index d01a2664a250..3a272e7edf23 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -955,6 +955,20 @@ out: return count; } + if (!strcmp(a->attr.name, "adjust_lock_priority")) { + if (t >= BIT(LOCK_NAME_MAX - 1)) + return -EINVAL; + sbi->adjust_lock_priority = t; + return count; + } + + if (!strcmp(a->attr.name, "lock_duration_priority")) { + if (t < NICE_TO_PRIO(MIN_NICE) || t > NICE_TO_PRIO(MAX_NICE)) + return -EINVAL; + sbi->lock_duration_priority = t; + return count; + } + __sbi_store_value(a, sbi, ptr + a->offset, t); return count; @@ -1272,6 +1286,8 @@ F2FS_SBI_GENERAL_RW_ATTR(carve_out); F2FS_SBI_GENERAL_RW_ATTR(reserved_pin_section); F2FS_SBI_GENERAL_RW_ATTR(bggc_io_aware); F2FS_SBI_GENERAL_RW_ATTR(max_lock_elapsed_time); +F2FS_SBI_GENERAL_RW_ATTR(lock_duration_priority); +F2FS_SBI_GENERAL_RW_ATTR(adjust_lock_priority); /* STAT_INFO ATTR */ #ifdef CONFIG_F2FS_STAT_FS @@ -1478,6 +1494,8 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(allocate_section_hint), ATTR_LIST(allocate_section_policy), ATTR_LIST(max_lock_elapsed_time), + ATTR_LIST(lock_duration_priority), + ATTR_LIST(adjust_lock_priority), NULL, }; ATTRIBUTE_GROUPS(f2fs); From bc367775f60214312befa33f101b31fe74bba48a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 30 Jan 2026 21:28:09 +0800 Subject: [PATCH 60/61] f2fs: introduce trace_f2fs_priority_update This patch introduces two new tracepoints for debug purpose. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 17 +++++++---- include/trace/events/f2fs.h | 57 +++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 2f5a03e29d0b..4afa5d9a19fc 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -130,9 +130,13 @@ static void uplift_priority(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc return; set_user_nice(current, lc->new_nice); lc->need_restore = true; + + trace_f2fs_priority_uplift(sem->sbi, sem->name, is_write, current, + NICE_TO_PRIO(lc->orig_nice), NICE_TO_PRIO(lc->new_nice)); } -static void restore_priority(struct f2fs_lock_context *lc) +static void restore_priority(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc, + bool is_write) { if (!lc->need_restore) return; @@ -140,6 +144,9 @@ static void restore_priority(struct f2fs_lock_context *lc) if (task_nice(current) != lc->new_nice) return; set_user_nice(current, lc->orig_nice); + + trace_f2fs_priority_restore(sem->sbi, sem->name, is_write, current, + NICE_TO_PRIO(lc->orig_nice), NICE_TO_PRIO(lc->new_nice)); } void f2fs_down_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc) @@ -153,7 +160,7 @@ int f2fs_down_read_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_contex { uplift_priority(sem, lc, false); if (!f2fs_down_read_trylock(sem)) { - restore_priority(lc); + restore_priority(sem, lc, false); return 0; } trace_lock_elapsed_time_start(sem, lc); @@ -163,7 +170,7 @@ int f2fs_down_read_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_contex void f2fs_up_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc) { f2fs_up_read(sem); - restore_priority(lc); + restore_priority(sem, lc, false); trace_lock_elapsed_time_end(sem, lc, false); } @@ -178,7 +185,7 @@ int f2fs_down_write_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_conte { uplift_priority(sem, lc, true); if (!f2fs_down_write_trylock(sem)) { - restore_priority(lc); + restore_priority(sem, lc, true); return 0; } trace_lock_elapsed_time_start(sem, lc); @@ -188,7 +195,7 @@ int f2fs_down_write_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_conte void f2fs_up_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc) { f2fs_up_write(sem); - restore_priority(lc); + restore_priority(sem, lc, true); trace_lock_elapsed_time_end(sem, lc, true); } diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index c3b6b509472f..9364e6775562 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -2525,6 +2525,63 @@ TRACE_EVENT(f2fs_lock_elapsed_time, __entry->other_time) ); +DECLARE_EVENT_CLASS(f2fs_priority_update, + + TP_PROTO(struct f2fs_sb_info *sbi, enum f2fs_lock_name lock_name, + bool is_write, struct task_struct *p, int orig_prio, + int new_prio), + + TP_ARGS(sbi, lock_name, is_write, p, orig_prio, new_prio), + + TP_STRUCT__entry( + __field(dev_t, dev) + __array(char, comm, TASK_COMM_LEN) + __field(pid_t, pid) + __field(unsigned int, lock_name) + __field(bool, is_write) + __field(int, orig_prio) + __field(int, new_prio) + ), + + TP_fast_assign( + __entry->dev = sbi->sb->s_dev; + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->lock_name = lock_name; + __entry->is_write = is_write; + __entry->orig_prio = orig_prio; + __entry->new_prio = new_prio; + ), + + TP_printk("dev = (%d,%d), comm: %s, pid: %d, lock_name: %s, " + "lock_type: %s, orig_prio: %d, new_prio: %d", + show_dev(__entry->dev), + __entry->comm, + __entry->pid, + show_lock_name(__entry->lock_name), + __entry->is_write ? "wlock" : "rlock", + __entry->orig_prio, + __entry->new_prio) +); + +DEFINE_EVENT(f2fs_priority_update, f2fs_priority_uplift, + + TP_PROTO(struct f2fs_sb_info *sbi, enum f2fs_lock_name lock_name, + bool is_write, struct task_struct *p, int orig_prio, + int new_prio), + + TP_ARGS(sbi, lock_name, is_write, p, orig_prio, new_prio) +); + +DEFINE_EVENT(f2fs_priority_update, f2fs_priority_restore, + + TP_PROTO(struct f2fs_sb_info *sbi, enum f2fs_lock_name lock_name, + bool is_write, struct task_struct *p, int orig_prio, + int new_prio), + + TP_ARGS(sbi, lock_name, is_write, p, orig_prio, new_prio) +); + #endif /* _TRACE_F2FS_H */ /* This part must be outside protection */ From 52190933c37a96164b271f3f30c16099d9eb8c09 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 4 Feb 2026 11:05:01 +0800 Subject: [PATCH 61/61] f2fs: sysfs: introduce critical_task_priority This patch introduces /sys/fs/f2fs//critical_task_priority, w/ this new sysfs interface, we can tune priority of f2fs_ckpt thread and f2fs_gc thread. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 8 ++++++++ fs/f2fs/checkpoint.c | 2 ++ fs/f2fs/f2fs.h | 4 ++++ fs/f2fs/gc.c | 2 ++ fs/f2fs/super.c | 1 + fs/f2fs/sysfs.c | 17 +++++++++++++++++ 6 files changed, 34 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index ea6474db8a31..c1d2b3fd9c65 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -987,3 +987,11 @@ Contact: "Chao Yu" Description: f2fs can tune priority of thread which has entered into critical region covered by f2fs rwsemphore lock. This sysfs entry can be used to control priority value, the range is [100,139], by default the value is 120. + +What: /sys/fs/f2fs//critical_task_priority +Date: February 2026 +Contact: "Chao Yu" +Description: It can be used to tune priority of f2fs critical task, e.g. f2fs_ckpt, f2fs_gc + threads, limitation as below: + - it requires user has CAP_SYS_NICE capability. + - the range is [100, 139], by default the value is 100. diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 4afa5d9a19fc..6dd39b7de11a 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -2158,6 +2158,8 @@ int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi) } set_task_ioprio(cprc->f2fs_issue_ckpt, cprc->ckpt_thread_ioprio); + set_user_nice(cprc->f2fs_issue_ckpt, + PRIO_TO_NICE(sbi->critical_task_priority)); return 0; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a6e7368fc40a..aa0bca9f851d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1593,6 +1593,7 @@ enum node_type { #define MAX_LOCK_ELAPSED_TIME 500 #define F2FS_DEFAULT_TASK_PRIORITY (DEFAULT_PRIO) +#define F2FS_CRITICAL_TASK_PRIORITY NICE_TO_PRIO(0) static inline int f2fs_test_bit(unsigned int nr, char *addr); static inline void f2fs_set_bit(unsigned int nr, char *addr); @@ -2010,6 +2011,9 @@ struct f2fs_sb_info { /* adjust priority for task which is in critical region covered by lock */ unsigned int lock_duration_priority; + /* priority for critical task, e.g. f2fs_ckpt, f2fs_gc threads */ + long critical_task_priority; + #ifdef CONFIG_F2FS_FS_COMPRESSION struct kmem_cache *page_array_slab; /* page array entry */ unsigned int page_array_slab_size; /* default page array slab size */ diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 60378614bc54..f46b2673d31f 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -234,6 +234,8 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) return err; } + set_user_nice(gc_th->f2fs_gc_task, + PRIO_TO_NICE(sbi->critical_task_priority)); return 0; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d5cf7265e5d3..1a755997aff5 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4340,6 +4340,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->max_lock_elapsed_time = MAX_LOCK_ELAPSED_TIME; sbi->adjust_lock_priority = 0; sbi->lock_duration_priority = F2FS_DEFAULT_TASK_PRIORITY; + sbi->critical_task_priority = F2FS_CRITICAL_TASK_PRIORITY; sbi->sum_blocksize = f2fs_sb_has_packed_ssa(sbi) ? 4096 : sbi->blocksize; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 3a272e7edf23..5fbfdc96e502 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -969,6 +969,21 @@ out: return count; } + if (!strcmp(a->attr.name, "critical_task_priority")) { + if (t < NICE_TO_PRIO(MIN_NICE) || t > NICE_TO_PRIO(MAX_NICE)) + return -EINVAL; + if (!capable(CAP_SYS_NICE)) + return -EPERM; + sbi->critical_task_priority = t; + if (sbi->cprc_info.f2fs_issue_ckpt) + set_user_nice(sbi->cprc_info.f2fs_issue_ckpt, + PRIO_TO_NICE(sbi->critical_task_priority)); + if (sbi->gc_thread && sbi->gc_thread->f2fs_gc_task) + set_user_nice(sbi->gc_thread->f2fs_gc_task, + PRIO_TO_NICE(sbi->critical_task_priority)); + return count; + } + __sbi_store_value(a, sbi, ptr + a->offset, t); return count; @@ -1288,6 +1303,7 @@ F2FS_SBI_GENERAL_RW_ATTR(bggc_io_aware); F2FS_SBI_GENERAL_RW_ATTR(max_lock_elapsed_time); F2FS_SBI_GENERAL_RW_ATTR(lock_duration_priority); F2FS_SBI_GENERAL_RW_ATTR(adjust_lock_priority); +F2FS_SBI_GENERAL_RW_ATTR(critical_task_priority); /* STAT_INFO ATTR */ #ifdef CONFIG_F2FS_STAT_FS @@ -1496,6 +1512,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(max_lock_elapsed_time), ATTR_LIST(lock_duration_priority), ATTR_LIST(adjust_lock_priority), + ATTR_LIST(critical_task_priority), NULL, }; ATTRIBUTE_GROUPS(f2fs);