diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 770470e0598b..c1d2b3fd9c65 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -520,7 +520,7 @@ What: /sys/fs/f2fs//ckpt_thread_ioprio Date: January 2021 Contact: "Daeho Jeong" Description: Give a way to change checkpoint merge daemon's io priority. - Its default value is "be,3", which means "BE" I/O class and + Its default value is "rt,3", which means "RT" I/O class and I/O priority "3". We can select the class between "rt" and "be", and set the I/O priority within valid range of it. "," delimiter is necessary in between I/O class and priority number. @@ -732,7 +732,7 @@ Description: Support configuring fault injection type, should be FAULT_TRUNCATE 0x00000400 FAULT_READ_IO 0x00000800 FAULT_CHECKPOINT 0x00001000 - FAULT_DISCARD 0x00002000 + FAULT_DISCARD 0x00002000 (obsolete) FAULT_WRITE_IO 0x00004000 FAULT_SLAB_ALLOC 0x00008000 FAULT_DQUOT_INIT 0x00010000 @@ -741,8 +741,10 @@ Description: Support configuring fault injection type, should be FAULT_BLKADDR_CONSISTENCE 0x00080000 FAULT_NO_SEGMENT 0x00100000 FAULT_INCONSISTENT_FOOTER 0x00200000 - FAULT_TIMEOUT 0x00400000 (1000ms) + FAULT_ATOMIC_TIMEOUT 0x00400000 (1000ms) FAULT_VMALLOC 0x00800000 + FAULT_LOCK_TIMEOUT 0x01000000 (1000ms) + FAULT_SKIP_WRITE 0x02000000 =========================== ========== What: /sys/fs/f2fs//discard_io_aware_gran @@ -939,3 +941,57 @@ Description: Controls write priority in multi-devices setups. A value of 0 means allocate_section_policy = 1 Prioritize writing to section before allocate_section_hint allocate_section_policy = 2 Prioritize writing to section after allocate_section_hint =========================== ========================================================== + +What: /sys/fs/f2fs//max_lock_elapsed_time +Date: December 2025 +Contact: "Chao Yu" +Description: This is a threshold, once a thread enters critical region that lock covers, total + elapsed time exceeds this threshold, f2fs will print tracepoint to dump information + of related context. This sysfs entry can be used to control the value of threshold, + by default, the value is 500 ms. + +What: /sys/fs/f2fs//inject_timeout_type +Date: December 2025 +Contact: "Chao Yu" +Description: This sysfs entry can be used to change type of injected timeout: + ========== =============================== + Flag_Value Flag_Description + ========== =============================== + 0x00000000 No timeout (default) + 0x00000001 Simulate running time + 0x00000002 Simulate IO type sleep time + 0x00000003 Simulate Non-IO type sleep time + 0x00000004 Simulate runnable time + ========== =============================== + +What: /sys/fs/f2fs//adjust_lock_priority +Date: January 2026 +Contact: "Chao Yu" +Description: This sysfs entry can be used to enable/disable to adjust priority for task + which is in critical region covered by lock. + ========== ================== + Flag_Value Flag_Description + ========== ================== + 0x00000000 Disabled (default) + 0x00000001 cp_rwsem + 0x00000002 node_change + 0x00000004 node_write + 0x00000008 gc_lock + 0x00000010 cp_global + 0x00000020 io_rwsem + ========== ================== + +What: /sys/fs/f2fs//lock_duration_priority +Date: January 2026 +Contact: "Chao Yu" +Description: f2fs can tune priority of thread which has entered into critical region covered by + f2fs rwsemphore lock. This sysfs entry can be used to control priority value, the + range is [100,139], by default the value is 120. + +What: /sys/fs/f2fs//critical_task_priority +Date: February 2026 +Contact: "Chao Yu" +Description: It can be used to tune priority of f2fs critical task, e.g. f2fs_ckpt, f2fs_gc + threads, limitation as below: + - it requires user has CAP_SYS_NICE capability. + - the range is [100, 139], by default the value is 100. diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index cb90d1ae82d0..7e4031631286 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -206,7 +206,7 @@ fault_type=%d Support configuring fault injection type, should be FAULT_TRUNCATE 0x00000400 FAULT_READ_IO 0x00000800 FAULT_CHECKPOINT 0x00001000 - FAULT_DISCARD 0x00002000 + FAULT_DISCARD 0x00002000 (obsolete) FAULT_WRITE_IO 0x00004000 FAULT_SLAB_ALLOC 0x00008000 FAULT_DQUOT_INIT 0x00010000 @@ -215,8 +215,10 @@ fault_type=%d Support configuring fault injection type, should be FAULT_BLKADDR_CONSISTENCE 0x00080000 FAULT_NO_SEGMENT 0x00100000 FAULT_INCONSISTENT_FOOTER 0x00200000 - FAULT_TIMEOUT 0x00400000 (1000ms) + FAULT_ATOMIC_TIMEOUT 0x00400000 (1000ms) FAULT_VMALLOC 0x00800000 + FAULT_LOCK_TIMEOUT 0x01000000 (1000ms) + FAULT_SKIP_WRITE 0x02000000 =========================== ========== mode=%s Control block allocation mode which supports "adaptive" and "lfs". In "lfs" mode, there should be no random @@ -1033,3 +1035,46 @@ the reserved space back to F2FS for its own use. So, the key idea is, user can do any file operations on /dev/vdc, and reclaim the space after the use, while the space is counted as /data. That doesn't require modifying partition size and filesystem format. + +Per-file Read-Only Large Folio Support +-------------------------------------- + +F2FS implements large folio support on the read path to leverage high-order +page allocation for significant performance gains. To minimize code complexity, +this support is currently excluded from the write path, which requires handling +complex optimizations such as compression and block allocation modes. + +This optional feature is triggered only when a file's immutable bit is set. +Consequently, F2FS will return EOPNOTSUPP if a user attempts to open a cached +file with write permissions, even immediately after clearing the bit. Write +access is only restored once the cached inode is dropped. The usage flow is +demonstrated below: + +.. code-block:: + + # f2fs_io setflags immutable /data/testfile_read_seq + + /* flush and reload the inode to enable the large folio */ + # sync && echo 3 > /proc/sys/vm/drop_caches + + /* mmap(MAP_POPULATE) + mlock() */ + # f2fs_io read 128 0 1024 mmap 1 0 /data/testfile_read_seq + + /* mmap() + fadvise(POSIX_FADV_WILLNEED) + mlock() */ + # f2fs_io read 128 0 1024 fadvise 1 0 /data/testfile_read_seq + + /* mmap() + mlock2(MLOCK_ONFAULT) + madvise(MADV_POPULATE_READ) */ + # f2fs_io read 128 0 1024 madvise 1 0 /data/testfile_read_seq + + # f2fs_io clearflags immutable /data/testfile_read_seq + + # f2fs_io write 1 0 1 zero buffered /data/testfile_read_seq + Failed to open /mnt/test/test: Operation not supported + + /* flush and reload the inode to disable the large folio */ + # sync && echo 3 > /proc/sys/vm/drop_caches + + # f2fs_io write 1 0 1 zero buffered /data/testfile_read_seq + Written 4096 bytes with pattern = zero, total_time = 29 us, max_latency = 28 us + + # rm /data/testfile_read_seq diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 300664269eb6..6dd39b7de11a 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -14,6 +14,9 @@ #include #include #include +#include +#include +#include #include "f2fs.h" #include "node.h" @@ -21,6 +24,209 @@ #include "iostat.h" #include +static inline void get_lock_elapsed_time(struct f2fs_time_stat *ts) +{ + ts->total_time = ktime_get(); +#ifdef CONFIG_64BIT + ts->running_time = current->se.sum_exec_runtime; +#endif +#if defined(CONFIG_SCHED_INFO) && defined(CONFIG_SCHEDSTATS) + ts->runnable_time = current->sched_info.run_delay; +#endif +#ifdef CONFIG_TASK_DELAY_ACCT + if (current->delays) + ts->io_sleep_time = current->delays->blkio_delay; +#endif +} + +static inline void trace_lock_elapsed_time_start(struct f2fs_rwsem *sem, + struct f2fs_lock_context *lc) +{ + lc->lock_trace = trace_f2fs_lock_elapsed_time_enabled(); + if (!lc->lock_trace) + return; + + get_lock_elapsed_time(&lc->ts); +} + +static inline void trace_lock_elapsed_time_end(struct f2fs_rwsem *sem, + struct f2fs_lock_context *lc, bool is_write) +{ + struct f2fs_time_stat tts; + unsigned long long total_time; + unsigned long long running_time = 0; + unsigned long long runnable_time = 0; + unsigned long long io_sleep_time = 0; + unsigned long long other_time = 0; + unsigned npm = NSEC_PER_MSEC; + + if (!lc->lock_trace) + return; + + if (time_to_inject(sem->sbi, FAULT_LOCK_TIMEOUT)) + f2fs_schedule_timeout_killable(DEFAULT_FAULT_TIMEOUT, true); + + get_lock_elapsed_time(&tts); + + total_time = div_u64(tts.total_time - lc->ts.total_time, npm); + if (total_time <= sem->sbi->max_lock_elapsed_time) + return; + +#ifdef CONFIG_64BIT + running_time = div_u64(tts.running_time - lc->ts.running_time, npm); +#endif +#if defined(CONFIG_SCHED_INFO) && defined(CONFIG_SCHEDSTATS) + runnable_time = div_u64(tts.runnable_time - lc->ts.runnable_time, npm); +#endif +#ifdef CONFIG_TASK_DELAY_ACCT + io_sleep_time = div_u64(tts.io_sleep_time - lc->ts.io_sleep_time, npm); +#endif + if (total_time > running_time + io_sleep_time + runnable_time) + other_time = total_time - running_time - + io_sleep_time - runnable_time; + + trace_f2fs_lock_elapsed_time(sem->sbi, sem->name, is_write, current, + get_current_ioprio(), total_time, running_time, + runnable_time, io_sleep_time, other_time); +} + +static bool need_uplift_priority(struct f2fs_rwsem *sem, bool is_write) +{ + if (!(sem->sbi->adjust_lock_priority & BIT(sem->name - 1))) + return false; + + switch (sem->name) { + /* + * writer is checkpoint which has high priority, let's just uplift + * priority for reader + */ + case LOCK_NAME_CP_RWSEM: + case LOCK_NAME_NODE_CHANGE: + case LOCK_NAME_NODE_WRITE: + return !is_write; + case LOCK_NAME_GC_LOCK: + case LOCK_NAME_CP_GLOBAL: + case LOCK_NAME_IO_RWSEM: + return true; + default: + f2fs_bug_on(sem->sbi, 1); + } + return false; +} + +static void uplift_priority(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc, + bool is_write) +{ + lc->need_restore = false; + if (!sem->sbi->adjust_lock_priority) + return; + if (rt_task(current)) + return; + if (!need_uplift_priority(sem, is_write)) + return; + lc->orig_nice = task_nice(current); + lc->new_nice = PRIO_TO_NICE(sem->sbi->lock_duration_priority); + if (lc->orig_nice <= lc->new_nice) + return; + set_user_nice(current, lc->new_nice); + lc->need_restore = true; + + trace_f2fs_priority_uplift(sem->sbi, sem->name, is_write, current, + NICE_TO_PRIO(lc->orig_nice), NICE_TO_PRIO(lc->new_nice)); +} + +static void restore_priority(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc, + bool is_write) +{ + if (!lc->need_restore) + return; + /* someone has updated the priority */ + if (task_nice(current) != lc->new_nice) + return; + set_user_nice(current, lc->orig_nice); + + trace_f2fs_priority_restore(sem->sbi, sem->name, is_write, current, + NICE_TO_PRIO(lc->orig_nice), NICE_TO_PRIO(lc->new_nice)); +} + +void f2fs_down_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc) +{ + uplift_priority(sem, lc, false); + f2fs_down_read(sem); + trace_lock_elapsed_time_start(sem, lc); +} + +int f2fs_down_read_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc) +{ + uplift_priority(sem, lc, false); + if (!f2fs_down_read_trylock(sem)) { + restore_priority(sem, lc, false); + return 0; + } + trace_lock_elapsed_time_start(sem, lc); + return 1; +} + +void f2fs_up_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc) +{ + f2fs_up_read(sem); + restore_priority(sem, lc, false); + trace_lock_elapsed_time_end(sem, lc, false); +} + +void f2fs_down_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc) +{ + uplift_priority(sem, lc, true); + f2fs_down_write(sem); + trace_lock_elapsed_time_start(sem, lc); +} + +int f2fs_down_write_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc) +{ + uplift_priority(sem, lc, true); + if (!f2fs_down_write_trylock(sem)) { + restore_priority(sem, lc, true); + return 0; + } + trace_lock_elapsed_time_start(sem, lc); + return 1; +} + +void f2fs_up_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc) +{ + f2fs_up_write(sem); + restore_priority(sem, lc, true); + trace_lock_elapsed_time_end(sem, lc, true); +} + +void f2fs_lock_op(struct f2fs_sb_info *sbi, struct f2fs_lock_context *lc) +{ + f2fs_down_read_trace(&sbi->cp_rwsem, lc); +} + +int f2fs_trylock_op(struct f2fs_sb_info *sbi, struct f2fs_lock_context *lc) +{ + if (time_to_inject(sbi, FAULT_LOCK_OP)) + return 0; + + return f2fs_down_read_trylock_trace(&sbi->cp_rwsem, lc); +} + +void f2fs_unlock_op(struct f2fs_sb_info *sbi, struct f2fs_lock_context *lc) +{ + f2fs_up_read_trace(&sbi->cp_rwsem, lc); +} + +static inline void f2fs_lock_all(struct f2fs_sb_info *sbi) +{ + f2fs_down_write(&sbi->cp_rwsem); +} + +static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) +{ + f2fs_up_write(&sbi->cp_rwsem); +} + #define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 3)) static struct kmem_cache *ino_entry_slab; @@ -379,6 +585,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); + struct f2fs_lock_context lc; long diff, written; if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) @@ -391,13 +598,13 @@ static int f2fs_write_meta_pages(struct address_space *mapping, goto skip_write; /* if locked failed, cp will flush dirty pages instead */ - if (!f2fs_down_write_trylock(&sbi->cp_global_sem)) + if (!f2fs_down_write_trylock_trace(&sbi->cp_global_sem, &lc)) goto skip_write; trace_f2fs_writepages(mapping->host, wbc, META); diff = nr_pages_to_write(sbi, META, wbc); - written = f2fs_sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO); - f2fs_up_write(&sbi->cp_global_sem); + written = f2fs_sync_meta_pages(sbi, wbc->nr_to_write, FS_META_IO); + f2fs_up_write_trace(&sbi->cp_global_sem, &lc); wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); return 0; @@ -407,8 +614,8 @@ skip_write: return 0; } -long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, - long nr_to_write, enum iostat_type io_type) +long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, long nr_to_write, + enum iostat_type io_type) { struct address_space *mapping = META_MAPPING(sbi); pgoff_t index = 0, prev = ULONG_MAX; @@ -469,7 +676,7 @@ continue_unlock: } stop: if (nwritten) - f2fs_submit_merged_write(sbi, type); + f2fs_submit_merged_write(sbi, META); blk_finish_plug(&plug); @@ -1312,8 +1519,7 @@ void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type) break; if (type == F2FS_DIRTY_META) - f2fs_sync_meta_pages(sbi, META, LONG_MAX, - FS_CP_META_IO); + f2fs_sync_meta_pages(sbi, LONG_MAX, FS_CP_META_IO); else if (type == F2FS_WB_CP_DATA) f2fs_submit_merged_write(sbi, DATA); @@ -1485,7 +1691,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) int err; /* Flush all the NAT/SIT pages */ - f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + f2fs_sync_meta_pages(sbi, LONG_MAX, FS_CP_META_IO); stat_cp_time(cpc, CP_TIME_SYNC_META); @@ -1584,7 +1790,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) } /* Here, we have one bio having CP pack except cp pack 2 page */ - f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + f2fs_sync_meta_pages(sbi, LONG_MAX, FS_CP_META_IO); stat_cp_time(cpc, CP_TIME_SYNC_CP_META); /* Wait for all dirty meta pages to be submitted for IO */ @@ -1646,6 +1852,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_lock_context lc; unsigned long long ckpt_ver; int err = 0; @@ -1660,7 +1867,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_warn(sbi, "Start checkpoint disabled!"); } if (cpc->reason != CP_RESIZE) - f2fs_down_write(&sbi->cp_global_sem); + f2fs_down_write_trace(&sbi->cp_global_sem, &lc); stat_cp_time(cpc, CP_TIME_LOCK); @@ -1701,6 +1908,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) goto out; } } + stat_cp_time(cpc, CP_TIME_MERGE_WRITE); /* * update checkpoint pack index @@ -1717,10 +1925,11 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_bug_on(sbi, !f2fs_cp_error(sbi)); goto stop; } + stat_cp_time(cpc, CP_TIME_FLUSH_NAT); f2fs_flush_sit_entries(sbi, cpc); - stat_cp_time(cpc, CP_TIME_FLUSH_META); + stat_cp_time(cpc, CP_TIME_FLUSH_SIT); /* save inmem log status */ f2fs_save_inmem_curseg(sbi); @@ -1750,7 +1959,7 @@ stop: trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, CP_PHASE_FINISH_CHECKPOINT); out: if (cpc->reason != CP_RESIZE) - f2fs_up_write(&sbi->cp_global_sem); + f2fs_up_write_trace(&sbi->cp_global_sem, &lc); return err; } @@ -1796,11 +2005,12 @@ void f2fs_destroy_checkpoint_caches(void) static int __write_checkpoint_sync(struct f2fs_sb_info *sbi) { struct cp_control cpc = { .reason = CP_SYNC, }; + struct f2fs_lock_context lc; int err; - f2fs_down_write(&sbi->gc_lock); + f2fs_down_write_trace(&sbi->gc_lock, &lc); err = f2fs_write_checkpoint(sbi, &cpc); - f2fs_up_write(&sbi->gc_lock); + f2fs_up_write_trace(&sbi->gc_lock, &lc); return err; } @@ -1888,11 +2098,12 @@ int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi) cpc.reason = __get_cp_reason(sbi); if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC || sbi->umount_lock_holder == current) { + struct f2fs_lock_context lc; int ret; - f2fs_down_write(&sbi->gc_lock); + f2fs_down_write_trace(&sbi->gc_lock, &lc); ret = f2fs_write_checkpoint(sbi, &cpc); - f2fs_up_write(&sbi->gc_lock); + f2fs_up_write_trace(&sbi->gc_lock, &lc); return ret; } @@ -1947,6 +2158,8 @@ int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi) } set_task_ioprio(cprc->f2fs_issue_ckpt, cprc->ckpt_thread_ioprio); + set_user_nice(cprc->f2fs_issue_ckpt, + PRIO_TO_NICE(sbi->critical_task_priority)); return 0; } diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index ef1225af2acf..006a80acd1de 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1291,6 +1291,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, struct dnode_of_data dn; struct node_info ni; struct compress_io_ctx *cic; + struct f2fs_lock_context lc; pgoff_t start_idx = start_idx_of_cluster(cc); unsigned int last_index = cc->cluster_size - 1; loff_t psize; @@ -1309,8 +1310,8 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, * checkpoint. This can only happen to quota writes which can cause * the below discard race condition. */ - f2fs_down_read(&sbi->node_write); - } else if (!f2fs_trylock_op(sbi)) { + f2fs_down_read_trace(&sbi->node_write, &lc); + } else if (!f2fs_trylock_op(sbi, &lc)) { goto out_free; } @@ -1434,9 +1435,9 @@ unlock_continue: f2fs_put_dnode(&dn); if (quota_inode) - f2fs_up_read(&sbi->node_write); + f2fs_up_read_trace(&sbi->node_write, &lc); else - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); spin_lock(&fi->i_size_lock); if (fi->last_disk_size < psize) @@ -1463,9 +1464,9 @@ out_put_dnode: f2fs_put_dnode(&dn); out_unlock_op: if (quota_inode) - f2fs_up_read(&sbi->node_write); + f2fs_up_read_trace(&sbi->node_write, &lc); else - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); out_free: for (i = 0; i < cc->valid_nr_cpages; i++) { f2fs_compress_free_page(cc->cpages[i]); @@ -1512,6 +1513,7 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc, { struct address_space *mapping = cc->inode->i_mapping; struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); + struct f2fs_lock_context lc; int submitted, compr_blocks, i; int ret = 0; @@ -1530,7 +1532,7 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc, /* overwrite compressed cluster w/ normal cluster */ if (compr_blocks > 0) - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); for (i = 0; i < cc->cluster_size; i++) { struct folio *folio; @@ -1586,7 +1588,7 @@ continue_unlock: out: if (compr_blocks > 0) - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); f2fs_balance_fs(sbi, true); return ret; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 491f66511201..338df7a2aea6 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -31,9 +31,15 @@ static struct kmem_cache *bio_post_read_ctx_cache; static struct kmem_cache *bio_entry_slab; +static struct kmem_cache *ffs_entry_slab; static mempool_t *bio_post_read_ctx_pool; static struct bio_set f2fs_bioset; +struct f2fs_folio_state { + spinlock_t state_lock; + unsigned int read_pages_pending; +}; + #define F2FS_BIO_POOL_SIZE NR_CURSEG_TYPE int __init f2fs_init_bioset(void) @@ -139,11 +145,15 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task) { struct folio_iter fi; struct bio_post_read_ctx *ctx = bio->bi_private; + unsigned long flags; bio_for_each_folio_all(fi, bio) { struct folio *folio = fi.folio; + unsigned nr_pages = fi.length >> PAGE_SHIFT; + bool finished = true; - if (f2fs_is_compressed_page(folio)) { + if (!folio_test_large(folio) && + f2fs_is_compressed_page(folio)) { if (ctx && !ctx->decompression_attempted) f2fs_end_read_compressed_page(folio, true, 0, in_task); @@ -151,8 +161,25 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task) continue; } - dec_page_count(F2FS_F_SB(folio), __read_io_type(folio)); - folio_end_read(folio, bio->bi_status == BLK_STS_OK); + if (folio_test_large(folio)) { + struct f2fs_folio_state *ffs = folio->private; + + spin_lock_irqsave(&ffs->state_lock, flags); + ffs->read_pages_pending -= nr_pages; + finished = !ffs->read_pages_pending; + spin_unlock_irqrestore(&ffs->state_lock, flags); + } + + while (nr_pages--) + dec_page_count(F2FS_F_SB(folio), __read_io_type(folio)); + + if (F2FS_F_SB(folio)->node_inode && is_node_folio(folio) && + f2fs_sanity_check_node_footer(F2FS_F_SB(folio), + folio, folio->index, NODE_TYPE_REGULAR, true)) + bio->bi_status = BLK_STS_IOERR; + + if (finished) + folio_end_read(folio, bio->bi_status == BLK_STS_OK); } if (ctx) @@ -189,7 +216,7 @@ static void f2fs_verify_bio(struct work_struct *work) struct folio *folio = fi.folio; if (!f2fs_is_compressed_page(folio) && - !fsverity_verify_page(vi, &folio->page)) { + !fsverity_verify_folio(vi, folio)) { bio->bi_status = BLK_STS_IOERR; break; } @@ -354,18 +381,27 @@ static void f2fs_write_end_io(struct bio *bio) STOP_CP_REASON_WRITE_FAIL); } - f2fs_bug_on(sbi, is_node_folio(folio) && - folio->index != nid_of_node(folio)); + if (is_node_folio(folio)) { + f2fs_sanity_check_node_footer(sbi, folio, + folio->index, NODE_TYPE_REGULAR, true); + f2fs_bug_on(sbi, folio->index != nid_of_node(folio)); + } dec_page_count(sbi, type); + + /* + * we should access sbi before folio_end_writeback() to + * avoid racing w/ kill_f2fs_super() + */ + if (type == F2FS_WB_CP_DATA && !get_pages(sbi, type) && + wq_has_sleeper(&sbi->cp_wait)) + wake_up(&sbi->cp_wait); + if (f2fs_in_warm_node_list(sbi, folio)) f2fs_del_fsync_node_entry(sbi, folio); folio_clear_f2fs_gcing(folio); folio_end_writeback(folio); } - if (!get_pages(sbi, F2FS_WB_CP_DATA) && - wq_has_sleeper(&sbi->cp_wait)) - wake_up(&sbi->cp_wait); bio_put(bio); } @@ -511,6 +547,9 @@ static bool f2fs_crypt_mergeable_bio(struct bio *bio, const struct inode *inode, void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio, enum page_type type) { + if (!bio) + return; + WARN_ON_ONCE(!is_read_io(bio_op(bio))); trace_f2fs_submit_read_bio(sbi->sb, type, bio); @@ -597,7 +636,8 @@ int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi) for (j = HOT; j < n; j++) { struct f2fs_bio_info *io = &sbi->write_io[i][j]; - init_f2fs_rwsem(&io->io_rwsem); + init_f2fs_rwsem_trace(&io->io_rwsem, sbi, + LOCK_NAME_IO_RWSEM); io->sbi = sbi; io->bio = NULL; io->last_block_in_bio = 0; @@ -621,8 +661,9 @@ static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, { enum page_type btype = PAGE_TYPE_OF_BIO(type); struct f2fs_bio_info *io = sbi->write_io[btype] + temp; + struct f2fs_lock_context lc; - f2fs_down_write(&io->io_rwsem); + f2fs_down_write_trace(&io->io_rwsem, &lc); if (!io->bio) goto unlock_out; @@ -636,27 +677,37 @@ static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, } __submit_merged_bio(io); unlock_out: - f2fs_up_write(&io->io_rwsem); + f2fs_up_write_trace(&io->io_rwsem, &lc); } static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, struct folio *folio, - nid_t ino, enum page_type type, bool force) + nid_t ino, enum page_type type, bool writeback) { enum temp_type temp; bool ret = true; + bool force = !inode && !folio && !ino; for (temp = HOT; temp < NR_TEMP_TYPE; temp++) { if (!force) { enum page_type btype = PAGE_TYPE_OF_BIO(type); struct f2fs_bio_info *io = sbi->write_io[btype] + temp; + struct f2fs_lock_context lc; - f2fs_down_read(&io->io_rwsem); + f2fs_down_read_trace(&io->io_rwsem, &lc); ret = __has_merged_page(io->bio, inode, folio, ino); - f2fs_up_read(&io->io_rwsem); + f2fs_up_read_trace(&io->io_rwsem, &lc); } - if (ret) + if (ret) { __f2fs_submit_merged_write(sbi, type, temp); + /* + * For waitting writebck case, if the bio owned by the + * folio is already submitted, we do not need to submit + * other types of bios. + */ + if (writeback) + break; + } /* TODO: use HOT temp only for meta pages now. */ if (type >= META) @@ -666,7 +717,7 @@ static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type) { - __submit_merged_write_cond(sbi, NULL, NULL, 0, type, true); + __submit_merged_write_cond(sbi, NULL, NULL, 0, type, false); } void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, @@ -676,6 +727,12 @@ void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, __submit_merged_write_cond(sbi, inode, folio, ino, type, false); } +void f2fs_submit_merged_write_folio(struct f2fs_sb_info *sbi, + struct folio *folio, enum page_type type) +{ + __submit_merged_write_cond(sbi, NULL, folio, 0, type, true); +} + void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi) { f2fs_submit_merged_write(sbi, DATA); @@ -949,11 +1006,12 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio) enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); struct f2fs_bio_info *io = sbi->write_io[btype] + fio->temp; struct folio *bio_folio; + struct f2fs_lock_context lc; enum count_type type; f2fs_bug_on(sbi, is_read_io(fio->op)); - f2fs_down_write(&io->io_rwsem); + f2fs_down_write_trace(&io->io_rwsem, &lc); next: #ifdef CONFIG_BLK_DEV_ZONED if (f2fs_sb_has_blkzoned(sbi) && btype < META && io->zone_pending_bio) { @@ -1035,7 +1093,7 @@ out: if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) || !f2fs_is_checkpoint_ready(sbi)) __submit_merged_bio(io); - f2fs_up_write(&io->io_rwsem); + f2fs_up_write_trace(&io->io_rwsem, &lc); } static struct bio *f2fs_grab_read_bio(struct inode *inode, @@ -1212,11 +1270,21 @@ struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index, struct dnode_of_data dn; struct folio *folio; int err; - +retry: folio = f2fs_grab_cache_folio(mapping, index, for_write); if (IS_ERR(folio)) return folio; + if (folio_test_large(folio)) { + pgoff_t folio_index = mapping_align_index(mapping, index); + + f2fs_folio_put(folio, true); + invalidate_inode_pages2_range(mapping, folio_index, + folio_index + folio_nr_pages(folio) - 1); + f2fs_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT); + goto retry; + } + if (f2fs_lookup_read_extent_cache_block(inode, index, &dn.data_blkaddr)) { if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr, @@ -1428,34 +1496,37 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) return 0; } -static void f2fs_map_lock(struct f2fs_sb_info *sbi, int flag) +static void f2fs_map_lock(struct f2fs_sb_info *sbi, + struct f2fs_lock_context *lc, + int flag) { - f2fs_down_read(&sbi->cp_enable_rwsem); if (flag == F2FS_GET_BLOCK_PRE_AIO) - f2fs_down_read(&sbi->node_change); + f2fs_down_read_trace(&sbi->node_change, lc); else - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, lc); } -static void f2fs_map_unlock(struct f2fs_sb_info *sbi, int flag) +static void f2fs_map_unlock(struct f2fs_sb_info *sbi, + struct f2fs_lock_context *lc, + int flag) { if (flag == F2FS_GET_BLOCK_PRE_AIO) - f2fs_up_read(&sbi->node_change); + f2fs_up_read_trace(&sbi->node_change, lc); else - f2fs_unlock_op(sbi); - f2fs_up_read(&sbi->cp_enable_rwsem); + f2fs_unlock_op(sbi, lc); } int f2fs_get_block_locked(struct dnode_of_data *dn, pgoff_t index) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + struct f2fs_lock_context lc; int err = 0; - f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO); + f2fs_map_lock(sbi, &lc, F2FS_GET_BLOCK_PRE_AIO); if (!f2fs_lookup_read_extent_cache_block(dn->inode, index, &dn->data_blkaddr)) err = f2fs_reserve_block(dn, index); - f2fs_map_unlock(sbi, F2FS_GET_BLOCK_PRE_AIO); + f2fs_map_unlock(sbi, &lc, F2FS_GET_BLOCK_PRE_AIO); return err; } @@ -1546,6 +1617,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag) unsigned int maxblocks = map->m_len; struct dnode_of_data dn; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_lock_context lc; int mode = map->m_may_create ? ALLOC_NODE : LOOKUP_NODE; pgoff_t pgofs, end_offset, end; int err = 0, ofs = 1; @@ -1584,7 +1656,7 @@ next_dnode: if (map->m_may_create) { if (f2fs_lfs_mode(sbi)) f2fs_balance_fs(sbi, true); - f2fs_map_lock(sbi, flag); + f2fs_map_lock(sbi, &lc, flag); } /* When reading holes, we need its node page */ @@ -1750,7 +1822,7 @@ skip: f2fs_put_dnode(&dn); if (map->m_may_create) { - f2fs_map_unlock(sbi, flag); + f2fs_map_unlock(sbi, &lc, flag); f2fs_balance_fs(sbi, dn.node_changed); } goto next_dnode; @@ -1797,7 +1869,7 @@ sync_out: f2fs_put_dnode(&dn); unlock_out: if (map->m_may_create) { - f2fs_map_unlock(sbi, flag); + f2fs_map_unlock(sbi, &lc, flag); f2fs_balance_fs(sbi, dn.node_changed); } out: @@ -1805,7 +1877,8 @@ out: return err; } -bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len) +static bool __f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len, + bool check_first) { struct f2fs_map_blocks map; block_t last_lblk; @@ -1827,10 +1900,17 @@ bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len) if (err || map.m_len == 0) return false; map.m_lblk += map.m_len; + if (check_first) + break; } return true; } +bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len) +{ + return __f2fs_overwrite_io(inode, pos, len, false); +} + static int f2fs_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo) { @@ -2104,10 +2184,13 @@ static int f2fs_read_single_page(struct inode *inode, struct fsverity_info *vi, /* * Map blocks using the previous result first. */ - if ((map->m_flags & F2FS_MAP_MAPPED) && - block_in_file > map->m_lblk && + if (map->m_flags & F2FS_MAP_MAPPED) { + if (block_in_file > map->m_lblk && block_in_file < (map->m_lblk + map->m_len)) + goto got_it; + } else if (block_in_file < *map->m_next_pgofs) { goto got_it; + } /* * Then do more f2fs_map_blocks() calls until we are @@ -2343,6 +2426,185 @@ out: } #endif +static struct f2fs_folio_state *ffs_find_or_alloc(struct folio *folio) +{ + struct f2fs_folio_state *ffs = folio->private; + + if (ffs) + return ffs; + + ffs = f2fs_kmem_cache_alloc(ffs_entry_slab, + GFP_NOIO | __GFP_ZERO, true, NULL); + + spin_lock_init(&ffs->state_lock); + folio_attach_private(folio, ffs); + return ffs; +} + +static void ffs_detach_free(struct folio *folio) +{ + struct f2fs_folio_state *ffs; + + if (!folio_test_large(folio)) { + folio_detach_private(folio); + return; + } + + ffs = folio_detach_private(folio); + if (!ffs) + return; + + WARN_ON_ONCE(ffs->read_pages_pending != 0); + kmem_cache_free(ffs_entry_slab, ffs); +} + +static int f2fs_read_data_large_folio(struct inode *inode, + struct fsverity_info *vi, + struct readahead_control *rac, struct folio *folio) +{ + struct bio *bio = NULL; + sector_t last_block_in_bio = 0; + struct f2fs_map_blocks map = {0, }; + pgoff_t index, offset, next_pgofs = 0; + unsigned max_nr_pages = rac ? readahead_count(rac) : + folio_nr_pages(folio); + unsigned nrpages; + struct f2fs_folio_state *ffs; + int ret = 0; + bool folio_in_bio; + + if (!IS_IMMUTABLE(inode) || f2fs_compressed_file(inode)) { + if (folio) + folio_unlock(folio); + return -EOPNOTSUPP; + } + + map.m_seg_type = NO_CHECK_TYPE; + + if (rac) + folio = readahead_folio(rac); +next_folio: + if (!folio) + goto out; + + folio_in_bio = false; + index = folio->index; + offset = 0; + ffs = NULL; + nrpages = folio_nr_pages(folio); + + for (; nrpages; nrpages--, max_nr_pages--, index++, offset++) { + sector_t block_nr; + /* + * Map blocks using the previous result first. + */ + if (map.m_flags & F2FS_MAP_MAPPED) { + if (index > map.m_lblk && + index < (map.m_lblk + map.m_len)) + goto got_it; + } else if (index < next_pgofs) { + /* hole case */ + goto got_it; + } + + /* + * Then do more f2fs_map_blocks() calls until we are + * done with this page. + */ + memset(&map, 0, sizeof(map)); + map.m_next_pgofs = &next_pgofs; + map.m_seg_type = NO_CHECK_TYPE; + map.m_lblk = index; + map.m_len = max_nr_pages; + + ret = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DEFAULT); + if (ret) + goto err_out; +got_it: + if ((map.m_flags & F2FS_MAP_MAPPED)) { + block_nr = map.m_pblk + index - map.m_lblk; + if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr, + DATA_GENERIC_ENHANCE_READ)) { + ret = -EFSCORRUPTED; + goto err_out; + } + } else { + size_t page_offset = offset << PAGE_SHIFT; + folio_zero_range(folio, page_offset, PAGE_SIZE); + if (vi && !fsverity_verify_blocks(vi, folio, PAGE_SIZE, page_offset)) { + ret = -EIO; + goto err_out; + } + continue; + } + + /* We must increment read_pages_pending before possible BIOs submitting + * to prevent from premature folio_end_read() call on folio + */ + if (folio_test_large(folio)) { + ffs = ffs_find_or_alloc(folio); + + /* set the bitmap to wait */ + spin_lock_irq(&ffs->state_lock); + ffs->read_pages_pending++; + spin_unlock_irq(&ffs->state_lock); + } + + /* + * This page will go to BIO. Do we need to send this + * BIO off first? + */ + if (bio && (!page_is_mergeable(F2FS_I_SB(inode), bio, + last_block_in_bio, block_nr) || + !f2fs_crypt_mergeable_bio(bio, inode, index, NULL))) { +submit_and_realloc: + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); + bio = NULL; + } + if (bio == NULL) + bio = f2fs_grab_read_bio(inode, vi, + block_nr, max_nr_pages, + f2fs_ra_op_flags(rac), + index, false); + + /* + * If the page is under writeback, we need to wait for + * its completion to see the correct decrypted data. + */ + f2fs_wait_on_block_writeback(inode, block_nr); + + if (!bio_add_folio(bio, folio, F2FS_BLKSIZE, + offset << PAGE_SHIFT)) + goto submit_and_realloc; + + folio_in_bio = true; + inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA); + f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO, + F2FS_BLKSIZE); + last_block_in_bio = block_nr; + } + trace_f2fs_read_folio(folio, DATA); +err_out: + if (!folio_in_bio) { + folio_end_read(folio, !ret); + if (ret) + return ret; + } + if (rac) { + folio = readahead_folio(rac); + goto next_folio; + } +out: + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); + if (ret) { + /* Wait bios and clear uptodate. */ + folio_lock(folio); + folio_clear_uptodate(folio); + folio_unlock(folio); + } + return ret; +} + /* * This function was originally taken from fs/mpage.c, and customized for f2fs. * Major change was from block_size == page_size in f2fs by default. @@ -2367,10 +2629,15 @@ static int f2fs_mpage_readpages(struct inode *inode, struct fsverity_info *vi, pgoff_t nc_cluster_idx = NULL_CLUSTER; pgoff_t index; #endif + pgoff_t next_pgofs = 0; unsigned nr_pages = rac ? readahead_count(rac) : 1; + struct address_space *mapping = rac ? rac->mapping : folio->mapping; unsigned max_nr_pages = nr_pages; int ret = 0; + if (mapping_large_folio_support(mapping)) + return f2fs_read_data_large_folio(inode, vi, rac, folio); + #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { index = rac ? readahead_index(rac) : folio->index; @@ -2383,7 +2650,7 @@ static int f2fs_mpage_readpages(struct inode *inode, struct fsverity_info *vi, map.m_lblk = 0; map.m_len = 0; map.m_flags = 0; - map.m_next_pgofs = NULL; + map.m_next_pgofs = &next_pgofs; map.m_next_extent = NULL; map.m_seg_type = NO_CHECK_TYPE; map.m_may_create = false; @@ -2464,8 +2731,7 @@ next_page: } #endif } - if (bio) - f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); return ret; } @@ -2663,6 +2929,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) struct inode *inode = folio->mapping->host; struct dnode_of_data dn; struct node_info ni; + struct f2fs_lock_context lc; bool ipu_force = false; bool atomic_commit; int err = 0; @@ -2687,8 +2954,12 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) goto got_it; } + if (is_sbi_flag_set(fio->sbi, SBI_ENABLE_CHECKPOINT) && + time_to_inject(fio->sbi, FAULT_SKIP_WRITE)) + return -EINVAL; + /* Deadlock due to between page->lock and f2fs_lock_op */ - if (fio->need_lock == LOCK_REQ && !f2fs_trylock_op(fio->sbi)) + if (fio->need_lock == LOCK_REQ && !f2fs_trylock_op(fio->sbi, &lc)) return -EAGAIN; err = f2fs_get_dnode_of_data(&dn, folio->index, LOOKUP_NODE); @@ -2729,7 +3000,7 @@ got_it: folio_start_writeback(folio); f2fs_put_dnode(&dn); if (fio->need_lock == LOCK_REQ) - f2fs_unlock_op(fio->sbi); + f2fs_unlock_op(fio->sbi, &lc); err = f2fs_inplace_write_data(fio); if (err) { if (fscrypt_inode_uses_fs_layer_crypto(inode)) @@ -2743,7 +3014,7 @@ got_it: } if (fio->need_lock == LOCK_RETRY) { - if (!f2fs_trylock_op(fio->sbi)) { + if (!f2fs_trylock_op(fio->sbi, &lc)) { err = -EAGAIN; goto out_writepage; } @@ -2775,7 +3046,7 @@ out_writepage: f2fs_put_dnode(&dn); out: if (fio->need_lock == LOCK_REQ) - f2fs_unlock_op(fio->sbi); + f2fs_unlock_op(fio->sbi, &lc); return err; } @@ -2855,19 +3126,21 @@ int f2fs_write_single_data_page(struct folio *folio, int *submitted, write: /* Dentry/quota blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode) || quota_inode) { + struct f2fs_lock_context lc; + /* * We need to wait for node_write to avoid block allocation during * checkpoint. This can only happen to quota writes which can cause * the below discard race condition. */ if (quota_inode) - f2fs_down_read(&sbi->node_write); + f2fs_down_read_trace(&sbi->node_write, &lc); fio.need_lock = LOCK_DONE; err = f2fs_do_write_data_page(&fio); if (quota_inode) - f2fs_up_read(&sbi->node_write); + f2fs_up_read_trace(&sbi->node_write, &lc); goto done; } @@ -3237,6 +3510,8 @@ static inline bool __should_serialize_io(struct inode *inode, if (IS_NOQUOTA(inode)) return false; + if (f2fs_is_pinned_file(inode)) + return false; if (f2fs_need_compress_data(inode)) return true; if (wbc->sync_mode != WB_SYNC_ALL) @@ -3259,6 +3534,16 @@ static inline void account_writeback(struct inode *inode, bool inc) f2fs_up_read(&F2FS_I(inode)->i_sem); } +static inline void update_skipped_write(struct f2fs_sb_info *sbi, + struct writeback_control *wbc) +{ + long skipped = wbc->pages_skipped; + + if (is_sbi_flag_set(sbi, SBI_ENABLE_CHECKPOINT) && skipped && + wbc->sync_mode == WB_SYNC_ALL) + atomic_add(skipped, &sbi->nr_pages[F2FS_SKIPPED_WRITE]); +} + static int __f2fs_write_data_pages(struct address_space *mapping, struct writeback_control *wbc, enum iostat_type io_type) @@ -3323,10 +3608,19 @@ static int __f2fs_write_data_pages(struct address_space *mapping, */ f2fs_remove_dirty_inode(inode); + + /* + * f2fs_write_cache_pages() has retry logic for EAGAIN case which is + * common when racing w/ checkpoint, so only update skipped write + * when ret is non-zero. + */ + if (ret) + update_skipped_write(sbi, wbc); return ret; skip_write: wbc->pages_skipped += get_dirty_pages(inode); + update_skipped_write(sbi, wbc); trace_f2fs_writepages(mapping->host, wbc, DATA); return 0; } @@ -3368,6 +3662,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, struct inode *inode = folio->mapping->host; pgoff_t index = folio->index; struct dnode_of_data dn; + struct f2fs_lock_context lc; struct folio *ifolio; bool locked = false; int flag = F2FS_GET_BLOCK_PRE_AIO; @@ -3384,10 +3679,10 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, if (f2fs_has_inline_data(inode)) { if (pos + len > MAX_INLINE_DATA(inode)) flag = F2FS_GET_BLOCK_DEFAULT; - f2fs_map_lock(sbi, flag); + f2fs_map_lock(sbi, &lc, flag); locked = true; } else if ((pos & PAGE_MASK) >= i_size_read(inode)) { - f2fs_map_lock(sbi, flag); + f2fs_map_lock(sbi, &lc, flag); locked = true; } @@ -3431,7 +3726,7 @@ restart: if (!err && dn.data_blkaddr != NULL_ADDR) goto out; f2fs_put_dnode(&dn); - f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO); + f2fs_map_lock(sbi, &lc, F2FS_GET_BLOCK_PRE_AIO); WARN_ON(flag != F2FS_GET_BLOCK_PRE_AIO); locked = true; goto restart; @@ -3445,7 +3740,7 @@ out: f2fs_put_dnode(&dn); unlock_out: if (locked) - f2fs_map_unlock(sbi, flag); + f2fs_map_unlock(sbi, &lc, flag); return err; } @@ -3481,10 +3776,11 @@ static int __reserve_data_block(struct inode *inode, pgoff_t index, { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; + struct f2fs_lock_context lc; struct folio *ifolio; int err = 0; - f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO); + f2fs_map_lock(sbi, &lc, F2FS_GET_BLOCK_PRE_AIO); ifolio = f2fs_get_inode_folio(sbi, inode->i_ino); if (IS_ERR(ifolio)) { @@ -3502,7 +3798,7 @@ static int __reserve_data_block(struct inode *inode, pgoff_t index, f2fs_put_dnode(&dn); unlock_out: - f2fs_map_unlock(sbi, F2FS_GET_BLOCK_PRE_AIO); + f2fs_map_unlock(sbi, &lc, F2FS_GET_BLOCK_PRE_AIO); return err; } @@ -3761,7 +4057,12 @@ void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length) f2fs_remove_dirty_inode(inode); } } - folio_detach_private(folio); + + if (offset || length != folio_size(folio)) + return; + + folio_cancel_dirty(folio); + ffs_detach_free(folio); } bool f2fs_release_folio(struct folio *folio, gfp_t wait) @@ -3770,7 +4071,7 @@ bool f2fs_release_folio(struct folio *folio, gfp_t wait) if (folio_test_dirty(folio)) return false; - folio_detach_private(folio); + ffs_detach_free(folio); return true; } @@ -3955,6 +4256,7 @@ static int check_swap_activate(struct swap_info_struct *sis, while (cur_lblock < last_lblock && cur_lblock < sis->max) { struct f2fs_map_blocks map; + bool last_extent = false; retry: cond_resched(); @@ -3980,11 +4282,10 @@ retry: pblock = map.m_pblk; nr_pblocks = map.m_len; - if ((pblock - SM_I(sbi)->main_blkaddr) % blks_per_sec || - nr_pblocks % blks_per_sec || - f2fs_is_sequential_zone_area(sbi, pblock)) { - bool last_extent = false; - + if (!last_extent && + ((pblock - SM_I(sbi)->main_blkaddr) % blks_per_sec || + nr_pblocks % blks_per_sec || + f2fs_is_sequential_zone_area(sbi, pblock))) { not_aligned++; nr_pblocks = roundup(nr_pblocks, blks_per_sec); @@ -4005,8 +4306,8 @@ retry: goto out; } - if (!last_extent) - goto retry; + /* lookup block mapping info after block migration */ + goto retry; } if (cur_lblock + nr_pblocks >= sis->max) @@ -4176,12 +4477,25 @@ int __init f2fs_init_bio_entry_cache(void) { bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab", sizeof(struct bio_entry)); - return bio_entry_slab ? 0 : -ENOMEM; + + if (!bio_entry_slab) + return -ENOMEM; + + ffs_entry_slab = f2fs_kmem_cache_create("f2fs_ffs_slab", + sizeof(struct f2fs_folio_state)); + + if (!ffs_entry_slab) { + kmem_cache_destroy(bio_entry_slab); + return -ENOMEM; + } + + return 0; } void f2fs_destroy_bio_entry_cache(void) { kmem_cache_destroy(bio_entry_slab); + kmem_cache_destroy(ffs_entry_slab); } static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, @@ -4207,7 +4521,7 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, * f2fs_map_lock and f2fs_balance_fs are not necessary. */ if ((flags & IOMAP_WRITE) && - !f2fs_overwrite_io(inode, offset, length)) + !__f2fs_overwrite_io(inode, offset, length, true)) map.m_may_create = true; err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DIO); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 032683835569..8e1040e375a7 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -423,6 +423,7 @@ static const char *s_flag[MAX_SBI_FLAG] = { [SBI_IS_RESIZEFS] = "resizefs", [SBI_IS_FREEZING] = "freezefs", [SBI_IS_WRITABLE] = "writable", + [SBI_ENABLE_CHECKPOINT] = "enable_checkpoint", }; static const char *ipu_mode_names[F2FS_IPU_MAX] = { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a90a62cfe617..bb34e864d0ef 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -54,7 +54,7 @@ enum { FAULT_TRUNCATE, FAULT_READ_IO, FAULT_CHECKPOINT, - FAULT_DISCARD, + FAULT_DISCARD, /* it's obsolete due to __blkdev_issue_discard() will never fail */ FAULT_WRITE_IO, FAULT_SLAB_ALLOC, FAULT_DQUOT_INIT, @@ -63,8 +63,10 @@ enum { FAULT_BLKADDR_CONSISTENCE, FAULT_NO_SEGMENT, FAULT_INCONSISTENT_FOOTER, - FAULT_TIMEOUT, + FAULT_ATOMIC_TIMEOUT, FAULT_VMALLOC, + FAULT_LOCK_TIMEOUT, + FAULT_SKIP_WRITE, FAULT_MAX, }; @@ -72,7 +74,8 @@ enum { enum fault_option { FAULT_RATE = 1, /* only update fault rate */ FAULT_TYPE = 2, /* only update fault type */ - FAULT_ALL = 4, /* reset all fault injection options/stats */ + FAULT_TIMEOUT = 4, /* only update fault timeout type */ + FAULT_ALL = 8, /* reset all fault injection options/stats */ }; #ifdef CONFIG_F2FS_FAULT_INJECTION @@ -82,6 +85,7 @@ struct f2fs_fault_info { unsigned int inject_type; /* Used to account total count of injection for each type */ unsigned int inject_count[FAULT_MAX]; + unsigned int inject_lock_timeout; /* inject lock timeout */ }; extern const char *f2fs_fault_name[FAULT_MAX]; @@ -173,6 +177,26 @@ enum device_allocation_policy { ALLOCATE_FORWARD_FROM_HINT, }; +enum f2fs_lock_name { + LOCK_NAME_NONE, + LOCK_NAME_CP_RWSEM, + LOCK_NAME_NODE_CHANGE, + LOCK_NAME_NODE_WRITE, + LOCK_NAME_GC_LOCK, + LOCK_NAME_CP_GLOBAL, + LOCK_NAME_IO_RWSEM, + LOCK_NAME_MAX, +}; + +enum f2fs_timeout_type { + TIMEOUT_TYPE_NONE, + TIMEOUT_TYPE_RUNNING, + TIMEOUT_TYPE_IO_SLEEP, + TIMEOUT_TYPE_NONIO_SLEEP, + TIMEOUT_TYPE_RUNNABLE, + TIMEOUT_TYPE_MAX, +}; + /* * An implementation of an rwsem that is explicitly unfair to readers. This * prevents priority inversion when a low-priority reader acquires the read lock @@ -181,6 +205,8 @@ enum device_allocation_policy { */ struct f2fs_rwsem { + struct f2fs_sb_info *sbi; + enum f2fs_lock_name name; struct rw_semaphore internal_rwsem; #ifdef CONFIG_F2FS_UNFAIR_RWSEM wait_queue_head_t read_waiters; @@ -287,7 +313,6 @@ enum { #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ #define DEF_DISABLE_INTERVAL 5 /* 5 secs */ -#define DEF_ENABLE_INTERVAL 5 /* 5 secs */ #define DEF_DISABLE_QUICK_INTERVAL 1 /* 1 secs */ #define DEF_UMOUNT_DISCARD_TIMEOUT 5 /* 5 secs */ @@ -295,7 +320,9 @@ enum cp_time { CP_TIME_START, /* begin */ CP_TIME_LOCK, /* after cp_global_sem */ CP_TIME_OP_LOCK, /* after block_operation */ - CP_TIME_FLUSH_META, /* after flush sit/nat */ + CP_TIME_MERGE_WRITE, /* after flush DATA/NODE/META */ + CP_TIME_FLUSH_NAT, /* after flush nat */ + CP_TIME_FLUSH_SIT, /* after flush sit */ CP_TIME_SYNC_META, /* after sync_meta_pages */ CP_TIME_SYNC_CP_META, /* after sync cp meta pages */ CP_TIME_WAIT_DIRTY_META,/* after wait on dirty meta */ @@ -521,13 +548,25 @@ struct fsync_inode_entry { #define nats_in_cursum(jnl) (le16_to_cpu((jnl)->n_nats)) #define sits_in_cursum(jnl) (le16_to_cpu((jnl)->n_sits)) -#define nat_in_journal(jnl, i) ((jnl)->nat_j.entries[i].ne) -#define nid_in_journal(jnl, i) ((jnl)->nat_j.entries[i].nid) -#define sit_in_journal(jnl, i) ((jnl)->sit_j.entries[i].se) -#define segno_in_journal(jnl, i) ((jnl)->sit_j.entries[i].segno) +#define nat_in_journal(jnl, i) \ + (((struct nat_journal_entry *)(jnl)->nat_j.entries)[i].ne) +#define nid_in_journal(jnl, i) \ + (((struct nat_journal_entry *)(jnl)->nat_j.entries)[i].nid) +#define sit_in_journal(jnl, i) \ + (((struct sit_journal_entry *)(jnl)->sit_j.entries)[i].se) +#define segno_in_journal(jnl, i) \ + (((struct sit_journal_entry *)(jnl)->sit_j.entries)[i].segno) -#define MAX_NAT_JENTRIES(jnl) (NAT_JOURNAL_ENTRIES - nats_in_cursum(jnl)) -#define MAX_SIT_JENTRIES(jnl) (SIT_JOURNAL_ENTRIES - sits_in_cursum(jnl)) +#define sum_entries(sum) ((struct f2fs_summary *)(sum)) +#define sum_journal(sbi, sum) \ + ((struct f2fs_journal *)((char *)(sum) + \ + ((sbi)->entries_in_sum * sizeof(struct f2fs_summary)))) +#define sum_footer(sbi, sum) \ + ((struct summary_footer *)((char *)(sum) + (sbi)->sum_blocksize - \ + sizeof(struct summary_footer))) + +#define MAX_NAT_JENTRIES(sbi, jnl) ((sbi)->nat_journal_entries - nats_in_cursum(jnl)) +#define MAX_SIT_JENTRIES(sbi, jnl) ((sbi)->sit_journal_entries - sits_in_cursum(jnl)) static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i) { @@ -545,14 +584,6 @@ static inline int update_sits_in_cursum(struct f2fs_journal *journal, int i) return before; } -static inline bool __has_cursum_space(struct f2fs_journal *journal, - int size, int type) -{ - if (type == NAT_JOURNAL) - return size <= MAX_NAT_JENTRIES(journal); - return size <= MAX_SIT_JENTRIES(journal); -} - /* for inline stuff */ #define DEF_INLINE_RESERVED_SIZE 1 static inline int get_extra_isize(struct inode *inode); @@ -669,8 +700,10 @@ enum { #define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO or flush count */ -/* IO/non-IO congestion wait timeout value, default: 1ms */ -#define DEFAULT_SCHEDULE_TIMEOUT (msecs_to_jiffies(1)) +#define MAX_FLUSH_RETRY_COUNT 3 /* maximum flush retry count in f2fs_enable_checkpoint() */ + +/* IO/non-IO congestion wait timeout value, default: 1 jiffies */ +#define DEFAULT_SCHEDULE_TIMEOUT 1 /* timeout value injected, default: 1000ms */ #define DEFAULT_FAULT_TIMEOUT (msecs_to_jiffies(1000)) @@ -1208,6 +1241,7 @@ enum count_type { F2FS_RD_META, F2FS_DIO_WRITE, F2FS_DIO_READ, + F2FS_SKIPPED_WRITE, /* skip or fail during f2fs_enable_checkpoint() */ NR_COUNT_TYPE, }; @@ -1396,6 +1430,27 @@ struct atgc_management { unsigned long long age_threshold; /* age threshold */ }; +struct f2fs_time_stat { + unsigned long long total_time; /* total wall clock time */ +#ifdef CONFIG_64BIT + unsigned long long running_time; /* running time */ +#endif +#if defined(CONFIG_SCHED_INFO) && defined(CONFIG_SCHEDSTATS) + unsigned long long runnable_time; /* runnable(including preempted) time */ +#endif +#ifdef CONFIG_TASK_DELAY_ACCT + unsigned long long io_sleep_time; /* IO sleep time */ +#endif +}; + +struct f2fs_lock_context { + struct f2fs_time_stat ts; + int orig_nice; + int new_nice; + bool lock_trace; + bool need_restore; +}; + struct f2fs_gc_control { unsigned int victim_segno; /* target victim segment number */ int init_gc_type; /* FG_GC or BG_GC */ @@ -1404,6 +1459,7 @@ struct f2fs_gc_control { bool err_gc_skipped; /* return EAGAIN if GC skipped */ bool one_time; /* require one time GC in one migration unit */ unsigned int nr_free_secs; /* # of free sections to do GC */ + struct f2fs_lock_context lc; /* lock context for gc_lock */ }; /* @@ -1427,6 +1483,7 @@ enum { SBI_IS_RESIZEFS, /* resizefs is in process */ SBI_IS_FREEZING, /* freezefs is in process */ SBI_IS_WRITABLE, /* remove ro mountoption transiently */ + SBI_ENABLE_CHECKPOINT, /* indicate it's during f2fs_enable_checkpoint() */ MAX_SBI_FLAG, }; @@ -1436,7 +1493,6 @@ enum { DISCARD_TIME, GC_TIME, DISABLE_TIME, - ENABLE_TIME, UMOUNT_DISCARD_TIMEOUT, MAX_TIME, }; @@ -1522,6 +1578,20 @@ enum f2fs_lookup_mode { LOOKUP_AUTO, }; +/* For node type in __get_node_folio() */ +enum node_type { + NODE_TYPE_REGULAR, + NODE_TYPE_INODE, + NODE_TYPE_XATTR, + NODE_TYPE_NON_INODE, +}; + +/* a threshold of maximum elapsed time in critical region to print tracepoint */ +#define MAX_LOCK_ELAPSED_TIME 500 + +#define F2FS_DEFAULT_TASK_PRIORITY (DEFAULT_PRIO) +#define F2FS_CRITICAL_TASK_PRIORITY NICE_TO_PRIO(0) + static inline int f2fs_test_bit(unsigned int nr, char *addr); static inline void f2fs_set_bit(unsigned int nr, char *addr); static inline void f2fs_clear_bit(unsigned int nr, char *addr); @@ -1714,7 +1784,6 @@ struct f2fs_sb_info { long interval_time[MAX_TIME]; /* to store thresholds */ struct ckpt_req_control cprc_info; /* for checkpoint request control */ struct cp_stats cp_stats; /* for time stat of checkpoint */ - struct f2fs_rwsem cp_enable_rwsem; /* block cache/dio write */ struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ @@ -1760,7 +1829,16 @@ struct f2fs_sb_info { unsigned int total_valid_node_count; /* valid node block count */ int dir_level; /* directory level */ bool readdir_ra; /* readahead inode in readdir */ - u64 max_io_bytes; /* max io bytes to merge IOs */ + unsigned int max_io_bytes; /* max io bytes to merge IOs */ + + /* variable summary block units */ + unsigned int sum_blocksize; /* sum block size */ + unsigned int sums_per_block; /* sum block count per block */ + unsigned int entries_in_sum; /* entry count in sum block */ + unsigned int sum_entry_size; /* total entry size in sum block */ + unsigned int sum_journal_size; /* journal size in sum block */ + unsigned int nat_journal_entries; /* nat journal entry count in the journal */ + unsigned int sit_journal_entries; /* sit journal entry count in the journal */ block_t user_block_count; /* # of user blocks */ block_t total_valid_block_count; /* # of valid blocks */ @@ -1908,7 +1986,7 @@ struct f2fs_sb_info { unsigned int gc_segment_mode; /* GC state for reclaimed segments */ unsigned int gc_reclaimed_segs[MAX_GC_MODE]; /* Reclaimed segs for each mode */ - unsigned long seq_file_ra_mul; /* multiplier for ra_pages of seq. files in fadvise */ + unsigned int seq_file_ra_mul; /* multiplier for ra_pages of seq. files in fadvise */ int max_fragment_chunk; /* max chunk size for block fragmentation mode */ int max_fragment_hole; /* max hole size for block fragmentation mode */ @@ -1922,6 +2000,18 @@ struct f2fs_sb_info { /* carve out reserved_blocks from total blocks */ bool carve_out; + /* max elapsed time threshold in critical region that lock covered */ + unsigned long long max_lock_elapsed_time; + + /* enable/disable to adjust task priority in critical region covered by lock */ + unsigned int adjust_lock_priority; + + /* adjust priority for task which is in critical region covered by lock */ + unsigned int lock_duration_priority; + + /* priority for critical task, e.g. f2fs_ckpt, f2fs_gc threads */ + long critical_task_priority; + #ifdef CONFIG_F2FS_FS_COMPRESSION struct kmem_cache *page_array_slab; /* page array entry */ unsigned int page_array_slab_size; /* default page array slab size */ @@ -2261,16 +2351,22 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) spin_unlock_irqrestore(&sbi->cp_lock, flags); } -#define init_f2fs_rwsem(sem) \ +#define init_f2fs_rwsem(sem) __init_f2fs_rwsem(sem, NULL, LOCK_NAME_NONE) +#define init_f2fs_rwsem_trace __init_f2fs_rwsem + +#define __init_f2fs_rwsem(sem, sbi, name) \ do { \ static struct lock_class_key __key; \ \ - __init_f2fs_rwsem((sem), #sem, &__key); \ + do_init_f2fs_rwsem((sem), #sem, &__key, sbi, name); \ } while (0) -static inline void __init_f2fs_rwsem(struct f2fs_rwsem *sem, - const char *sem_name, struct lock_class_key *key) +static inline void do_init_f2fs_rwsem(struct f2fs_rwsem *sem, + const char *sem_name, struct lock_class_key *key, + struct f2fs_sb_info *sbi, enum f2fs_lock_name name) { + sem->sbi = sbi; + sem->name = name; __init_rwsem(&sem->internal_rwsem, sem_name, key); #ifdef CONFIG_F2FS_UNFAIR_RWSEM init_waitqueue_head(&sem->read_waiters); @@ -2339,6 +2435,16 @@ static inline void f2fs_up_write(struct f2fs_rwsem *sem) #endif } +void f2fs_down_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc); +int f2fs_down_read_trylock_trace(struct f2fs_rwsem *sem, + struct f2fs_lock_context *lc); +void f2fs_up_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc); +void f2fs_down_write_trace(struct f2fs_rwsem *sem, + struct f2fs_lock_context *lc); +int f2fs_down_write_trylock_trace(struct f2fs_rwsem *sem, + struct f2fs_lock_context *lc); +void f2fs_up_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc); + static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock) { unsigned long flags; @@ -2369,33 +2475,6 @@ static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi, return (cpc) ? (cpc->reason & CP_UMOUNT) && set : set; } -static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) -{ - f2fs_down_read(&sbi->cp_rwsem); -} - -static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi) -{ - if (time_to_inject(sbi, FAULT_LOCK_OP)) - return 0; - return f2fs_down_read_trylock(&sbi->cp_rwsem); -} - -static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi) -{ - f2fs_up_read(&sbi->cp_rwsem); -} - -static inline void f2fs_lock_all(struct f2fs_sb_info *sbi) -{ - f2fs_down_write(&sbi->cp_rwsem); -} - -static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) -{ - f2fs_up_write(&sbi->cp_rwsem); -} - static inline int __get_cp_reason(struct f2fs_sb_info *sbi) { int reason = CP_SYNC; @@ -2811,6 +2890,14 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); } +static inline bool __has_cursum_space(struct f2fs_sb_info *sbi, + struct f2fs_journal *journal, unsigned int size, int type) +{ + if (type == NAT_JOURNAL) + return size <= MAX_NAT_JENTRIES(sbi, journal); + return size <= MAX_SIT_JENTRIES(sbi, journal); +} + extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync); static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, struct inode *inode, bool is_inode) @@ -3722,7 +3809,7 @@ void f2fs_update_inode_page(struct inode *inode); int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc); void f2fs_remove_donate_inode(struct inode *inode); void f2fs_evict_inode(struct inode *inode); -void f2fs_handle_failed_inode(struct inode *inode); +void f2fs_handle_failed_inode(struct inode *inode, struct f2fs_lock_context *lc); /* * namei.c @@ -3855,6 +3942,9 @@ struct folio *f2fs_new_node_folio(struct dnode_of_data *dn, unsigned int ofs); void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid, enum node_type node_type); +int f2fs_sanity_check_node_footer(struct f2fs_sb_info *sbi, + struct folio *folio, pgoff_t nid, + enum node_type ntype, bool in_irq); struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino); struct folio *f2fs_get_xnode_folio(struct f2fs_sb_info *sbi, pgoff_t xnid); int f2fs_move_node_folio(struct folio *node_folio, int gc_type); @@ -3954,7 +4044,8 @@ void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr, block_t len); void f2fs_write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk); void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk); -int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, +int f2fs_lookup_journal_in_cursum(struct f2fs_sb_info *sbi, + struct f2fs_journal *journal, int type, unsigned int val, int alloc); void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); int f2fs_check_and_fix_write_pointer(struct f2fs_sb_info *sbi); @@ -3989,6 +4080,9 @@ static inline bool f2fs_need_rand_seg(struct f2fs_sb_info *sbi) /* * checkpoint.c */ +void f2fs_lock_op(struct f2fs_sb_info *sbi, struct f2fs_lock_context *lc); +int f2fs_trylock_op(struct f2fs_sb_info *sbi, struct f2fs_lock_context *lc); +void f2fs_unlock_op(struct f2fs_sb_info *sbi, struct f2fs_lock_context *lc); void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io, unsigned char reason); void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi); @@ -4004,8 +4098,8 @@ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync); void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index, unsigned int ra_blocks); -long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, - long nr_to_write, enum iostat_type io_type); +long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, long nr_to_write, + enum iostat_type io_type); void f2fs_add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); void f2fs_remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); void f2fs_release_ino_entry(struct f2fs_sb_info *sbi, bool all); @@ -4050,6 +4144,8 @@ void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, struct folio *folio, nid_t ino, enum page_type type); +void f2fs_submit_merged_write_folio(struct f2fs_sb_info *sbi, + struct folio *folio, enum page_type type); void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, struct bio **bio, struct folio *folio); void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi); @@ -4887,6 +4983,7 @@ static inline bool f2fs_allow_multi_device_dio(struct f2fs_sb_info *sbi, #ifdef CONFIG_F2FS_FAULT_INJECTION extern int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned long rate, unsigned long type, enum fault_option fo); +extern void f2fs_simulate_lock_timeout(struct f2fs_sb_info *sbi); #else static inline int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned long rate, unsigned long type, @@ -4894,6 +4991,10 @@ static inline int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, { return 0; } +static inline void f2fs_simulate_lock_timeout(struct f2fs_sb_info *sbi) +{ + return; +} #endif static inline bool is_journalled_quota(struct f2fs_sb_info *sbi) @@ -4909,6 +5010,22 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi) return false; } +static inline bool f2fs_quota_file(struct f2fs_sb_info *sbi, nid_t ino) +{ +#ifdef CONFIG_QUOTA + int i; + + if (!f2fs_sb_has_quota_ino(sbi)) + return false; + + for (i = 0; i < MAXQUOTAS; i++) { + if (f2fs_qf_ino(sbi->sb, i) == ino) + return true; + } +#endif + return false; +} + static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi) { return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK; @@ -4928,16 +5045,14 @@ static inline void __f2fs_schedule_timeout(long timeout, bool io) #define f2fs_schedule_timeout(timeout) \ __f2fs_schedule_timeout(timeout, false) -static inline void f2fs_io_schedule_timeout_killable(long timeout) +static inline void f2fs_schedule_timeout_killable(long timeout, bool io) { - while (timeout) { + unsigned long last_time = jiffies + timeout; + + while (jiffies < last_time) { if (fatal_signal_pending(current)) return; - set_current_state(TASK_UNINTERRUPTIBLE); - io_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT); - if (timeout <= DEFAULT_SCHEDULE_TIMEOUT) - return; - timeout -= DEFAULT_SCHEDULE_TIMEOUT; + __f2fs_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT, io); } } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 1fdbe18692be..c8a2f17a8f11 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -626,6 +626,10 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) if (!f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; + if (mapping_large_folio_support(inode->i_mapping) && + filp->f_mode & FMODE_WRITE) + return -EOPNOTSUPP; + err = fsverity_file_open(inode, filp); if (err) return err; @@ -772,6 +776,7 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; + struct f2fs_lock_context lc; pgoff_t free_from; int count = 0, err = 0; struct folio *ifolio; @@ -790,7 +795,7 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock) goto free_partial; if (lock) - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); ifolio = f2fs_get_inode_folio(sbi, inode->i_ino); if (IS_ERR(ifolio)) { @@ -841,7 +846,7 @@ free_next: err = f2fs_truncate_inode_blocks(inode, free_from); out: if (lock) - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); free_partial: /* lastly zero out the first data page */ if (!err) @@ -1112,11 +1117,13 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, } if (i_uid_needs_update(idmap, attr, inode) || i_gid_needs_update(idmap, attr, inode)) { - f2fs_lock_op(sbi); + struct f2fs_lock_context lc; + + f2fs_lock_op(sbi, &lc); err = dquot_transfer(idmap, inode, attr); if (err) { set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); return err; } /* @@ -1126,7 +1133,7 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, i_uid_update(idmap, attr, inode); i_gid_update(idmap, attr, inode); f2fs_mark_inode_dirty_sync(inode, true); - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); } if (attr->ia_valid & ATTR_SIZE) { @@ -1210,15 +1217,16 @@ static int fill_zero(struct inode *inode, pgoff_t index, { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct folio *folio; + struct f2fs_lock_context lc; if (!len) return 0; f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); folio = f2fs_get_new_data_folio(inode, NULL, index, false); - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); if (IS_ERR(folio)) return PTR_ERR(folio); @@ -1301,6 +1309,7 @@ static int f2fs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (pg_start < pg_end) { loff_t blk_start, blk_end; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_lock_context lc; f2fs_balance_fs(sbi, true); @@ -1312,9 +1321,9 @@ static int f2fs_punch_hole(struct inode *inode, loff_t offset, loff_t len) truncate_pagecache_range(inode, blk_start, blk_end - 1); - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); ret = f2fs_truncate_hole(inode, pg_start, pg_end); - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); filemap_invalidate_unlock(inode->i_mapping); f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); @@ -1546,6 +1555,7 @@ roll_back: static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_lock_context lc; pgoff_t nrpages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); pgoff_t start = offset >> PAGE_SHIFT; pgoff_t end = (offset + len) >> PAGE_SHIFT; @@ -1559,11 +1569,11 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) f2fs_zero_post_eof_page(inode, offset + len, false); - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); f2fs_drop_extent_tree(inode); truncate_pagecache(inode, offset); ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); filemap_invalidate_unlock(inode->i_mapping); f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); @@ -1711,6 +1721,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, for (index = pg_start; index < pg_end;) { struct dnode_of_data dn; + struct f2fs_lock_context lc; unsigned int end_offset; pgoff_t end; @@ -1721,12 +1732,12 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, (loff_t)index << PAGE_SHIFT, ((loff_t)pg_end << PAGE_SHIFT) - 1); - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); set_new_dnode(&dn, inode, NULL, NULL, 0); ret = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE); if (ret) { - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); filemap_invalidate_unlock(mapping); f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); goto out; @@ -1738,7 +1749,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = f2fs_do_zero_range(&dn, index, end); f2fs_put_dnode(&dn); - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); filemap_invalidate_unlock(mapping); f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); @@ -1821,17 +1832,19 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) truncate_pagecache(inode, offset); while (!ret && idx > pg_start) { + struct f2fs_lock_context lc; + nr = idx - pg_start; if (nr > delta) nr = delta; idx -= nr; - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); f2fs_drop_extent_tree(inode); ret = __exchange_data_block(inode, inode, idx, idx + delta, nr, false); - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); } filemap_invalidate_unlock(mapping); f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); @@ -1913,7 +1926,7 @@ next_alloc: if (has_not_enough_free_secs(sbi, 0, sbi->reserved_pin_section)) { - f2fs_down_write(&sbi->gc_lock); + f2fs_down_write_trace(&sbi->gc_lock, &gc_control.lc); stat_inc_gc_call_count(sbi, FOREGROUND); err = f2fs_gc(sbi, &gc_control); if (err && err != -ENODATA) { @@ -2448,7 +2461,7 @@ int f2fs_do_shutdown(struct f2fs_sb_info *sbi, unsigned int flag, f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); break; case F2FS_GOING_DOWN_METAFLUSH: - f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO); + f2fs_sync_meta_pages(sbi, LONG_MAX, FS_META_IO); f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); break; case F2FS_GOING_DOWN_NEED_FSCK: @@ -2764,12 +2777,13 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) return ret; if (!sync) { - if (!f2fs_down_write_trylock(&sbi->gc_lock)) { + if (!f2fs_down_write_trylock_trace(&sbi->gc_lock, + &gc_control.lc)) { ret = -EBUSY; goto out; } } else { - f2fs_down_write(&sbi->gc_lock); + f2fs_down_write_trace(&sbi->gc_lock, &gc_control.lc); } gc_control.init_gc_type = sync ? FG_GC : BG_GC; @@ -2809,12 +2823,12 @@ static int __f2fs_ioc_gc_range(struct file *filp, struct f2fs_gc_range *range) do_more: if (!range->sync) { - if (!f2fs_down_write_trylock(&sbi->gc_lock)) { + if (!f2fs_down_write_trylock_trace(&sbi->gc_lock, &gc_control.lc)) { ret = -EBUSY; goto out; } } else { - f2fs_down_write(&sbi->gc_lock); + f2fs_down_write_trace(&sbi->gc_lock, &gc_control.lc); } gc_control.victim_segno = GET_SEGNO(sbi, range->start); @@ -3087,6 +3101,7 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, struct inode *src = file_inode(file_in); struct inode *dst = file_inode(file_out); struct f2fs_sb_info *sbi = F2FS_I_SB(src); + struct f2fs_lock_context lc; size_t olen = len, dst_max_i_size = 0; size_t dst_osize; int ret; @@ -3182,7 +3197,7 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, goto out_src; } - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); ret = __exchange_data_block(src, dst, F2FS_BYTES_TO_BLK(pos_in), F2FS_BYTES_TO_BLK(pos_out), F2FS_BYTES_TO_BLK(len), false); @@ -3193,7 +3208,7 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, else if (dst_osize != dst->i_size) f2fs_i_size_write(dst, dst_osize); } - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); if (src != dst) f2fs_up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); @@ -3304,7 +3319,7 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) end_segno = min(start_segno + range.segments, dev_end_segno); while (start_segno < end_segno) { - if (!f2fs_down_write_trylock(&sbi->gc_lock)) { + if (!f2fs_down_write_trylock_trace(&sbi->gc_lock, &gc_control.lc)) { ret = -EBUSY; goto out; } @@ -3361,6 +3376,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode *ri = NULL; + struct f2fs_lock_context lc; kprojid_t kprojid; int err; @@ -3391,7 +3407,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) if (err) return err; - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); err = f2fs_transfer_project_quota(inode, kprojid); if (err) goto out_unlock; @@ -3400,7 +3416,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) inode_set_ctime_current(inode); f2fs_mark_inode_dirty_sync(inode, true); out_unlock: - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); return err; } #else @@ -3833,6 +3849,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) struct inode *inode = file_inode(filp); struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_lock_context lc; pgoff_t page_idx = 0, last_idx; unsigned int released_blocks = 0; int ret; @@ -3887,12 +3904,12 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) struct dnode_of_data dn; pgoff_t end_offset, count; - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); set_new_dnode(&dn, inode, NULL, NULL, 0); ret = f2fs_get_dnode_of_data(&dn, page_idx, LOOKUP_NODE); if (ret) { - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); if (ret == -ENOENT) { page_idx = f2fs_get_next_page_offset(&dn, page_idx); @@ -3910,7 +3927,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) f2fs_put_dnode(&dn); - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); if (ret < 0) break; @@ -4063,14 +4080,15 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) while (page_idx < last_idx) { struct dnode_of_data dn; + struct f2fs_lock_context lc; pgoff_t end_offset, count; - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); set_new_dnode(&dn, inode, NULL, NULL, 0); ret = f2fs_get_dnode_of_data(&dn, page_idx, LOOKUP_NODE); if (ret) { - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); if (ret == -ENOENT) { page_idx = f2fs_get_next_page_offset(&dn, page_idx); @@ -4088,7 +4106,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) f2fs_put_dnode(&dn); - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); if (ret < 0) break; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 384fa7e2085b..f46b2673d31f 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -102,21 +102,22 @@ static int gc_thread_func(void *data) if (sbi->gc_mode == GC_URGENT_HIGH || sbi->gc_mode == GC_URGENT_MID) { wait_ms = gc_th->urgent_sleep_time; - f2fs_down_write(&sbi->gc_lock); + f2fs_down_write_trace(&sbi->gc_lock, &gc_control.lc); goto do_gc; } if (foreground) { - f2fs_down_write(&sbi->gc_lock); + f2fs_down_write_trace(&sbi->gc_lock, &gc_control.lc); goto do_gc; - } else if (!f2fs_down_write_trylock(&sbi->gc_lock)) { + } else if (!f2fs_down_write_trylock_trace(&sbi->gc_lock, + &gc_control.lc)) { stat_other_skip_bggc_count(sbi); goto next; } if (!is_idle(sbi, GC_TIME)) { increase_sleep_time(gc_th, &wait_ms); - f2fs_up_write(&sbi->gc_lock); + f2fs_up_write_trace(&sbi->gc_lock, &gc_control.lc); stat_io_skip_bggc_count(sbi); goto next; } @@ -125,7 +126,8 @@ static int gc_thread_func(void *data) if (has_enough_free_blocks(sbi, gc_th->no_zoned_gc_percent)) { wait_ms = gc_th->no_gc_sleep_time; - f2fs_up_write(&sbi->gc_lock); + f2fs_up_write_trace(&sbi->gc_lock, + &gc_control.lc); goto next; } if (wait_ms == gc_th->no_gc_sleep_time) @@ -232,6 +234,8 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) return err; } + set_user_nice(gc_th->f2fs_gc_task, + PRIO_TO_NICE(sbi->critical_task_priority)); return 0; } @@ -1031,7 +1035,8 @@ static int check_valid_map(struct f2fs_sb_info *sbi, * ignore that. */ static int gc_node_segment(struct f2fs_sb_info *sbi, - struct f2fs_summary *sum, unsigned int segno, int gc_type) + struct f2fs_summary *sum, unsigned int segno, int gc_type, + struct blk_plug *plug) { struct f2fs_summary *entry; block_t start_addr; @@ -1100,8 +1105,11 @@ next_step: stat_inc_node_blk_count(sbi, 1, gc_type); } - if (++phase < 3) + if (++phase < 3) { + blk_finish_plug(plug); + blk_start_plug(plug); goto next_step; + } if (fggc) atomic_dec(&sbi->wb_sync_req[NODE]); @@ -1453,7 +1461,11 @@ up_out: put_out: f2fs_put_dnode(&dn); out: - f2fs_folio_put(folio, true); + if (!folio_test_uptodate(folio)) + __folio_set_dropbehind(folio); + folio_unlock(folio); + folio_end_dropbehind(folio); + folio_put(folio); return err; } @@ -1535,7 +1547,7 @@ out: */ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, struct gc_inode_list *gc_list, unsigned int segno, int gc_type, - bool force_migrate) + bool force_migrate, struct blk_plug *plug) { struct super_block *sb = sbi->sb; struct f2fs_summary *entry; @@ -1703,8 +1715,11 @@ next_step: } } - if (++phase < 5) + if (++phase < 5) { + blk_finish_plug(plug); + blk_start_plug(plug); goto next_step; + } return submitted; } @@ -1769,8 +1784,8 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, sanity_check_seg_type(sbi, get_seg_entry(sbi, segno)->type); - segno = rounddown(segno, SUMS_PER_BLOCK); - sum_blk_cnt = DIV_ROUND_UP(end_segno - segno, SUMS_PER_BLOCK); + segno = rounddown(segno, sbi->sums_per_block); + sum_blk_cnt = DIV_ROUND_UP(end_segno - segno, sbi->sums_per_block); /* readahead multi ssa blocks those have contiguous address */ if (__is_large_section(sbi)) f2fs_ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), @@ -1780,17 +1795,17 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, while (segno < end_segno) { struct folio *sum_folio = f2fs_get_sum_folio(sbi, segno); - segno += SUMS_PER_BLOCK; + segno += sbi->sums_per_block; if (IS_ERR(sum_folio)) { int err = PTR_ERR(sum_folio); - end_segno = segno - SUMS_PER_BLOCK; - segno = rounddown(start_segno, SUMS_PER_BLOCK); + end_segno = segno - sbi->sums_per_block; + segno = rounddown(start_segno, sbi->sums_per_block); while (segno < end_segno) { sum_folio = filemap_get_folio(META_MAPPING(sbi), GET_SUM_BLOCK(sbi, segno)); folio_put_refs(sum_folio, 2); - segno += SUMS_PER_BLOCK; + segno += sbi->sums_per_block; } return err; } @@ -1806,8 +1821,8 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, /* find segment summary of victim */ struct folio *sum_folio = filemap_get_folio(META_MAPPING(sbi), GET_SUM_BLOCK(sbi, segno)); - unsigned int block_end_segno = rounddown(segno, SUMS_PER_BLOCK) - + SUMS_PER_BLOCK; + unsigned int block_end_segno = rounddown(segno, sbi->sums_per_block) + + sbi->sums_per_block; if (block_end_segno > end_segno) block_end_segno = end_segno; @@ -1833,12 +1848,13 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, migrated >= sbi->migration_granularity) continue; - sum = SUM_BLK_PAGE_ADDR(sum_folio, cur_segno); - if (type != GET_SUM_TYPE((&sum->footer))) { + sum = SUM_BLK_PAGE_ADDR(sbi, sum_folio, cur_segno); + if (type != GET_SUM_TYPE(sum_footer(sbi, sum))) { f2fs_err(sbi, "Inconsistent segment (%u) type " "[%d, %d] in SSA and SIT", cur_segno, type, - GET_SUM_TYPE((&sum->footer))); + GET_SUM_TYPE( + sum_footer(sbi, sum))); f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_CORRUPTED_SUMMARY); continue; @@ -1853,11 +1869,11 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, */ if (type == SUM_TYPE_NODE) submitted += gc_node_segment(sbi, sum->entries, - cur_segno, gc_type); + cur_segno, gc_type, &plug); else submitted += gc_data_segment(sbi, sum->entries, gc_list, cur_segno, - gc_type, force_migrate); + gc_type, force_migrate, &plug); stat_inc_gc_seg_count(sbi, data_type, gc_type); sbi->gc_reclaimed_segs[sbi->gc_mode]++; @@ -2000,7 +2016,7 @@ retry: goto stop; } - __get_secs_required(sbi, NULL, &upper_secs, NULL); + upper_secs = __get_secs_required(sbi); /* * Write checkpoint to reclaim prefree segments. @@ -2035,7 +2051,7 @@ stop: reserved_segments(sbi), prefree_segments(sbi)); - f2fs_up_write(&sbi->gc_lock); + f2fs_up_write_trace(&sbi->gc_lock, &gc_control->lc); put_gc_inode(&gc_list); @@ -2096,6 +2112,7 @@ int f2fs_gc_range(struct f2fs_sb_info *sbi, if (unlikely(f2fs_cp_error(sbi))) return -EIO; + stat_inc_gc_call_count(sbi, FOREGROUND); for (segno = start_seg; segno <= end_seg; segno += SEGS_PER_SEC(sbi)) { struct gc_inode_list gc_list = { .ilist = LIST_HEAD_INIT(gc_list.ilist), @@ -2251,6 +2268,9 @@ int f2fs_resize_fs(struct file *filp, __u64 block_count) struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(filp)); __u64 old_block_count, shrunk_blocks; struct cp_control cpc = { CP_RESIZE, 0, 0, 0 }; + struct f2fs_lock_context lc; + struct f2fs_lock_context glc; + struct f2fs_lock_context clc; unsigned int secs; int err = 0; __u32 rem; @@ -2294,13 +2314,13 @@ int f2fs_resize_fs(struct file *filp, __u64 block_count) secs = div_u64(shrunk_blocks, BLKS_PER_SEC(sbi)); /* stop other GC */ - if (!f2fs_down_write_trylock(&sbi->gc_lock)) { + if (!f2fs_down_write_trylock_trace(&sbi->gc_lock, &glc)) { err = -EAGAIN; goto out_drop_write; } /* stop CP to protect MAIN_SEC in free_segment_range */ - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); spin_lock(&sbi->stat_lock); if (shrunk_blocks + valid_user_blocks(sbi) + @@ -2315,8 +2335,8 @@ int f2fs_resize_fs(struct file *filp, __u64 block_count) err = free_segment_range(sbi, secs, true); out_unlock: - f2fs_unlock_op(sbi); - f2fs_up_write(&sbi->gc_lock); + f2fs_unlock_op(sbi, &lc); + f2fs_up_write_trace(&sbi->gc_lock, &glc); out_drop_write: mnt_drop_write_file(filp); if (err) @@ -2333,8 +2353,8 @@ out_drop_write: return -EROFS; } - f2fs_down_write(&sbi->gc_lock); - f2fs_down_write(&sbi->cp_global_sem); + f2fs_down_write_trace(&sbi->gc_lock, &glc); + f2fs_down_write_trace(&sbi->cp_global_sem, &clc); spin_lock(&sbi->stat_lock); if (shrunk_blocks + valid_user_blocks(sbi) + @@ -2382,8 +2402,8 @@ recover_out: spin_unlock(&sbi->stat_lock); } out_err: - f2fs_up_write(&sbi->cp_global_sem); - f2fs_up_write(&sbi->gc_lock); + f2fs_up_write_trace(&sbi->cp_global_sem, &clc); + f2fs_up_write_trace(&sbi->gc_lock, &glc); thaw_super(sbi->sb, FREEZE_HOLDER_KERNEL, NULL); return err; } diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index e5c6a08b7e4f..0a1052d5ee62 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -218,6 +218,7 @@ int f2fs_convert_inline_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; + struct f2fs_lock_context lc; struct folio *ifolio, *folio; int err = 0; @@ -235,7 +236,7 @@ int f2fs_convert_inline_inode(struct inode *inode) if (IS_ERR(folio)) return PTR_ERR(folio); - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); ifolio = f2fs_get_inode_folio(sbi, inode->i_ino); if (IS_ERR(ifolio)) { @@ -250,7 +251,7 @@ int f2fs_convert_inline_inode(struct inode *inode) f2fs_put_dnode(&dn); out: - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); f2fs_folio_put(folio, true); @@ -597,13 +598,14 @@ int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry) struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct folio *ifolio; struct f2fs_filename fname; + struct f2fs_lock_context lc; void *inline_dentry = NULL; int err = 0; if (!f2fs_has_inline_dentry(dir)) return 0; - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); err = f2fs_setup_filename(dir, &dentry->d_name, 0, &fname); if (err) @@ -628,7 +630,7 @@ int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry) out_fname: f2fs_free_filename(&fname); out: - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); return err; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index ee332b994348..e0f850b3f0c3 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -597,6 +597,8 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) if (ret) goto bad_inode; make_now: + f2fs_set_inode_flags(inode); + if (ino == F2FS_NODE_INO(sbi)) { inode->i_mapping->a_ops = &f2fs_node_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); @@ -618,6 +620,9 @@ make_now: inode->i_op = &f2fs_file_inode_operations; inode->i_fop = &f2fs_file_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; + if (IS_IMMUTABLE(inode) && !f2fs_compressed_file(inode) && + !f2fs_quota_file(sbi, inode->i_ino)) + mapping_set_folio_min_order(inode->i_mapping, 0); } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &f2fs_dir_inode_operations; inode->i_fop = &f2fs_dir_operations; @@ -638,7 +643,6 @@ make_now: ret = -EIO; goto bad_inode; } - f2fs_set_inode_flags(inode); unlock_new_inode(inode); trace_f2fs_iget(inode); @@ -906,9 +910,11 @@ retry: err = -EIO; if (!err) { - f2fs_lock_op(sbi); + struct f2fs_lock_context lc; + + f2fs_lock_op(sbi, &lc); err = f2fs_remove_inode_page(inode); - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); if (err == -ENOENT) { err = 0; @@ -1004,7 +1010,7 @@ out_clear: } /* caller should call f2fs_lock_op() */ -void f2fs_handle_failed_inode(struct inode *inode) +void f2fs_handle_failed_inode(struct inode *inode, struct f2fs_lock_context *lc) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct node_info ni; @@ -1053,7 +1059,7 @@ void f2fs_handle_failed_inode(struct inode *inode) } out: - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, lc); /* iput will drop the inode object */ iput(inode); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 043d20516a21..e360f08a9586 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -354,6 +354,7 @@ static int f2fs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct f2fs_lock_context lc; struct inode *inode; nid_t ino = 0; int err; @@ -376,11 +377,11 @@ static int f2fs_create(struct mnt_idmap *idmap, struct inode *dir, inode->i_mapping->a_ops = &f2fs_dblock_aops; ino = inode->i_ino; - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); err = f2fs_add_link(dentry, inode); if (err) goto out; - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); f2fs_alloc_nid_done(sbi, ino); @@ -392,7 +393,7 @@ static int f2fs_create(struct mnt_idmap *idmap, struct inode *dir, f2fs_balance_fs(sbi, true); return 0; out: - f2fs_handle_failed_inode(inode); + f2fs_handle_failed_inode(inode, &lc); return err; } @@ -401,6 +402,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, { struct inode *inode = d_inode(old_dentry); struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct f2fs_lock_context lc; int err; if (unlikely(f2fs_cp_error(sbi))) @@ -427,11 +429,11 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, ihold(inode); set_inode_flag(inode, FI_INC_LINK); - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); err = f2fs_add_link(dentry, inode); if (err) goto out; - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); d_instantiate(dentry, inode); @@ -441,7 +443,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, out: clear_inode_flag(inode, FI_INC_LINK); iput(inode); - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); return err; } @@ -545,6 +547,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode = d_inode(dentry); struct f2fs_dir_entry *de; + struct f2fs_lock_context lc; struct folio *folio; int err; @@ -581,15 +584,15 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); err = f2fs_acquire_orphan_inode(sbi); if (err) { - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); f2fs_folio_put(folio, false); goto out; } f2fs_delete_entry(de, folio, dir, inode); - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid @@ -632,6 +635,7 @@ static int f2fs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct f2fs_lock_context lc; struct inode *inode; size_t len = strlen(symname); struct fscrypt_str disk_link; @@ -662,11 +666,11 @@ static int f2fs_symlink(struct mnt_idmap *idmap, struct inode *dir, inode_nohighmem(inode); inode->i_mapping->a_ops = &f2fs_dblock_aops; - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); err = f2fs_add_link(dentry, inode); if (err) goto out_f2fs_handle_failed_inode; - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); f2fs_alloc_nid_done(sbi, inode->i_ino); err = fscrypt_encrypt_symlink(inode, symname, len, &disk_link); @@ -701,7 +705,7 @@ err_out: goto out_free_encrypted_link; out_f2fs_handle_failed_inode: - f2fs_handle_failed_inode(inode); + f2fs_handle_failed_inode(inode, &lc); out_free_encrypted_link: if (disk_link.name != (unsigned char *)symname) kfree(disk_link.name); @@ -712,6 +716,7 @@ static struct dentry *f2fs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct f2fs_lock_context lc; struct inode *inode; int err; @@ -732,11 +737,11 @@ static struct dentry *f2fs_mkdir(struct mnt_idmap *idmap, struct inode *dir, mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); set_inode_flag(inode, FI_INC_LINK); - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); err = f2fs_add_link(dentry, inode); if (err) goto out_fail; - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); f2fs_alloc_nid_done(sbi, inode->i_ino); @@ -750,7 +755,7 @@ static struct dentry *f2fs_mkdir(struct mnt_idmap *idmap, struct inode *dir, out_fail: clear_inode_flag(inode, FI_INC_LINK); - f2fs_handle_failed_inode(inode); + f2fs_handle_failed_inode(inode, &lc); return ERR_PTR(err); } @@ -767,6 +772,7 @@ static int f2fs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct f2fs_lock_context lc; struct inode *inode; int err = 0; @@ -786,11 +792,11 @@ static int f2fs_mknod(struct mnt_idmap *idmap, struct inode *dir, init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &f2fs_special_inode_operations; - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); err = f2fs_add_link(dentry, inode); if (err) goto out; - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); f2fs_alloc_nid_done(sbi, inode->i_ino); @@ -802,7 +808,7 @@ static int f2fs_mknod(struct mnt_idmap *idmap, struct inode *dir, f2fs_balance_fs(sbi, true); return 0; out: - f2fs_handle_failed_inode(inode); + f2fs_handle_failed_inode(inode, &lc); return err; } @@ -811,6 +817,7 @@ static int __f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct inode **new_inode, struct f2fs_filename *fname) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct f2fs_lock_context lc; struct inode *inode; int err; @@ -831,7 +838,7 @@ static int __f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, inode->i_mapping->a_ops = &f2fs_dblock_aops; } - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); err = f2fs_acquire_orphan_inode(sbi); if (err) goto out; @@ -860,7 +867,7 @@ static int __f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, f2fs_i_links_write(inode, false); } /* link_count was changed by d_tmpfile as well. */ - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); unlock_new_inode(inode); if (new_inode) @@ -872,7 +879,7 @@ static int __f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, release_out: f2fs_release_orphan_inode(sbi); out: - f2fs_handle_failed_inode(inode); + f2fs_handle_failed_inode(inode, &lc); return err; } @@ -920,6 +927,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct f2fs_dir_entry *old_dir_entry = NULL; struct f2fs_dir_entry *old_entry; struct f2fs_dir_entry *new_entry; + struct f2fs_lock_context lc; bool old_is_dir = S_ISDIR(old_inode->i_mode); int err; @@ -1008,7 +1016,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); err = f2fs_acquire_orphan_inode(sbi); if (err) @@ -1031,11 +1039,11 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, } else { f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); err = f2fs_add_link(new_dentry, old_inode); if (err) { - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); goto out_dir; } @@ -1084,7 +1092,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, TRANS_DIR_INO); } - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) f2fs_sync_fs(sbi->sb, 1); @@ -1093,7 +1101,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, return 0; put_out_dir: - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); f2fs_folio_put(new_folio, false); out_dir: if (old_dir_entry) @@ -1115,6 +1123,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, struct folio *old_folio, *new_folio; struct f2fs_dir_entry *old_dir_entry = NULL, *new_dir_entry = NULL; struct f2fs_dir_entry *old_entry, *new_entry; + struct f2fs_lock_context lc; int old_nlink = 0, new_nlink = 0; int err; @@ -1194,7 +1203,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); /* update ".." directory entry info of old dentry */ if (old_dir_entry) @@ -1247,7 +1256,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); } - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) f2fs_sync_fs(sbi->sb, 1); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 482a362f2625..74992fd9c9b6 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -606,7 +606,7 @@ retry: goto retry; } - i = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0); + i = f2fs_lookup_journal_in_cursum(sbi, journal, NAT_JOURNAL, nid, 0); if (i >= 0) { ne = nat_in_journal(journal, i); node_info_from_raw_nat(ni, &ne); @@ -643,6 +643,17 @@ sanity_check: return -EFSCORRUPTED; } + if (unlikely(f2fs_quota_file(sbi, ni->nid) && + !__is_valid_data_blkaddr(ni->blk_addr))) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_err_ratelimited(sbi, + "f2fs_get_node_info of %pS: inconsistent nat entry from qf_ino, " + "ino:%u, nid:%u, blkaddr:%u, ver:%u, flag:%u", + __builtin_return_address(0), + ni->ino, ni->nid, ni->blk_addr, ni->version, ni->flag); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_NAT); + } + /* cache nat entry */ if (need_cache) cache_nat_entry(sbi, nid, &ne); @@ -1500,24 +1511,33 @@ void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) f2fs_folio_put(afolio, err ? true : false); } -static int sanity_check_node_footer(struct f2fs_sb_info *sbi, +int f2fs_sanity_check_node_footer(struct f2fs_sb_info *sbi, struct folio *folio, pgoff_t nid, - enum node_type ntype) + enum node_type ntype, bool in_irq) { + bool is_inode, is_xnode; + if (unlikely(nid != nid_of_node(folio))) goto out_err; + is_inode = IS_INODE(folio); + is_xnode = f2fs_has_xattr_block(ofs_of_node(folio)); + switch (ntype) { + case NODE_TYPE_REGULAR: + if (is_inode && is_xnode) + goto out_err; + break; case NODE_TYPE_INODE: - if (!IS_INODE(folio)) + if (!is_inode || is_xnode) goto out_err; break; case NODE_TYPE_XATTR: - if (!f2fs_has_xattr_block(ofs_of_node(folio))) + if (is_inode || !is_xnode) goto out_err; break; case NODE_TYPE_NON_INODE: - if (IS_INODE(folio)) + if (is_inode) goto out_err; break; default: @@ -1527,12 +1547,13 @@ static int sanity_check_node_footer(struct f2fs_sb_info *sbi, goto out_err; return 0; out_err: - f2fs_warn(sbi, "inconsistent node block, node_type:%d, nid:%lu, " - "node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", - ntype, nid, nid_of_node(folio), ino_of_node(folio), - ofs_of_node(folio), cpver_of_node(folio), - next_blkaddr_of_node(folio)); set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn_ratelimited(sbi, "inconsistent node block, node_type:%d, nid:%lu, " + "node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", + ntype, nid, nid_of_node(folio), ino_of_node(folio), + ofs_of_node(folio), cpver_of_node(folio), + next_blkaddr_of_node(folio)); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); return -EFSCORRUPTED; } @@ -1578,7 +1599,7 @@ repeat: goto out_err; } page_hit: - err = sanity_check_node_footer(sbi, folio, nid, ntype); + err = f2fs_sanity_check_node_footer(sbi, folio, nid, ntype, false); if (!err) return folio; out_err: @@ -1727,6 +1748,7 @@ static bool __write_node_folio(struct folio *folio, bool atomic, bool *submitted .io_type = io_type, .io_wbc = wbc, }; + struct f2fs_lock_context lc; unsigned int seq; trace_f2fs_writepage(folio, NODE); @@ -1751,18 +1773,23 @@ static bool __write_node_folio(struct folio *folio, bool atomic, bool *submitted /* get old block addr of this node page */ nid = nid_of_node(folio); - f2fs_bug_on(sbi, folio->index != nid); + + if (f2fs_sanity_check_node_footer(sbi, folio, nid, + NODE_TYPE_REGULAR, false)) { + f2fs_handle_critical_error(sbi, STOP_CP_REASON_CORRUPTED_NID); + goto redirty_out; + } if (f2fs_get_node_info(sbi, nid, &ni, !do_balance)) goto redirty_out; - f2fs_down_read(&sbi->node_write); + f2fs_down_read_trace(&sbi->node_write, &lc); /* This page is already truncated */ if (unlikely(ni.blk_addr == NULL_ADDR)) { folio_clear_uptodate(folio); dec_page_count(sbi, F2FS_DIRTY_NODES); - f2fs_up_read(&sbi->node_write); + f2fs_up_read_trace(&sbi->node_write, &lc); folio_unlock(folio); return true; } @@ -1770,12 +1797,17 @@ static bool __write_node_folio(struct folio *folio, bool atomic, bool *submitted if (__is_valid_data_blkaddr(ni.blk_addr) && !f2fs_is_valid_blkaddr(sbi, ni.blk_addr, DATA_GENERIC_ENHANCE)) { - f2fs_up_read(&sbi->node_write); + f2fs_up_read_trace(&sbi->node_write, &lc); goto redirty_out; } - if (atomic && !test_opt(sbi, NOBARRIER)) - fio.op_flags |= REQ_PREFLUSH | REQ_FUA; + if (atomic) { + if (!test_opt(sbi, NOBARRIER)) + fio.op_flags |= REQ_PREFLUSH | REQ_FUA; + if (IS_INODE(folio)) + set_dentry_mark(folio, + f2fs_need_dentry_mark(sbi, ino_of_node(folio))); + } /* should add to global list before clearing PAGECACHE status */ if (f2fs_in_warm_node_list(sbi, folio)) { @@ -1790,7 +1822,7 @@ static bool __write_node_folio(struct folio *folio, bool atomic, bool *submitted f2fs_do_write_node_page(nid, &fio); set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(folio)); dec_page_count(sbi, F2FS_DIRTY_NODES); - f2fs_up_read(&sbi->node_write); + f2fs_up_read_trace(&sbi->node_write, &lc); folio_unlock(folio); @@ -1916,8 +1948,9 @@ continue_unlock: if (is_inode_flag_set(inode, FI_DIRTY_INODE)) f2fs_update_inode(inode, folio); - set_dentry_mark(folio, - f2fs_need_dentry_mark(sbi, ino)); + if (!atomic) + set_dentry_mark(folio, + f2fs_need_dentry_mark(sbi, ino)); } /* may be written by other thread */ if (!folio_test_dirty(folio)) @@ -2937,7 +2970,7 @@ int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, /* scan the node segment */ last_offset = BLKS_PER_SEG(sbi); addr = START_BLOCK(sbi, segno); - sum_entry = &sum->entries[0]; + sum_entry = sum_entries(sum); for (i = 0; i < last_offset; i += nrpages, addr += nrpages) { nrpages = bio_max_segs(last_offset - i); @@ -3078,7 +3111,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, * #2, flush nat entries to nat page. */ if (enabled_nat_bits(sbi, cpc) || - !__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL)) + !__has_cursum_space(sbi, journal, set->entry_cnt, NAT_JOURNAL)) to_journal = false; if (to_journal) { @@ -3101,7 +3134,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, nat_get_blkaddr(ne) == NEW_ADDR); if (to_journal) { - offset = f2fs_lookup_journal_in_cursum(journal, + offset = f2fs_lookup_journal_in_cursum(sbi, journal, NAT_JOURNAL, nid, 1); f2fs_bug_on(sbi, offset < 0); raw_ne = &nat_in_journal(journal, offset); @@ -3146,7 +3179,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct f2fs_journal *journal = curseg->journal; struct nat_entry_set *setvec[NAT_VEC_SIZE]; struct nat_entry_set *set, *tmp; - unsigned int found; + unsigned int found, entry_count = 0; nid_t set_idx = 0; LIST_HEAD(sets); int err = 0; @@ -3172,7 +3205,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * into nat entry set. */ if (enabled_nat_bits(sbi, cpc) || - !__has_cursum_space(journal, + !__has_cursum_space(sbi, journal, nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL)) remove_nats_in_journal(sbi); @@ -3183,9 +3216,21 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) set_idx = setvec[found - 1]->set + 1; for (idx = 0; idx < found; idx++) __adjust_nat_entry_set(setvec[idx], &sets, - MAX_NAT_JENTRIES(journal)); + MAX_NAT_JENTRIES(sbi, journal)); } + /* + * Readahead the current NAT block to prevent read requests from + * being issued and waited on one by one. + */ + list_for_each_entry(set, &sets, set_list) { + entry_count += set->entry_cnt; + if (!enabled_nat_bits(sbi, cpc) && + __has_cursum_space(sbi, journal, + entry_count, NAT_JOURNAL)) + continue; + f2fs_ra_meta_pages(sbi, set->set, 1, META_NAT, true); + } /* flush dirty nats in nat entry set */ list_for_each_entry_safe(set, tmp, &sets, set_list) { err = __flush_nat_entry_set(sbi, set, cpc); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 9cb8dcf8d417..824ac9f0e6e4 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -52,14 +52,6 @@ enum { IS_PREALLOC, /* nat entry is preallocated */ }; -/* For node type in __get_node_folio() */ -enum node_type { - NODE_TYPE_REGULAR, - NODE_TYPE_INODE, - NODE_TYPE_XATTR, - NODE_TYPE_NON_INODE, -}; - /* * For node information */ diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index c3415ebb9f50..a26071f2b0bc 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -514,7 +514,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, struct curseg_info *curseg = CURSEG_I(sbi, i); if (curseg->segno == segno) { - sum = curseg->sum_blk->entries[blkoff]; + sum = sum_entries(curseg->sum_blk)[blkoff]; goto got_it; } } @@ -522,8 +522,8 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, sum_folio = f2fs_get_sum_folio(sbi, segno); if (IS_ERR(sum_folio)) return PTR_ERR(sum_folio); - sum_node = SUM_BLK_PAGE_ADDR(sum_folio, segno); - sum = sum_node->entries[blkoff]; + sum_node = SUM_BLK_PAGE_ADDR(sbi, sum_folio, segno); + sum = sum_entries(sum_node)[blkoff]; f2fs_folio_put(sum_folio, true); got_it: /* Use the locked dnode page and inode */ @@ -875,6 +875,7 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) LIST_HEAD(inode_list); LIST_HEAD(tmp_inode_list); LIST_HEAD(dir_list); + struct f2fs_lock_context lc; int err; int ret = 0; unsigned long s_flags = sbi->sb->s_flags; @@ -888,7 +889,7 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) f2fs_info(sbi, "recover fsync data on readonly fs"); /* prevent checkpoint */ - f2fs_down_write(&sbi->cp_global_sem); + f2fs_down_write_trace(&sbi->cp_global_sem, &lc); /* step #1: find fsynced inode numbers */ err = find_fsync_dnodes(sbi, &inode_list, check_only, &new_inode); @@ -932,7 +933,7 @@ skip: if (!err) clear_sbi_flag(sbi, SBI_POR_DOING); - f2fs_up_write(&sbi->cp_global_sem); + f2fs_up_write_trace(&sbi->cp_global_sem, &lc); /* let's drop all the directory inodes for clean checkpoint */ destroy_fsync_dnodes(&dir_list, err); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c26424f47686..6a97fe76712b 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -371,8 +371,8 @@ next: } out: - if (time_to_inject(sbi, FAULT_TIMEOUT)) - f2fs_io_schedule_timeout_killable(DEFAULT_FAULT_TIMEOUT); + if (time_to_inject(sbi, FAULT_ATOMIC_TIMEOUT)) + f2fs_schedule_timeout_killable(DEFAULT_FAULT_TIMEOUT, true); if (ret) { sbi->revoked_atomic_block += fi->atomic_write_cnt; @@ -400,6 +400,7 @@ int f2fs_commit_atomic_write(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_lock_context lc; int err; err = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); @@ -407,11 +408,11 @@ int f2fs_commit_atomic_write(struct inode *inode) return err; f2fs_down_write(&fi->i_gc_rwsem[WRITE]); - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); err = __f2fs_commit_atomic_write(inode); - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); f2fs_up_write(&fi->i_gc_rwsem[WRITE]); return err; @@ -461,7 +462,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) .should_migrate_blocks = false, .err_gc_skipped = false, .nr_free_secs = 1 }; - f2fs_down_write(&sbi->gc_lock); + f2fs_down_write_trace(&sbi->gc_lock, &gc_control.lc); stat_inc_gc_call_count(sbi, FOREGROUND); f2fs_gc(sbi, &gc_control); } @@ -1286,7 +1287,6 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, &(dcc->fstrim_list) : &(dcc->wait_list); blk_opf_t flag = dpolicy->sync ? REQ_SYNC : 0; block_t lstart, start, len, total_len; - int err = 0; if (dc->state != D_PREP) return 0; @@ -1327,7 +1327,7 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, dc->di.len = 0; - while (total_len && *issued < dpolicy->max_requests && !err) { + while (total_len && *issued < dpolicy->max_requests) { struct bio *bio = NULL; unsigned long flags; bool last = true; @@ -1343,17 +1343,6 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, dc->di.len += len; - err = 0; - if (time_to_inject(sbi, FAULT_DISCARD)) { - err = -EIO; - spin_lock_irqsave(&dc->lock, flags); - if (dc->state == D_PARTIAL) - dc->state = D_SUBMIT; - spin_unlock_irqrestore(&dc->lock, flags); - - break; - } - __blkdev_issue_discard(bdev, SECTOR_FROM_BLOCK(start), SECTOR_FROM_BLOCK(len), GFP_NOFS, &bio); f2fs_bug_on(sbi, !bio); @@ -1392,11 +1381,11 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, len = total_len; } - if (!err && len) { + if (len) { dcc->undiscard_blks -= len; __update_discard_tree_range(sbi, bdev, lstart, start, len); } - return err; + return 0; } static void __insert_discard_cmd(struct f2fs_sb_info *sbi, @@ -2685,12 +2674,12 @@ int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) valid_sum_count += f2fs_curseg_valid_blocks(sbi, i); } - sum_in_page = (PAGE_SIZE - 2 * SUM_JOURNAL_SIZE - + sum_in_page = (sbi->blocksize - 2 * sbi->sum_journal_size - SUM_FOOTER_SIZE) / SUMMARY_SIZE; if (valid_sum_count <= sum_in_page) return 1; else if ((valid_sum_count - sum_in_page) <= - (PAGE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE) + (sbi->blocksize - SUM_FOOTER_SIZE) / SUMMARY_SIZE) return 2; return 3; } @@ -2710,7 +2699,7 @@ void f2fs_update_meta_page(struct f2fs_sb_info *sbi, { struct folio *folio; - if (SUMS_PER_BLOCK == 1) + if (!f2fs_sb_has_packed_ssa(sbi)) folio = f2fs_grab_meta_folio(sbi, blk_addr); else folio = f2fs_get_meta_folio_retry(sbi, blk_addr); @@ -2728,7 +2717,7 @@ static void write_sum_page(struct f2fs_sb_info *sbi, { struct folio *folio; - if (SUMS_PER_BLOCK == 1) + if (!f2fs_sb_has_packed_ssa(sbi)) return f2fs_update_meta_page(sbi, (void *)sum_blk, GET_SUM_BLOCK(sbi, segno)); @@ -2736,7 +2725,8 @@ static void write_sum_page(struct f2fs_sb_info *sbi, if (IS_ERR(folio)) return; - memcpy(SUM_BLK_PAGE_ADDR(folio, segno), sum_blk, sizeof(*sum_blk)); + memcpy(SUM_BLK_PAGE_ADDR(sbi, folio, segno), sum_blk, + sbi->sum_blocksize); folio_mark_dirty(folio); f2fs_folio_put(folio, true); } @@ -2755,11 +2745,11 @@ static void write_current_sum_page(struct f2fs_sb_info *sbi, mutex_lock(&curseg->curseg_mutex); down_read(&curseg->journal_rwsem); - memcpy(&dst->journal, curseg->journal, SUM_JOURNAL_SIZE); + memcpy(sum_journal(sbi, dst), curseg->journal, sbi->sum_journal_size); up_read(&curseg->journal_rwsem); - memcpy(dst->entries, src->entries, SUM_ENTRY_SIZE); - memcpy(&dst->footer, &src->footer, SUM_FOOTER_SIZE); + memcpy(sum_entries(dst), sum_entries(src), sbi->sum_entry_size); + memcpy(sum_footer(sbi, dst), sum_footer(sbi, src), SUM_FOOTER_SIZE); mutex_unlock(&curseg->curseg_mutex); @@ -2932,7 +2922,7 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) curseg->next_blkoff = 0; curseg->next_segno = NULL_SEGNO; - sum_footer = &(curseg->sum_blk->footer); + sum_footer = sum_footer(sbi, curseg->sum_blk); memset(sum_footer, 0, sizeof(struct summary_footer)); sanity_check_seg_type(sbi, seg_type); @@ -3078,11 +3068,11 @@ static int change_curseg(struct f2fs_sb_info *sbi, int type) sum_folio = f2fs_get_sum_folio(sbi, new_segno); if (IS_ERR(sum_folio)) { /* GC won't be able to use stale summary pages by cp_error */ - memset(curseg->sum_blk, 0, SUM_ENTRY_SIZE); + memset(curseg->sum_blk, 0, sbi->sum_entry_size); return PTR_ERR(sum_folio); } - sum_node = SUM_BLK_PAGE_ADDR(sum_folio, new_segno); - memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); + sum_node = SUM_BLK_PAGE_ADDR(sbi, sum_folio, new_segno); + memcpy(curseg->sum_blk, sum_node, sbi->sum_entry_size); f2fs_folio_put(sum_folio, true); return 0; } @@ -3362,19 +3352,20 @@ int f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force) int f2fs_allocate_pinning_section(struct f2fs_sb_info *sbi) { + struct f2fs_lock_context lc; int err; bool gc_required = true; retry: - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); err = f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false); - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); if (f2fs_sb_has_blkzoned(sbi) && err == -EAGAIN && gc_required) { - f2fs_down_write(&sbi->gc_lock); + f2fs_down_write_trace(&sbi->gc_lock, &lc); err = f2fs_gc_range(sbi, 0, sbi->first_seq_zone_segno - 1, true, ZONED_PIN_SEC_REQUIRED_COUNT); - f2fs_up_write(&sbi->gc_lock); + f2fs_up_write_trace(&sbi->gc_lock, &lc); gc_required = false; if (!err) @@ -3494,6 +3485,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) block_t start_block, end_block; struct cp_control cpc; struct discard_policy dpolicy; + struct f2fs_lock_context lc; unsigned long long trimmed = 0; int err = 0; bool need_align = f2fs_lfs_mode(sbi) && __is_large_section(sbi); @@ -3526,10 +3518,10 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) if (sbi->discard_blks == 0) goto out; - f2fs_down_write(&sbi->gc_lock); + f2fs_down_write_trace(&sbi->gc_lock, &lc); stat_inc_cp_call_count(sbi, TOTAL_CALL); err = f2fs_write_checkpoint(sbi, &cpc); - f2fs_up_write(&sbi->gc_lock); + f2fs_up_write_trace(&sbi->gc_lock, &lc); if (err) goto out; @@ -3814,7 +3806,7 @@ int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct folio *folio, f2fs_wait_discard_bio(sbi, *new_blkaddr); - curseg->sum_blk->entries[curseg->next_blkoff] = *sum; + sum_entries(curseg->sum_blk)[curseg->next_blkoff] = *sum; if (curseg->alloc_type == SSR) { curseg->next_blkoff = f2fs_find_next_ssr_block(sbi, curseg); } else { @@ -4183,7 +4175,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, } curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); - curseg->sum_blk->entries[curseg->next_blkoff] = *sum; + sum_entries(curseg->sum_blk)[curseg->next_blkoff] = *sum; if (!recover_curseg || recover_newaddr) { if (!from_gc) @@ -4240,7 +4232,7 @@ void f2fs_folio_wait_writeback(struct folio *folio, enum page_type type, struct f2fs_sb_info *sbi = F2FS_F_SB(folio); /* submit cached LFS IO */ - f2fs_submit_merged_write_cond(sbi, NULL, folio, 0, type); + f2fs_submit_merged_write_folio(sbi, folio, type); /* submit cached IPU IO */ f2fs_submit_merged_ipu_write(sbi, NULL, folio); if (ordered) { @@ -4303,12 +4295,12 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi) /* Step 1: restore nat cache */ seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); - memcpy(seg_i->journal, kaddr, SUM_JOURNAL_SIZE); + memcpy(seg_i->journal, kaddr, sbi->sum_journal_size); /* Step 2: restore sit cache */ seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA); - memcpy(seg_i->journal, kaddr + SUM_JOURNAL_SIZE, SUM_JOURNAL_SIZE); - offset = 2 * SUM_JOURNAL_SIZE; + memcpy(seg_i->journal, kaddr + sbi->sum_journal_size, sbi->sum_journal_size); + offset = 2 * sbi->sum_journal_size; /* Step 3: restore summary entries */ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { @@ -4330,9 +4322,9 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi) struct f2fs_summary *s; s = (struct f2fs_summary *)(kaddr + offset); - seg_i->sum_blk->entries[j] = *s; + sum_entries(seg_i->sum_blk)[j] = *s; offset += SUMMARY_SIZE; - if (offset + SUMMARY_SIZE <= PAGE_SIZE - + if (offset + SUMMARY_SIZE <= sbi->blocksize - SUM_FOOTER_SIZE) continue; @@ -4388,7 +4380,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) if (IS_NODESEG(type)) { if (__exist_node_summaries(sbi)) { - struct f2fs_summary *ns = &sum->entries[0]; + struct f2fs_summary *ns = sum_entries(sum); int i; for (i = 0; i < BLKS_PER_SEG(sbi); i++, ns++) { @@ -4408,11 +4400,13 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) /* update journal info */ down_write(&curseg->journal_rwsem); - memcpy(curseg->journal, &sum->journal, SUM_JOURNAL_SIZE); + memcpy(curseg->journal, sum_journal(sbi, sum), sbi->sum_journal_size); up_write(&curseg->journal_rwsem); - memcpy(curseg->sum_blk->entries, sum->entries, SUM_ENTRY_SIZE); - memcpy(&curseg->sum_blk->footer, &sum->footer, SUM_FOOTER_SIZE); + memcpy(sum_entries(curseg->sum_blk), sum_entries(sum), + sbi->sum_entry_size); + memcpy(sum_footer(sbi, curseg->sum_blk), sum_footer(sbi, sum), + SUM_FOOTER_SIZE); curseg->next_segno = segno; reset_curseg(sbi, type, 0); curseg->alloc_type = ckpt->alloc_type[type]; @@ -4456,8 +4450,8 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) } /* sanity check for summary blocks */ - if (nats_in_cursum(nat_j) > NAT_JOURNAL_ENTRIES || - sits_in_cursum(sit_j) > SIT_JOURNAL_ENTRIES) { + if (nats_in_cursum(nat_j) > sbi->nat_journal_entries || + sits_in_cursum(sit_j) > sbi->sit_journal_entries) { f2fs_err(sbi, "invalid journal entries nats %u sits %u", nats_in_cursum(nat_j), sits_in_cursum(sit_j)); return -EINVAL; @@ -4481,13 +4475,13 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) /* Step 1: write nat cache */ seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); - memcpy(kaddr, seg_i->journal, SUM_JOURNAL_SIZE); - written_size += SUM_JOURNAL_SIZE; + memcpy(kaddr, seg_i->journal, sbi->sum_journal_size); + written_size += sbi->sum_journal_size; /* Step 2: write sit cache */ seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA); - memcpy(kaddr + written_size, seg_i->journal, SUM_JOURNAL_SIZE); - written_size += SUM_JOURNAL_SIZE; + memcpy(kaddr + written_size, seg_i->journal, sbi->sum_journal_size); + written_size += sbi->sum_journal_size; /* Step 3: write summary entries */ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { @@ -4500,10 +4494,10 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) written_size = 0; } summary = (struct f2fs_summary *)(kaddr + written_size); - *summary = seg_i->sum_blk->entries[j]; + *summary = sum_entries(seg_i->sum_blk)[j]; written_size += SUMMARY_SIZE; - if (written_size + SUMMARY_SIZE <= PAGE_SIZE - + if (written_size + SUMMARY_SIZE <= sbi->blocksize - SUM_FOOTER_SIZE) continue; @@ -4545,8 +4539,9 @@ void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk) write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE); } -int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, - unsigned int val, int alloc) +int f2fs_lookup_journal_in_cursum(struct f2fs_sb_info *sbi, + struct f2fs_journal *journal, int type, + unsigned int val, int alloc) { int i; @@ -4555,13 +4550,13 @@ int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, if (le32_to_cpu(nid_in_journal(journal, i)) == val) return i; } - if (alloc && __has_cursum_space(journal, 1, NAT_JOURNAL)) + if (alloc && __has_cursum_space(sbi, journal, 1, NAT_JOURNAL)) return update_nats_in_cursum(journal, 1); } else if (type == SIT_JOURNAL) { for (i = 0; i < sits_in_cursum(journal); i++) if (le32_to_cpu(segno_in_journal(journal, i)) == val) return i; - if (alloc && __has_cursum_space(journal, 1, SIT_JOURNAL)) + if (alloc && __has_cursum_space(sbi, journal, 1, SIT_JOURNAL)) return update_sits_in_cursum(journal, 1); } return -1; @@ -4709,8 +4704,8 @@ void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * entries, remove all entries from journal and add and account * them in sit entry set. */ - if (!__has_cursum_space(journal, sit_i->dirty_sentries, SIT_JOURNAL) || - !to_journal) + if (!__has_cursum_space(sbi, journal, + sit_i->dirty_sentries, SIT_JOURNAL) || !to_journal) remove_sits_in_journal(sbi); /* @@ -4727,7 +4722,8 @@ void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) unsigned int segno = start_segno; if (to_journal && - !__has_cursum_space(journal, ses->entry_cnt, SIT_JOURNAL)) + !__has_cursum_space(sbi, journal, ses->entry_cnt, + SIT_JOURNAL)) to_journal = false; if (to_journal) { @@ -4755,7 +4751,7 @@ void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) } if (to_journal) { - offset = f2fs_lookup_journal_in_cursum(journal, + offset = f2fs_lookup_journal_in_cursum(sbi, journal, SIT_JOURNAL, segno, 1); f2fs_bug_on(sbi, offset < 0); segno_in_journal(journal, offset) = @@ -4962,12 +4958,13 @@ static int build_curseg(struct f2fs_sb_info *sbi) for (i = 0; i < NO_CHECK_TYPE; i++) { mutex_init(&array[i].curseg_mutex); - array[i].sum_blk = f2fs_kzalloc(sbi, PAGE_SIZE, GFP_KERNEL); + array[i].sum_blk = f2fs_kzalloc(sbi, sbi->sum_blocksize, + GFP_KERNEL); if (!array[i].sum_blk) return -ENOMEM; init_rwsem(&array[i].journal_rwsem); array[i].journal = f2fs_kzalloc(sbi, - sizeof(struct f2fs_journal), GFP_KERNEL); + sbi->sum_journal_size, GFP_KERNEL); if (!array[i].journal) return -ENOMEM; array[i].seg_type = log_type_to_seg_type(i); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 07dcbcbeb7c6..068845660b0f 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -90,12 +90,11 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, #define GET_ZONE_FROM_SEG(sbi, segno) \ GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno)) -#define SUMS_PER_BLOCK (F2FS_BLKSIZE / F2FS_SUM_BLKSIZE) #define GET_SUM_BLOCK(sbi, segno) \ - (SM_I(sbi)->ssa_blkaddr + (segno / SUMS_PER_BLOCK)) -#define GET_SUM_BLKOFF(segno) (segno % SUMS_PER_BLOCK) -#define SUM_BLK_PAGE_ADDR(folio, segno) \ - (folio_address(folio) + GET_SUM_BLKOFF(segno) * F2FS_SUM_BLKSIZE) + (SM_I(sbi)->ssa_blkaddr + (segno / (sbi)->sums_per_block)) +#define GET_SUM_BLKOFF(sbi, segno) (segno % (sbi)->sums_per_block) +#define SUM_BLK_PAGE_ADDR(sbi, folio, segno) \ + (folio_address(folio) + GET_SUM_BLKOFF(sbi, segno) * (sbi)->sum_blocksize) #define GET_SUM_TYPE(footer) ((footer)->entry_type) #define SET_SUM_TYPE(footer, type) ((footer)->entry_type = (type)) @@ -621,97 +620,90 @@ static inline unsigned int get_left_section_blocks(struct f2fs_sb_info *sbi, return CAP_BLKS_PER_SEC(sbi) - get_ckpt_valid_blocks(sbi, segno, true); } -static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, - unsigned int node_blocks, unsigned int data_blocks, - unsigned int dent_blocks) +static inline void get_additional_blocks_required(struct f2fs_sb_info *sbi, + unsigned int *total_node_blocks, unsigned int *total_data_blocks, + unsigned int *total_dent_blocks, bool separate_dent) { - unsigned int segno, left_blocks, blocks; + unsigned int segno, left_blocks; int i; + unsigned int min_free_node_blocks = CAP_BLKS_PER_SEC(sbi); + unsigned int min_free_dent_blocks = CAP_BLKS_PER_SEC(sbi); + unsigned int min_free_data_blocks = CAP_BLKS_PER_SEC(sbi); /* check current data/node sections in the worst case. */ for (i = CURSEG_HOT_DATA; i < NR_PERSISTENT_LOG; i++) { segno = CURSEG_I(sbi, i)->segno; if (unlikely(segno == NULL_SEGNO)) - return false; + return; left_blocks = get_left_section_blocks(sbi, i, segno); - blocks = i <= CURSEG_COLD_DATA ? data_blocks : node_blocks; - if (blocks > left_blocks) - return false; + if (i > CURSEG_COLD_DATA) + min_free_node_blocks = min(min_free_node_blocks, left_blocks); + else if (i == CURSEG_HOT_DATA && separate_dent) + min_free_dent_blocks = left_blocks; + else + min_free_data_blocks = min(min_free_data_blocks, left_blocks); } - /* check current data section for dentry blocks. */ - segno = CURSEG_I(sbi, CURSEG_HOT_DATA)->segno; - - if (unlikely(segno == NULL_SEGNO)) - return false; - - left_blocks = get_left_section_blocks(sbi, CURSEG_HOT_DATA, segno); - - if (dent_blocks > left_blocks) - return false; - return true; + *total_node_blocks = (*total_node_blocks > min_free_node_blocks) ? + *total_node_blocks - min_free_node_blocks : 0; + *total_dent_blocks = (*total_dent_blocks > min_free_dent_blocks) ? + *total_dent_blocks - min_free_dent_blocks : 0; + *total_data_blocks = (*total_data_blocks > min_free_data_blocks) ? + *total_data_blocks - min_free_data_blocks : 0; } /* - * calculate needed sections for dirty node/dentry and call - * has_curseg_enough_space, please note that, it needs to account - * dirty data as well in lfs mode when checkpoint is disabled. + * call get_additional_blocks_required to calculate dirty blocks + * needing to be placed in free sections, please note that, it + * needs to account dirty data as well in lfs mode when checkpoint + * is disabled. */ -static inline void __get_secs_required(struct f2fs_sb_info *sbi, - unsigned int *lower_p, unsigned int *upper_p, bool *curseg_p) +static inline int __get_secs_required(struct f2fs_sb_info *sbi) { unsigned int total_node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) + get_pages(sbi, F2FS_DIRTY_DENTS) + get_pages(sbi, F2FS_DIRTY_IMETA); unsigned int total_dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS); unsigned int total_data_blocks = 0; - unsigned int node_secs = total_node_blocks / CAP_BLKS_PER_SEC(sbi); - unsigned int dent_secs = total_dent_blocks / CAP_BLKS_PER_SEC(sbi); - unsigned int data_secs = 0; - unsigned int node_blocks = total_node_blocks % CAP_BLKS_PER_SEC(sbi); - unsigned int dent_blocks = total_dent_blocks % CAP_BLKS_PER_SEC(sbi); - unsigned int data_blocks = 0; + bool separate_dent = true; - if (f2fs_lfs_mode(sbi)) { + if (f2fs_lfs_mode(sbi)) total_data_blocks = get_pages(sbi, F2FS_DIRTY_DATA); - data_secs = total_data_blocks / CAP_BLKS_PER_SEC(sbi); - data_blocks = total_data_blocks % CAP_BLKS_PER_SEC(sbi); + + /* + * When active_logs != 4, dentry blocks and data blocks can be + * mixed in the same logs, so check their space together. + */ + if (F2FS_OPTION(sbi).active_logs != 4) { + total_data_blocks += total_dent_blocks; + total_dent_blocks = 0; + separate_dent = false; } - if (lower_p) - *lower_p = node_secs + dent_secs + data_secs; - if (upper_p) - *upper_p = node_secs + dent_secs + data_secs + - (node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0) + - (data_blocks ? 1 : 0); - if (curseg_p) - *curseg_p = has_curseg_enough_space(sbi, - node_blocks, data_blocks, dent_blocks); + get_additional_blocks_required(sbi, &total_node_blocks, &total_dent_blocks, + &total_data_blocks, separate_dent); + + return DIV_ROUND_UP(total_node_blocks, CAP_BLKS_PER_SEC(sbi)) + + DIV_ROUND_UP(total_dent_blocks, CAP_BLKS_PER_SEC(sbi)) + + DIV_ROUND_UP(total_data_blocks, CAP_BLKS_PER_SEC(sbi)); } static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed, int needed) { - unsigned int free_secs, lower_secs, upper_secs; - bool curseg_space; + unsigned int free_secs, required_secs; if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; - __get_secs_required(sbi, &lower_secs, &upper_secs, &curseg_space); - free_secs = free_sections(sbi) + freed; - lower_secs += needed + reserved_sections(sbi); - upper_secs += needed + reserved_sections(sbi); + required_secs = needed + reserved_sections(sbi) + + __get_secs_required(sbi); - if (free_secs > upper_secs) - return false; - if (free_secs <= lower_secs) - return true; - return !curseg_space; + return free_secs < required_secs; } static inline bool has_enough_free_secs(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index cd00d030edda..7c8e6eea60df 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -67,8 +67,10 @@ const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_BLKADDR_CONSISTENCE] = "inconsistent blkaddr", [FAULT_NO_SEGMENT] = "no free segment", [FAULT_INCONSISTENT_FOOTER] = "inconsistent footer", - [FAULT_TIMEOUT] = "timeout", + [FAULT_ATOMIC_TIMEOUT] = "atomic timeout", [FAULT_VMALLOC] = "vmalloc", + [FAULT_LOCK_TIMEOUT] = "lock timeout", + [FAULT_SKIP_WRITE] = "skip write", }; int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned long rate, @@ -96,8 +98,57 @@ int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned long rate, f2fs_info(sbi, "build fault injection type: 0x%lx", type); } + if (fo & FAULT_TIMEOUT) { + if (type >= TIMEOUT_TYPE_MAX) + return -EINVAL; + ffi->inject_lock_timeout = (unsigned int)type; + f2fs_info(sbi, "build fault timeout injection type: 0x%lx", type); + } + return 0; } + +static void inject_timeout(struct f2fs_sb_info *sbi) +{ + struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info; + enum f2fs_timeout_type type = ffi->inject_lock_timeout; + unsigned long start_time = jiffies; + unsigned long timeout = HZ; + + switch (type) { + case TIMEOUT_TYPE_RUNNING: + while (!time_after(jiffies, start_time + timeout)) { + if (fatal_signal_pending(current)) + return; + ; + } + break; + case TIMEOUT_TYPE_IO_SLEEP: + f2fs_schedule_timeout_killable(timeout, true); + break; + case TIMEOUT_TYPE_NONIO_SLEEP: + f2fs_schedule_timeout_killable(timeout, false); + break; + case TIMEOUT_TYPE_RUNNABLE: + while (!time_after(jiffies, start_time + timeout)) { + if (fatal_signal_pending(current)) + return; + schedule(); + } + break; + default: + return; + } +} + +void f2fs_simulate_lock_timeout(struct f2fs_sb_info *sbi) +{ + struct f2fs_lock_context lc; + + f2fs_lock_op(sbi, &lc); + inject_timeout(sbi); + f2fs_unlock_op(sbi, &lc); +} #endif /* f2fs-wide shrinker description */ @@ -2556,6 +2607,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) { unsigned int s_flags = sbi->sb->s_flags; struct cp_control cpc; + struct f2fs_lock_context lc; unsigned int gc_mode = sbi->gc_mode; int err = 0; int ret; @@ -2585,7 +2637,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) .no_bg_gc = true, .nr_free_secs = 1 }; - f2fs_down_write(&sbi->gc_lock); + f2fs_down_write_trace(&sbi->gc_lock, &gc_control.lc); stat_inc_gc_call_count(sbi, FOREGROUND); err = f2fs_gc(sbi, &gc_control); if (err == -ENODATA) { @@ -2609,7 +2661,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) } skip_gc: - f2fs_down_write(&sbi->gc_lock); + f2fs_down_write_trace(&sbi->gc_lock, &lc); cpc.reason = CP_PAUSE; set_sbi_flag(sbi, SBI_CP_DISABLED); stat_inc_cp_call_count(sbi, TOTAL_CALL); @@ -2622,7 +2674,7 @@ skip_gc: spin_unlock(&sbi->stat_lock); out_unlock: - f2fs_up_write(&sbi->gc_lock); + f2fs_up_write_trace(&sbi->gc_lock, &lc); restore_flag: sbi->gc_mode = gc_mode; sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ @@ -2632,57 +2684,66 @@ restore_flag: static int f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) { - unsigned int nr_pages = get_pages(sbi, F2FS_DIRTY_DATA) / 16; - long long start, writeback, lock, sync_inode, end; + int retry = MAX_FLUSH_RETRY_COUNT; + long long start, writeback, end; int ret; + struct f2fs_lock_context lc; + long long skipped_write, dirty_data; - f2fs_info(sbi, "%s start, meta: %lld, node: %lld, data: %lld", - __func__, + f2fs_info(sbi, "f2fs_enable_checkpoint() starts, meta: %lld, node: %lld, data: %lld", get_pages(sbi, F2FS_DIRTY_META), get_pages(sbi, F2FS_DIRTY_NODES), get_pages(sbi, F2FS_DIRTY_DATA)); - f2fs_update_time(sbi, ENABLE_TIME); - start = ktime_get(); + set_sbi_flag(sbi, SBI_ENABLE_CHECKPOINT); + /* we should flush all the data to keep data consistency */ - while (get_pages(sbi, F2FS_DIRTY_DATA)) { - writeback_inodes_sb_nr(sbi->sb, nr_pages, WB_REASON_SYNC); + do { + skipped_write = get_pages(sbi, F2FS_SKIPPED_WRITE); + dirty_data = get_pages(sbi, F2FS_DIRTY_DATA); + + sync_inodes_sb(sbi->sb); f2fs_io_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT); - if (f2fs_time_over(sbi, ENABLE_TIME)) + f2fs_info(sbi, "sync_inode_sb done, dirty_data: %lld, %lld, " + "skipped write: %lld, %lld, retry: %d", + get_pages(sbi, F2FS_DIRTY_DATA), + dirty_data, + get_pages(sbi, F2FS_SKIPPED_WRITE), + skipped_write, retry); + + /* + * sync_inodes_sb() has retry logic, so let's check dirty_data + * in prior to skipped_write in case there is no dirty data. + */ + if (!get_pages(sbi, F2FS_DIRTY_DATA)) break; - } + if (get_pages(sbi, F2FS_SKIPPED_WRITE) == skipped_write) + break; + } while (retry--); + + clear_sbi_flag(sbi, SBI_ENABLE_CHECKPOINT); + writeback = ktime_get(); - f2fs_down_write(&sbi->cp_enable_rwsem); + if (unlikely(get_pages(sbi, F2FS_DIRTY_DATA) || + get_pages(sbi, F2FS_SKIPPED_WRITE))) + f2fs_warn(sbi, "checkpoint=enable unwritten data: %lld, skipped data: %lld, retry: %d", + get_pages(sbi, F2FS_DIRTY_DATA), + get_pages(sbi, F2FS_SKIPPED_WRITE), retry); - lock = ktime_get(); + if (get_pages(sbi, F2FS_SKIPPED_WRITE)) + atomic_set(&sbi->nr_pages[F2FS_SKIPPED_WRITE], 0); - if (get_pages(sbi, F2FS_DIRTY_DATA)) - sync_inodes_sb(sbi->sb); - - if (unlikely(get_pages(sbi, F2FS_DIRTY_DATA))) - f2fs_warn(sbi, "%s: has some unwritten data: %lld", - __func__, get_pages(sbi, F2FS_DIRTY_DATA)); - - sync_inode = ktime_get(); - - f2fs_down_write(&sbi->gc_lock); + f2fs_down_write_trace(&sbi->gc_lock, &lc); f2fs_dirty_to_prefree(sbi); clear_sbi_flag(sbi, SBI_CP_DISABLED); set_sbi_flag(sbi, SBI_IS_DIRTY); - f2fs_up_write(&sbi->gc_lock); + f2fs_up_write_trace(&sbi->gc_lock, &lc); - f2fs_info(sbi, "%s sync_fs, meta: %lld, imeta: %lld, node: %lld, dents: %lld, qdata: %lld", - __func__, - get_pages(sbi, F2FS_DIRTY_META), - get_pages(sbi, F2FS_DIRTY_IMETA), - get_pages(sbi, F2FS_DIRTY_NODES), - get_pages(sbi, F2FS_DIRTY_DENTS), - get_pages(sbi, F2FS_DIRTY_QDATA)); ret = f2fs_sync_fs(sbi->sb, 1); if (ret) f2fs_err(sbi, "%s sync_fs failed, ret: %d", __func__, ret); @@ -2690,17 +2751,11 @@ static int f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) /* Let's ensure there's no pending checkpoint anymore */ f2fs_flush_ckpt_thread(sbi); - f2fs_up_write(&sbi->cp_enable_rwsem); - end = ktime_get(); - f2fs_info(sbi, "%s end, writeback:%llu, " - "lock:%llu, sync_inode:%llu, sync_fs:%llu", - __func__, - ktime_ms_delta(writeback, start), - ktime_ms_delta(lock, writeback), - ktime_ms_delta(sync_inode, lock), - ktime_ms_delta(end, sync_inode)); + f2fs_info(sbi, "f2fs_enable_checkpoint() finishes, writeback:%llu, sync:%llu", + ktime_ms_delta(writeback, start), + ktime_ms_delta(end, writeback)); return ret; } @@ -3219,19 +3274,12 @@ int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly) } static int f2fs_quota_enable(struct super_block *sb, int type, int format_id, - unsigned int flags) + unsigned int flags, unsigned long qf_inum) { struct inode *qf_inode; - unsigned long qf_inum; unsigned long qf_flag = F2FS_QUOTA_DEFAULT_FL; int err; - BUG_ON(!f2fs_sb_has_quota_ino(F2FS_SB(sb))); - - qf_inum = f2fs_qf_ino(sb, type); - if (!qf_inum) - return -EPERM; - qf_inode = f2fs_iget(sb, qf_inum); if (IS_ERR(qf_inode)) { f2fs_err(F2FS_SB(sb), "Bad quota inode %u:%lu", type, qf_inum); @@ -3264,7 +3312,7 @@ static int f2fs_enable_quotas(struct super_block *sb) test_opt(sbi, PRJQUOTA), }; - if (is_set_ckpt_flags(F2FS_SB(sb), CP_QUOTA_NEED_FSCK_FLAG)) { + if (is_set_ckpt_flags(sbi, CP_QUOTA_NEED_FSCK_FLAG)) { f2fs_err(sbi, "quota file may be corrupted, skip loading it"); return 0; } @@ -3276,14 +3324,13 @@ static int f2fs_enable_quotas(struct super_block *sb) if (qf_inum) { err = f2fs_quota_enable(sb, type, QFMT_VFS_V1, DQUOT_USAGE_ENABLED | - (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0)); + (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0), qf_inum); if (err) { f2fs_err(sbi, "Failed to enable quota tracking (type=%d, err=%d). Please run fsck to fix.", type, err); for (type--; type >= 0; type--) dquot_quota_off(sb, type); - set_sbi_flag(F2FS_SB(sb), - SBI_QUOTA_NEED_REPAIR); + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); return err; } } @@ -3330,6 +3377,7 @@ int f2fs_do_quota_sync(struct super_block *sb, int type) * that userspace sees the changes. */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + struct f2fs_lock_context lc; if (type != -1 && cnt != type) continue; @@ -3349,13 +3397,13 @@ int f2fs_do_quota_sync(struct super_block *sb, int type) * block_operation * f2fs_down_read(quota_sem) */ - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); f2fs_down_read(&sbi->quota_sem); ret = f2fs_quota_sync_file(sbi, cnt); f2fs_up_read(&sbi->quota_sem); - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); if (!f2fs_sb_has_quota_ino(sbi)) inode_unlock(dqopt->files[cnt]); @@ -4077,20 +4125,6 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, if (sanity_check_area_boundary(sbi, folio, index)) return -EFSCORRUPTED; - /* - * Check for legacy summary layout on 16KB+ block devices. - * Modern f2fs-tools packs multiple 4KB summary areas into one block, - * whereas legacy versions used one block per summary, leading - * to a much larger SSA. - */ - if (SUMS_PER_BLOCK > 1 && - !(__F2FS_HAS_FEATURE(raw_super, F2FS_FEATURE_PACKED_SSA))) { - f2fs_info(sbi, "Error: Device formatted with a legacy version. " - "Please reformat with a tool supporting the packed ssa " - "feature for block sizes larger than 4kb."); - return -EOPNOTSUPP; - } - return 0; } @@ -4300,6 +4334,22 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->max_fragment_hole = DEF_FRAGMENT_SIZE; spin_lock_init(&sbi->gc_remaining_trials_lock); atomic64_set(&sbi->current_atomic_write, 0); + sbi->max_lock_elapsed_time = MAX_LOCK_ELAPSED_TIME; + sbi->adjust_lock_priority = 0; + sbi->lock_duration_priority = F2FS_DEFAULT_TASK_PRIORITY; + sbi->critical_task_priority = F2FS_CRITICAL_TASK_PRIORITY; + + sbi->sum_blocksize = f2fs_sb_has_packed_ssa(sbi) ? + 4096 : sbi->blocksize; + sbi->sums_per_block = sbi->blocksize / sbi->sum_blocksize; + sbi->entries_in_sum = sbi->sum_blocksize / 8; + sbi->sum_entry_size = SUMMARY_SIZE * sbi->entries_in_sum; + sbi->sum_journal_size = sbi->sum_blocksize - SUM_FOOTER_SIZE - + sbi->sum_entry_size; + sbi->nat_journal_entries = (sbi->sum_journal_size - 2) / + sizeof(struct nat_journal_entry); + sbi->sit_journal_entries = (sbi->sum_journal_size - 2) / + sizeof(struct sit_journal_entry); sbi->dir_level = DEF_DIR_LEVEL; sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL; @@ -4307,7 +4357,6 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->interval_time[DISCARD_TIME] = DEF_IDLE_INTERVAL; sbi->interval_time[GC_TIME] = DEF_IDLE_INTERVAL; sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_INTERVAL; - sbi->interval_time[ENABLE_TIME] = DEF_ENABLE_INTERVAL; sbi->interval_time[UMOUNT_DISCARD_TIMEOUT] = DEF_UMOUNT_DISCARD_TIMEOUT; clear_sbi_flag(sbi, SBI_NEED_FSCK); @@ -4896,14 +4945,13 @@ try_onemore: sbi->sb = sb; /* initialize locks within allocated memory */ - init_f2fs_rwsem(&sbi->gc_lock); + init_f2fs_rwsem_trace(&sbi->gc_lock, sbi, LOCK_NAME_GC_LOCK); mutex_init(&sbi->writepages); - init_f2fs_rwsem(&sbi->cp_global_sem); - init_f2fs_rwsem(&sbi->node_write); - init_f2fs_rwsem(&sbi->node_change); + init_f2fs_rwsem_trace(&sbi->cp_global_sem, sbi, LOCK_NAME_CP_GLOBAL); + init_f2fs_rwsem_trace(&sbi->node_write, sbi, LOCK_NAME_NODE_WRITE); + init_f2fs_rwsem_trace(&sbi->node_change, sbi, LOCK_NAME_NODE_CHANGE); spin_lock_init(&sbi->stat_lock); - init_f2fs_rwsem(&sbi->cp_rwsem); - init_f2fs_rwsem(&sbi->cp_enable_rwsem); + init_f2fs_rwsem_trace(&sbi->cp_rwsem, sbi, LOCK_NAME_CP_RWSEM); init_f2fs_rwsem(&sbi->quota_sem); init_waitqueue_head(&sbi->cp_wait); spin_lock_init(&sbi->error_lock); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index c42f4f979d13..5fbfdc96e502 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -35,6 +35,7 @@ enum { #ifdef CONFIG_F2FS_FAULT_INJECTION FAULT_INFO_RATE, /* struct f2fs_fault_info */ FAULT_INFO_TYPE, /* struct f2fs_fault_info */ + FAULT_INFO_TIMEOUT, /* struct f2fs_fault_info */ #endif RESERVED_BLOCKS, /* struct f2fs_sb_info */ CPRC_INFO, /* struct ckpt_req_control */ @@ -58,6 +59,7 @@ struct f2fs_attr { const char *buf, size_t len); int struct_type; int offset; + int size; int id; }; @@ -84,7 +86,8 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) return (unsigned char *)sbi; #ifdef CONFIG_F2FS_FAULT_INJECTION else if (struct_type == FAULT_INFO_RATE || - struct_type == FAULT_INFO_TYPE) + struct_type == FAULT_INFO_TYPE || + struct_type == FAULT_INFO_TIMEOUT) return (unsigned char *)&F2FS_OPTION(sbi).fault_info; #endif #ifdef CONFIG_F2FS_STAT_FS @@ -344,11 +347,30 @@ static ssize_t main_blkaddr_show(struct f2fs_attr *a, (unsigned long long)MAIN_BLKADDR(sbi)); } +static ssize_t __sbi_show_value(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf, + unsigned char *value) +{ + switch (a->size) { + case 1: + return sysfs_emit(buf, "%u\n", *(u8 *)value); + case 2: + return sysfs_emit(buf, "%u\n", *(u16 *)value); + case 4: + return sysfs_emit(buf, "%u\n", *(u32 *)value); + case 8: + return sysfs_emit(buf, "%llu\n", *(u64 *)value); + default: + f2fs_bug_on(sbi, 1); + return sysfs_emit(buf, + "show sysfs node value with wrong type\n"); + } +} + static ssize_t f2fs_sbi_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { unsigned char *ptr = NULL; - unsigned int *ui; ptr = __struct_ptr(sbi, a->struct_type); if (!ptr) @@ -428,9 +450,30 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, atomic_read(&sbi->cp_call_count[BACKGROUND])); #endif - ui = (unsigned int *)(ptr + a->offset); + return __sbi_show_value(a, sbi, buf, ptr + a->offset); +} - return sysfs_emit(buf, "%u\n", *ui); +static void __sbi_store_value(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, + unsigned char *ui, unsigned long value) +{ + switch (a->size) { + case 1: + *(u8 *)ui = value; + break; + case 2: + *(u16 *)ui = value; + break; + case 4: + *(u32 *)ui = value; + break; + case 8: + *(u64 *)ui = value; + break; + default: + f2fs_bug_on(sbi, 1); + f2fs_err(sbi, "store sysfs node value with wrong type"); + } } static ssize_t __sbi_store(struct f2fs_attr *a, @@ -529,6 +572,12 @@ out: return -EINVAL; return count; } + if (a->struct_type == FAULT_INFO_TIMEOUT) { + if (f2fs_build_fault_attr(sbi, 0, t, FAULT_TIMEOUT)) + return -EINVAL; + f2fs_simulate_lock_timeout(sbi); + return count; + } #endif if (a->struct_type == RESERVED_BLOCKS) { spin_lock(&sbi->stat_lock); @@ -749,7 +798,7 @@ out: return count; } - if (!strcmp(a->attr.name, "gc_pin_file_threshold")) { + if (!strcmp(a->attr.name, "gc_pin_file_thresh")) { if (t > MAX_GC_FAILED_PINNED_FILES) return -EINVAL; sbi->gc_pin_file_threshold = t; @@ -906,7 +955,36 @@ out: return count; } - *ui = (unsigned int)t; + if (!strcmp(a->attr.name, "adjust_lock_priority")) { + if (t >= BIT(LOCK_NAME_MAX - 1)) + return -EINVAL; + sbi->adjust_lock_priority = t; + return count; + } + + if (!strcmp(a->attr.name, "lock_duration_priority")) { + if (t < NICE_TO_PRIO(MIN_NICE) || t > NICE_TO_PRIO(MAX_NICE)) + return -EINVAL; + sbi->lock_duration_priority = t; + return count; + } + + if (!strcmp(a->attr.name, "critical_task_priority")) { + if (t < NICE_TO_PRIO(MIN_NICE) || t > NICE_TO_PRIO(MAX_NICE)) + return -EINVAL; + if (!capable(CAP_SYS_NICE)) + return -EPERM; + sbi->critical_task_priority = t; + if (sbi->cprc_info.f2fs_issue_ckpt) + set_user_nice(sbi->cprc_info.f2fs_issue_ckpt, + PRIO_TO_NICE(sbi->critical_task_priority)); + if (sbi->gc_thread && sbi->gc_thread->f2fs_gc_task) + set_user_nice(sbi->gc_thread->f2fs_gc_task, + PRIO_TO_NICE(sbi->critical_task_priority)); + return count; + } + + __sbi_store_value(a, sbi, ptr + a->offset, t); return count; } @@ -1053,24 +1131,27 @@ static struct f2fs_attr f2fs_attr_sb_##_name = { \ .id = F2FS_FEATURE_##_feat, \ } -#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \ +#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset, _size) \ static struct f2fs_attr f2fs_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = _mode }, \ .show = _show, \ .store = _store, \ .struct_type = _struct_type, \ - .offset = _offset \ + .offset = _offset, \ + .size = _size \ } #define F2FS_RO_ATTR(struct_type, struct_name, name, elname) \ F2FS_ATTR_OFFSET(struct_type, name, 0444, \ f2fs_sbi_show, NULL, \ - offsetof(struct struct_name, elname)) + offsetof(struct struct_name, elname), \ + sizeof_field(struct struct_name, elname)) #define F2FS_RW_ATTR(struct_type, struct_name, name, elname) \ F2FS_ATTR_OFFSET(struct_type, name, 0644, \ f2fs_sbi_show, f2fs_sbi_store, \ - offsetof(struct struct_name, elname)) + offsetof(struct struct_name, elname), \ + sizeof_field(struct struct_name, elname)) #define F2FS_GENERAL_RO_ATTR(name) \ static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL) @@ -1219,6 +1300,10 @@ F2FS_SBI_GENERAL_RW_ATTR(blkzone_alloc_policy); F2FS_SBI_GENERAL_RW_ATTR(carve_out); F2FS_SBI_GENERAL_RW_ATTR(reserved_pin_section); F2FS_SBI_GENERAL_RW_ATTR(bggc_io_aware); +F2FS_SBI_GENERAL_RW_ATTR(max_lock_elapsed_time); +F2FS_SBI_GENERAL_RW_ATTR(lock_duration_priority); +F2FS_SBI_GENERAL_RW_ATTR(adjust_lock_priority); +F2FS_SBI_GENERAL_RW_ATTR(critical_task_priority); /* STAT_INFO ATTR */ #ifdef CONFIG_F2FS_STAT_FS @@ -1232,6 +1317,7 @@ STAT_INFO_RO_ATTR(gc_background_calls, gc_call_count[BACKGROUND]); #ifdef CONFIG_F2FS_FAULT_INJECTION FAULT_INFO_GENERAL_RW_ATTR(FAULT_INFO_RATE, inject_rate); FAULT_INFO_GENERAL_RW_ATTR(FAULT_INFO_TYPE, inject_type); +FAULT_INFO_GENERAL_RW_ATTR(FAULT_INFO_TIMEOUT, inject_lock_timeout); #endif /* RESERVED_BLOCKS ATTR */ @@ -1361,6 +1447,7 @@ static struct attribute *f2fs_attrs[] = { #ifdef CONFIG_F2FS_FAULT_INJECTION ATTR_LIST(inject_rate), ATTR_LIST(inject_type), + ATTR_LIST(inject_lock_timeout), #endif ATTR_LIST(data_io_flag), ATTR_LIST(node_io_flag), @@ -1422,6 +1509,10 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(reserved_pin_section), ATTR_LIST(allocate_section_hint), ATTR_LIST(allocate_section_policy), + ATTR_LIST(max_lock_elapsed_time), + ATTR_LIST(lock_duration_priority), + ATTR_LIST(adjust_lock_priority), + ATTR_LIST(critical_task_priority), NULL, }; ATTRIBUTE_GROUPS(f2fs); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index b4e5c406632f..941dc62a6d6f 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -804,6 +804,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, struct folio *ifolio, int flags) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_lock_context lc; int err; if (unlikely(f2fs_cp_error(sbi))) @@ -821,11 +822,11 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, size, ifolio, flags); f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); + f2fs_lock_op(sbi, &lc); f2fs_down_write(&F2FS_I(inode)->i_xattr_sem); err = __f2fs_setxattr(inode, index, name, value, size, NULL, flags); f2fs_up_write(&F2FS_I(inode)->i_xattr_sem); - f2fs_unlock_op(sbi); + f2fs_unlock_op(sbi, &lc); f2fs_update_time(sbi, REQ_TIME); return err; diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index a7880787cad3..dc41722fcc9d 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -17,7 +17,6 @@ #define F2FS_LOG_SECTORS_PER_BLOCK (PAGE_SHIFT - 9) /* log number for sector/blk */ #define F2FS_BLKSIZE PAGE_SIZE /* support only block == page */ #define F2FS_BLKSIZE_BITS PAGE_SHIFT /* bits for F2FS_BLKSIZE */ -#define F2FS_SUM_BLKSIZE 4096 /* only support 4096 byte sum block */ #define F2FS_MAX_EXTENSION 64 /* # of extension entries */ #define F2FS_EXTENSION_LEN 8 /* max size of extension */ @@ -442,10 +441,8 @@ struct f2fs_sit_block { * from node's page's beginning to get a data block address. * ex) data_blkaddr = (block_t)(nodepage_start_address + ofs_in_node) */ -#define ENTRIES_IN_SUM (F2FS_SUM_BLKSIZE / 8) #define SUMMARY_SIZE (7) /* sizeof(struct f2fs_summary) */ #define SUM_FOOTER_SIZE (5) /* sizeof(struct summary_footer) */ -#define SUM_ENTRY_SIZE (SUMMARY_SIZE * ENTRIES_IN_SUM) /* a summary entry for a block in a segment */ struct f2fs_summary { @@ -468,22 +465,6 @@ struct summary_footer { __le32 check_sum; /* summary checksum */ } __packed; -#define SUM_JOURNAL_SIZE (F2FS_SUM_BLKSIZE - SUM_FOOTER_SIZE -\ - SUM_ENTRY_SIZE) -#define NAT_JOURNAL_ENTRIES ((SUM_JOURNAL_SIZE - 2) /\ - sizeof(struct nat_journal_entry)) -#define NAT_JOURNAL_RESERVED ((SUM_JOURNAL_SIZE - 2) %\ - sizeof(struct nat_journal_entry)) -#define SIT_JOURNAL_ENTRIES ((SUM_JOURNAL_SIZE - 2) /\ - sizeof(struct sit_journal_entry)) -#define SIT_JOURNAL_RESERVED ((SUM_JOURNAL_SIZE - 2) %\ - sizeof(struct sit_journal_entry)) - -/* Reserved area should make size of f2fs_extra_info equals to - * that of nat_journal and sit_journal. - */ -#define EXTRA_INFO_RESERVED (SUM_JOURNAL_SIZE - 2 - 8) - /* * frequently updated NAT/SIT entries can be stored in the spare area in * summary blocks @@ -498,9 +479,16 @@ struct nat_journal_entry { struct f2fs_nat_entry ne; } __packed; +/* + * The nat_journal structure is a placeholder whose actual size varies depending + * on the use of packed_ssa. Therefore, it must always be accessed only through + * specific sets of macros and fields, and size calculations should use + * size-related macros instead of sizeof(). + * Relevant macros: sbi->nat_journal_entries, nat_in_journal(), + * nid_in_journal(), MAX_NAT_JENTRIES(). + */ struct nat_journal { - struct nat_journal_entry entries[NAT_JOURNAL_ENTRIES]; - __u8 reserved[NAT_JOURNAL_RESERVED]; + struct nat_journal_entry entries[0]; } __packed; struct sit_journal_entry { @@ -508,14 +496,21 @@ struct sit_journal_entry { struct f2fs_sit_entry se; } __packed; +/* + * The sit_journal structure is a placeholder whose actual size varies depending + * on the use of packed_ssa. Therefore, it must always be accessed only through + * specific sets of macros and fields, and size calculations should use + * size-related macros instead of sizeof(). + * Relevant macros: sbi->sit_journal_entries, sit_in_journal(), + * segno_in_journal(), MAX_SIT_JENTRIES(). + */ struct sit_journal { - struct sit_journal_entry entries[SIT_JOURNAL_ENTRIES]; - __u8 reserved[SIT_JOURNAL_RESERVED]; + struct sit_journal_entry entries[0]; } __packed; struct f2fs_extra_info { __le64 kbytes_written; - __u8 reserved[EXTRA_INFO_RESERVED]; + __u8 reserved[]; } __packed; struct f2fs_journal { @@ -531,11 +526,33 @@ struct f2fs_journal { }; } __packed; -/* Block-sized summary block structure */ +/* + * Block-sized summary block structure + * + * The f2fs_summary_block structure is a placeholder whose actual size varies + * depending on the use of packed_ssa. Therefore, it must always be accessed + * only through specific sets of macros and fields, and size calculations should + * use size-related macros instead of sizeof(). + * Relevant macros: sbi->sum_blocksize, sbi->entries_in_sum, + * sbi->sum_entry_size, sum_entries(), sum_journal(), sum_footer(). + * + * Summary Block Layout + * + * +-----------------------+ <--- Block Start + * | struct f2fs_summary | + * | entries[0] | + * | ... | + * | entries[N-1] | + * +-----------------------+ + * | struct f2fs_journal | + * +-----------------------+ + * | struct summary_footer | + * +-----------------------+ <--- Block End + */ struct f2fs_summary_block { - struct f2fs_summary entries[ENTRIES_IN_SUM]; - struct f2fs_journal journal; - struct summary_footer footer; + struct f2fs_summary entries[0]; + // struct f2fs_journal journal; + // struct summary_footer footer; } __packed; /* diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index df4017dcc701..9364e6775562 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -184,6 +184,15 @@ TRACE_DEFINE_ENUM(CP_PHASE_FINISH_CHECKPOINT); { CP_PHASE_FINISH_BLOCK_OPS, "finish block_ops" }, \ { CP_PHASE_FINISH_CHECKPOINT, "finish checkpoint" }) +#define show_lock_name(lock) \ + __print_symbolic(lock, \ + { LOCK_NAME_CP_RWSEM, "cp_rwsem" }, \ + { LOCK_NAME_NODE_CHANGE, "node_change" }, \ + { LOCK_NAME_NODE_WRITE, "node_write" }, \ + { LOCK_NAME_GC_LOCK, "gc_lock" }, \ + { LOCK_NAME_CP_GLOBAL, "cp_global" }, \ + { LOCK_NAME_IO_RWSEM, "io_rwsem" }) + struct f2fs_sb_info; struct f2fs_io_info; struct extent_info; @@ -1358,6 +1367,7 @@ DECLARE_EVENT_CLASS(f2fs__folio, __field(int, type) __field(int, dir) __field(pgoff_t, index) + __field(pgoff_t, nrpages) __field(int, dirty) __field(int, uptodate) ), @@ -1368,16 +1378,18 @@ DECLARE_EVENT_CLASS(f2fs__folio, __entry->type = type; __entry->dir = S_ISDIR(folio->mapping->host->i_mode); __entry->index = folio->index; + __entry->nrpages= folio_nr_pages(folio); __entry->dirty = folio_test_dirty(folio); __entry->uptodate = folio_test_uptodate(folio); ), - TP_printk("dev = (%d,%d), ino = %lu, %s, %s, index = %lu, " + TP_printk("dev = (%d,%d), ino = %lu, %s, %s, index = %lu, nr_pages = %lu, " "dirty = %d, uptodate = %d", show_dev_ino(__entry), show_block_type(__entry->type), show_file_type(__entry->dir), (unsigned long)__entry->index, + (unsigned long)__entry->nrpages, __entry->dirty, __entry->uptodate) ); @@ -1403,6 +1415,13 @@ DEFINE_EVENT(f2fs__folio, f2fs_readpage, TP_ARGS(folio, type) ); +DEFINE_EVENT(f2fs__folio, f2fs_read_folio, + + TP_PROTO(struct folio *folio, int type), + + TP_ARGS(folio, type) +); + DEFINE_EVENT(f2fs__folio, f2fs_set_page_dirty, TP_PROTO(struct folio *folio, int type), @@ -2442,6 +2461,127 @@ DEFINE_EVENT(f2fs__rw_end, f2fs_datawrite_end, TP_ARGS(inode, offset, bytes) ); +TRACE_EVENT(f2fs_lock_elapsed_time, + + TP_PROTO(struct f2fs_sb_info *sbi, enum f2fs_lock_name lock_name, + bool is_write, struct task_struct *p, int ioprio, + unsigned long long total_time, + unsigned long long running_time, + unsigned long long runnable_time, + unsigned long long io_sleep_time, + unsigned long long other_time), + + TP_ARGS(sbi, lock_name, is_write, p, ioprio, total_time, running_time, + runnable_time, io_sleep_time, other_time), + + TP_STRUCT__entry( + __field(dev_t, dev) + __array(char, comm, TASK_COMM_LEN) + __field(pid_t, pid) + __field(int, prio) + __field(int, ioprio_class) + __field(int, ioprio_data) + __field(unsigned int, lock_name) + __field(bool, is_write) + __field(unsigned long long, total_time) + __field(unsigned long long, running_time) + __field(unsigned long long, runnable_time) + __field(unsigned long long, io_sleep_time) + __field(unsigned long long, other_time) + ), + + TP_fast_assign( + __entry->dev = sbi->sb->s_dev; + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; + __entry->ioprio_class = IOPRIO_PRIO_CLASS(ioprio); + __entry->ioprio_data = IOPRIO_PRIO_DATA(ioprio); + __entry->lock_name = lock_name; + __entry->is_write = is_write; + __entry->total_time = total_time; + __entry->running_time = running_time; + __entry->runnable_time = runnable_time; + __entry->io_sleep_time = io_sleep_time; + __entry->other_time = other_time; + ), + + TP_printk("dev = (%d,%d), comm: %s, pid: %d, prio: %d, " + "ioprio_class: %d, ioprio_data: %d, lock_name: %s, " + "lock_type: %s, total: %llu, running: %llu, " + "runnable: %llu, io_sleep: %llu, other: %llu", + show_dev(__entry->dev), + __entry->comm, + __entry->pid, + __entry->prio, + __entry->ioprio_class, + __entry->ioprio_data, + show_lock_name(__entry->lock_name), + __entry->is_write ? "wlock" : "rlock", + __entry->total_time, + __entry->running_time, + __entry->runnable_time, + __entry->io_sleep_time, + __entry->other_time) +); + +DECLARE_EVENT_CLASS(f2fs_priority_update, + + TP_PROTO(struct f2fs_sb_info *sbi, enum f2fs_lock_name lock_name, + bool is_write, struct task_struct *p, int orig_prio, + int new_prio), + + TP_ARGS(sbi, lock_name, is_write, p, orig_prio, new_prio), + + TP_STRUCT__entry( + __field(dev_t, dev) + __array(char, comm, TASK_COMM_LEN) + __field(pid_t, pid) + __field(unsigned int, lock_name) + __field(bool, is_write) + __field(int, orig_prio) + __field(int, new_prio) + ), + + TP_fast_assign( + __entry->dev = sbi->sb->s_dev; + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->lock_name = lock_name; + __entry->is_write = is_write; + __entry->orig_prio = orig_prio; + __entry->new_prio = new_prio; + ), + + TP_printk("dev = (%d,%d), comm: %s, pid: %d, lock_name: %s, " + "lock_type: %s, orig_prio: %d, new_prio: %d", + show_dev(__entry->dev), + __entry->comm, + __entry->pid, + show_lock_name(__entry->lock_name), + __entry->is_write ? "wlock" : "rlock", + __entry->orig_prio, + __entry->new_prio) +); + +DEFINE_EVENT(f2fs_priority_update, f2fs_priority_uplift, + + TP_PROTO(struct f2fs_sb_info *sbi, enum f2fs_lock_name lock_name, + bool is_write, struct task_struct *p, int orig_prio, + int new_prio), + + TP_ARGS(sbi, lock_name, is_write, p, orig_prio, new_prio) +); + +DEFINE_EVENT(f2fs_priority_update, f2fs_priority_restore, + + TP_PROTO(struct f2fs_sb_info *sbi, enum f2fs_lock_name lock_name, + bool is_write, struct task_struct *p, int orig_prio, + int new_prio), + + TP_ARGS(sbi, lock_name, is_write, p, orig_prio, new_prio) +); + #endif /* _TRACE_F2FS_H */ /* This part must be outside protection */