2
0
mirror of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git synced 2025-09-04 20:19:47 +08:00

Major ext4 changes for 6.17:

- Better scalability for ext4 block allocation
   - Fix insufficient credits when writing back large folios
 
 Miscellaneous bug fixes, especially when handling exteded attriutes,
 inline data, and fast commit.
 -----BEGIN PGP SIGNATURE-----
 
 iQEzBAABCAAdFiEEK2m5VNv+CHkogTfJ8vlZVpUNgaMFAmiIQEoACgkQ8vlZVpUN
 gaPB9wf/QursT7eLjx9Gz+4PYNWPKptBERQtmmDAnNYxDlEQ28+CHdMdEeiIPPoP
 IW1DIHfR7VaTI2K7gy6D5632VAhDDKiXBpIYu1yh3KPClAxjTZbhrif8J5UBXj1K
 ZwmCeLDF40jijua4rVKq3Fqf4iTJUyU2NqLpvcze7BZg7FwstXiNJrZ3DjAwi1BW
 j/5veWwh/KrNMzT5u0+RpMs4FBrdXQXvwSe/4pSx6d75r6WAdzhgUMy09os1wAWU
 3N0JU+R5hAG6iFfbWQRURB6oLMmmxl4x2F7r5BvM27uQtELNLNcxBKZhMW97HpiE
 uSwKgo/59DKpWX0xQ2x/yugQIzd62w==
 =oPHD
 -----END PGP SIGNATURE-----

Merge tag 'ext4_for_linus_6.17-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 updates from Ted Ts'o:
 "Major ext4 changes for 6.17:

   - Better scalability for ext4 block allocation

   - Fix insufficient credits when writing back large folios

  Miscellaneous bug fixes, especially when handling exteded attriutes,
  inline data, and fast commit"

* tag 'ext4_for_linus_6.17-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (39 commits)
  ext4: do not BUG when INLINE_DATA_FL lacks system.data xattr
  ext4: implement linear-like traversal across order xarrays
  ext4: refactor choose group to scan group
  ext4: convert free groups order lists to xarrays
  ext4: factor out ext4_mb_scan_group()
  ext4: factor out ext4_mb_might_prefetch()
  ext4: factor out __ext4_mb_scan_group()
  ext4: fix largest free orders lists corruption on mb_optimize_scan switch
  ext4: fix zombie groups in average fragment size lists
  ext4: merge freed extent with existing extents before insertion
  ext4: convert sbi->s_mb_free_pending to atomic_t
  ext4: fix typo in CR_GOAL_LEN_SLOW comment
  ext4: get rid of some obsolete EXT4_MB_HINT flags
  ext4: utilize multiple global goals to reduce contention
  ext4: remove unnecessary s_md_lock on update s_mb_last_group
  ext4: remove unnecessary s_mb_last_start
  ext4: separate stream goal hits from s_bal_goals for better tracking
  ext4: add ext4_try_lock_group() to skip busy groups
  ext4: initialize superblock fields in the kballoc-test.c kunit tests
  ext4: refactor the inline directory conversion and new directory codepaths
  ...
This commit is contained in:
Linus Torvalds 2025-07-31 10:02:44 -07:00
commit ff7dcfedf9
15 changed files with 902 additions and 694 deletions

View File

@ -703,7 +703,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
* possible we just missed a transaction commit that did so
*/
smp_mb();
if (sbi->s_mb_free_pending == 0) {
if (atomic_read(&sbi->s_mb_free_pending) == 0) {
if (test_opt(sb, DISCARD)) {
atomic_inc(&sbi->s_retry_alloc_pending);
flush_work(&sbi->s_discard_work);

View File

@ -157,7 +157,7 @@ enum criteria {
/*
* Reads each block group sequentially, performing disk IO if
* necessary, to find find_suitable block group. Tries to
* necessary, to find suitable block group. Tries to
* allocate goal length but might trim the request if nothing
* is found after enough tries.
*/
@ -185,14 +185,8 @@ enum criteria {
/* prefer goal again. length */
#define EXT4_MB_HINT_MERGE 0x0001
/* blocks already reserved */
#define EXT4_MB_HINT_RESERVED 0x0002
/* metadata is being allocated */
#define EXT4_MB_HINT_METADATA 0x0004
/* first blocks in the file */
#define EXT4_MB_HINT_FIRST 0x0008
/* search for the best chunk */
#define EXT4_MB_HINT_BEST 0x0010
/* data is being allocated */
#define EXT4_MB_HINT_DATA 0x0020
/* don't preallocate (for tails) */
@ -213,15 +207,6 @@ enum criteria {
#define EXT4_MB_USE_RESERVED 0x2000
/* Do strict check for free blocks while retrying block allocation */
#define EXT4_MB_STRICT_CHECK 0x4000
/* Large fragment size list lookup succeeded at least once for
* CR_POWER2_ALIGNED */
#define EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED 0x8000
/* Avg fragment size rb tree lookup succeeded at least once for
* CR_GOAL_LEN_FAST */
#define EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED 0x00010000
/* Avg fragment size rb tree lookup succeeded at least once for
* CR_BEST_AVAIL_LEN */
#define EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED 0x00020000
struct ext4_allocation_request {
/* target inode for block we're allocating */
@ -1608,16 +1593,14 @@ struct ext4_sb_info {
unsigned short *s_mb_offsets;
unsigned int *s_mb_maxs;
unsigned int s_group_info_size;
unsigned int s_mb_free_pending;
atomic_t s_mb_free_pending;
struct list_head s_freed_data_list[2]; /* List of blocks to be freed
after commit completed */
struct list_head s_discard_list;
struct work_struct s_discard_work;
atomic_t s_retry_alloc_pending;
struct list_head *s_mb_avg_fragment_size;
rwlock_t *s_mb_avg_fragment_size_locks;
struct list_head *s_mb_largest_free_orders;
rwlock_t *s_mb_largest_free_orders_locks;
struct xarray *s_mb_avg_fragment_size;
struct xarray *s_mb_largest_free_orders;
/* tunables */
unsigned long s_stripe;
@ -1629,15 +1612,16 @@ struct ext4_sb_info {
unsigned int s_mb_order2_reqs;
unsigned int s_mb_group_prealloc;
unsigned int s_max_dir_size_kb;
/* where last allocation was done - for stream allocation */
unsigned long s_mb_last_group;
unsigned long s_mb_last_start;
unsigned int s_mb_prefetch;
unsigned int s_mb_prefetch_limit;
unsigned int s_mb_best_avail_max_trim_order;
unsigned int s_sb_update_sec;
unsigned int s_sb_update_kb;
/* where last allocation was done - for stream allocation */
ext4_group_t *s_mb_last_groups;
unsigned int s_mb_nr_global_goals;
/* stats for buddy allocator */
atomic_t s_bal_reqs; /* number of reqs with len > 1 */
atomic_t s_bal_success; /* we found long enough chunks */
@ -1646,12 +1630,10 @@ struct ext4_sb_info {
atomic_t s_bal_cX_ex_scanned[EXT4_MB_NUM_CRS]; /* total extents scanned */
atomic_t s_bal_groups_scanned; /* number of groups scanned */
atomic_t s_bal_goals; /* goal hits */
atomic_t s_bal_stream_goals; /* stream allocation global goal hits */
atomic_t s_bal_len_goals; /* len goal hits */
atomic_t s_bal_breaks; /* too long searches */
atomic_t s_bal_2orders; /* 2^order hits */
atomic_t s_bal_p2_aligned_bad_suggestions;
atomic_t s_bal_goal_fast_bad_suggestions;
atomic_t s_bal_best_avail_bad_suggestions;
atomic64_t s_bal_cX_groups_considered[EXT4_MB_NUM_CRS];
atomic64_t s_bal_cX_hits[EXT4_MB_NUM_CRS];
atomic64_t s_bal_cX_failed[EXT4_MB_NUM_CRS]; /* cX loop didn't find blocks */
@ -3020,7 +3002,7 @@ int ext4_walk_page_buffers(handle_t *handle,
struct buffer_head *bh));
int do_journal_get_write_access(handle_t *handle, struct inode *inode,
struct buffer_head *bh);
bool ext4_should_enable_large_folio(struct inode *inode);
void ext4_set_inode_mapping_order(struct inode *inode);
#define FALL_BACK_TO_NONDELALLOC 1
#define CONVERT_INLINE_DATA 2
@ -3064,9 +3046,9 @@ extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
extern void ext4_set_inode_flags(struct inode *, bool init);
extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode);
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
extern int ext4_chunk_trans_extent(struct inode *inode, int nrblocks);
extern int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
int pextents);
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
@ -3489,8 +3471,6 @@ struct ext4_group_info {
void *bb_bitmap;
#endif
struct rw_semaphore alloc_sem;
struct list_head bb_avg_fragment_size_node;
struct list_head bb_largest_free_order_node;
ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block
* regions, index is order.
* bb_counters[3] = 5 means
@ -3541,23 +3521,28 @@ static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi)
return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD);
}
static inline bool ext4_try_lock_group(struct super_block *sb, ext4_group_t group)
{
if (!spin_trylock(ext4_group_lock_ptr(sb, group)))
return false;
/*
* We're able to grab the lock right away, so drop the lock
* contention counter.
*/
atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
return true;
}
static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
{
spinlock_t *lock = ext4_group_lock_ptr(sb, group);
if (spin_trylock(lock))
/*
* We're able to grab the lock right away, so drop the
* lock contention counter.
*/
atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
else {
if (!ext4_try_lock_group(sb, group)) {
/*
* The lock is busy, so bump the contention counter,
* and then wait on the spin lock.
*/
atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1,
EXT4_MAX_CONTENTION);
spin_lock(lock);
spin_lock(ext4_group_lock_ptr(sb, group));
}
}
@ -3612,6 +3597,7 @@ extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
extern int ext4_get_max_inline_size(struct inode *inode);
extern int ext4_find_inline_data_nolock(struct inode *inode);
extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);
extern void ext4_update_final_de(void *de_buf, int old_size, int new_size);
int ext4_readpage_inline(struct inode *inode, struct folio *folio);
extern int ext4_try_to_write_inline_data(struct address_space *mapping,
@ -3671,10 +3657,10 @@ static inline int ext4_has_inline_data(struct inode *inode)
extern const struct inode_operations ext4_dir_inode_operations;
extern const struct inode_operations ext4_special_inode_operations;
extern struct dentry *ext4_get_parent(struct dentry *child);
extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
struct ext4_dir_entry_2 *de,
int blocksize, int csum_size,
unsigned int parent_ino, int dotdot_real_len);
extern int ext4_init_dirblock(handle_t *handle, struct inode *inode,
struct buffer_head *dir_block,
unsigned int parent_ino, void *inline_buf,
int inline_size);
extern void ext4_initialize_dirent_tail(struct buffer_head *bh,
unsigned int blocksize);
extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode,

View File

@ -30,13 +30,6 @@
*/
#define CHECK_BINSEARCH__
/*
* If EXT_STATS is defined then stats numbers are collected.
* These number will be displayed at umount time.
*/
#define EXT_STATS_
/*
* ext4_inode has i_block array (60 bytes total).
* The first 12 bytes store ext4_extent_header;

View File

@ -5215,7 +5215,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
credits = depth + 2;
}
restart_credits = ext4_writepage_trans_blocks(inode);
restart_credits = ext4_chunk_trans_extent(inode, 0);
err = ext4_datasem_ensure_credits(handle, inode, credits,
restart_credits, 0);
if (err) {
@ -5475,7 +5475,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
truncate_pagecache(inode, start);
credits = ext4_writepage_trans_blocks(inode);
credits = ext4_chunk_trans_extent(inode, 0);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
if (IS_ERR(handle))
return PTR_ERR(handle);
@ -5571,7 +5571,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
truncate_pagecache(inode, start);
credits = ext4_writepage_trans_blocks(inode);
credits = ext4_chunk_trans_extent(inode, 0);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
if (IS_ERR(handle))
return PTR_ERR(handle);

View File

@ -1335,8 +1335,7 @@ got:
}
}
if (ext4_should_enable_large_folio(inode))
mapping_set_large_folios(inode->i_mapping);
ext4_set_inode_mapping_order(inode);
ext4_update_inode_fsync_trans(handle, inode, 1);

View File

@ -303,7 +303,11 @@ static int ext4_create_inline_data(handle_t *handle,
if (error)
goto out;
BUG_ON(!is.s.not_found);
if (!is.s.not_found) {
EXT4_ERROR_INODE(inode, "unexpected inline data xattr");
error = -EFSCORRUPTED;
goto out;
}
error = ext4_xattr_ibody_set(handle, inode, &i, &is);
if (error) {
@ -354,7 +358,11 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode,
if (error)
goto out;
BUG_ON(is.s.not_found);
if (is.s.not_found) {
EXT4_ERROR_INODE(inode, "missing inline data xattr");
error = -EFSCORRUPTED;
goto out;
}
len -= EXT4_MIN_INLINE_DATA_SIZE;
value = kzalloc(len, GFP_NOFS);
@ -562,7 +570,7 @@ static int ext4_convert_inline_data_to_extent(struct address_space *mapping,
return 0;
}
needed_blocks = ext4_writepage_trans_blocks(inode);
needed_blocks = ext4_chunk_trans_extent(inode, 1);
ret = ext4_get_inode_loc(inode, &iloc);
if (ret)
@ -612,6 +620,7 @@ retry:
} else
ret = ext4_block_write_begin(handle, folio, from, to,
ext4_get_block);
clear_buffer_new(folio_buffers(folio));
if (!ret && ext4_should_journal_data(inode)) {
ret = ext4_walk_page_buffers(handle, inode,
@ -891,6 +900,7 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
return ret;
}
clear_buffer_new(folio_buffers(folio));
folio_mark_dirty(folio);
folio_mark_uptodate(folio);
ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
@ -995,7 +1005,7 @@ static void *ext4_get_inline_xattr_pos(struct inode *inode,
}
/* Set the final de to cover the whole block. */
static void ext4_update_final_de(void *de_buf, int old_size, int new_size)
void ext4_update_final_de(void *de_buf, int old_size, int new_size)
{
struct ext4_dir_entry_2 *de, *prev_de;
void *limit;
@ -1059,51 +1069,6 @@ static void ext4_restore_inline_data(handle_t *handle, struct inode *inode,
ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
}
static int ext4_finish_convert_inline_dir(handle_t *handle,
struct inode *inode,
struct buffer_head *dir_block,
void *buf,
int inline_size)
{
int err, csum_size = 0, header_size = 0;
struct ext4_dir_entry_2 *de;
void *target = dir_block->b_data;
/*
* First create "." and ".." and then copy the dir information
* back to the block.
*/
de = target;
de = ext4_init_dot_dotdot(inode, de,
inode->i_sb->s_blocksize, csum_size,
le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1);
header_size = (void *)de - target;
memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE,
inline_size - EXT4_INLINE_DOTDOT_SIZE);
if (ext4_has_feature_metadata_csum(inode->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
inode->i_size = inode->i_sb->s_blocksize;
i_size_write(inode, inode->i_sb->s_blocksize);
EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
ext4_update_final_de(dir_block->b_data,
inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size,
inode->i_sb->s_blocksize - csum_size);
if (csum_size)
ext4_initialize_dirent_tail(dir_block,
inode->i_sb->s_blocksize);
set_buffer_uptodate(dir_block);
unlock_buffer(dir_block);
err = ext4_handle_dirty_dirblock(handle, inode, dir_block);
if (err)
return err;
set_buffer_verified(dir_block);
return ext4_mark_inode_dirty(handle, inode);
}
static int ext4_convert_inline_data_nolock(handle_t *handle,
struct inode *inode,
struct ext4_iloc *iloc)
@ -1175,8 +1140,17 @@ static int ext4_convert_inline_data_nolock(handle_t *handle,
error = ext4_handle_dirty_metadata(handle,
inode, data_bh);
} else {
error = ext4_finish_convert_inline_dir(handle, inode, data_bh,
buf, inline_size);
unlock_buffer(data_bh);
inode->i_size = inode->i_sb->s_blocksize;
i_size_write(inode, inode->i_sb->s_blocksize);
EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
error = ext4_init_dirblock(handle, inode, data_bh,
le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode),
buf + EXT4_INLINE_DOTDOT_SIZE,
inline_size - EXT4_INLINE_DOTDOT_SIZE);
if (!error)
error = ext4_mark_inode_dirty(handle, inode);
}
out_restore:
@ -1315,7 +1289,7 @@ int ext4_inlinedir_to_tree(struct file *dir_file,
if (pos == 0) {
fake.inode = cpu_to_le32(inode->i_ino);
fake.name_len = 1;
strcpy(fake.name, ".");
memcpy(fake.name, ".", 2);
fake.rec_len = ext4_rec_len_to_disk(
ext4_dir_rec_len(fake.name_len, NULL),
inline_size);
@ -1325,7 +1299,7 @@ int ext4_inlinedir_to_tree(struct file *dir_file,
} else if (pos == EXT4_INLINE_DOTDOT_OFFSET) {
fake.inode = cpu_to_le32(parent_ino);
fake.name_len = 2;
strcpy(fake.name, "..");
memcpy(fake.name, "..", 3);
fake.rec_len = ext4_rec_len_to_disk(
ext4_dir_rec_len(fake.name_len, NULL),
inline_size);
@ -1864,7 +1838,7 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
};
needed_blocks = ext4_writepage_trans_blocks(inode);
needed_blocks = ext4_chunk_trans_extent(inode, 1);
handle = ext4_journal_start(inode, EXT4_HT_INODE, needed_blocks);
if (IS_ERR(handle))
return PTR_ERR(handle);
@ -1903,7 +1877,12 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
if ((err = ext4_xattr_ibody_find(inode, &i, &is)) != 0)
goto out_error;
BUG_ON(is.s.not_found);
if (is.s.not_found) {
EXT4_ERROR_INODE(inode,
"missing inline data xattr");
err = -EFSCORRUPTED;
goto out_error;
}
value_len = le32_to_cpu(is.s.here->e_value_size);
value = kmalloc(value_len, GFP_NOFS);
@ -1979,7 +1958,7 @@ int ext4_convert_inline_data(struct inode *inode)
return 0;
}
needed_blocks = ext4_writepage_trans_blocks(inode);
needed_blocks = ext4_chunk_trans_extent(inode, 1);
iloc.bh = NULL;
error = ext4_get_inode_loc(inode, &iloc);

View File

@ -723,8 +723,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
ext4_check_map_extents_env(inode);
/* Lookup extent status tree firstly */
if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) &&
ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
map->m_pblk = ext4_es_pblock(&es) +
map->m_lblk - es.es_lblk;
@ -757,8 +756,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
orig_mlen == map->m_len)
goto found;
if (flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF)
map->m_len = orig_mlen;
map->m_len = orig_mlen;
}
/*
* In the query cache no-wait mode, nothing we can do more if we
@ -877,6 +875,26 @@ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags)
} while (unlikely(!try_cmpxchg(&bh->b_state, &old_state, new_state)));
}
/*
* Make sure that the current journal transaction has enough credits to map
* one extent. Return -EAGAIN if it cannot extend the current running
* transaction.
*/
static inline int ext4_journal_ensure_extent_credits(handle_t *handle,
struct inode *inode)
{
int credits;
int ret;
/* Called from ext4_da_write_begin() which has no handle started? */
if (!handle)
return 0;
credits = ext4_chunk_trans_blocks(inode, 1);
ret = __ext4_journal_ensure_credits(handle, credits, credits, 0);
return ret <= 0 ? ret : -EAGAIN;
}
static int _ext4_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int flags)
{
@ -1171,11 +1189,13 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio,
}
continue;
}
if (buffer_new(bh))
if (WARN_ON_ONCE(buffer_new(bh)))
clear_buffer_new(bh);
if (!buffer_mapped(bh)) {
WARN_ON(bh->b_size != blocksize);
err = get_block(inode, block, bh, 1);
err = ext4_journal_ensure_extent_credits(handle, inode);
if (!err)
err = get_block(inode, block, bh, 1);
if (err)
break;
if (buffer_new(bh)) {
@ -1274,7 +1294,8 @@ static int ext4_write_begin(const struct kiocb *iocb,
* Reserve one block more for addition to orphan list in case
* we allocate blocks but write fails for some reason
*/
needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
needed_blocks = ext4_chunk_trans_extent(inode,
ext4_journal_blocks_per_folio(inode)) + 1;
index = pos >> PAGE_SHIFT;
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
@ -1372,8 +1393,9 @@ retry_journal:
ext4_orphan_del(NULL, inode);
}
if (ret == -ENOSPC &&
ext4_should_retry_alloc(inode->i_sb, &retries))
if (ret == -EAGAIN ||
(ret == -ENOSPC &&
ext4_should_retry_alloc(inode->i_sb, &retries)))
goto retry_journal;
folio_put(folio);
return ret;
@ -1393,6 +1415,7 @@ static int write_end_fn(handle_t *handle, struct inode *inode,
ret = ext4_dirty_journalled_data(handle, bh);
clear_buffer_meta(bh);
clear_buffer_prio(bh);
clear_buffer_new(bh);
return ret;
}
@ -1665,11 +1688,12 @@ struct mpage_da_data {
unsigned int can_map:1; /* Can writepages call map blocks? */
/* These are internal state of ext4_do_writepages() */
pgoff_t first_page; /* The first page to write */
pgoff_t next_page; /* Current page to examine */
pgoff_t last_page; /* Last page to examine */
loff_t start_pos; /* The start pos to write */
loff_t next_pos; /* Current pos to examine */
loff_t end_pos; /* Last pos to examine */
/*
* Extent to map - this can be after first_page because that can be
* Extent to map - this can be after start_pos because that can be
* fully mapped. We somewhat abuse m_flags to store whether the extent
* is delalloc or unwritten.
*/
@ -1689,38 +1713,38 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
struct inode *inode = mpd->inode;
struct address_space *mapping = inode->i_mapping;
/* This is necessary when next_page == 0. */
if (mpd->first_page >= mpd->next_page)
/* This is necessary when next_pos == 0. */
if (mpd->start_pos >= mpd->next_pos)
return;
mpd->scanned_until_end = 0;
index = mpd->first_page;
end = mpd->next_page - 1;
if (invalidate) {
ext4_lblk_t start, last;
start = index << (PAGE_SHIFT - inode->i_blkbits);
last = end << (PAGE_SHIFT - inode->i_blkbits);
start = EXT4_B_TO_LBLK(inode, mpd->start_pos);
last = mpd->next_pos >> inode->i_blkbits;
/*
* avoid racing with extent status tree scans made by
* ext4_insert_delayed_block()
*/
down_write(&EXT4_I(inode)->i_data_sem);
ext4_es_remove_extent(inode, start, last - start + 1);
ext4_es_remove_extent(inode, start, last - start);
up_write(&EXT4_I(inode)->i_data_sem);
}
folio_batch_init(&fbatch);
while (index <= end) {
nr = filemap_get_folios(mapping, &index, end, &fbatch);
index = mpd->start_pos >> PAGE_SHIFT;
end = mpd->next_pos >> PAGE_SHIFT;
while (index < end) {
nr = filemap_get_folios(mapping, &index, end - 1, &fbatch);
if (nr == 0)
break;
for (i = 0; i < nr; i++) {
struct folio *folio = fbatch.folios[i];
if (folio->index < mpd->first_page)
if (folio_pos(folio) < mpd->start_pos)
continue;
if (folio_next_index(folio) - 1 > end)
if (folio_next_index(folio) > end)
continue;
BUG_ON(!folio_test_locked(folio));
BUG_ON(folio_test_writeback(folio));
@ -2022,7 +2046,8 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio)
{
mpd->first_page += folio_nr_pages(folio);
mpd->start_pos += folio_size(folio);
mpd->wbc->nr_to_write -= folio_nr_pages(folio);
folio_unlock(folio);
}
@ -2032,7 +2057,7 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio)
loff_t size;
int err;
BUG_ON(folio->index != mpd->first_page);
WARN_ON_ONCE(folio_pos(folio) != mpd->start_pos);
folio_clear_dirty_for_io(folio);
/*
* We have to be very careful here! Nothing protects writeback path
@ -2053,8 +2078,6 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio)
!ext4_verity_in_progress(mpd->inode))
len = size & (len - 1);
err = ext4_bio_write_folio(&mpd->io_submit, folio, len);
if (!err)
mpd->wbc->nr_to_write -= folio_nr_pages(folio);
return err;
}
@ -2321,6 +2344,11 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
int get_blocks_flags;
int err, dioread_nolock;
/* Make sure transaction has enough credits for this extent */
err = ext4_journal_ensure_extent_credits(handle, inode);
if (err < 0)
return err;
trace_ext4_da_write_pages_extent(inode, map);
/*
* Call ext4_map_blocks() to allocate any delayed allocation blocks, or
@ -2359,6 +2387,47 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
return 0;
}
/*
* This is used to submit mapped buffers in a single folio that is not fully
* mapped for various reasons, such as insufficient space or journal credits.
*/
static int mpage_submit_partial_folio(struct mpage_da_data *mpd)
{
struct inode *inode = mpd->inode;
struct folio *folio;
loff_t pos;
int ret;
folio = filemap_get_folio(inode->i_mapping,
mpd->start_pos >> PAGE_SHIFT);
if (IS_ERR(folio))
return PTR_ERR(folio);
/*
* The mapped position should be within the current processing folio
* but must not be the folio start position.
*/
pos = ((loff_t)mpd->map.m_lblk) << inode->i_blkbits;
if (WARN_ON_ONCE((folio_pos(folio) == pos) ||
!folio_contains(folio, pos >> PAGE_SHIFT)))
return -EINVAL;
ret = mpage_submit_folio(mpd, folio);
if (ret)
goto out;
/*
* Update start_pos to prevent this folio from being released in
* mpage_release_unused_pages(), it will be reset to the aligned folio
* pos when this folio is written again in the next round. Additionally,
* do not update wbc->nr_to_write here, as it will be updated once the
* entire folio has finished processing.
*/
mpd->start_pos = pos;
out:
folio_unlock(folio);
folio_put(folio);
return ret;
}
/*
* mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
* mpd->len and submit pages underlying it for IO
@ -2407,10 +2476,18 @@ static int mpage_map_and_submit_extent(handle_t *handle,
* In the case of ENOSPC, if ext4_count_free_blocks()
* is non-zero, a commit should free up blocks.
*/
if ((err == -ENOMEM) ||
if ((err == -ENOMEM) || (err == -EAGAIN) ||
(err == -ENOSPC && ext4_count_free_clusters(sb))) {
if (progress)
/*
* We may have already allocated extents for
* some bhs inside the folio, issue the
* corresponding data to prevent stale data.
*/
if (progress) {
if (mpage_submit_partial_folio(mpd))
goto invalidate_dirty_pages;
goto update_disksize;
}
return err;
}
ext4_msg(sb, KERN_CRIT,
@ -2444,7 +2521,7 @@ update_disksize:
* Update on-disk size after IO is submitted. Races with
* truncate are avoided by checking i_size under i_data_sem.
*/
disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT;
disksize = mpd->start_pos;
if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) {
int err2;
loff_t i_size;
@ -2468,21 +2545,6 @@ update_disksize:
return err;
}
/*
* Calculate the total number of credits to reserve for one writepages
* iteration. This is called from ext4_writepages(). We map an extent of
* up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
* the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
* bpp - 1 blocks in bpp different extents.
*/
static int ext4_da_writepages_trans_blocks(struct inode *inode)
{
int bpp = ext4_journal_blocks_per_folio(inode);
return ext4_meta_trans_blocks(inode,
MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
}
static int ext4_journal_folio_buffers(handle_t *handle, struct folio *folio,
size_t len)
{
@ -2547,8 +2609,8 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
struct address_space *mapping = mpd->inode->i_mapping;
struct folio_batch fbatch;
unsigned int nr_folios;
pgoff_t index = mpd->first_page;
pgoff_t end = mpd->last_page;
pgoff_t index = mpd->start_pos >> PAGE_SHIFT;
pgoff_t end = mpd->end_pos >> PAGE_SHIFT;
xa_mark_t tag;
int i, err = 0;
int blkbits = mpd->inode->i_blkbits;
@ -2563,7 +2625,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
tag = PAGECACHE_TAG_DIRTY;
mpd->map.m_len = 0;
mpd->next_page = index;
mpd->next_pos = mpd->start_pos;
if (ext4_should_journal_data(mpd->inode)) {
handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE,
bpp);
@ -2594,7 +2656,8 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
goto out;
/* If we can't merge this page, we are done. */
if (mpd->map.m_len > 0 && mpd->next_page != folio->index)
if (mpd->map.m_len > 0 &&
mpd->next_pos != folio_pos(folio))
goto out;
if (handle) {
@ -2640,8 +2703,8 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
}
if (mpd->map.m_len == 0)
mpd->first_page = folio->index;
mpd->next_page = folio_next_index(folio);
mpd->start_pos = folio_pos(folio);
mpd->next_pos = folio_pos(folio) + folio_size(folio);
/*
* Writeout when we cannot modify metadata is simple.
* Just submit the page. For data=journal mode we
@ -2769,12 +2832,12 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
mpd->journalled_more_data = 0;
if (ext4_should_dioread_nolock(inode)) {
int bpf = ext4_journal_blocks_per_folio(inode);
/*
* We may need to convert up to one extent per block in
* the page and we may dirty the inode.
* the folio and we may dirty the inode.
*/
rsv_blocks = 1 + ext4_chunk_trans_blocks(inode,
PAGE_SIZE >> inode->i_blkbits);
rsv_blocks = 1 + ext4_ext_index_trans_blocks(inode, bpf);
}
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
@ -2784,18 +2847,18 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
writeback_index = mapping->writeback_index;
if (writeback_index)
cycled = 0;
mpd->first_page = writeback_index;
mpd->last_page = -1;
mpd->start_pos = writeback_index << PAGE_SHIFT;
mpd->end_pos = LLONG_MAX;
} else {
mpd->first_page = wbc->range_start >> PAGE_SHIFT;
mpd->last_page = wbc->range_end >> PAGE_SHIFT;
mpd->start_pos = wbc->range_start;
mpd->end_pos = wbc->range_end;
}
ext4_io_submit_init(&mpd->io_submit, wbc);
retry:
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag_pages_for_writeback(mapping, mpd->first_page,
mpd->last_page);
tag_pages_for_writeback(mapping, mpd->start_pos >> PAGE_SHIFT,
mpd->end_pos >> PAGE_SHIFT);
blk_start_plug(&plug);
/*
@ -2838,8 +2901,14 @@ retry:
* not supported by delalloc.
*/
BUG_ON(ext4_should_journal_data(inode));
needed_blocks = ext4_da_writepages_trans_blocks(inode);
/*
* Calculate the number of credits needed to reserve for one
* extent of up to MAX_WRITEPAGES_EXTENT_LEN blocks. It will
* attempt to extend the transaction or start a new iteration
* if the reserved credits are insufficient.
*/
needed_blocks = ext4_chunk_trans_blocks(inode,
MAX_WRITEPAGES_EXTENT_LEN);
/* start a new transaction */
handle = ext4_journal_start_with_reserve(inode,
EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);
@ -2855,7 +2924,8 @@ retry:
}
mpd->do_map = 1;
trace_ext4_da_write_pages(inode, mpd->first_page, wbc);
trace_ext4_da_write_folios_start(inode, mpd->start_pos,
mpd->next_pos, wbc);
ret = mpage_prepare_extent_to_map(mpd);
if (!ret && mpd->map.m_len)
ret = mpage_map_and_submit_extent(handle, mpd,
@ -2893,6 +2963,8 @@ retry:
} else
ext4_put_io_end(mpd->io_submit.io_end);
mpd->io_submit.io_end = NULL;
trace_ext4_da_write_folios_end(inode, mpd->start_pos,
mpd->next_pos, wbc, ret);
if (ret == -ENOSPC && sbi->s_journal) {
/*
@ -2904,6 +2976,8 @@ retry:
ret = 0;
continue;
}
if (ret == -EAGAIN)
ret = 0;
/* Fatal error - ENOMEM, EIO... */
if (ret)
break;
@ -2912,8 +2986,8 @@ unplug:
blk_finish_plug(&plug);
if (!ret && !cycled && wbc->nr_to_write > 0) {
cycled = 1;
mpd->last_page = writeback_index - 1;
mpd->first_page = 0;
mpd->end_pos = (writeback_index << PAGE_SHIFT) - 1;
mpd->start_pos = 0;
goto retry;
}
@ -2923,7 +2997,7 @@ unplug:
* Set the writeback_index so that range_cyclic
* mode will write it back later
*/
mapping->writeback_index = mpd->first_page;
mapping->writeback_index = mpd->start_pos >> PAGE_SHIFT;
out_writepages:
trace_ext4_writepages_result(inode, wbc, ret,
@ -4384,7 +4458,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
return ret;
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
credits = ext4_writepage_trans_blocks(inode);
credits = ext4_chunk_trans_extent(inode, 2);
else
credits = ext4_blocks_for_truncate(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
@ -4533,7 +4607,7 @@ int ext4_truncate(struct inode *inode)
}
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
credits = ext4_writepage_trans_blocks(inode);
credits = ext4_chunk_trans_extent(inode, 1);
else
credits = ext4_blocks_for_truncate(inode);
@ -5101,7 +5175,7 @@ error:
return -EFSCORRUPTED;
}
bool ext4_should_enable_large_folio(struct inode *inode)
static bool ext4_should_enable_large_folio(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
@ -5118,6 +5192,22 @@ bool ext4_should_enable_large_folio(struct inode *inode)
return true;
}
/*
* Limit the maximum folio order to 2048 blocks to prevent overestimation
* of reserve handle credits during the folio writeback in environments
* where the PAGE_SIZE exceeds 4KB.
*/
#define EXT4_MAX_PAGECACHE_ORDER(i) \
umin(MAX_PAGECACHE_ORDER, (11 + (i)->i_blkbits - PAGE_SHIFT))
void ext4_set_inode_mapping_order(struct inode *inode)
{
if (!ext4_should_enable_large_folio(inode))
return;
mapping_set_folio_order_range(inode->i_mapping, 0,
EXT4_MAX_PAGECACHE_ORDER(inode));
}
struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ext4_iget_flags flags, const char *function,
unsigned int line)
@ -5435,8 +5525,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ret = -EFSCORRUPTED;
goto bad_inode;
}
if (ext4_should_enable_large_folio(inode))
mapping_set_large_folios(inode->i_mapping);
ext4_set_inode_mapping_order(inode);
ret = check_igot_inode(inode, flags, function, line);
/*
@ -6134,7 +6224,7 @@ int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents)
int ret;
/*
* How many index and lead blocks need to touch to map @lblocks
* How many index and leaf blocks need to touch to map @lblocks
* logical blocks to @pextents physical extents?
*/
idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);
@ -6143,7 +6233,7 @@ int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents)
* Now let's see how many group bitmaps and group descriptors need
* to account
*/
groups = idxblocks;
groups = idxblocks + pextents;
gdpblocks = groups;
if (groups > ngroups)
groups = ngroups;
@ -6160,25 +6250,19 @@ int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents)
}
/*
* Calculate the total number of credits to reserve to fit
* the modification of a single pages into a single transaction,
* which may include multiple chunks of block allocations.
*
* This could be called via ext4_write_begin()
*
* We need to consider the worse case, when
* one new block per extent.
* Calculate the journal credits for modifying the number of blocks
* in a single extent within one transaction. 'nrblocks' is used only
* for non-extent inodes. For extent type inodes, 'nrblocks' can be
* zero if the exact number of blocks is unknown.
*/
int ext4_writepage_trans_blocks(struct inode *inode)
int ext4_chunk_trans_extent(struct inode *inode, int nrblocks)
{
int bpp = ext4_journal_blocks_per_folio(inode);
int ret;
ret = ext4_meta_trans_blocks(inode, bpp, bpp);
ret = ext4_meta_trans_blocks(inode, nrblocks, 1);
/* Account for data blocks for journalled mode */
if (ext4_should_journal_data(inode))
ret += bpp;
ret += nrblocks;
return ret;
}
@ -6550,6 +6634,55 @@ static int ext4_bh_unmapped(handle_t *handle, struct inode *inode,
return !buffer_mapped(bh);
}
static int ext4_block_page_mkwrite(struct inode *inode, struct folio *folio,
get_block_t get_block)
{
handle_t *handle;
loff_t size;
unsigned long len;
int credits;
int ret;
credits = ext4_chunk_trans_extent(inode,
ext4_journal_blocks_per_folio(inode));
handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, credits);
if (IS_ERR(handle))
return PTR_ERR(handle);
folio_lock(folio);
size = i_size_read(inode);
/* Page got truncated from under us? */
if (folio->mapping != inode->i_mapping || folio_pos(folio) > size) {
ret = -EFAULT;
goto out_error;
}
len = folio_size(folio);
if (folio_pos(folio) + len > size)
len = size - folio_pos(folio);
ret = ext4_block_write_begin(handle, folio, 0, len, get_block);
if (ret)
goto out_error;
if (!ext4_should_journal_data(inode)) {
block_commit_write(folio, 0, len);
folio_mark_dirty(folio);
} else {
ret = ext4_journal_folio_buffers(handle, folio, len);
if (ret)
goto out_error;
}
ext4_journal_stop(handle);
folio_wait_stable(folio);
return ret;
out_error:
folio_unlock(folio);
ext4_journal_stop(handle);
return ret;
}
vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
@ -6561,8 +6694,7 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
struct file *file = vma->vm_file;
struct inode *inode = file_inode(file);
struct address_space *mapping = inode->i_mapping;
handle_t *handle;
get_block_t *get_block;
get_block_t *get_block = ext4_get_block;
int retries = 0;
if (unlikely(IS_IMMUTABLE(inode)))
@ -6630,47 +6762,11 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
/* OK, we need to fill the hole... */
if (ext4_should_dioread_nolock(inode))
get_block = ext4_get_block_unwritten;
else
get_block = ext4_get_block;
retry_alloc:
handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
ext4_writepage_trans_blocks(inode));
if (IS_ERR(handle)) {
ret = VM_FAULT_SIGBUS;
goto out;
}
/*
* Data journalling can't use block_page_mkwrite() because it
* will set_buffer_dirty() before do_journal_get_write_access()
* thus might hit warning messages for dirty metadata buffers.
*/
if (!ext4_should_journal_data(inode)) {
err = block_page_mkwrite(vma, vmf, get_block);
} else {
folio_lock(folio);
size = i_size_read(inode);
/* Page got truncated from under us? */
if (folio->mapping != mapping || folio_pos(folio) > size) {
ret = VM_FAULT_NOPAGE;
goto out_error;
}
len = folio_size(folio);
if (folio_pos(folio) + len > size)
len = size - folio_pos(folio);
err = ext4_block_write_begin(handle, folio, 0, len,
ext4_get_block);
if (!err) {
ret = VM_FAULT_SIGBUS;
if (ext4_journal_folio_buffers(handle, folio, len))
goto out_error;
} else {
folio_unlock(folio);
}
}
ext4_journal_stop(handle);
if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
/* Start journal and allocate blocks */
err = ext4_block_page_mkwrite(inode, folio, get_block);
if (err == -EAGAIN ||
(err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)))
goto retry_alloc;
out_ret:
ret = vmf_fs_error(err);
@ -6678,8 +6774,4 @@ out:
filemap_invalidate_unlock_shared(mapping);
sb_end_pagefault(inode->i_sb);
return ret;
out_error:
folio_unlock(folio);
ext4_journal_stop(handle);
goto out;
}

View File

@ -155,6 +155,7 @@ static struct super_block *mbt_ext4_alloc_super_block(void)
bgl_lock_init(sbi->s_blockgroup_lock);
sbi->s_es = &fsb->es;
sbi->s_sb = sb;
sb->s_fs_info = sbi;
up_write(&sb->s_umount);
@ -802,6 +803,8 @@ static void test_mb_mark_used(struct kunit *test)
KUNIT_ASSERT_EQ(test, ret, 0);
grp->bb_free = EXT4_CLUSTERS_PER_GROUP(sb);
grp->bb_largest_free_order = -1;
grp->bb_avg_fragment_size_order = -1;
mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
for (i = 0; i < TEST_RANGE_COUNT; i++)
test_mb_mark_used_range(test, &e4b, ranges[i].start,
@ -875,6 +878,8 @@ static void test_mb_free_blocks(struct kunit *test)
ext4_unlock_group(sb, TEST_GOAL_GROUP);
grp->bb_free = 0;
grp->bb_largest_free_order = -1;
grp->bb_avg_fragment_size_order = -1;
memset(bitmap, 0xff, sb->s_blocksize);
mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);

File diff suppressed because it is too large Load Diff

View File

@ -192,8 +192,13 @@ struct ext4_allocation_context {
*/
ext4_grpblk_t ac_orig_goal_len;
ext4_group_t ac_prefetch_grp;
unsigned int ac_prefetch_ios;
unsigned int ac_prefetch_nr;
int ac_first_err;
__u32 ac_flags; /* allocation hints */
__u32 ac_groups_linear_remaining;
__u16 ac_groups_scanned;
__u16 ac_found;
__u16 ac_cX_found[EXT4_MB_NUM_CRS];
@ -204,6 +209,8 @@ struct ext4_allocation_context {
__u8 ac_2order; /* if request is to allocate 2^N blocks and
* N > 0, the field stores N, otherwise 0 */
__u8 ac_op; /* operation, for history only */
struct ext4_buddy *ac_e4b;
struct folio *ac_bitmap_folio;
struct folio *ac_buddy_folio;
struct ext4_prealloc_space *ac_pa;

View File

@ -280,7 +280,8 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
*/
again:
*err = 0;
jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
jblocks = ext4_meta_trans_blocks(orig_inode, block_len_in_page,
block_len_in_page) * 2;
handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks);
if (IS_ERR(handle)) {
*err = PTR_ERR(handle);

View File

@ -2915,33 +2915,50 @@ err_unlock_inode:
return err;
}
struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
struct ext4_dir_entry_2 *de,
int blocksize, int csum_size,
unsigned int parent_ino, int dotdot_real_len)
int ext4_init_dirblock(handle_t *handle, struct inode *inode,
struct buffer_head *bh, unsigned int parent_ino,
void *inline_buf, int inline_size)
{
struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) bh->b_data;
size_t blocksize = bh->b_size;
int csum_size = 0, header_size;
if (ext4_has_feature_metadata_csum(inode->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
de->inode = cpu_to_le32(inode->i_ino);
de->name_len = 1;
de->rec_len = ext4_rec_len_to_disk(ext4_dir_rec_len(de->name_len, NULL),
blocksize);
strcpy(de->name, ".");
memcpy(de->name, ".", 2);
ext4_set_de_type(inode->i_sb, de, S_IFDIR);
de = ext4_next_entry(de, blocksize);
de->inode = cpu_to_le32(parent_ino);
de->name_len = 2;
if (!dotdot_real_len)
de->rec_len = ext4_rec_len_to_disk(blocksize -
(csum_size + ext4_dir_rec_len(1, NULL)),
blocksize);
else
memcpy(de->name, "..", 3);
ext4_set_de_type(inode->i_sb, de, S_IFDIR);
if (inline_buf) {
de->rec_len = ext4_rec_len_to_disk(
ext4_dir_rec_len(de->name_len, NULL),
blocksize);
strcpy(de->name, "..");
ext4_set_de_type(inode->i_sb, de, S_IFDIR);
de = ext4_next_entry(de, blocksize);
header_size = (char *)de - bh->b_data;
memcpy((void *)de, inline_buf, inline_size);
ext4_update_final_de(bh->b_data, inline_size + header_size,
blocksize - csum_size);
} else {
de->rec_len = ext4_rec_len_to_disk(blocksize -
(csum_size + ext4_dir_rec_len(1, NULL)),
blocksize);
}
return ext4_next_entry(de, blocksize);
if (csum_size)
ext4_initialize_dirent_tail(bh, blocksize);
BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
set_buffer_uptodate(bh);
set_buffer_verified(bh);
return ext4_handle_dirty_dirblock(handle, inode, bh);
}
int ext4_init_new_dir(handle_t *handle, struct inode *dir,
@ -2950,13 +2967,8 @@ int ext4_init_new_dir(handle_t *handle, struct inode *dir,
struct buffer_head *dir_block = NULL;
struct ext4_dir_entry_2 *de;
ext4_lblk_t block = 0;
unsigned int blocksize = dir->i_sb->s_blocksize;
int csum_size = 0;
int err;
if (ext4_has_feature_metadata_csum(dir->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
err = ext4_try_create_inline_dir(handle, dir, inode);
if (err < 0 && err != -ENOSPC)
@ -2965,21 +2977,15 @@ int ext4_init_new_dir(handle_t *handle, struct inode *dir,
goto out;
}
set_nlink(inode, 2);
inode->i_size = 0;
dir_block = ext4_append(handle, inode, &block);
if (IS_ERR(dir_block))
return PTR_ERR(dir_block);
de = (struct ext4_dir_entry_2 *)dir_block->b_data;
ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0);
set_nlink(inode, 2);
if (csum_size)
ext4_initialize_dirent_tail(dir_block, blocksize);
BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_dirblock(handle, inode, dir_block);
err = ext4_init_dirblock(handle, inode, dir_block, dir->i_ino, NULL, 0);
if (err)
goto out;
set_buffer_verified(dir_block);
out:
brelse(dir_block);
return err;
@ -3082,7 +3088,8 @@ bool ext4_empty_dir(struct inode *inode)
de = (struct ext4_dir_entry_2 *) bh->b_data;
if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size,
0) ||
le32_to_cpu(de->inode) != inode->i_ino || strcmp(".", de->name)) {
le32_to_cpu(de->inode) != inode->i_ino || de->name_len != 1 ||
de->name[0] != '.') {
ext4_warning_inode(inode, "directory missing '.'");
brelse(bh);
return false;
@ -3091,7 +3098,8 @@ bool ext4_empty_dir(struct inode *inode)
de = ext4_next_entry(de, sb->s_blocksize);
if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size,
offset) ||
le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) {
le32_to_cpu(de->inode) == 0 || de->name_len != 2 ||
de->name[0] != '.' || de->name[1] != '.') {
ext4_warning_inode(inode, "directory missing '..'");
brelse(bh);
return false;
@ -3532,7 +3540,7 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data,
bh->b_size, 0) ||
le32_to_cpu(de->inode) != inode->i_ino ||
strcmp(".", de->name)) {
de->name_len != 1 || de->name[0] != '.') {
EXT4_ERROR_INODE(inode, "directory missing '.'");
brelse(bh);
*retval = -EFSCORRUPTED;
@ -3543,7 +3551,8 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
de = ext4_next_entry(de, inode->i_sb->s_blocksize);
if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data,
bh->b_size, offset) ||
le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) {
le32_to_cpu(de->inode) == 0 || de->name_len != 2 ||
de->name[0] != '.' || de->name[1] != '.') {
EXT4_ERROR_INODE(inode, "directory missing '..'");
brelse(bh);
*retval = -EFSCORRUPTED;

View File

@ -236,10 +236,12 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head)
static bool ext4_io_end_defer_completion(ext4_io_end_t *io_end)
{
if (io_end->flag & EXT4_IO_END_UNWRITTEN)
if (io_end->flag & EXT4_IO_END_UNWRITTEN &&
!list_empty(&io_end->list_vec))
return true;
if (test_opt(io_end->inode->i_sb, DATA_ERR_ABORT) &&
io_end->flag & EXT4_IO_END_FAILED)
io_end->flag & EXT4_IO_END_FAILED &&
!ext4_emergency_state(io_end->inode->i_sb))
return true;
return false;
}
@ -256,6 +258,7 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end)
WARN_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION));
WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN &&
!io_end->handle && sbi->s_journal);
WARN_ON(!io_end->bio);
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
wq = sbi->rsv_conversion_wq;
@ -318,12 +321,9 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
void ext4_put_io_end_defer(ext4_io_end_t *io_end)
{
if (refcount_dec_and_test(&io_end->count)) {
if (io_end->flag & EXT4_IO_END_FAILED ||
(io_end->flag & EXT4_IO_END_UNWRITTEN &&
!list_empty(&io_end->list_vec))) {
ext4_add_complete_io(io_end);
return;
}
if (ext4_io_end_defer_completion(io_end))
return ext4_add_complete_io(io_end);
ext4_release_io_end(io_end);
}
}

View File

@ -338,7 +338,7 @@ xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry,
cmp = name_len - entry->e_name_len;
if (!cmp)
cmp = memcmp(name, entry->e_name, name_len);
if (cmp <= 0 && (sorted || cmp == 0))
if (!cmp || (cmp < 0 && sorted))
break;
}
*pentry = entry;
@ -962,7 +962,7 @@ int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode,
* so we need to reserve credits for this eventuality
*/
if (inode && ext4_has_inline_data(inode))
credits += ext4_writepage_trans_blocks(inode) + 1;
credits += ext4_chunk_trans_extent(inode, 1) + 1;
/* We are done if ea_inode feature is not enabled. */
if (!ext4_has_feature_ea_inode(sb))

View File

@ -23,10 +23,7 @@ struct partial_cluster;
#define show_mballoc_flags(flags) __print_flags(flags, "|", \
{ EXT4_MB_HINT_MERGE, "HINT_MERGE" }, \
{ EXT4_MB_HINT_RESERVED, "HINT_RESV" }, \
{ EXT4_MB_HINT_METADATA, "HINT_MDATA" }, \
{ EXT4_MB_HINT_FIRST, "HINT_FIRST" }, \
{ EXT4_MB_HINT_BEST, "HINT_BEST" }, \
{ EXT4_MB_HINT_DATA, "HINT_DATA" }, \
{ EXT4_MB_HINT_NOPREALLOC, "HINT_NOPREALLOC" }, \
{ EXT4_MB_HINT_GROUP_ALLOC, "HINT_GRP_ALLOC" }, \
@ -483,16 +480,17 @@ TRACE_EVENT(ext4_writepages,
(unsigned long) __entry->writeback_index)
);
TRACE_EVENT(ext4_da_write_pages,
TP_PROTO(struct inode *inode, pgoff_t first_page,
TRACE_EVENT(ext4_da_write_folios_start,
TP_PROTO(struct inode *inode, loff_t start_pos, loff_t next_pos,
struct writeback_control *wbc),
TP_ARGS(inode, first_page, wbc),
TP_ARGS(inode, start_pos, next_pos, wbc),
TP_STRUCT__entry(
__field( dev_t, dev )
__field( ino_t, ino )
__field( pgoff_t, first_page )
__field( loff_t, start_pos )
__field( loff_t, next_pos )
__field( long, nr_to_write )
__field( int, sync_mode )
),
@ -500,18 +498,48 @@ TRACE_EVENT(ext4_da_write_pages,
TP_fast_assign(
__entry->dev = inode->i_sb->s_dev;
__entry->ino = inode->i_ino;
__entry->first_page = first_page;
__entry->start_pos = start_pos;
__entry->next_pos = next_pos;
__entry->nr_to_write = wbc->nr_to_write;
__entry->sync_mode = wbc->sync_mode;
),
TP_printk("dev %d,%d ino %lu first_page %lu nr_to_write %ld "
"sync_mode %d",
TP_printk("dev %d,%d ino %lu start_pos 0x%llx next_pos 0x%llx nr_to_write %ld sync_mode %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino, __entry->first_page,
(unsigned long) __entry->ino, __entry->start_pos, __entry->next_pos,
__entry->nr_to_write, __entry->sync_mode)
);
TRACE_EVENT(ext4_da_write_folios_end,
TP_PROTO(struct inode *inode, loff_t start_pos, loff_t next_pos,
struct writeback_control *wbc, int ret),
TP_ARGS(inode, start_pos, next_pos, wbc, ret),
TP_STRUCT__entry(
__field( dev_t, dev )
__field( ino_t, ino )
__field( loff_t, start_pos )
__field( loff_t, next_pos )
__field( long, nr_to_write )
__field( int, ret )
),
TP_fast_assign(
__entry->dev = inode->i_sb->s_dev;
__entry->ino = inode->i_ino;
__entry->start_pos = start_pos;
__entry->next_pos = next_pos;
__entry->nr_to_write = wbc->nr_to_write;
__entry->ret = ret;
),
TP_printk("dev %d,%d ino %lu start_pos 0x%llx next_pos 0x%llx nr_to_write %ld ret %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino, __entry->start_pos, __entry->next_pos,
__entry->nr_to_write, __entry->ret)
);
TRACE_EVENT(ext4_da_write_pages_extent,
TP_PROTO(struct inode *inode, struct ext4_map_blocks *map),