From 5088aad3d32cc0b1c96cbe3e718569ffc0aca4ba Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 23 Apr 2025 12:54:13 -0700 Subject: [PATCH 01/26] xfs: stop using set_blocksize XFS has its own buffer cache for metadata that uses submit_bio, which means that it no longer uses the block device pagecache for anything. Create a more lightweight helper that runs the blocksize checks and flushes dirty data and use that instead. No more truncating the pagecache because XFS does not use it or care about it. Signed-off-by: Darrick J. Wong Reviewed-by: Luis Chamberlain Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_buf.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 1a2b3f06fa71..5ae77ffdc947 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1719,18 +1719,25 @@ xfs_setsize_buftarg( struct xfs_buftarg *btp, unsigned int sectorsize) { + int error; + /* Set up metadata sector size info */ btp->bt_meta_sectorsize = sectorsize; btp->bt_meta_sectormask = sectorsize - 1; - if (set_blocksize(btp->bt_bdev_file, sectorsize)) { + error = bdev_validate_blocksize(btp->bt_bdev, sectorsize); + if (error) { xfs_warn(btp->bt_mount, - "Cannot set_blocksize to %u on device %pg", - sectorsize, btp->bt_bdev); + "Cannot use blocksize %u on device %pg, err %d", + sectorsize, btp->bt_bdev, error); return -EINVAL; } - return 0; + /* + * Flush the block device pagecache so our bios see anything dirtied + * before mount. + */ + return sync_blockdev(btp->bt_bdev); } int From bfecc4091e07a47696ac922783216d9e9ea46c97 Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Wed, 30 Apr 2025 08:35:34 +0000 Subject: [PATCH 02/26] xfs: allow ro mounts if rtdev or logdev are read-only Allow read-only mounts on rtdevs and logdevs that are marked as read-only and make sure those mounts can't be remounted read-write. Use the sb_open_mode helper to make sure that we don't try to open devices with write access enabled for read-only mounts. Signed-off-by: Hans Holmberg Reviewed-by: Christoph Hellwig Reviewed-by: Carlos Maiolino Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_super.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index b2dd0c0bf509..5e456a6073ca 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -380,10 +380,11 @@ xfs_blkdev_get( struct file **bdev_filep) { int error = 0; + blk_mode_t mode; - *bdev_filep = bdev_file_open_by_path(name, - BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES, - mp->m_super, &fs_holder_ops); + mode = sb_open_mode(mp->m_super->s_flags); + *bdev_filep = bdev_file_open_by_path(name, mode, + mp->m_super, &fs_holder_ops); if (IS_ERR(*bdev_filep)) { error = PTR_ERR(*bdev_filep); *bdev_filep = NULL; @@ -1969,6 +1970,20 @@ xfs_remount_rw( struct xfs_sb *sbp = &mp->m_sb; int error; + if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp && + bdev_read_only(mp->m_logdev_targp->bt_bdev)) { + xfs_warn(mp, + "ro->rw transition prohibited by read-only logdev"); + return -EACCES; + } + + if (mp->m_rtdev_targp && + bdev_read_only(mp->m_rtdev_targp->bt_bdev)) { + xfs_warn(mp, + "ro->rw transition prohibited by read-only rtdev"); + return -EACCES; + } + if (xfs_has_norecovery(mp)) { xfs_warn(mp, "ro->rw transition prohibited on norecovery mount"); From 23be716b1c4f3f3a6c00ee38d51a57ef7db9ef7d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 1 May 2025 09:27:24 +1000 Subject: [PATCH 03/26] xfs: don't assume perags are initialised when trimming AGs When running fstrim immediately after mounting a V4 filesystem, the fstrim fails to trim all the free space in the filesystem. It only trims the first extent in the by-size free space tree in each AG and then returns. If a second fstrim is then run, it runs correctly and the entire free space in the filesystem is iterated and discarded correctly. The problem lies in the setup of the trim cursor - it assumes that pag->pagf_longest is valid without either reading the AGF first or checking if xfs_perag_initialised_agf(pag) is true or not. As a result, when a filesystem is mounted without reading the AGF (e.g. a clean mount on a v4 filesystem) and the first operation is a fstrim call, pag->pagf_longest is zero and so the free extent search starts at the wrong end of the by-size btree and exits after discarding the first record in the tree. Fix this by deferring the initialisation of tcur->count to after we have locked the AGF and guaranteed that the perag is properly initialised. We trigger this on tcur->count == 0 after locking the AGF, as this will only occur on the first call to xfs_trim_gather_extents() for each AG. If we need to iterate, tcur->count will be set to the length of the record we need to restart at, so we can use this to ensure we only sample a valid pag->pagf_longest value for the iteration. Signed-off-by: Dave Chinner Reviewed-by: Bill O'Donnell Reviewed-by: Darrick J. Wong Fixes: 89cfa899608f ("xfs: reduce AGF hold times during fstrim operations") Cc: # v6.6 Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_discard.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index c1a306268ae4..94d0873bcd62 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -167,6 +167,14 @@ xfs_discard_extents( return error; } +/* + * Care must be taken setting up the trim cursor as the perags may not have been + * initialised when the cursor is initialised. e.g. a clean mount which hasn't + * read in AGFs and the first operation run on the mounted fs is a trim. This + * can result in perag fields that aren't initialised until + * xfs_trim_gather_extents() calls xfs_alloc_read_agf() to lock down the AG for + * the free space search. + */ struct xfs_trim_cur { xfs_agblock_t start; xfs_extlen_t count; @@ -204,6 +212,14 @@ xfs_trim_gather_extents( if (error) goto out_trans_cancel; + /* + * First time through tcur->count will not have been initialised as + * pag->pagf_longest is not guaranteed to be valid before we read + * the AGF buffer above. + */ + if (!tcur->count) + tcur->count = pag->pagf_longest; + if (tcur->by_bno) { /* sub-AG discard request always starts at tcur->start */ cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag); @@ -350,7 +366,6 @@ xfs_trim_perag_extents( { struct xfs_trim_cur tcur = { .start = start, - .count = pag->pagf_longest, .end = end, .minlen = minlen, }; From 5d894321c49e61379189b0ff605f316e39cbd1e9 Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 7 May 2025 14:18:21 -0700 Subject: [PATCH 04/26] fs: add atomic write unit max opt to statx XFS will be able to support large atomic writes (atomic write > 1x block) in future. This will be achieved by using different operating methods, depending on the size of the write. Specifically a new method of operation based in FS atomic extent remapping will be supported in addition to the current HW offload-based method. The FS method will generally be appreciably slower performing than the HW-offload method. However the FS method will be typically able to contribute to achieving a larger atomic write unit max limit. XFS will support a hybrid mode, where HW offload method will be used when possible, i.e. HW offload is used when the length of the write is supported, and for other times FS-based atomic writes will be used. As such, there is an atomic write length at which the user may experience appreciably slower performance. Advertise this limit in a new statx field, stx_atomic_write_unit_max_opt. When zero, it means that there is no such performance boundary. Masks STATX{_ATTR}_WRITE_ATOMIC can be used to get this new field. This is ok for older kernels which don't support this new field, as they would report 0 in this field (from zeroing in cp_statx()) already. Furthermore those older kernels don't support large atomic writes - apart from block fops, but there would be consistent performance there for atomic writes in range [unit min, unit max]. Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Darrick J. Wong Acked-by: Darrick J. Wong Signed-off-by: John Garry --- block/bdev.c | 3 ++- fs/ext4/inode.c | 2 +- fs/stat.c | 6 +++++- fs/xfs/xfs_iops.c | 2 +- include/linux/fs.h | 3 ++- include/linux/stat.h | 1 + include/uapi/linux/stat.h | 8 ++++++-- 7 files changed, 18 insertions(+), 7 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index 520515e4e64e..9f321fb94bac 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -1336,7 +1336,8 @@ void bdev_statx(struct path *path, struct kstat *stat, generic_fill_statx_atomic_writes(stat, queue_atomic_write_unit_min_bytes(bd_queue), - queue_atomic_write_unit_max_bytes(bd_queue)); + queue_atomic_write_unit_max_bytes(bd_queue), + 0); } stat->blksize = bdev_io_min(bdev); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 94c7d2d828a6..cdf01e60fa6d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5692,7 +5692,7 @@ int ext4_getattr(struct mnt_idmap *idmap, const struct path *path, awu_max = sbi->s_awu_max; } - generic_fill_statx_atomic_writes(stat, awu_min, awu_max); + generic_fill_statx_atomic_writes(stat, awu_min, awu_max, 0); } flags = ei->i_flags & EXT4_FL_USER_VISIBLE; diff --git a/fs/stat.c b/fs/stat.c index f13308bfdc98..c41855f62d22 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -136,13 +136,15 @@ EXPORT_SYMBOL(generic_fill_statx_attr); * @stat: Where to fill in the attribute flags * @unit_min: Minimum supported atomic write length in bytes * @unit_max: Maximum supported atomic write length in bytes + * @unit_max_opt: Optimised maximum supported atomic write length in bytes * * Fill in the STATX{_ATTR}_WRITE_ATOMIC flags in the kstat structure from * atomic write unit_min and unit_max values. */ void generic_fill_statx_atomic_writes(struct kstat *stat, unsigned int unit_min, - unsigned int unit_max) + unsigned int unit_max, + unsigned int unit_max_opt) { /* Confirm that the request type is known */ stat->result_mask |= STATX_WRITE_ATOMIC; @@ -153,6 +155,7 @@ void generic_fill_statx_atomic_writes(struct kstat *stat, if (unit_min) { stat->atomic_write_unit_min = unit_min; stat->atomic_write_unit_max = unit_max; + stat->atomic_write_unit_max_opt = unit_max_opt; /* Initially only allow 1x segment */ stat->atomic_write_segments_max = 1; @@ -732,6 +735,7 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer) tmp.stx_atomic_write_unit_min = stat->atomic_write_unit_min; tmp.stx_atomic_write_unit_max = stat->atomic_write_unit_max; tmp.stx_atomic_write_segments_max = stat->atomic_write_segments_max; + tmp.stx_atomic_write_unit_max_opt = stat->atomic_write_unit_max_opt; return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0; } diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 756bd3ca8e00..f0e5d83195df 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -610,7 +610,7 @@ xfs_report_atomic_write( if (xfs_inode_can_atomicwrite(ip)) unit_min = unit_max = ip->i_mount->m_sb.sb_blocksize; - generic_fill_statx_atomic_writes(stat, unit_min, unit_max); + generic_fill_statx_atomic_writes(stat, unit_min, unit_max, 0); } STATIC int diff --git a/include/linux/fs.h b/include/linux/fs.h index 016b0fe1536e..7b19d8f99aff 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3475,7 +3475,8 @@ void generic_fillattr(struct mnt_idmap *, u32, struct inode *, struct kstat *); void generic_fill_statx_attr(struct inode *inode, struct kstat *stat); void generic_fill_statx_atomic_writes(struct kstat *stat, unsigned int unit_min, - unsigned int unit_max); + unsigned int unit_max, + unsigned int unit_max_opt); extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int); extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int); void __inode_add_bytes(struct inode *inode, loff_t bytes); diff --git a/include/linux/stat.h b/include/linux/stat.h index be7496a6a0dd..e3d00e7bb26d 100644 --- a/include/linux/stat.h +++ b/include/linux/stat.h @@ -57,6 +57,7 @@ struct kstat { u32 dio_read_offset_align; u32 atomic_write_unit_min; u32 atomic_write_unit_max; + u32 atomic_write_unit_max_opt; u32 atomic_write_segments_max; }; diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h index f78ee3670dd5..1686861aae20 100644 --- a/include/uapi/linux/stat.h +++ b/include/uapi/linux/stat.h @@ -182,8 +182,12 @@ struct statx { /* File offset alignment for direct I/O reads */ __u32 stx_dio_read_offset_align; - /* 0xb8 */ - __u64 __spare3[9]; /* Spare space for future expansion */ + /* Optimised max atomic write unit in bytes */ + __u32 stx_atomic_write_unit_max_opt; + __u32 __spare2[1]; + + /* 0xc0 */ + __u64 __spare3[8]; /* Spare space for future expansion */ /* 0x100 */ }; From 84270a1a30c964e51af3e489411b4e178c4f153e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 7 May 2025 14:18:22 -0700 Subject: [PATCH 05/26] xfs: only call xfs_setsize_buftarg once per buffer target It's silly to call xfs_setsize_buftarg from xfs_alloc_buftarg with the block device LBA size because we don't need to ask the block layer to validate a geometry number that it provided us. Instead, set the preliminary bt_meta_sector* fields to the LBA size in preparation for reading the primary super. However, we still want to flush and invalidate the pagecache for all three block devices before we start reading metadata from those devices, so call sync_blockdev() per bdev in xfs_alloc_buftarg(). This will enable a subsequent patch to validate hw atomic write geometry against the filesystem geometry. Signed-off-by: Darrick J. Wong Reviewed-by: John Garry [jpg: call sync_blockdev() from xfs_alloc_buftarg()] Signed-off-by: John Garry Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_buf.c | 28 ++++++++++++++++++---------- fs/xfs/xfs_super.c | 16 ++++++++++++---- 2 files changed, 30 insertions(+), 14 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 5ae77ffdc947..d8f90bdd2a33 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1733,11 +1733,7 @@ xfs_setsize_buftarg( return -EINVAL; } - /* - * Flush the block device pagecache so our bios see anything dirtied - * before mount. - */ - return sync_blockdev(btp->bt_bdev); + return 0; } int @@ -1786,6 +1782,8 @@ xfs_alloc_buftarg( { struct xfs_buftarg *btp; const struct dax_holder_operations *ops = NULL; + int error; + #if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE) ops = &xfs_dax_holder_operations; @@ -1806,21 +1804,31 @@ xfs_alloc_buftarg( btp->bt_bdev); } + /* + * Flush and invalidate all devices' pagecaches before reading any + * metadata because XFS doesn't use the bdev pagecache. + */ + error = sync_blockdev(btp->bt_bdev); + if (error) + goto error_free; + /* * When allocating the buftargs we have not yet read the super block and * thus don't know the file system sector size yet. */ - if (xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev))) - goto error_free; - if (xfs_init_buftarg(btp, bdev_logical_block_size(btp->bt_bdev), - mp->m_super->s_id)) + btp->bt_meta_sectorsize = bdev_logical_block_size(btp->bt_bdev); + btp->bt_meta_sectormask = btp->bt_meta_sectorsize - 1; + + error = xfs_init_buftarg(btp, btp->bt_meta_sectorsize, + mp->m_super->s_id); + if (error) goto error_free; return btp; error_free: kfree(btp); - return NULL; + return ERR_PTR(error); } static inline void diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 5e456a6073ca..6eba90eb7297 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -482,21 +482,29 @@ xfs_open_devices( /* * Setup xfs_mount buffer target pointers */ - error = -ENOMEM; mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_file); - if (!mp->m_ddev_targp) + if (IS_ERR(mp->m_ddev_targp)) { + error = PTR_ERR(mp->m_ddev_targp); + mp->m_ddev_targp = NULL; goto out_close_rtdev; + } if (rtdev_file) { mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev_file); - if (!mp->m_rtdev_targp) + if (IS_ERR(mp->m_rtdev_targp)) { + error = PTR_ERR(mp->m_rtdev_targp); + mp->m_rtdev_targp = NULL; goto out_free_ddev_targ; + } } if (logdev_file && file_bdev(logdev_file) != ddev) { mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev_file); - if (!mp->m_logdev_targp) + if (IS_ERR(mp->m_logdev_targp)) { + error = PTR_ERR(mp->m_logdev_targp); + mp->m_logdev_targp = NULL; goto out_free_rtdev_targ; + } } else { mp->m_logdev_targp = mp->m_ddev_targp; /* Handle won't be used, drop it */ From 5af9f5508477f088320fdce6c0fd69f960aee319 Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 7 May 2025 14:18:23 -0700 Subject: [PATCH 06/26] xfs: rename xfs_inode_can_atomicwrite() -> xfs_inode_can_hw_atomic_write() In future we will want to be able to check if specifically HW offload-based atomic writes are possible, so rename xfs_inode_can_atomicwrite() -> xfs_inode_can_hw_atomicwrite(). Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig [djwong: add an underscore to be consistent with everything else] Signed-off-by: Darrick J. Wong Signed-off-by: John Garry --- fs/xfs/xfs_file.c | 2 +- fs/xfs/xfs_inode.h | 4 +--- fs/xfs/xfs_iops.c | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 84f08c976ac4..55bdae44e42a 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1488,7 +1488,7 @@ xfs_file_open( if (xfs_is_shutdown(XFS_M(inode->i_sb))) return -EIO; file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; - if (xfs_inode_can_atomicwrite(XFS_I(inode))) + if (xfs_inode_can_hw_atomic_write(XFS_I(inode))) file->f_mode |= FMODE_CAN_ATOMIC_WRITE; return generic_file_open(inode, file); } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index eae0159983ca..d3471a7418b9 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -356,9 +356,7 @@ static inline bool xfs_inode_has_bigrtalloc(const struct xfs_inode *ip) (XFS_IS_REALTIME_INODE(ip) ? \ (ip)->i_mount->m_rtdev_targp : (ip)->i_mount->m_ddev_targp) -static inline bool -xfs_inode_can_atomicwrite( - struct xfs_inode *ip) +static inline bool xfs_inode_can_hw_atomic_write(const struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; struct xfs_buftarg *target = xfs_inode_buftarg(ip); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index f0e5d83195df..22432c300fd7 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -608,7 +608,7 @@ xfs_report_atomic_write( { unsigned int unit_min = 0, unit_max = 0; - if (xfs_inode_can_atomicwrite(ip)) + if (xfs_inode_can_hw_atomic_write(ip)) unit_min = unit_max = ip->i_mount->m_sb.sb_blocksize; generic_fill_statx_atomic_writes(stat, unit_min, unit_max, 0); } From 13c7c54bd0fa0a7937ce394e8c416c6a70e0a6ed Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 7 May 2025 14:18:24 -0700 Subject: [PATCH 07/26] xfs: separate out setting buftarg atomic writes limits Separate out setting buftarg atomic writes limits into a dedicated function, xfs_configure_buftarg_atomic_writes(), to keep the specific functionality self-contained. For naming consistency, rename xfs_setsize_buftarg() -> xfs_configure_buftarg(). Signed-off-by: Darrick J. Wong [jpg: separate out from patch "xfs: ignore HW which ..."] Signed-off-by: John Garry Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_buf.c | 32 ++++++++++++++++++++++++-------- fs/xfs/xfs_buf.h | 2 +- fs/xfs/xfs_super.c | 6 +++--- 3 files changed, 28 insertions(+), 12 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index d8f90bdd2a33..e2374c503e79 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1714,13 +1714,33 @@ xfs_free_buftarg( kfree(btp); } +/* + * Configure this buffer target for hardware-assisted atomic writes if the + * underlying block device supports is congruent with the filesystem geometry. + */ +static inline void +xfs_configure_buftarg_atomic_writes( + struct xfs_buftarg *btp) +{ + unsigned int min_bytes, max_bytes; + + min_bytes = bdev_atomic_write_unit_min_bytes(btp->bt_bdev); + max_bytes = bdev_atomic_write_unit_max_bytes(btp->bt_bdev); + + btp->bt_bdev_awu_min = min_bytes; + btp->bt_bdev_awu_max = max_bytes; +} + +/* Configure a buffer target that abstracts a block device. */ int -xfs_setsize_buftarg( +xfs_configure_buftarg( struct xfs_buftarg *btp, unsigned int sectorsize) { int error; + ASSERT(btp->bt_bdev != NULL); + /* Set up metadata sector size info */ btp->bt_meta_sectorsize = sectorsize; btp->bt_meta_sectormask = sectorsize - 1; @@ -1733,6 +1753,9 @@ xfs_setsize_buftarg( return -EINVAL; } + if (bdev_can_atomic_write(btp->bt_bdev)) + xfs_configure_buftarg_atomic_writes(btp); + return 0; } @@ -1797,13 +1820,6 @@ xfs_alloc_buftarg( btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off, mp, ops); - if (bdev_can_atomic_write(btp->bt_bdev)) { - btp->bt_bdev_awu_min = bdev_atomic_write_unit_min_bytes( - btp->bt_bdev); - btp->bt_bdev_awu_max = bdev_atomic_write_unit_max_bytes( - btp->bt_bdev); - } - /* * Flush and invalidate all devices' pagecaches before reading any * metadata because XFS doesn't use the bdev pagecache. diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index d0b065a9a9f0..a7026fb255c4 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -374,7 +374,7 @@ struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp, extern void xfs_free_buftarg(struct xfs_buftarg *); extern void xfs_buftarg_wait(struct xfs_buftarg *); extern void xfs_buftarg_drain(struct xfs_buftarg *); -extern int xfs_setsize_buftarg(struct xfs_buftarg *, unsigned int); +int xfs_configure_buftarg(struct xfs_buftarg *btp, unsigned int sectorsize); #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 6eba90eb7297..77a3c003fc4f 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -537,7 +537,7 @@ xfs_setup_devices( { int error; - error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize); + error = xfs_configure_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize); if (error) return error; @@ -546,7 +546,7 @@ xfs_setup_devices( if (xfs_has_sector(mp)) log_sector_size = mp->m_sb.sb_logsectsize; - error = xfs_setsize_buftarg(mp->m_logdev_targp, + error = xfs_configure_buftarg(mp->m_logdev_targp, log_sector_size); if (error) return error; @@ -560,7 +560,7 @@ xfs_setup_devices( } mp->m_rtdev_targp = mp->m_ddev_targp; } else if (mp->m_rtname) { - error = xfs_setsize_buftarg(mp->m_rtdev_targp, + error = xfs_configure_buftarg(mp->m_rtdev_targp, mp->m_sb.sb_sectsize); if (error) return error; From 6d1bdc739140aa9b446182a00a7a4e03e2162861 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 7 May 2025 14:18:25 -0700 Subject: [PATCH 08/26] xfs: add helpers to compute log item overhead Add selected helpers to estimate the transaction reservation required to write various log intent and buffer items to the log. These helpers will be used by the online repair code for more precise estimations of how much work can be done in a single transaction. Signed-off-by: Darrick J. Wong Reviewed-by: John Garry Signed-off-by: John Garry --- fs/xfs/xfs_bmap_item.c | 10 ++++++++++ fs/xfs/xfs_bmap_item.h | 3 +++ fs/xfs/xfs_buf_item.c | 19 +++++++++++++++++++ fs/xfs/xfs_buf_item.h | 3 +++ fs/xfs/xfs_extfree_item.c | 10 ++++++++++ fs/xfs/xfs_extfree_item.h | 3 +++ fs/xfs/xfs_log_cil.c | 4 +--- fs/xfs/xfs_log_priv.h | 13 +++++++++++++ fs/xfs/xfs_refcount_item.c | 10 ++++++++++ fs/xfs/xfs_refcount_item.h | 3 +++ fs/xfs/xfs_rmap_item.c | 10 ++++++++++ fs/xfs/xfs_rmap_item.h | 3 +++ 12 files changed, 88 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 3d52e9d7ad57..646c515ee355 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -77,6 +77,11 @@ xfs_bui_item_size( *nbytes += xfs_bui_log_format_sizeof(buip->bui_format.bui_nextents); } +unsigned int xfs_bui_log_space(unsigned int nr) +{ + return xlog_item_space(1, xfs_bui_log_format_sizeof(nr)); +} + /* * This is called to fill in the vector of log iovecs for the * given bui log item. We use only 1 iovec, and we point that @@ -168,6 +173,11 @@ xfs_bud_item_size( *nbytes += sizeof(struct xfs_bud_log_format); } +unsigned int xfs_bud_log_space(void) +{ + return xlog_item_space(1, sizeof(struct xfs_bud_log_format)); +} + /* * This is called to fill in the vector of log iovecs for the * given bud log item. We use only 1 iovec, and we point that diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h index 6fee6a508343..b42fee06899d 100644 --- a/fs/xfs/xfs_bmap_item.h +++ b/fs/xfs/xfs_bmap_item.h @@ -72,4 +72,7 @@ struct xfs_bmap_intent; void xfs_bmap_defer_add(struct xfs_trans *tp, struct xfs_bmap_intent *bi); +unsigned int xfs_bui_log_space(unsigned int nr); +unsigned int xfs_bud_log_space(void); + #endif /* __XFS_BMAP_ITEM_H__ */ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 19eb0b7a3e58..90139e0f3271 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -103,6 +103,25 @@ xfs_buf_item_size_segment( return; } +/* + * Compute the worst case log item overhead for an invalidated buffer with the + * given map count and block size. + */ +unsigned int +xfs_buf_inval_log_space( + unsigned int map_count, + unsigned int blocksize) +{ + unsigned int chunks = DIV_ROUND_UP(blocksize, XFS_BLF_CHUNK); + unsigned int bitmap_size = DIV_ROUND_UP(chunks, NBWORD); + unsigned int ret = + offsetof(struct xfs_buf_log_format, blf_data_map) + + (bitmap_size * sizeof_field(struct xfs_buf_log_format, + blf_data_map[0])); + + return ret * map_count; +} + /* * Return the number of log iovecs and space needed to log the given buf log * item. diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 8cde85259a58..e10e324cd245 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -64,6 +64,9 @@ static inline void xfs_buf_dquot_iodone(struct xfs_buf *bp) void xfs_buf_iodone(struct xfs_buf *); bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec); +unsigned int xfs_buf_inval_log_space(unsigned int map_count, + unsigned int blocksize); + extern struct kmem_cache *xfs_buf_item_cache; #endif /* __XFS_BUF_ITEM_H__ */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 777438b853da..d574f5f639fa 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -83,6 +83,11 @@ xfs_efi_item_size( *nbytes += xfs_efi_log_format_sizeof(efip->efi_format.efi_nextents); } +unsigned int xfs_efi_log_space(unsigned int nr) +{ + return xlog_item_space(1, xfs_efi_log_format_sizeof(nr)); +} + /* * This is called to fill in the vector of log iovecs for the * given efi log item. We use only 1 iovec, and we point that @@ -254,6 +259,11 @@ xfs_efd_item_size( *nbytes += xfs_efd_log_format_sizeof(efdp->efd_format.efd_nextents); } +unsigned int xfs_efd_log_space(unsigned int nr) +{ + return xlog_item_space(1, xfs_efd_log_format_sizeof(nr)); +} + /* * This is called to fill in the vector of log iovecs for the * given efd log item. We use only 1 iovec, and we point that diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index 41b7c4306079..c8402040410b 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -94,4 +94,7 @@ void xfs_extent_free_defer_add(struct xfs_trans *tp, struct xfs_extent_free_item *xefi, struct xfs_defer_pending **dfpp); +unsigned int xfs_efi_log_space(unsigned int nr); +unsigned int xfs_efd_log_space(unsigned int nr); + #endif /* __XFS_EXTFREE_ITEM_H__ */ diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 1ca406ec1b40..f66d2d430e4f 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -309,9 +309,7 @@ xlog_cil_alloc_shadow_bufs( * Then round nbytes up to 64-bit alignment so that the initial * buffer alignment is easy to calculate and verify. */ - nbytes += niovecs * - (sizeof(uint64_t) + sizeof(struct xlog_op_header)); - nbytes = round_up(nbytes, sizeof(uint64_t)); + nbytes = xlog_item_space(niovecs, nbytes); /* * The data buffer needs to start 64-bit aligned, so round up diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index f3d78869e5e5..39a102cc1b43 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -698,4 +698,17 @@ xlog_kvmalloc( return p; } +/* + * Given a count of iovecs and space for a log item, compute the space we need + * in the log to store that data plus the log headers. + */ +static inline unsigned int +xlog_item_space( + unsigned int niovecs, + unsigned int nbytes) +{ + nbytes += niovecs * (sizeof(uint64_t) + sizeof(struct xlog_op_header)); + return round_up(nbytes, sizeof(uint64_t)); +} + #endif /* __XFS_LOG_PRIV_H__ */ diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index fe2d7aab8554..076501123d89 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -78,6 +78,11 @@ xfs_cui_item_size( *nbytes += xfs_cui_log_format_sizeof(cuip->cui_format.cui_nextents); } +unsigned int xfs_cui_log_space(unsigned int nr) +{ + return xlog_item_space(1, xfs_cui_log_format_sizeof(nr)); +} + /* * This is called to fill in the vector of log iovecs for the * given cui log item. We use only 1 iovec, and we point that @@ -179,6 +184,11 @@ xfs_cud_item_size( *nbytes += sizeof(struct xfs_cud_log_format); } +unsigned int xfs_cud_log_space(void) +{ + return xlog_item_space(1, sizeof(struct xfs_cud_log_format)); +} + /* * This is called to fill in the vector of log iovecs for the * given cud log item. We use only 1 iovec, and we point that diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h index bfee8f30c63c..0fc3f493342b 100644 --- a/fs/xfs/xfs_refcount_item.h +++ b/fs/xfs/xfs_refcount_item.h @@ -76,4 +76,7 @@ struct xfs_refcount_intent; void xfs_refcount_defer_add(struct xfs_trans *tp, struct xfs_refcount_intent *ri); +unsigned int xfs_cui_log_space(unsigned int nr); +unsigned int xfs_cud_log_space(void); + #endif /* __XFS_REFCOUNT_ITEM_H__ */ diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 89decffe76c8..c99700318ec2 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -77,6 +77,11 @@ xfs_rui_item_size( *nbytes += xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents); } +unsigned int xfs_rui_log_space(unsigned int nr) +{ + return xlog_item_space(1, xfs_rui_log_format_sizeof(nr)); +} + /* * This is called to fill in the vector of log iovecs for the * given rui log item. We use only 1 iovec, and we point that @@ -180,6 +185,11 @@ xfs_rud_item_size( *nbytes += sizeof(struct xfs_rud_log_format); } +unsigned int xfs_rud_log_space(void) +{ + return xlog_item_space(1, sizeof(struct xfs_rud_log_format)); +} + /* * This is called to fill in the vector of log iovecs for the * given rud log item. We use only 1 iovec, and we point that diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h index 40d331555675..3a99f0117f2d 100644 --- a/fs/xfs/xfs_rmap_item.h +++ b/fs/xfs/xfs_rmap_item.h @@ -75,4 +75,7 @@ struct xfs_rmap_intent; void xfs_rmap_defer_add(struct xfs_trans *tp, struct xfs_rmap_intent *ri); +unsigned int xfs_rui_log_space(unsigned int nr); +unsigned int xfs_rud_log_space(void); + #endif /* __XFS_RMAP_ITEM_H__ */ From 805f89881252a9aee30799b8a395deec79c13414 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 7 May 2025 14:18:25 -0700 Subject: [PATCH 09/26] xfs: add helpers to compute transaction reservation for finishing intent items In the transaction reservation code, hoist the logic that computes the reservation needed to finish one log intent item into separate helper functions. These will be used in subsequent patches to estimate the number of blocks that an online repair can commit to reaping in the same transaction as the change committing the new data structure. Signed-off-by: Darrick J. Wong Reviewed-by: John Garry Signed-off-by: John Garry --- fs/xfs/libxfs/xfs_trans_resv.c | 165 ++++++++++++++++++++++++++------- fs/xfs/libxfs/xfs_trans_resv.h | 18 ++++ 2 files changed, 152 insertions(+), 31 deletions(-) diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 13d00c7166e1..580d00ae2857 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -263,6 +263,42 @@ xfs_rtalloc_block_count( * register overflow from temporaries in the calculations. */ +/* + * Finishing a data device refcount updates (t1): + * the agfs of the ags containing the blocks: nr_ops * sector size + * the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size + */ +inline unsigned int +xfs_calc_finish_cui_reservation( + struct xfs_mount *mp, + unsigned int nr_ops) +{ + if (!xfs_has_reflink(mp)) + return 0; + + return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), + mp->m_sb.sb_blocksize); +} + +/* + * Realtime refcount updates (t2); + * the rt refcount inode + * the rtrefcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size + */ +inline unsigned int +xfs_calc_finish_rt_cui_reservation( + struct xfs_mount *mp, + unsigned int nr_ops) +{ + if (!xfs_has_rtreflink(mp)) + return 0; + + return xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(xfs_rtrefcountbt_block_count(mp, nr_ops), + mp->m_sb.sb_blocksize); +} + /* * Compute the log reservation required to handle the refcount update * transaction. Refcount updates are always done via deferred log items. @@ -280,19 +316,10 @@ xfs_calc_refcountbt_reservation( struct xfs_mount *mp, unsigned int nr_ops) { - unsigned int blksz = XFS_FSB_TO_B(mp, 1); - unsigned int t1, t2 = 0; + unsigned int t1, t2; - if (!xfs_has_reflink(mp)) - return 0; - - t1 = xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz); - - if (xfs_has_realtime(mp)) - t2 = xfs_calc_inode_res(mp, 1) + - xfs_calc_buf_res(xfs_rtrefcountbt_block_count(mp, nr_ops), - blksz); + t1 = xfs_calc_finish_cui_reservation(mp, nr_ops); + t2 = xfs_calc_finish_rt_cui_reservation(mp, nr_ops); return max(t1, t2); } @@ -379,6 +406,96 @@ xfs_calc_write_reservation_minlogsize( return xfs_calc_write_reservation(mp, true); } +/* + * Finishing an EFI can free the blocks and bmap blocks (t2): + * the agf for each of the ags: nr * sector size + * the agfl for each of the ags: nr * sector size + * the super block to reflect the freed blocks: sector size + * worst case split in allocation btrees per extent assuming nr extents: + * nr exts * 2 trees * (2 * max depth - 1) * block size + */ +inline unsigned int +xfs_calc_finish_efi_reservation( + struct xfs_mount *mp, + unsigned int nr) +{ + return xfs_calc_buf_res((2 * nr) + 1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_allocfree_block_count(mp, nr), + mp->m_sb.sb_blocksize); +} + +/* + * Or, if it's a realtime file (t3): + * the agf for each of the ags: 2 * sector size + * the agfl for each of the ags: 2 * sector size + * the super block to reflect the freed blocks: sector size + * the realtime bitmap: + * 2 exts * ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes + * the realtime summary: 2 exts * 1 block + * worst case split in allocation btrees per extent assuming 2 extents: + * 2 exts * 2 trees * (2 * max depth - 1) * block size + */ +inline unsigned int +xfs_calc_finish_rt_efi_reservation( + struct xfs_mount *mp, + unsigned int nr) +{ + if (!xfs_has_realtime(mp)) + return 0; + + return xfs_calc_buf_res((2 * nr) + 1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_rtalloc_block_count(mp, nr), + mp->m_sb.sb_blocksize) + + xfs_calc_buf_res(xfs_allocfree_block_count(mp, nr), + mp->m_sb.sb_blocksize); +} + +/* + * Finishing an RUI is the same as an EFI. We can split the rmap btree twice + * on each end of the record, and that can cause the AGFL to be refilled or + * emptied out. + */ +inline unsigned int +xfs_calc_finish_rui_reservation( + struct xfs_mount *mp, + unsigned int nr) +{ + if (!xfs_has_rmapbt(mp)) + return 0; + return xfs_calc_finish_efi_reservation(mp, nr); +} + +/* + * Finishing an RUI is the same as an EFI. We can split the rmap btree twice + * on each end of the record, and that can cause the AGFL to be refilled or + * emptied out. + */ +inline unsigned int +xfs_calc_finish_rt_rui_reservation( + struct xfs_mount *mp, + unsigned int nr) +{ + if (!xfs_has_rtrmapbt(mp)) + return 0; + return xfs_calc_finish_rt_efi_reservation(mp, nr); +} + +/* + * In finishing a BUI, we can modify: + * the inode being truncated: inode size + * dquots + * the inode's bmap btree: (max depth + 1) * block size + */ +inline unsigned int +xfs_calc_finish_bui_reservation( + struct xfs_mount *mp, + unsigned int nr) +{ + return xfs_calc_inode_res(mp, 1) + XFS_DQUOT_LOGRES + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, + mp->m_sb.sb_blocksize); +} + /* * In truncating a file we free up to two extents at once. We can modify (t1): * the inode being truncated: inode size @@ -411,16 +528,8 @@ xfs_calc_itruncate_reservation( t1 = xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz); - t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4), blksz); - - if (xfs_has_realtime(mp)) { - t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 2), blksz) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz); - } else { - t3 = 0; - } + t2 = xfs_calc_finish_efi_reservation(mp, 4); + t3 = xfs_calc_finish_rt_efi_reservation(mp, 2); /* * In the early days of reflink, we included enough reservation to log @@ -501,9 +610,7 @@ xfs_calc_rename_reservation( xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)); - t2 = xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 3), - XFS_FSB_TO_B(mp, 1)); + t2 = xfs_calc_finish_efi_reservation(mp, 3); if (xfs_has_parent(mp)) { unsigned int rename_overhead, exchange_overhead; @@ -611,9 +718,7 @@ xfs_calc_link_reservation( overhead += xfs_calc_iunlink_remove_reservation(mp); t1 = xfs_calc_inode_res(mp, 2) + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)); - t2 = xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), - XFS_FSB_TO_B(mp, 1)); + t2 = xfs_calc_finish_efi_reservation(mp, 1); if (xfs_has_parent(mp)) { t3 = resp->tr_attrsetm.tr_logres; @@ -676,9 +781,7 @@ xfs_calc_remove_reservation( t1 = xfs_calc_inode_res(mp, 2) + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)); - t2 = xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), - XFS_FSB_TO_B(mp, 1)); + t2 = xfs_calc_finish_efi_reservation(mp, 2); if (xfs_has_parent(mp)) { t3 = resp->tr_attrrm.tr_logres; diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index 0554b9d775d2..d9d0032cbbc5 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h @@ -98,6 +98,24 @@ struct xfs_trans_resv { void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp); uint xfs_allocfree_block_count(struct xfs_mount *mp, uint num_ops); +unsigned int xfs_calc_finish_bui_reservation(struct xfs_mount *mp, + unsigned int nr_ops); + +unsigned int xfs_calc_finish_efi_reservation(struct xfs_mount *mp, + unsigned int nr_ops); +unsigned int xfs_calc_finish_rt_efi_reservation(struct xfs_mount *mp, + unsigned int nr_ops); + +unsigned int xfs_calc_finish_rui_reservation(struct xfs_mount *mp, + unsigned int nr_ops); +unsigned int xfs_calc_finish_rt_rui_reservation(struct xfs_mount *mp, + unsigned int nr_ops); + +unsigned int xfs_calc_finish_cui_reservation(struct xfs_mount *mp, + unsigned int nr_ops); +unsigned int xfs_calc_finish_rt_cui_reservation(struct xfs_mount *mp, + unsigned int nr_ops); + unsigned int xfs_calc_itruncate_reservation_minlogsize(struct xfs_mount *mp); unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp); unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp); From 85bf2dfa3f1287d349fa1f92b673ca67d13c7784 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 7 May 2025 14:18:26 -0700 Subject: [PATCH 10/26] xfs: ignore HW which cannot atomic write a single block Currently only HW which can write at least 1x block is supported. For supporting atomic writes > 1x block, a CoW-based method will also be used and this will not be resticted to using HW which can write >= 1x block. However for deciding if HW-based atomic writes can be used, we need to start adding checks for write length < HW min, which complicates the code. Indeed, a statx field similar to unit_max_opt should also be added for this minimum, which is undesirable. HW which can only write > 1x blocks would be uncommon and quite weird, so let's just not support it. Signed-off-by: "Darrick J. Wong" Signed-off-by: John Garry Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_buf.c | 12 ++++++++++++ fs/xfs/xfs_buf.h | 2 +- fs/xfs/xfs_inode.h | 10 +--------- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index e2374c503e79..d52d9587b42c 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1722,11 +1722,23 @@ static inline void xfs_configure_buftarg_atomic_writes( struct xfs_buftarg *btp) { + struct xfs_mount *mp = btp->bt_mount; unsigned int min_bytes, max_bytes; min_bytes = bdev_atomic_write_unit_min_bytes(btp->bt_bdev); max_bytes = bdev_atomic_write_unit_max_bytes(btp->bt_bdev); + /* + * Ignore atomic write geometry that is nonsense or doesn't even cover + * a single fsblock. + */ + if (min_bytes > max_bytes || + min_bytes > mp->m_sb.sb_blocksize || + max_bytes < mp->m_sb.sb_blocksize) { + min_bytes = 0; + max_bytes = 0; + } + btp->bt_bdev_awu_min = min_bytes; btp->bt_bdev_awu_max = max_bytes; } diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index a7026fb255c4..9d2ab567cf81 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -112,7 +112,7 @@ struct xfs_buftarg { struct percpu_counter bt_readahead_count; struct ratelimit_state bt_ioerror_rl; - /* Atomic write unit values */ + /* Atomic write unit values, bytes */ unsigned int bt_bdev_awu_min; unsigned int bt_bdev_awu_max; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index d3471a7418b9..d7e2b902ef5c 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -358,15 +358,7 @@ static inline bool xfs_inode_has_bigrtalloc(const struct xfs_inode *ip) static inline bool xfs_inode_can_hw_atomic_write(const struct xfs_inode *ip) { - struct xfs_mount *mp = ip->i_mount; - struct xfs_buftarg *target = xfs_inode_buftarg(ip); - - if (mp->m_sb.sb_blocksize < target->bt_bdev_awu_min) - return false; - if (mp->m_sb.sb_blocksize > target->bt_bdev_awu_max) - return false; - - return true; + return xfs_inode_buftarg(ip)->bt_bdev_awu_max > 0; } /* From 6baf4cc47a741024d37e6149d5d035d3fc9ed1fe Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 7 May 2025 14:18:27 -0700 Subject: [PATCH 11/26] xfs: allow block allocator to take an alignment hint Add a BMAPI flag to provide a hint to the block allocator to align extents according to the extszhint. This will be useful for atomic writes to ensure that we are not being allocated extents which are not suitable (for atomic writes). Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Darrick J. Wong Signed-off-by: John Garry --- fs/xfs/libxfs/xfs_bmap.c | 5 +++++ fs/xfs/libxfs/xfs_bmap.h | 6 +++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 63255820b58a..d954f9b8071f 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3312,6 +3312,11 @@ xfs_bmap_compute_alignments( align = xfs_get_cowextsz_hint(ap->ip); else if (ap->datatype & XFS_ALLOC_USERDATA) align = xfs_get_extsz_hint(ap->ip); + + /* Try to align start block to any minimum allocation alignment */ + if (align > 1 && (ap->flags & XFS_BMAPI_EXTSZALIGN)) + args->alignment = align; + if (align) { if (xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0, ap->eof, 0, ap->conv, &ap->offset, diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index b4d9c6e0f3f9..d5f2729305fa 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -87,6 +87,9 @@ struct xfs_bmalloca { /* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */ #define XFS_BMAPI_NORMAP (1u << 10) +/* Try to align allocations to the extent size hint */ +#define XFS_BMAPI_EXTSZALIGN (1u << 11) + #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ @@ -98,7 +101,8 @@ struct xfs_bmalloca { { XFS_BMAPI_REMAP, "REMAP" }, \ { XFS_BMAPI_COWFORK, "COWFORK" }, \ { XFS_BMAPI_NODISCARD, "NODISCARD" }, \ - { XFS_BMAPI_NORMAP, "NORMAP" } + { XFS_BMAPI_NORMAP, "NORMAP" },\ + { XFS_BMAPI_EXTSZALIGN, "EXTSZALIGN" } static inline int xfs_bmapi_aflag(int w) From 514df14fae97c91521ef53faf3a386e26afb8bee Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 7 May 2025 14:18:28 -0700 Subject: [PATCH 12/26] xfs: refactor xfs_reflink_end_cow_extent() Refactor xfs_reflink_end_cow_extent() into separate parts which process the CoW range and commit the transaction. This refactoring will be used in future for when it is required to commit a range of extents as a single transaction, similar to how it was done pre-commit d6f215f359637. Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Darrick J. Wong Signed-off-by: John Garry --- fs/xfs/xfs_reflink.c | 72 ++++++++++++++++++++++++++------------------ 1 file changed, 42 insertions(+), 30 deletions(-) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index cc3b4df88110..bd711c5bb6bb 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -786,35 +786,19 @@ xfs_reflink_update_quota( * requirements as low as possible. */ STATIC int -xfs_reflink_end_cow_extent( +xfs_reflink_end_cow_extent_locked( + struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *offset_fsb, xfs_fileoff_t end_fsb) { struct xfs_iext_cursor icur; struct xfs_bmbt_irec got, del, data; - struct xfs_mount *mp = ip->i_mount; - struct xfs_trans *tp; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); - unsigned int resblks; int nmaps; bool isrt = XFS_IS_REALTIME_INODE(ip); int error; - resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, - XFS_TRANS_RESERVE, &tp); - if (error) - return error; - - /* - * Lock the inode. We have to ijoin without automatic unlock because - * the lead transaction is the refcountbt record deletion; the data - * fork update follows as a deferred log item. - */ - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); - /* * In case of racing, overlapping AIO writes no COW extents might be * left by the time I/O completes for the loser of the race. In that @@ -823,7 +807,7 @@ xfs_reflink_end_cow_extent( if (!xfs_iext_lookup_extent(ip, ifp, *offset_fsb, &icur, &got) || got.br_startoff >= end_fsb) { *offset_fsb = end_fsb; - goto out_cancel; + return 0; } /* @@ -837,7 +821,7 @@ xfs_reflink_end_cow_extent( if (!xfs_iext_next_extent(ifp, &icur, &got) || got.br_startoff >= end_fsb) { *offset_fsb = end_fsb; - goto out_cancel; + return 0; } } del = got; @@ -846,14 +830,14 @@ xfs_reflink_end_cow_extent( error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, XFS_IEXT_REFLINK_END_COW_CNT); if (error) - goto out_cancel; + return error; /* Grab the corresponding mapping in the data fork. */ nmaps = 1; error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data, &nmaps, 0); if (error) - goto out_cancel; + return error; /* We can only remap the smaller of the two extent sizes. */ data.br_blockcount = min(data.br_blockcount, del.br_blockcount); @@ -882,7 +866,7 @@ xfs_reflink_end_cow_extent( error = xfs_bunmapi(NULL, ip, data.br_startoff, data.br_blockcount, 0, 1, &done); if (error) - goto out_cancel; + return error; ASSERT(done); } @@ -899,17 +883,45 @@ xfs_reflink_end_cow_extent( /* Remove the mapping from the CoW fork. */ xfs_bmap_del_extent_cow(ip, &icur, &got, &del); - error = xfs_trans_commit(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) - return error; - /* Update the caller about how much progress we made. */ *offset_fsb = del.br_startoff + del.br_blockcount; return 0; +} -out_cancel: - xfs_trans_cancel(tp); +/* + * Remap part of the CoW fork into the data fork. + * + * We aim to remap the range starting at @offset_fsb and ending at @end_fsb + * into the data fork; this function will remap what it can (at the end of the + * range) and update @end_fsb appropriately. Each remap gets its own + * transaction because we can end up merging and splitting bmbt blocks for + * every remap operation and we'd like to keep the block reservation + * requirements as low as possible. + */ +STATIC int +xfs_reflink_end_cow_extent( + struct xfs_inode *ip, + xfs_fileoff_t *offset_fsb, + xfs_fileoff_t end_fsb) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + unsigned int resblks; + int error; + + resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, + XFS_TRANS_RESERVE, &tp); + if (error) + return error; + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + error = xfs_reflink_end_cow_extent_locked(tp, ip, offset_fsb, end_fsb); + if (error) + xfs_trans_cancel(tp); + else + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } From 0ea88ed47bb1912377975f310da10183e0f6c5bb Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 7 May 2025 14:18:28 -0700 Subject: [PATCH 13/26] xfs: refine atomic write size check in xfs_file_write_iter() Currently the size of atomic write allowed is fixed at the blocksize. To start to lift this restriction, partly refactor xfs_report_atomic_write() to into helpers - xfs_get_atomic_write_{min, max}() - and use those helpers to find the per-inode atomic write limits and check according to that. Also add xfs_get_atomic_write_max_opt() to return the optimal limit, and just return 0 since large atomics aren't supported yet. Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Darrick J. Wong Signed-off-by: John Garry --- fs/xfs/xfs_file.c | 12 +++++------- fs/xfs/xfs_iops.c | 36 +++++++++++++++++++++++++++++++----- fs/xfs/xfs_iops.h | 3 +++ 3 files changed, 39 insertions(+), 12 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 55bdae44e42a..e8acd6ca8f27 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1032,14 +1032,12 @@ xfs_file_write_iter( return xfs_file_dax_write(iocb, from); if (iocb->ki_flags & IOCB_ATOMIC) { - /* - * Currently only atomic writing of a single FS block is - * supported. It would be possible to atomic write smaller than - * a FS block, but there is no requirement to support this. - * Note that iomap also does not support this yet. - */ - if (ocount != ip->i_mount->m_sb.sb_blocksize) + if (ocount < xfs_get_atomic_write_min(ip)) return -EINVAL; + + if (ocount > xfs_get_atomic_write_max(ip)) + return -EINVAL; + ret = generic_atomic_write_valid(iocb, from); if (ret) return ret; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 22432c300fd7..77a0606e9dc9 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -601,16 +601,42 @@ xfs_report_dioalign( stat->dio_offset_align = stat->dio_read_offset_align; } +unsigned int +xfs_get_atomic_write_min( + struct xfs_inode *ip) +{ + if (!xfs_inode_can_hw_atomic_write(ip)) + return 0; + + return ip->i_mount->m_sb.sb_blocksize; +} + +unsigned int +xfs_get_atomic_write_max( + struct xfs_inode *ip) +{ + if (!xfs_inode_can_hw_atomic_write(ip)) + return 0; + + return ip->i_mount->m_sb.sb_blocksize; +} + +unsigned int +xfs_get_atomic_write_max_opt( + struct xfs_inode *ip) +{ + return 0; +} + static void xfs_report_atomic_write( struct xfs_inode *ip, struct kstat *stat) { - unsigned int unit_min = 0, unit_max = 0; - - if (xfs_inode_can_hw_atomic_write(ip)) - unit_min = unit_max = ip->i_mount->m_sb.sb_blocksize; - generic_fill_statx_atomic_writes(stat, unit_min, unit_max, 0); + generic_fill_statx_atomic_writes(stat, + xfs_get_atomic_write_min(ip), + xfs_get_atomic_write_max(ip), + xfs_get_atomic_write_max_opt(ip)); } STATIC int diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index 3c1a2605ffd2..0896f6b8b3b8 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -19,5 +19,8 @@ int xfs_inode_init_security(struct inode *inode, struct inode *dir, extern void xfs_setup_inode(struct xfs_inode *ip); extern void xfs_setup_iops(struct xfs_inode *ip); extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init); +unsigned int xfs_get_atomic_write_min(struct xfs_inode *ip); +unsigned int xfs_get_atomic_write_max(struct xfs_inode *ip); +unsigned int xfs_get_atomic_write_max_opt(struct xfs_inode *ip); #endif /* __XFS_IOPS_H__ */ From bd1d2c21d5d2496f01b2d1fce07ec2e0da05dd3e Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 7 May 2025 14:18:29 -0700 Subject: [PATCH 14/26] xfs: add xfs_atomic_write_cow_iomap_begin() For CoW-based atomic writes, reuse the infrastructure for reflink CoW fork support. Add ->iomap_begin() callback xfs_atomic_write_cow_iomap_begin() to create staging mappings in the CoW fork for atomic write updates. The general steps in the function are as follows: - find extent mapping in the CoW fork for the FS block range being written - if part or full extent is found, proceed to process found extent - if no extent found, map in new blocks to the CoW fork - convert unwritten blocks in extent if required - update iomap extent mapping and return The bulk of this function is quite similar to the processing in xfs_reflink_allocate_cow(), where we try to find an extent mapping; if none exists, then allocate a new extent in the CoW fork, convert unwritten blocks, and return a mapping. Performance testing has shown the XFS_ILOCK_EXCL locking to be quite a bottleneck, so this is an area which could be optimised in future. Christoph Hellwig contributed almost all of the code in xfs_atomic_write_cow_iomap_begin(). Reviewed-by: Darrick J. Wong [djwong: add a new xfs_can_sw_atomic_write to convey intent better] Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: John Garry --- fs/xfs/xfs_iomap.c | 128 +++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_iomap.h | 1 + fs/xfs/xfs_mount.h | 5 ++ fs/xfs/xfs_reflink.c | 2 +- fs/xfs/xfs_reflink.h | 2 + fs/xfs/xfs_trace.h | 22 ++++++++ 6 files changed, 159 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index cb23c8871f81..166fba2ff1ef 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1022,6 +1022,134 @@ const struct iomap_ops xfs_zoned_direct_write_iomap_ops = { }; #endif /* CONFIG_XFS_RT */ +static int +xfs_atomic_write_cow_iomap_begin( + struct inode *inode, + loff_t offset, + loff_t length, + unsigned flags, + struct iomap *iomap, + struct iomap *srcmap) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + const xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length); + xfs_filblks_t count_fsb = end_fsb - offset_fsb; + int nmaps = 1; + xfs_filblks_t resaligned; + struct xfs_bmbt_irec cmap; + struct xfs_iext_cursor icur; + struct xfs_trans *tp; + unsigned int dblocks = 0, rblocks = 0; + int error; + u64 seq; + + ASSERT(flags & IOMAP_WRITE); + ASSERT(flags & IOMAP_DIRECT); + + if (xfs_is_shutdown(mp)) + return -EIO; + + if (!xfs_can_sw_atomic_write(mp)) { + ASSERT(xfs_can_sw_atomic_write(mp)); + return -EINVAL; + } + + /* blocks are always allocated in this path */ + if (flags & IOMAP_NOWAIT) + return -EAGAIN; + + trace_xfs_iomap_atomic_write_cow(ip, offset, length); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + if (!ip->i_cowfp) { + ASSERT(!xfs_is_reflink_inode(ip)); + xfs_ifork_init_cow(ip); + } + + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap)) + cmap.br_startoff = end_fsb; + if (cmap.br_startoff <= offset_fsb) { + xfs_trim_extent(&cmap, offset_fsb, count_fsb); + goto found; + } + + end_fsb = cmap.br_startoff; + count_fsb = end_fsb - offset_fsb; + + resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb, + xfs_get_cowextsz_hint(ip)); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + if (XFS_IS_REALTIME_INODE(ip)) { + dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + rblocks = resaligned; + } else { + dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); + rblocks = 0; + } + + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks, + rblocks, false, &tp); + if (error) + return error; + + /* extent layout could have changed since the unlock, so check again */ + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap)) + cmap.br_startoff = end_fsb; + if (cmap.br_startoff <= offset_fsb) { + xfs_trim_extent(&cmap, offset_fsb, count_fsb); + xfs_trans_cancel(tp); + goto found; + } + + /* + * Allocate the entire reservation as unwritten blocks. + * + * Use XFS_BMAPI_EXTSZALIGN to hint at aligning new extents according to + * extszhint, such that there will be a greater chance that future + * atomic writes to that same range will be aligned (and don't require + * this COW-based method). + */ + error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, + XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC | + XFS_BMAPI_EXTSZALIGN, 0, &cmap, &nmaps); + if (error) { + xfs_trans_cancel(tp); + goto out_unlock; + } + + xfs_inode_set_cowblocks_tag(ip); + error = xfs_trans_commit(tp); + if (error) + goto out_unlock; + +found: + if (cmap.br_state != XFS_EXT_NORM) { + error = xfs_reflink_convert_cow_locked(ip, offset_fsb, + count_fsb); + if (error) + goto out_unlock; + cmap.br_state = XFS_EXT_NORM; + } + + length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount); + trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap); + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq); + +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +const struct iomap_ops xfs_atomic_write_cow_iomap_ops = { + .iomap_begin = xfs_atomic_write_cow_iomap_begin, +}; + static int xfs_dax_write_iomap_end( struct inode *inode, diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index d330c4a581b1..674f8ac1b9bd 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -56,5 +56,6 @@ extern const struct iomap_ops xfs_read_iomap_ops; extern const struct iomap_ops xfs_seek_iomap_ops; extern const struct iomap_ops xfs_xattr_iomap_ops; extern const struct iomap_ops xfs_dax_write_iomap_ops; +extern const struct iomap_ops xfs_atomic_write_cow_iomap_ops; #endif /* __XFS_IOMAP_H__*/ diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index e5192c12e7ac..e67bc3e91f98 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -464,6 +464,11 @@ static inline bool xfs_has_nonzoned(const struct xfs_mount *mp) return !xfs_has_zoned(mp); } +static inline bool xfs_can_sw_atomic_write(struct xfs_mount *mp) +{ + return xfs_has_reflink(mp); +} + /* * Some features are always on for v5 file systems, allow the compiler to * eliminiate dead code when building without v4 support. diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index bd711c5bb6bb..f5d338916098 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -293,7 +293,7 @@ xfs_bmap_trim_cow( return xfs_reflink_trim_around_shared(ip, imap, shared); } -static int +int xfs_reflink_convert_cow_locked( struct xfs_inode *ip, xfs_fileoff_t offset_fsb, diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index cc4e92278279..379619f24247 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -35,6 +35,8 @@ int xfs_reflink_allocate_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool convert_now); extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); +int xfs_reflink_convert_cow_locked(struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, xfs_filblks_t count_fsb); extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip, struct xfs_trans **tpp, xfs_fileoff_t offset_fsb, diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index e56ba1963160..9554578c6da4 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1657,6 +1657,28 @@ DEFINE_RW_EVENT(xfs_file_direct_write); DEFINE_RW_EVENT(xfs_file_dax_write); DEFINE_RW_EVENT(xfs_reflink_bounce_dio_write); +TRACE_EVENT(xfs_iomap_atomic_write_cow, + TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), + TP_ARGS(ip, offset, count), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_off_t, offset) + __field(ssize_t, count) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->offset = offset; + __entry->count = count; + ), + TP_printk("dev %d:%d ino 0x%llx pos 0x%llx bytecount 0x%zx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->offset, + __entry->count) +) + DECLARE_EVENT_CLASS(xfs_imap_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, int whichfork, struct xfs_bmbt_irec *irec), From 11ab31909d7cc177fab981cd0d6822a570244599 Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 7 May 2025 14:18:30 -0700 Subject: [PATCH 15/26] xfs: add large atomic writes checks in xfs_direct_write_iomap_begin() For when large atomic writes (> 1x FS block) are supported, there will be various occasions when HW offload may not be possible. Such instances include: - unaligned extent mapping wrt write length - extent mappings which do not cover the full write, e.g. the write spans sparse or mixed-mapping extents - the write length is greater than HW offload can support - no hardware support at all In those cases, we need to fallback to the CoW-based atomic write mode. For this, report special code -ENOPROTOOPT to inform the caller that HW offload-based method is not possible. In addition to the occasions mentioned, if the write covers an unallocated range, we again judge that we need to rely on the CoW-based method when we would need to allocate anything more than 1x block. This is because if we allocate less blocks that is required for the write, then again HW offload-based method would not be possible. So we are taking a pessimistic approach to writes covering unallocated space. Reviewed-by: Darrick J. Wong [djwong: various cleanups] Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: John Garry --- fs/xfs/xfs_iomap.c | 62 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 166fba2ff1ef..ff05e6b1b0bb 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -798,6 +798,38 @@ imap_spans_range( return true; } +static bool +xfs_bmap_hw_atomic_write_possible( + struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, + xfs_fileoff_t offset_fsb, + xfs_fileoff_t end_fsb) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fsize_t len = XFS_FSB_TO_B(mp, end_fsb - offset_fsb); + + /* + * atomic writes are required to be naturally aligned for disk blocks, + * which ensures that we adhere to block layer rules that we won't + * straddle any boundary or violate write alignment requirement. + */ + if (!IS_ALIGNED(imap->br_startblock, imap->br_blockcount)) + return false; + + /* + * Spanning multiple extents would mean that multiple BIOs would be + * issued, and so would lose atomicity required for REQ_ATOMIC-based + * atomics. + */ + if (!imap_spans_range(imap, offset_fsb, end_fsb)) + return false; + + /* + * The ->iomap_begin caller should ensure this, but check anyway. + */ + return len <= xfs_inode_buftarg(ip)->bt_bdev_awu_max; +} + static int xfs_direct_write_iomap_begin( struct inode *inode, @@ -812,9 +844,11 @@ xfs_direct_write_iomap_begin( struct xfs_bmbt_irec imap, cmap; xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length); + xfs_fileoff_t orig_end_fsb = end_fsb; int nimaps = 1, error = 0; bool shared = false; u16 iomap_flags = 0; + bool needs_alloc; unsigned int lockmode; u64 seq; @@ -875,13 +909,37 @@ relock: (flags & IOMAP_DIRECT) || IS_DAX(inode)); if (error) goto out_unlock; - if (shared) + if (shared) { + if ((flags & IOMAP_ATOMIC) && + !xfs_bmap_hw_atomic_write_possible(ip, &cmap, + offset_fsb, end_fsb)) { + error = -ENOPROTOOPT; + goto out_unlock; + } goto out_found_cow; + } end_fsb = imap.br_startoff + imap.br_blockcount; length = XFS_FSB_TO_B(mp, end_fsb) - offset; } - if (imap_needs_alloc(inode, flags, &imap, nimaps)) + needs_alloc = imap_needs_alloc(inode, flags, &imap, nimaps); + + if (flags & IOMAP_ATOMIC) { + error = -ENOPROTOOPT; + /* + * If we allocate less than what is required for the write + * then we may end up with multiple extents, which means that + * REQ_ATOMIC-based cannot be used, so avoid this possibility. + */ + if (needs_alloc && orig_end_fsb - offset_fsb > 1) + goto out_unlock; + + if (!xfs_bmap_hw_atomic_write_possible(ip, &imap, offset_fsb, + orig_end_fsb)) + goto out_unlock; + } + + if (needs_alloc) goto allocate_blocks; /* From b1e09178b73adf10dc87fba9aee7787a7ad26874 Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 7 May 2025 14:18:31 -0700 Subject: [PATCH 16/26] xfs: commit CoW-based atomic writes atomically When completing a CoW-based write, each extent range mapping update is covered by a separate transaction. For a CoW-based atomic write, all mappings must be changed at once, so change to use a single transaction. Note that there is a limit on the amount of log intent items which can be fit into a single transaction, but this is being ignored for now since the count of items for a typical atomic write would be much less than is typically supported. A typical atomic write would be expected to be 64KB or less, which means only 16 possible extents unmaps, which is quite small. Reviewed-by: Darrick J. Wong [djwong: add tr_atomic_ioend] Reviewed-by: Christoph Hellwig Signed-off-by: Darrick J. Wong Signed-off-by: John Garry --- fs/xfs/libxfs/xfs_log_rlimit.c | 4 +++ fs/xfs/libxfs/xfs_trans_resv.c | 15 +++++++++ fs/xfs/libxfs/xfs_trans_resv.h | 1 + fs/xfs/xfs_file.c | 5 ++- fs/xfs/xfs_reflink.c | 56 ++++++++++++++++++++++++++++++++++ fs/xfs/xfs_reflink.h | 2 ++ 6 files changed, 82 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c index d3bd6a86c8fe..34bba96d30ca 100644 --- a/fs/xfs/libxfs/xfs_log_rlimit.c +++ b/fs/xfs/libxfs/xfs_log_rlimit.c @@ -91,6 +91,7 @@ xfs_log_calc_trans_resv_for_minlogblocks( */ if (xfs_want_minlogsize_fixes(&mp->m_sb)) { xfs_trans_resv_calc(mp, resv); + resv->tr_atomic_ioend = M_RES(mp)->tr_atomic_ioend; return; } @@ -107,6 +108,9 @@ xfs_log_calc_trans_resv_for_minlogblocks( xfs_trans_resv_calc(mp, resv); + /* Copy the dynamic transaction reservation types from the running fs */ + resv->tr_atomic_ioend = M_RES(mp)->tr_atomic_ioend; + if (xfs_has_reflink(mp)) { /* * In the early days of reflink, typical log operation counts diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 580d00ae2857..a841432abf83 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -1284,6 +1284,15 @@ xfs_calc_namespace_reservations( resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES; } +STATIC void +xfs_calc_default_atomic_ioend_reservation( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + /* Pick a default that will scale reasonably for the log size. */ + resp->tr_atomic_ioend = resp->tr_itruncate; +} + void xfs_trans_resv_calc( struct xfs_mount *mp, @@ -1378,4 +1387,10 @@ xfs_trans_resv_calc( resp->tr_itruncate.tr_logcount += logcount_adj; resp->tr_write.tr_logcount += logcount_adj; resp->tr_qm_dqalloc.tr_logcount += logcount_adj; + + /* + * Now that we've finished computing the static reservations, we can + * compute the dynamic reservation for atomic writes. + */ + xfs_calc_default_atomic_ioend_reservation(mp, resp); } diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index d9d0032cbbc5..670045d417a6 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h @@ -48,6 +48,7 @@ struct xfs_trans_resv { struct xfs_trans_res tr_qm_dqalloc; /* allocate quota on disk */ struct xfs_trans_res tr_sb; /* modify superblock */ struct xfs_trans_res tr_fsyncts; /* update timestamps on fsync */ + struct xfs_trans_res tr_atomic_ioend; /* untorn write completion */ }; /* shorthand way of accessing reservation structure */ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e8acd6ca8f27..32883ec8ca2e 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -576,7 +576,10 @@ xfs_dio_write_end_io( nofs_flag = memalloc_nofs_save(); if (flags & IOMAP_DIO_COW) { - error = xfs_reflink_end_cow(ip, offset, size); + if (iocb->ki_flags & IOCB_ATOMIC) + error = xfs_reflink_end_atomic_cow(ip, offset, size); + else + error = xfs_reflink_end_cow(ip, offset, size); if (error) goto out; } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index f5d338916098..218dee76768b 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -984,6 +984,62 @@ xfs_reflink_end_cow( return error; } +/* + * Fully remap all of the file's data fork at once, which is the critical part + * in achieving atomic behaviour. + * The regular CoW end path does not use function as to keep the block + * reservation per transaction as low as possible. + */ +int +xfs_reflink_end_atomic_cow( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t count) +{ + xfs_fileoff_t offset_fsb; + xfs_fileoff_t end_fsb; + int error = 0; + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + unsigned int resblks; + + trace_xfs_reflink_end_cow(ip, offset, count); + + offset_fsb = XFS_B_TO_FSBT(mp, offset); + end_fsb = XFS_B_TO_FSB(mp, offset + count); + + /* + * Each remapping operation could cause a btree split, so in the worst + * case that's one for each block. + */ + resblks = (end_fsb - offset_fsb) * + XFS_NEXTENTADD_SPACE_RES(mp, 1, XFS_DATA_FORK); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_atomic_ioend, resblks, 0, + XFS_TRANS_RESERVE, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + while (end_fsb > offset_fsb && !error) { + error = xfs_reflink_end_cow_extent_locked(tp, ip, &offset_fsb, + end_fsb); + } + if (error) { + trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_); + goto out_cancel; + } + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +out_cancel: + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + /* * Free all CoW staging blocks that are still referenced by the ondisk refcount * metadata. The ondisk metadata does not track which inode created the diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 379619f24247..412e9b6f2082 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -45,6 +45,8 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count, bool cancel_real); extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); +int xfs_reflink_end_atomic_cow(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t count); extern int xfs_reflink_recover_cow(struct xfs_mount *mp); extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, From 9baeac3ab1f83689f6105853e7682030b33751fd Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 7 May 2025 14:18:31 -0700 Subject: [PATCH 17/26] xfs: add xfs_file_dio_write_atomic() Add xfs_file_dio_write_atomic() for dedicated handling of atomic writes. Now HW offload will not be required to support atomic writes and is an optional feature. CoW-based atomic writes will be supported with out-of-places write and atomic extent remapping. Either mode of operation may be used for an atomic write, depending on the circumstances. The preferred method is HW offload as it will be faster. If HW offload is not available then we always use the CoW-based method. If HW offload is available but not possible to use, then again we use the CoW-based method. If available, HW offload would not be possible for the write length exceeding the HW offload limit, the write spanning multiple extents, unaligned disk blocks, etc. Apart from the write exceeding the HW offload limit, other conditions for HW offload usage can only be detected in the iomap handling for the write. As such, we use a fallback method to issue the write if we detect in the ->iomap_begin() handler that HW offload is not possible. Special code -ENOPROTOOPT is returned from ->iomap_begin() to inform that HW offload is not possible. In other words, atomic writes are supported on any filesystem that can perform out of place write remapping atomically (i.e. reflink) up to some fairly large size. If the conditions are right (a single correctly aligned overwrite mapping) then the filesystem will use any available hardware support to avoid the filesystem metadata updates. Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: John Garry --- fs/xfs/xfs_file.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 32883ec8ca2e..f4a66ff85748 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -728,6 +728,72 @@ xfs_file_dio_write_zoned( return ret; } +/* + * Handle block atomic writes + * + * Two methods of atomic writes are supported: + * - REQ_ATOMIC-based, which would typically use some form of HW offload in the + * disk + * - COW-based, which uses a COW fork as a staging extent for data updates + * before atomically updating extent mappings for the range being written + * + */ +static noinline ssize_t +xfs_file_dio_write_atomic( + struct xfs_inode *ip, + struct kiocb *iocb, + struct iov_iter *from) +{ + unsigned int iolock = XFS_IOLOCK_SHARED; + ssize_t ret, ocount = iov_iter_count(from); + const struct iomap_ops *dops; + + /* + * HW offload should be faster, so try that first if it is already + * known that the write length is not too large. + */ + if (ocount > xfs_inode_buftarg(ip)->bt_bdev_awu_max) + dops = &xfs_atomic_write_cow_iomap_ops; + else + dops = &xfs_direct_write_iomap_ops; + +retry: + ret = xfs_ilock_iocb_for_write(iocb, &iolock); + if (ret) + return ret; + + ret = xfs_file_write_checks(iocb, from, &iolock, NULL); + if (ret) + goto out_unlock; + + /* Demote similar to xfs_file_dio_write_aligned() */ + if (iolock == XFS_IOLOCK_EXCL) { + xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); + iolock = XFS_IOLOCK_SHARED; + } + + trace_xfs_file_direct_write(iocb, from); + ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops, + 0, NULL, 0); + + /* + * The retry mechanism is based on the ->iomap_begin method returning + * -ENOPROTOOPT, which would be when the REQ_ATOMIC-based write is not + * possible. The REQ_ATOMIC-based method typically not be possible if + * the write spans multiple extents or the disk blocks are misaligned. + */ + if (ret == -ENOPROTOOPT && dops == &xfs_direct_write_iomap_ops) { + xfs_iunlock(ip, iolock); + dops = &xfs_atomic_write_cow_iomap_ops; + goto retry; + } + +out_unlock: + if (iolock) + xfs_iunlock(ip, iolock); + return ret; +} + /* * Handle block unaligned direct I/O writes * @@ -843,6 +909,8 @@ xfs_file_dio_write( return xfs_file_dio_write_unaligned(ip, iocb, from); if (xfs_is_zoned_inode(ip)) return xfs_file_dio_write_zoned(ip, iocb, from); + if (iocb->ki_flags & IOCB_ATOMIC) + return xfs_file_dio_write_atomic(ip, iocb, from); return xfs_file_dio_write_aligned(ip, iocb, from, &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL); } From 0c438dcc31504bf4f50b20dc52f8f5ca7fab53e2 Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 7 May 2025 14:18:32 -0700 Subject: [PATCH 18/26] xfs: add xfs_calc_atomic_write_unit_max() Now that CoW-based atomic writes are supported, update the max size of an atomic write for the data device. The limit of a CoW-based atomic write will be the limit of the number of logitems which can fit into a single transaction. In addition, the max atomic write size needs to be aligned to the agsize. Limit the size of atomic writes to the greatest power-of-two factor of the agsize so that allocations for an atomic write will always be aligned compatibly with the alignment requirements of the storage. Function xfs_atomic_write_logitems() is added to find the limit the number of log items which can fit in a single transaction. Amend the max atomic write computation to create a new transaction reservation type, and compute the maximum size of an atomic write completion (in fsblocks) based on this new transaction reservation. Initially, tr_atomic_write is a clone of tr_itruncate, which provides a reasonable level of parallelism. In the next patch, we'll add a mount option so that sysadmins can configure their own limits. [djwong: use a new reservation type for atomic write ioends, refactor group limit calculations] Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong [jpg: rounddown power-of-2 always] Reviewed-by: Christoph Hellwig Signed-off-by: John Garry --- fs/xfs/libxfs/xfs_trans_resv.c | 94 ++++++++++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_trans_resv.h | 2 + fs/xfs/xfs_mount.c | 83 ++++++++++++++++++++++++++++++ fs/xfs/xfs_mount.h | 6 +++ fs/xfs/xfs_reflink.c | 16 ++++++ fs/xfs/xfs_reflink.h | 2 + fs/xfs/xfs_trace.h | 60 ++++++++++++++++++++++ 7 files changed, 263 insertions(+) diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index a841432abf83..e73c09fbd24c 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -22,6 +22,12 @@ #include "xfs_rtbitmap.h" #include "xfs_attr_item.h" #include "xfs_log.h" +#include "xfs_defer.h" +#include "xfs_bmap_item.h" +#include "xfs_extfree_item.h" +#include "xfs_rmap_item.h" +#include "xfs_refcount_item.h" +#include "xfs_trace.h" #define _ALLOC true #define _FREE false @@ -1394,3 +1400,91 @@ xfs_trans_resv_calc( */ xfs_calc_default_atomic_ioend_reservation(mp, resp); } + +/* + * Return the per-extent and fixed transaction reservation sizes needed to + * complete an atomic write. + */ +STATIC unsigned int +xfs_calc_atomic_write_ioend_geometry( + struct xfs_mount *mp, + unsigned int *step_size) +{ + const unsigned int efi = xfs_efi_log_space(1); + const unsigned int efd = xfs_efd_log_space(1); + const unsigned int rui = xfs_rui_log_space(1); + const unsigned int rud = xfs_rud_log_space(); + const unsigned int cui = xfs_cui_log_space(1); + const unsigned int cud = xfs_cud_log_space(); + const unsigned int bui = xfs_bui_log_space(1); + const unsigned int bud = xfs_bud_log_space(); + + /* + * Maximum overhead to complete an atomic write ioend in software: + * remove data fork extent + remove cow fork extent + map extent into + * data fork. + * + * tx0: Creates a BUI and a CUI and that's all it needs. + * + * tx1: Roll to finish the BUI. Need space for the BUD, an RUI, and + * enough space to relog the CUI (== CUI + CUD). + * + * tx2: Roll again to finish the RUI. Need space for the RUD and space + * to relog the CUI. + * + * tx3: Roll again, need space for the CUD and possibly a new EFI. + * + * tx4: Roll again, need space for an EFD. + * + * If the extent referenced by the pair of BUI/CUI items is not the one + * being currently processed, then we need to reserve space to relog + * both items. + */ + const unsigned int tx0 = bui + cui; + const unsigned int tx1 = bud + rui + cui + cud; + const unsigned int tx2 = rud + cui + cud; + const unsigned int tx3 = cud + efi; + const unsigned int tx4 = efd; + const unsigned int relog = bui + bud + cui + cud; + + const unsigned int per_intent = max(max3(tx0, tx1, tx2), + max3(tx3, tx4, relog)); + + /* Overhead to finish one step of each intent item type */ + const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1); + const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1); + const unsigned int f3 = xfs_calc_finish_cui_reservation(mp, 1); + const unsigned int f4 = xfs_calc_finish_bui_reservation(mp, 1); + + /* We only finish one item per transaction in a chain */ + *step_size = max(f4, max3(f1, f2, f3)); + + return per_intent; +} + +/* + * Compute the maximum size (in fsblocks) of atomic writes that we can complete + * given the existing log reservations. + */ +xfs_extlen_t +xfs_calc_max_atomic_write_fsblocks( + struct xfs_mount *mp) +{ + const struct xfs_trans_res *resv = &M_RES(mp)->tr_atomic_ioend; + unsigned int per_intent = 0; + unsigned int step_size = 0; + unsigned int ret = 0; + + if (resv->tr_logres > 0) { + per_intent = xfs_calc_atomic_write_ioend_geometry(mp, + &step_size); + + if (resv->tr_logres >= step_size) + ret = (resv->tr_logres - step_size) / per_intent; + } + + trace_xfs_calc_max_atomic_write_fsblocks(mp, per_intent, step_size, + resv->tr_logres, ret); + + return ret; +} diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index 670045d417a6..a6d303b83688 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h @@ -121,4 +121,6 @@ unsigned int xfs_calc_itruncate_reservation_minlogsize(struct xfs_mount *mp); unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp); unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp); +xfs_extlen_t xfs_calc_max_atomic_write_fsblocks(struct xfs_mount *mp); + #endif /* __XFS_TRANS_RESV_H__ */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 00b53f479ece..86089e27b8e7 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -666,6 +666,82 @@ xfs_agbtree_compute_maxlevels( mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels); } +/* Maximum atomic write IO size that the kernel allows. */ +static inline xfs_extlen_t xfs_calc_atomic_write_max(struct xfs_mount *mp) +{ + return rounddown_pow_of_two(XFS_B_TO_FSB(mp, MAX_RW_COUNT)); +} + +static inline unsigned int max_pow_of_two_factor(const unsigned int nr) +{ + return 1 << (ffs(nr) - 1); +} + +/* + * If the data device advertises atomic write support, limit the size of data + * device atomic writes to the greatest power-of-two factor of the AG size so + * that every atomic write unit aligns with the start of every AG. This is + * required so that the per-AG allocations for an atomic write will always be + * aligned compatibly with the alignment requirements of the storage. + * + * If the data device doesn't advertise atomic writes, then there are no + * alignment restrictions and the largest out-of-place write we can do + * ourselves is the number of blocks that user files can allocate from any AG. + */ +static inline xfs_extlen_t xfs_calc_perag_awu_max(struct xfs_mount *mp) +{ + if (mp->m_ddev_targp->bt_bdev_awu_min > 0) + return max_pow_of_two_factor(mp->m_sb.sb_agblocks); + return rounddown_pow_of_two(mp->m_ag_max_usable); +} + +/* + * Reflink on the realtime device requires rtgroups, and atomic writes require + * reflink. + * + * If the realtime device advertises atomic write support, limit the size of + * data device atomic writes to the greatest power-of-two factor of the rtgroup + * size so that every atomic write unit aligns with the start of every rtgroup. + * This is required so that the per-rtgroup allocations for an atomic write + * will always be aligned compatibly with the alignment requirements of the + * storage. + * + * If the rt device doesn't advertise atomic writes, then there are no + * alignment restrictions and the largest out-of-place write we can do + * ourselves is the number of blocks that user files can allocate from any + * rtgroup. + */ +static inline xfs_extlen_t xfs_calc_rtgroup_awu_max(struct xfs_mount *mp) +{ + struct xfs_groups *rgs = &mp->m_groups[XG_TYPE_RTG]; + + if (rgs->blocks == 0) + return 0; + if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_bdev_awu_min > 0) + return max_pow_of_two_factor(rgs->blocks); + return rounddown_pow_of_two(rgs->blocks); +} + +/* Compute the maximum atomic write unit size for each section. */ +static inline void +xfs_calc_atomic_write_unit_max( + struct xfs_mount *mp) +{ + struct xfs_groups *ags = &mp->m_groups[XG_TYPE_AG]; + struct xfs_groups *rgs = &mp->m_groups[XG_TYPE_RTG]; + + const xfs_extlen_t max_write = xfs_calc_atomic_write_max(mp); + const xfs_extlen_t max_ioend = xfs_reflink_max_atomic_cow(mp); + const xfs_extlen_t max_agsize = xfs_calc_perag_awu_max(mp); + const xfs_extlen_t max_rgsize = xfs_calc_rtgroup_awu_max(mp); + + ags->awu_max = min3(max_write, max_ioend, max_agsize); + rgs->awu_max = min3(max_write, max_ioend, max_rgsize); + + trace_xfs_calc_atomic_write_unit_max(mp, max_write, max_ioend, + max_agsize, max_rgsize); +} + /* Compute maximum possible height for realtime btree types for this fs. */ static inline void xfs_rtbtree_compute_maxlevels( @@ -1082,6 +1158,13 @@ xfs_mountfs( xfs_zone_gc_start(mp); } + /* + * Pre-calculate atomic write unit max. This involves computations + * derived from transaction reservations, so we must do this after the + * log is fully initialized. + */ + xfs_calc_atomic_write_unit_max(mp); + return 0; out_agresv: diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index e67bc3e91f98..e2abf31438e0 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -119,6 +119,12 @@ struct xfs_groups { * SMR hard drives. */ xfs_fsblock_t start_fsb; + + /* + * Maximum length of an atomic write for files stored in this + * collection of allocation groups, in fsblocks. + */ + xfs_extlen_t awu_max; }; struct xfs_freecounter { diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 218dee76768b..ad3bcb76d805 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1040,6 +1040,22 @@ out_cancel: return error; } +/* Compute the largest atomic write that we can complete through software. */ +xfs_extlen_t +xfs_reflink_max_atomic_cow( + struct xfs_mount *mp) +{ + /* We cannot do any atomic writes without out of place writes. */ + if (!xfs_can_sw_atomic_write(mp)) + return 0; + + /* + * Atomic write limits must always be a power-of-2, according to + * generic_atomic_write_valid. + */ + return rounddown_pow_of_two(xfs_calc_max_atomic_write_fsblocks(mp)); +} + /* * Free all CoW staging blocks that are still referenced by the ondisk refcount * metadata. The ondisk metadata does not track which inode created the diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 412e9b6f2082..36cda724da89 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -68,4 +68,6 @@ extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen, bool xfs_reflink_supports_rextsize(struct xfs_mount *mp, unsigned int rextsize); +xfs_extlen_t xfs_reflink_max_atomic_cow(struct xfs_mount *mp); + #endif /* __XFS_REFLINK_H */ diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 9554578c6da4..d5ae00f8e04c 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -170,6 +170,66 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound); DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list); DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list); +TRACE_EVENT(xfs_calc_atomic_write_unit_max, + TP_PROTO(struct xfs_mount *mp, unsigned int max_write, + unsigned int max_ioend, unsigned int max_agsize, + unsigned int max_rgsize), + TP_ARGS(mp, max_write, max_ioend, max_agsize, max_rgsize), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, max_write) + __field(unsigned int, max_ioend) + __field(unsigned int, max_agsize) + __field(unsigned int, max_rgsize) + __field(unsigned int, data_awu_max) + __field(unsigned int, rt_awu_max) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->max_write = max_write; + __entry->max_ioend = max_ioend; + __entry->max_agsize = max_agsize; + __entry->max_rgsize = max_rgsize; + __entry->data_awu_max = mp->m_groups[XG_TYPE_AG].awu_max; + __entry->rt_awu_max = mp->m_groups[XG_TYPE_RTG].awu_max; + ), + TP_printk("dev %d:%d max_write %u max_ioend %u max_agsize %u max_rgsize %u data_awu_max %u rt_awu_max %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->max_write, + __entry->max_ioend, + __entry->max_agsize, + __entry->max_rgsize, + __entry->data_awu_max, + __entry->rt_awu_max) +); + +TRACE_EVENT(xfs_calc_max_atomic_write_fsblocks, + TP_PROTO(struct xfs_mount *mp, unsigned int per_intent, + unsigned int step_size, unsigned int logres, + unsigned int blockcount), + TP_ARGS(mp, per_intent, step_size, logres, blockcount), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, per_intent) + __field(unsigned int, step_size) + __field(unsigned int, logres) + __field(unsigned int, blockcount) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->per_intent = per_intent; + __entry->step_size = step_size; + __entry->logres = logres; + __entry->blockcount = blockcount; + ), + TP_printk("dev %d:%d per_intent %u step_size %u logres %u blockcount %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->per_intent, + __entry->step_size, + __entry->logres, + __entry->blockcount) +); + TRACE_EVENT(xlog_intent_recovery_failed, TP_PROTO(struct xfs_mount *mp, const struct xfs_defer_op_type *ops, int error), From 9dffc58f23849783ab2f6c2c9f5af9b94a42666f Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 7 May 2025 14:18:33 -0700 Subject: [PATCH 19/26] xfs: update atomic write limits Update the limits returned from xfs_get_atomic_write_{min, max, max_opt)(). No reflink support always means no CoW-based atomic writes. For updating xfs_get_atomic_write_min(), we support blocksize only and that depends on HW or reflink support. For updating xfs_get_atomic_write_max(), for no reflink, we are limited to blocksize but only if HW support. Otherwise we are limited to combined limit in mp->m_atomic_write_unit_max. For updating xfs_get_atomic_write_max_opt(), ultimately we are limited by the bdev atomic write limit. If xfs_get_atomic_write_max() does not report > 1x blocksize, then just continue to report 0 as before. Reviewed-by: Darrick J. Wong [djwong: update comments in the helper functions] Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: John Garry --- fs/xfs/xfs_file.c | 2 +- fs/xfs/xfs_iops.c | 54 +++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 48 insertions(+), 8 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f4a66ff85748..48254a72071b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1557,7 +1557,7 @@ xfs_file_open( if (xfs_is_shutdown(XFS_M(inode->i_sb))) return -EIO; file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; - if (xfs_inode_can_hw_atomic_write(XFS_I(inode))) + if (xfs_get_atomic_write_min(XFS_I(inode)) > 0) file->f_mode |= FMODE_CAN_ATOMIC_WRITE; return generic_file_open(inode, file); } diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 77a0606e9dc9..8cddbb7c149b 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -605,27 +605,67 @@ unsigned int xfs_get_atomic_write_min( struct xfs_inode *ip) { - if (!xfs_inode_can_hw_atomic_write(ip)) - return 0; + struct xfs_mount *mp = ip->i_mount; - return ip->i_mount->m_sb.sb_blocksize; + /* + * If we can complete an atomic write via atomic out of place writes, + * then advertise a minimum size of one fsblock. Without this + * mechanism, we can only guarantee atomic writes up to a single LBA. + * + * If out of place writes are not available, we can guarantee an atomic + * write of exactly one single fsblock if the bdev will make that + * guarantee for us. + */ + if (xfs_inode_can_hw_atomic_write(ip) || xfs_can_sw_atomic_write(mp)) + return mp->m_sb.sb_blocksize; + + return 0; } unsigned int xfs_get_atomic_write_max( struct xfs_inode *ip) { - if (!xfs_inode_can_hw_atomic_write(ip)) - return 0; + struct xfs_mount *mp = ip->i_mount; - return ip->i_mount->m_sb.sb_blocksize; + /* + * If out of place writes are not available, we can guarantee an atomic + * write of exactly one single fsblock if the bdev will make that + * guarantee for us. + */ + if (!xfs_can_sw_atomic_write(mp)) { + if (xfs_inode_can_hw_atomic_write(ip)) + return mp->m_sb.sb_blocksize; + return 0; + } + + /* + * If we can complete an atomic write via atomic out of place writes, + * then advertise a maximum size of whatever we can complete through + * that means. Hardware support is reported via max_opt, not here. + */ + if (XFS_IS_REALTIME_INODE(ip)) + return XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_RTG].awu_max); + return XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_AG].awu_max); } unsigned int xfs_get_atomic_write_max_opt( struct xfs_inode *ip) { - return 0; + unsigned int awu_max = xfs_get_atomic_write_max(ip); + + /* if the max is 1x block, then just keep behaviour that opt is 0 */ + if (awu_max <= ip->i_mount->m_sb.sb_blocksize) + return 0; + + /* + * Advertise the maximum size of an atomic write that we can tell the + * block device to perform for us. In general the bdev limit will be + * less than our out of place write limit, but we don't want to exceed + * the awu_max. + */ + return min(awu_max, xfs_inode_buftarg(ip)->bt_bdev_awu_max); } static void From 4528b9052731f14c1a9be16b98e33c9401e6d1bc Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 7 May 2025 14:18:34 -0700 Subject: [PATCH 20/26] xfs: allow sysadmins to specify a maximum atomic write limit at mount time Introduce a mount option to allow sysadmins to specify the maximum size of an atomic write. If the filesystem can work with the supplied value, that becomes the new guaranteed maximum. The value mustn't be too big for the existing filesystem geometry (max write size, max AG/rtgroup size). We dynamically recompute the tr_atomic_write transaction reservation based on the given block size, check that the current log size isn't less than the new minimum log size constraints, and set a new maximum. The actual software atomic write max is still computed based off of tr_atomic_ioend the same way it has for the past few commits. Note also that xfs_calc_atomic_write_log_geometry is non-static because mkfs will need that. Signed-off-by: Darrick J. Wong Signed-off-by: John Garry Reviewed-by: John Garry --- Documentation/admin-guide/xfs.rst | 11 +++++ fs/xfs/libxfs/xfs_trans_resv.c | 69 ++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_trans_resv.h | 4 ++ fs/xfs/xfs_mount.c | 80 ++++++++++++++++++++++++++++++- fs/xfs/xfs_mount.h | 6 +++ fs/xfs/xfs_super.c | 58 +++++++++++++++++++++- fs/xfs/xfs_trace.h | 33 +++++++++++++ 7 files changed, 259 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst index 5becb441c3cb..a18328a5fb93 100644 --- a/Documentation/admin-guide/xfs.rst +++ b/Documentation/admin-guide/xfs.rst @@ -151,6 +151,17 @@ When mounting an XFS filesystem, the following options are accepted. optional, and the log section can be separate from the data section or contained within it. + max_atomic_write=value + Set the maximum size of an atomic write. The size may be + specified in bytes, in kilobytes with a "k" suffix, in megabytes + with a "m" suffix, or in gigabytes with a "g" suffix. The size + cannot be larger than the maximum write size, larger than the + size of any allocation group, or larger than the size of a + remapping operation that the log can complete atomically. + + The default value is to set the maximum I/O completion size + to allow each CPU to handle one at a time. + max_open_zones=value Specify the max number of zones to keep open for writing on a zoned rt device. Many open zones aids file data separation diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index e73c09fbd24c..86a111d0f2fc 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -1488,3 +1488,72 @@ xfs_calc_max_atomic_write_fsblocks( return ret; } + +/* + * Compute the log blocks and transaction reservation needed to complete an + * atomic write of a given number of blocks. Worst case, each block requires + * separate handling. A return value of 0 means something went wrong. + */ +xfs_extlen_t +xfs_calc_atomic_write_log_geometry( + struct xfs_mount *mp, + xfs_extlen_t blockcount, + unsigned int *new_logres) +{ + struct xfs_trans_res *curr_res = &M_RES(mp)->tr_atomic_ioend; + uint old_logres = curr_res->tr_logres; + unsigned int per_intent, step_size; + unsigned int logres; + xfs_extlen_t min_logblocks; + + ASSERT(blockcount > 0); + + xfs_calc_default_atomic_ioend_reservation(mp, M_RES(mp)); + + per_intent = xfs_calc_atomic_write_ioend_geometry(mp, &step_size); + + /* Check for overflows */ + if (check_mul_overflow(blockcount, per_intent, &logres) || + check_add_overflow(logres, step_size, &logres)) + return 0; + + curr_res->tr_logres = logres; + min_logblocks = xfs_log_calc_minimum_size(mp); + curr_res->tr_logres = old_logres; + + trace_xfs_calc_max_atomic_write_log_geometry(mp, per_intent, step_size, + blockcount, min_logblocks, logres); + + *new_logres = logres; + return min_logblocks; +} + +/* + * Compute the transaction reservation needed to complete an out of place + * atomic write of a given number of blocks. + */ +int +xfs_calc_atomic_write_reservation( + struct xfs_mount *mp, + xfs_extlen_t blockcount) +{ + unsigned int new_logres; + xfs_extlen_t min_logblocks; + + /* + * If the caller doesn't ask for a specific atomic write size, then + * use the defaults. + */ + if (blockcount == 0) { + xfs_calc_default_atomic_ioend_reservation(mp, M_RES(mp)); + return 0; + } + + min_logblocks = xfs_calc_atomic_write_log_geometry(mp, blockcount, + &new_logres); + if (!min_logblocks || min_logblocks > mp->m_sb.sb_logblocks) + return -EINVAL; + + M_RES(mp)->tr_atomic_ioend.tr_logres = new_logres; + return 0; +} diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index a6d303b83688..336279e0fc61 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h @@ -122,5 +122,9 @@ unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp); unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp); xfs_extlen_t xfs_calc_max_atomic_write_fsblocks(struct xfs_mount *mp); +xfs_extlen_t xfs_calc_atomic_write_log_geometry(struct xfs_mount *mp, + xfs_extlen_t blockcount, unsigned int *new_logres); +int xfs_calc_atomic_write_reservation(struct xfs_mount *mp, + xfs_extlen_t blockcount); #endif /* __XFS_TRANS_RESV_H__ */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 86089e27b8e7..29276fe60df9 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -742,6 +742,82 @@ xfs_calc_atomic_write_unit_max( max_agsize, max_rgsize); } +/* + * Try to set the atomic write maximum to a new value that we got from + * userspace via mount option. + */ +int +xfs_set_max_atomic_write_opt( + struct xfs_mount *mp, + unsigned long long new_max_bytes) +{ + const xfs_filblks_t new_max_fsbs = XFS_B_TO_FSBT(mp, new_max_bytes); + const xfs_extlen_t max_write = xfs_calc_atomic_write_max(mp); + const xfs_extlen_t max_group = + max(mp->m_groups[XG_TYPE_AG].blocks, + mp->m_groups[XG_TYPE_RTG].blocks); + const xfs_extlen_t max_group_write = + max(xfs_calc_perag_awu_max(mp), xfs_calc_rtgroup_awu_max(mp)); + int error; + + if (new_max_bytes == 0) + goto set_limit; + + ASSERT(max_write <= U32_MAX); + + /* generic_atomic_write_valid enforces power of two length */ + if (!is_power_of_2(new_max_bytes)) { + xfs_warn(mp, + "max atomic write size of %llu bytes is not a power of 2", + new_max_bytes); + return -EINVAL; + } + + if (new_max_bytes & mp->m_blockmask) { + xfs_warn(mp, + "max atomic write size of %llu bytes not aligned with fsblock", + new_max_bytes); + return -EINVAL; + } + + if (new_max_fsbs > max_write) { + xfs_warn(mp, + "max atomic write size of %lluk cannot be larger than max write size %lluk", + new_max_bytes >> 10, + XFS_FSB_TO_B(mp, max_write) >> 10); + return -EINVAL; + } + + if (new_max_fsbs > max_group) { + xfs_warn(mp, + "max atomic write size of %lluk cannot be larger than allocation group size %lluk", + new_max_bytes >> 10, + XFS_FSB_TO_B(mp, max_group) >> 10); + return -EINVAL; + } + + if (new_max_fsbs > max_group_write) { + xfs_warn(mp, + "max atomic write size of %lluk cannot be larger than max allocation group write size %lluk", + new_max_bytes >> 10, + XFS_FSB_TO_B(mp, max_group_write) >> 10); + return -EINVAL; + } + +set_limit: + error = xfs_calc_atomic_write_reservation(mp, new_max_fsbs); + if (error) { + xfs_warn(mp, + "cannot support completing atomic writes of %lluk", + new_max_bytes >> 10); + return error; + } + + xfs_calc_atomic_write_unit_max(mp); + mp->m_awu_max_bytes = new_max_bytes; + return 0; +} + /* Compute maximum possible height for realtime btree types for this fs. */ static inline void xfs_rtbtree_compute_maxlevels( @@ -1163,7 +1239,9 @@ xfs_mountfs( * derived from transaction reservations, so we must do this after the * log is fully initialized. */ - xfs_calc_atomic_write_unit_max(mp); + error = xfs_set_max_atomic_write_opt(mp, mp->m_awu_max_bytes); + if (error) + goto out_agresv; return 0; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index e2abf31438e0..5b5df70570c0 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -237,6 +237,9 @@ typedef struct xfs_mount { unsigned int m_max_open_zones; unsigned int m_zonegc_low_space; + /* max_atomic_write mount option value */ + unsigned long long m_awu_max_bytes; + /* * Bitsets of per-fs metadata that have been checked and/or are sick. * Callers must hold m_sb_lock to access these two fields. @@ -804,4 +807,7 @@ static inline void xfs_mod_sb_delalloc(struct xfs_mount *mp, int64_t delta) percpu_counter_add(&mp->m_delalloc_blks, delta); } +int xfs_set_max_atomic_write_opt(struct xfs_mount *mp, + unsigned long long new_max_bytes); + #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 77a3c003fc4f..8e3ae1749855 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -111,7 +111,7 @@ enum { Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota, Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce, Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_max_open_zones, - Opt_lifetime, Opt_nolifetime, + Opt_lifetime, Opt_nolifetime, Opt_max_atomic_write, }; static const struct fs_parameter_spec xfs_fs_parameters[] = { @@ -159,6 +159,7 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = { fsparam_u32("max_open_zones", Opt_max_open_zones), fsparam_flag("lifetime", Opt_lifetime), fsparam_flag("nolifetime", Opt_nolifetime), + fsparam_string("max_atomic_write", Opt_max_atomic_write), {} }; @@ -241,6 +242,9 @@ xfs_fs_show_options( if (mp->m_max_open_zones) seq_printf(m, ",max_open_zones=%u", mp->m_max_open_zones); + if (mp->m_awu_max_bytes) + seq_printf(m, ",max_atomic_write=%lluk", + mp->m_awu_max_bytes >> 10); return 0; } @@ -1343,6 +1347,42 @@ suffix_kstrtoint( return ret; } +static int +suffix_kstrtoull( + const char *s, + unsigned int base, + unsigned long long *res) +{ + int last, shift_left_factor = 0; + unsigned long long _res; + char *value; + int ret = 0; + + value = kstrdup(s, GFP_KERNEL); + if (!value) + return -ENOMEM; + + last = strlen(value) - 1; + if (value[last] == 'K' || value[last] == 'k') { + shift_left_factor = 10; + value[last] = '\0'; + } + if (value[last] == 'M' || value[last] == 'm') { + shift_left_factor = 20; + value[last] = '\0'; + } + if (value[last] == 'G' || value[last] == 'g') { + shift_left_factor = 30; + value[last] = '\0'; + } + + if (kstrtoull(value, base, &_res)) + ret = -EINVAL; + kfree(value); + *res = _res << shift_left_factor; + return ret; +} + static inline void xfs_fs_warn_deprecated( struct fs_context *fc, @@ -1527,6 +1567,14 @@ xfs_fs_parse_param( case Opt_nolifetime: parsing_mp->m_features |= XFS_FEAT_NOLIFETIME; return 0; + case Opt_max_atomic_write: + if (suffix_kstrtoull(param->string, 10, + &parsing_mp->m_awu_max_bytes)) { + xfs_warn(parsing_mp, + "max atomic write size must be positive integer"); + return -EINVAL; + } + return 0; default: xfs_warn(parsing_mp, "unknown mount option [%s].", param->key); return -EINVAL; @@ -2137,6 +2185,14 @@ xfs_fs_reconfigure( if (error) return error; + /* Validate new max_atomic_write option before making other changes */ + if (mp->m_awu_max_bytes != new_mp->m_awu_max_bytes) { + error = xfs_set_max_atomic_write_opt(mp, + new_mp->m_awu_max_bytes); + if (error) + return error; + } + /* inode32 -> inode64 */ if (xfs_has_small_inums(mp) && !xfs_has_small_inums(new_mp)) { mp->m_features &= ~XFS_FEAT_SMALL_INUMS; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index d5ae00f8e04c..01d284a1c759 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -230,6 +230,39 @@ TRACE_EVENT(xfs_calc_max_atomic_write_fsblocks, __entry->blockcount) ); +TRACE_EVENT(xfs_calc_max_atomic_write_log_geometry, + TP_PROTO(struct xfs_mount *mp, unsigned int per_intent, + unsigned int step_size, unsigned int blockcount, + unsigned int min_logblocks, unsigned int logres), + TP_ARGS(mp, per_intent, step_size, blockcount, min_logblocks, logres), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, per_intent) + __field(unsigned int, step_size) + __field(unsigned int, blockcount) + __field(unsigned int, min_logblocks) + __field(unsigned int, cur_logblocks) + __field(unsigned int, logres) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->per_intent = per_intent; + __entry->step_size = step_size; + __entry->blockcount = blockcount; + __entry->min_logblocks = min_logblocks; + __entry->cur_logblocks = mp->m_sb.sb_logblocks; + __entry->logres = logres; + ), + TP_printk("dev %d:%d per_intent %u step_size %u blockcount %u min_logblocks %u logblocks %u logres %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->per_intent, + __entry->step_size, + __entry->blockcount, + __entry->min_logblocks, + __entry->cur_logblocks, + __entry->logres) +); + TRACE_EVENT(xlog_intent_recovery_failed, TP_PROTO(struct xfs_mount *mp, const struct xfs_defer_op_type *ops, int error), From ea31bdece29ac72ebe409cf2bb411abe6e31431a Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 23 Apr 2025 12:54:13 -0700 Subject: [PATCH 21/26] xfs: stop using set_blocksize XFS has its own buffer cache for metadata that uses submit_bio, which means that it no longer uses the block device pagecache for anything. Create a more lightweight helper that runs the blocksize checks and flushes dirty data and use that instead. No more truncating the pagecache because XFS does not use it or care about it. Signed-off-by: Darrick J. Wong Reviewed-by: Luis Chamberlain Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_buf.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 1a2b3f06fa71..5ae77ffdc947 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1719,18 +1719,25 @@ xfs_setsize_buftarg( struct xfs_buftarg *btp, unsigned int sectorsize) { + int error; + /* Set up metadata sector size info */ btp->bt_meta_sectorsize = sectorsize; btp->bt_meta_sectormask = sectorsize - 1; - if (set_blocksize(btp->bt_bdev_file, sectorsize)) { + error = bdev_validate_blocksize(btp->bt_bdev, sectorsize); + if (error) { xfs_warn(btp->bt_mount, - "Cannot set_blocksize to %u on device %pg", - sectorsize, btp->bt_bdev); + "Cannot use blocksize %u on device %pg, err %d", + sectorsize, btp->bt_bdev, error); return -EINVAL; } - return 0; + /* + * Flush the block device pagecache so our bios see anything dirtied + * before mount. + */ + return sync_blockdev(btp->bt_bdev); } int From c0a5c4084709a78117c1c372aa4f813e1a98c313 Mon Sep 17 00:00:00 2001 From: Zizhi Wo Date: Tue, 6 May 2025 09:15:40 +0800 Subject: [PATCH 22/26] xfs: Remove deprecated xfs_bufd sysctl parameters Commit 64af7a6ea5a4 ("xfs: remove deprecated sysctls") removed the deprecated xfsbufd-related sysctl interface, but forgot to delete the corresponding parameters: "xfs_buf_timer" and "xfs_buf_age". This patch removes those parameters and makes no other changes. Signed-off-by: Zizhi Wo Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_globals.c | 2 -- fs/xfs/xfs_sysctl.h | 2 -- 2 files changed, 4 deletions(-) diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index f18fec0adf66..f6f628c01feb 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -23,8 +23,6 @@ xfs_param_t xfs_params = { .inherit_sync = { 0, 1, 1 }, .inherit_nodump = { 0, 1, 1 }, .inherit_noatim = { 0, 1, 1 }, - .xfs_buf_timer = { 100/2, 1*100, 30*100 }, - .xfs_buf_age = { 1*100, 15*100, 7200*100}, .inherit_nosym = { 0, 0, 1 }, .rotorstep = { 1, 1, 255 }, .inherit_nodfrg = { 0, 1, 1 }, diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index 276696a07040..51646f066c4f 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -29,8 +29,6 @@ typedef struct xfs_param { xfs_sysctl_val_t inherit_sync; /* Inherit the "sync" inode flag. */ xfs_sysctl_val_t inherit_nodump;/* Inherit the "nodump" inode flag. */ xfs_sysctl_val_t inherit_noatim;/* Inherit the "noatime" inode flag. */ - xfs_sysctl_val_t xfs_buf_timer; /* Interval between xfsbufd wakeups. */ - xfs_sysctl_val_t xfs_buf_age; /* Metadata buffer age before flush. */ xfs_sysctl_val_t inherit_nosym; /* Inherit the "nosymlinks" flag. */ xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */ xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */ From ca43b74ac3040ae13be854e6a71ebd7a91e5fcfc Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 13 May 2025 07:35:29 -0700 Subject: [PATCH 23/26] xfs: remove some EXPERIMENTAL warnings Online fsck was finished a year ago, in Linux 6.10. The exchange-range syscall and parent pointers were merged in the same cycle. None of these have encountered any serious errors in the year that they've been in the kernel (or the many many years they've been under development) so let's drop the shouty warnings. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Carlos Maiolino Signed-off-by: Carlos Maiolino --- fs/xfs/scrub/scrub.c | 2 -- fs/xfs/xfs_message.c | 12 ------------ fs/xfs/xfs_message.h | 3 --- fs/xfs/xfs_mount.h | 7 ------- fs/xfs/xfs_super.c | 7 ------- 5 files changed, 31 deletions(-) diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 9908850bf76f..76e24032e99a 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -680,8 +680,6 @@ xfs_scrub_metadata( if (error) goto out; - xfs_warn_experimental(mp, XFS_EXPERIMENTAL_SCRUB); - sc = kzalloc(sizeof(struct xfs_scrub), XCHK_GFP_FLAGS); if (!sc) { error = -ENOMEM; diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index 15d410d16bb2..54fc5ada519c 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -145,10 +145,6 @@ xfs_warn_experimental( .opstate = XFS_OPSTATE_WARNED_PNFS, .name = "pNFS", }, - [XFS_EXPERIMENTAL_SCRUB] = { - .opstate = XFS_OPSTATE_WARNED_SCRUB, - .name = "online scrub", - }, [XFS_EXPERIMENTAL_SHRINK] = { .opstate = XFS_OPSTATE_WARNED_SHRINK, .name = "online shrink", @@ -161,14 +157,6 @@ xfs_warn_experimental( .opstate = XFS_OPSTATE_WARNED_LBS, .name = "large block size", }, - [XFS_EXPERIMENTAL_EXCHRANGE] = { - .opstate = XFS_OPSTATE_WARNED_EXCHRANGE, - .name = "exchange range", - }, - [XFS_EXPERIMENTAL_PPTR] = { - .opstate = XFS_OPSTATE_WARNED_PPTR, - .name = "parent pointer", - }, [XFS_EXPERIMENTAL_METADIR] = { .opstate = XFS_OPSTATE_WARNED_METADIR, .name = "metadata directory tree", diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index a92a4d09c8e9..bce9942f394a 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -92,12 +92,9 @@ void xfs_buf_alert_ratelimited(struct xfs_buf *bp, const char *rlmsg, enum xfs_experimental_feat { XFS_EXPERIMENTAL_PNFS, - XFS_EXPERIMENTAL_SCRUB, XFS_EXPERIMENTAL_SHRINK, XFS_EXPERIMENTAL_LARP, XFS_EXPERIMENTAL_LBS, - XFS_EXPERIMENTAL_EXCHRANGE, - XFS_EXPERIMENTAL_PPTR, XFS_EXPERIMENTAL_METADIR, XFS_EXPERIMENTAL_ZONED, diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 5b5df70570c0..168b4d340cfc 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -559,8 +559,6 @@ __XFS_HAS_FEAT(nouuid, NOUUID) /* Kernel has logged a warning about pNFS being used on this fs. */ #define XFS_OPSTATE_WARNED_PNFS 7 -/* Kernel has logged a warning about online fsck being used on this fs. */ -#define XFS_OPSTATE_WARNED_SCRUB 8 /* Kernel has logged a warning about shrink being used on this fs. */ #define XFS_OPSTATE_WARNED_SHRINK 9 /* Kernel has logged a warning about logged xattr updates being used. */ @@ -573,10 +571,6 @@ __XFS_HAS_FEAT(nouuid, NOUUID) #define XFS_OPSTATE_USE_LARP 13 /* Kernel has logged a warning about blocksize > pagesize on this fs. */ #define XFS_OPSTATE_WARNED_LBS 14 -/* Kernel has logged a warning about exchange-range being used on this fs. */ -#define XFS_OPSTATE_WARNED_EXCHRANGE 15 -/* Kernel has logged a warning about parent pointers being used on this fs. */ -#define XFS_OPSTATE_WARNED_PPTR 16 /* Kernel has logged a warning about metadata dirs being used on this fs. */ #define XFS_OPSTATE_WARNED_METADIR 17 /* Filesystem should use qflags to determine quotaon status */ @@ -645,7 +639,6 @@ xfs_should_warn(struct xfs_mount *mp, long nr) { (1UL << XFS_OPSTATE_READONLY), "read_only" }, \ { (1UL << XFS_OPSTATE_INODEGC_ENABLED), "inodegc" }, \ { (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" }, \ - { (1UL << XFS_OPSTATE_WARNED_SCRUB), "wscrub" }, \ { (1UL << XFS_OPSTATE_WARNED_SHRINK), "wshrink" }, \ { (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" }, \ { (1UL << XFS_OPSTATE_QUOTACHECK_RUNNING), "quotacheck" }, \ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 8e3ae1749855..5318ab438054 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1954,13 +1954,6 @@ xfs_fs_fill_super( } } - - if (xfs_has_exchange_range(mp)) - xfs_warn_experimental(mp, XFS_EXPERIMENTAL_EXCHRANGE); - - if (xfs_has_parent(mp)) - xfs_warn_experimental(mp, XFS_EXPERIMENTAL_PPTR); - /* * If no quota mount options were provided, maybe we'll try to pick * up the quota accounting and enforcement flags from the ondisk sb. From 1c7161ef0164716fdf4618b50747bd3002625e38 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 May 2025 06:44:20 +0200 Subject: [PATCH 24/26] xfs: remove the EXPERIMENTAL warning for pNFS The pNFS layout support has been around for 10 years without major issues, drop the EXPERIMENTAL warning. Signed-off-by: Christoph Hellwig Reviewed-by: Carlos Maiolino Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_message.c | 4 ---- fs/xfs/xfs_message.h | 1 - fs/xfs/xfs_mount.h | 2 -- fs/xfs/xfs_pnfs.c | 2 -- 4 files changed, 9 deletions(-) diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index 54fc5ada519c..19aba2c3d525 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -141,10 +141,6 @@ xfs_warn_experimental( const char *name; long opstate; } features[] = { - [XFS_EXPERIMENTAL_PNFS] = { - .opstate = XFS_OPSTATE_WARNED_PNFS, - .name = "pNFS", - }, [XFS_EXPERIMENTAL_SHRINK] = { .opstate = XFS_OPSTATE_WARNED_SHRINK, .name = "online shrink", diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index bce9942f394a..d68e72379f9d 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -91,7 +91,6 @@ void xfs_buf_alert_ratelimited(struct xfs_buf *bp, const char *rlmsg, const char *fmt, ...); enum xfs_experimental_feat { - XFS_EXPERIMENTAL_PNFS, XFS_EXPERIMENTAL_SHRINK, XFS_EXPERIMENTAL_LARP, XFS_EXPERIMENTAL_LBS, diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 168b4d340cfc..89965b7ac29c 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -557,8 +557,6 @@ __XFS_HAS_FEAT(nouuid, NOUUID) */ #define XFS_OPSTATE_BLOCKGC_ENABLED 6 -/* Kernel has logged a warning about pNFS being used on this fs. */ -#define XFS_OPSTATE_WARNED_PNFS 7 /* Kernel has logged a warning about shrink being used on this fs. */ #define XFS_OPSTATE_WARNED_SHRINK 9 /* Kernel has logged a warning about logged xattr updates being used. */ diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 6f4479deac6d..afe7497012d4 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -58,8 +58,6 @@ xfs_fs_get_uuid( { struct xfs_mount *mp = XFS_M(sb); - xfs_warn_experimental(mp, XFS_EXPERIMENTAL_PNFS); - if (*len < sizeof(uuid_t)) return -EINVAL; From 70b95cb86513d7f6d084ddc8e961a1cab9022e14 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 May 2025 10:50:37 +0000 Subject: [PATCH 25/26] xfs: free the item in xfs_mru_cache_insert on failure Call the provided free_func when xfs_mru_cache_insert as that's what the callers need to do anyway. Signed-off-by: Christoph Hellwig Signed-off-by: Hans Holmberg Reviewed-by: Darrick J. Wong Reviewed-by: Carlos Maiolino Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_filestream.c | 15 ++++----------- fs/xfs/xfs_mru_cache.c | 15 ++++++++++++--- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index a961aa420c48..044918fbae06 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -304,11 +304,9 @@ xfs_filestream_create_association( * for us, so all we need to do here is take another active reference to * the perag for the cached association. * - * If we fail to store the association, we need to drop the fstrms - * counter as well as drop the perag reference we take here for the - * item. We do not need to return an error for this failure - as long as - * we return a referenced AG, the allocation can still go ahead just - * fine. + * If we fail to store the association, we do not need to return an + * error for this failure - as long as we return a referenced AG, the + * allocation can still go ahead just fine. */ item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!item) @@ -316,14 +314,9 @@ xfs_filestream_create_association( atomic_inc(&pag_group(args->pag)->xg_active_ref); item->pag = args->pag; - error = xfs_mru_cache_insert(mp->m_filestream, pino, &item->mru); - if (error) - goto out_free_item; + xfs_mru_cache_insert(mp->m_filestream, pino, &item->mru); return 0; -out_free_item: - xfs_perag_rele(item->pag); - kfree(item); out_put_fstrms: atomic_dec(&args->pag->pagf_fstrms); return 0; diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index d0f5b403bdbe..08443ceec329 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -414,6 +414,8 @@ xfs_mru_cache_destroy( * To insert an element, call xfs_mru_cache_insert() with the data store, the * element's key and the client data pointer. This function returns 0 on * success or ENOMEM if memory for the data element couldn't be allocated. + * + * The passed in elem is freed through the per-cache free_func on failure. */ int xfs_mru_cache_insert( @@ -421,14 +423,15 @@ xfs_mru_cache_insert( unsigned long key, struct xfs_mru_cache_elem *elem) { - int error; + int error = -EINVAL; ASSERT(mru && mru->lists); if (!mru || !mru->lists) - return -EINVAL; + goto out_free; + error = -ENOMEM; if (radix_tree_preload(GFP_KERNEL)) - return -ENOMEM; + goto out_free; INIT_LIST_HEAD(&elem->list_node); elem->key = key; @@ -440,6 +443,12 @@ xfs_mru_cache_insert( _xfs_mru_cache_list_insert(mru, elem); spin_unlock(&mru->lock); + if (error) + goto out_free; + return 0; + +out_free: + mru->free_func(mru->data, elem); return error; } From f3e2e53823b98d55da46ea72d32ac18bd7709c33 Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Wed, 14 May 2025 10:50:37 +0000 Subject: [PATCH 26/26] xfs: add inode to zone caching for data placement Placing data from the same file in the same zone is a great heuristic for reducing write amplification and we do this already - but only for sequential writes. To support placing data in the same way for random writes, reuse the xfs mru cache to map inodes to open zones on first write. If a mapping is present, use the open zone for data placement for this file until the zone is full. Signed-off-by: Hans Holmberg Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_mount.h | 1 + fs/xfs/xfs_zone_alloc.c | 109 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 110 insertions(+) diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 89965b7ac29c..d85084f9f317 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -236,6 +236,7 @@ typedef struct xfs_mount { bool m_update_sb; /* sb needs update in mount */ unsigned int m_max_open_zones; unsigned int m_zonegc_low_space; + struct xfs_mru_cache *m_zone_cache; /* Inode to open zone cache */ /* max_atomic_write mount option value */ unsigned long long m_awu_max_bytes; diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index d509e49b2aaa..80add26c0111 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -24,6 +24,7 @@ #include "xfs_zone_priv.h" #include "xfs_zones.h" #include "xfs_trace.h" +#include "xfs_mru_cache.h" void xfs_open_zone_put( @@ -796,6 +797,100 @@ xfs_submit_zoned_bio( submit_bio(&ioend->io_bio); } +/* + * Cache the last zone written to for an inode so that it is considered first + * for subsequent writes. + */ +struct xfs_zone_cache_item { + struct xfs_mru_cache_elem mru; + struct xfs_open_zone *oz; +}; + +static inline struct xfs_zone_cache_item * +xfs_zone_cache_item(struct xfs_mru_cache_elem *mru) +{ + return container_of(mru, struct xfs_zone_cache_item, mru); +} + +static void +xfs_zone_cache_free_func( + void *data, + struct xfs_mru_cache_elem *mru) +{ + struct xfs_zone_cache_item *item = xfs_zone_cache_item(mru); + + xfs_open_zone_put(item->oz); + kfree(item); +} + +/* + * Check if we have a cached last open zone available for the inode and + * if yes return a reference to it. + */ +static struct xfs_open_zone * +xfs_cached_zone( + struct xfs_mount *mp, + struct xfs_inode *ip) +{ + struct xfs_mru_cache_elem *mru; + struct xfs_open_zone *oz; + + mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino); + if (!mru) + return NULL; + oz = xfs_zone_cache_item(mru)->oz; + if (oz) { + /* + * GC only steals open zones at mount time, so no GC zones + * should end up in the cache. + */ + ASSERT(!oz->oz_is_gc); + ASSERT(atomic_read(&oz->oz_ref) > 0); + atomic_inc(&oz->oz_ref); + } + xfs_mru_cache_done(mp->m_zone_cache); + return oz; +} + +/* + * Update the last used zone cache for a given inode. + * + * The caller must have a reference on the open zone. + */ +static void +xfs_zone_cache_create_association( + struct xfs_inode *ip, + struct xfs_open_zone *oz) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_zone_cache_item *item = NULL; + struct xfs_mru_cache_elem *mru; + + ASSERT(atomic_read(&oz->oz_ref) > 0); + atomic_inc(&oz->oz_ref); + + mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino); + if (mru) { + /* + * If we have an association already, update it to point to the + * new zone. + */ + item = xfs_zone_cache_item(mru); + xfs_open_zone_put(item->oz); + item->oz = oz; + xfs_mru_cache_done(mp->m_zone_cache); + return; + } + + item = kmalloc(sizeof(*item), GFP_KERNEL); + if (!item) { + xfs_open_zone_put(oz); + return; + } + item->oz = oz; + xfs_mru_cache_insert(mp->m_zone_cache, ip->i_ino, &item->mru); +} + void xfs_zone_alloc_and_submit( struct iomap_ioend *ioend, @@ -819,11 +914,16 @@ xfs_zone_alloc_and_submit( */ if (!*oz && ioend->io_offset) *oz = xfs_last_used_zone(ioend); + if (!*oz) + *oz = xfs_cached_zone(mp, ip); + if (!*oz) { select_zone: *oz = xfs_select_zone(mp, write_hint, pack_tight); if (!*oz) goto out_error; + + xfs_zone_cache_create_association(ip, *oz); } alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size), @@ -1211,6 +1311,14 @@ xfs_mount_zones( error = xfs_zone_gc_mount(mp); if (error) goto out_free_zone_info; + + /* + * Set up a mru cache to track inode to open zone for data placement + * purposes. The magic values for group count and life time is the + * same as the defaults for file streams, which seems sane enough. + */ + xfs_mru_cache_create(&mp->m_zone_cache, mp, + 5000, 10, xfs_zone_cache_free_func); return 0; out_free_zone_info: @@ -1224,4 +1332,5 @@ xfs_unmount_zones( { xfs_zone_gc_unmount(mp); xfs_free_zone_info(mp->m_zone_info); + xfs_mru_cache_destroy(mp->m_zone_cache); }