2
0
mirror of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git synced 2025-09-04 20:19:47 +08:00

xfs: New code for 6.16

Signed-off-by: Carlos Maiolino <cem@kernel.org>
 -----BEGIN PGP SIGNATURE-----
 
 iJUEABMJAB0WIQSmtYVZ/MfVMGUq1GNcsMJ8RxYuYwUCaDQXTQAKCRBcsMJ8RxYu
 YwUHAYDYYm9oit6AIr0AgTXBMJ+DHyqaszBy0VT2jQUP+yXxyrQc46QExXKU9YQV
 ffmGRAsBgN7ZdDI8D5qWySyOynB3b1Jn3/0jY82GscFK0k0oX3EtxbN9MdrovbgK
 qyO66BVx7w==
 =pG5y
 -----END PGP SIGNATURE-----

Merge tag 'xfs-merge-6.16' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Pull xfs updates from Carlos Maiolino:

 - Atomic writes for XFS

 - Remove experimental warnings for pNFS, scrub and parent pointers

* tag 'xfs-merge-6.16' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: (26 commits)
  xfs: add inode to zone caching for data placement
  xfs: free the item in xfs_mru_cache_insert on failure
  xfs: remove the EXPERIMENTAL warning for pNFS
  xfs: remove some EXPERIMENTAL warnings
  xfs: Remove deprecated xfs_bufd sysctl parameters
  xfs: stop using set_blocksize
  xfs: allow sysadmins to specify a maximum atomic write limit at mount time
  xfs: update atomic write limits
  xfs: add xfs_calc_atomic_write_unit_max()
  xfs: add xfs_file_dio_write_atomic()
  xfs: commit CoW-based atomic writes atomically
  xfs: add large atomic writes checks in xfs_direct_write_iomap_begin()
  xfs: add xfs_atomic_write_cow_iomap_begin()
  xfs: refine atomic write size check in xfs_file_write_iter()
  xfs: refactor xfs_reflink_end_cow_extent()
  xfs: allow block allocator to take an alignment hint
  xfs: ignore HW which cannot atomic write a single block
  xfs: add helpers to compute transaction reservation for finishing intent items
  xfs: add helpers to compute log item overhead
  xfs: separate out setting buftarg atomic writes limits
  ...
This commit is contained in:
Linus Torvalds 2025-05-26 12:56:01 -07:00
commit f83fcb87f8
48 changed files with 1521 additions and 188 deletions

View File

@ -151,6 +151,17 @@ When mounting an XFS filesystem, the following options are accepted.
optional, and the log section can be separate from the data optional, and the log section can be separate from the data
section or contained within it. section or contained within it.
max_atomic_write=value
Set the maximum size of an atomic write. The size may be
specified in bytes, in kilobytes with a "k" suffix, in megabytes
with a "m" suffix, or in gigabytes with a "g" suffix. The size
cannot be larger than the maximum write size, larger than the
size of any allocation group, or larger than the size of a
remapping operation that the log can complete atomically.
The default value is to set the maximum I/O completion size
to allow each CPU to handle one at a time.
max_open_zones=value max_open_zones=value
Specify the max number of zones to keep open for writing on a Specify the max number of zones to keep open for writing on a
zoned rt device. Many open zones aids file data separation zoned rt device. Many open zones aids file data separation

View File

@ -1335,7 +1335,8 @@ void bdev_statx(const struct path *path, struct kstat *stat, u32 request_mask)
generic_fill_statx_atomic_writes(stat, generic_fill_statx_atomic_writes(stat,
queue_atomic_write_unit_min_bytes(bd_queue), queue_atomic_write_unit_min_bytes(bd_queue),
queue_atomic_write_unit_max_bytes(bd_queue)); queue_atomic_write_unit_max_bytes(bd_queue),
0);
} }
stat->blksize = bdev_io_min(bdev); stat->blksize = bdev_io_min(bdev);

View File

@ -5692,7 +5692,7 @@ int ext4_getattr(struct mnt_idmap *idmap, const struct path *path,
awu_max = sbi->s_awu_max; awu_max = sbi->s_awu_max;
} }
generic_fill_statx_atomic_writes(stat, awu_min, awu_max); generic_fill_statx_atomic_writes(stat, awu_min, awu_max, 0);
} }
flags = ei->i_flags & EXT4_FL_USER_VISIBLE; flags = ei->i_flags & EXT4_FL_USER_VISIBLE;

View File

@ -136,13 +136,15 @@ EXPORT_SYMBOL(generic_fill_statx_attr);
* @stat: Where to fill in the attribute flags * @stat: Where to fill in the attribute flags
* @unit_min: Minimum supported atomic write length in bytes * @unit_min: Minimum supported atomic write length in bytes
* @unit_max: Maximum supported atomic write length in bytes * @unit_max: Maximum supported atomic write length in bytes
* @unit_max_opt: Optimised maximum supported atomic write length in bytes
* *
* Fill in the STATX{_ATTR}_WRITE_ATOMIC flags in the kstat structure from * Fill in the STATX{_ATTR}_WRITE_ATOMIC flags in the kstat structure from
* atomic write unit_min and unit_max values. * atomic write unit_min and unit_max values.
*/ */
void generic_fill_statx_atomic_writes(struct kstat *stat, void generic_fill_statx_atomic_writes(struct kstat *stat,
unsigned int unit_min, unsigned int unit_min,
unsigned int unit_max) unsigned int unit_max,
unsigned int unit_max_opt)
{ {
/* Confirm that the request type is known */ /* Confirm that the request type is known */
stat->result_mask |= STATX_WRITE_ATOMIC; stat->result_mask |= STATX_WRITE_ATOMIC;
@ -153,6 +155,7 @@ void generic_fill_statx_atomic_writes(struct kstat *stat,
if (unit_min) { if (unit_min) {
stat->atomic_write_unit_min = unit_min; stat->atomic_write_unit_min = unit_min;
stat->atomic_write_unit_max = unit_max; stat->atomic_write_unit_max = unit_max;
stat->atomic_write_unit_max_opt = unit_max_opt;
/* Initially only allow 1x segment */ /* Initially only allow 1x segment */
stat->atomic_write_segments_max = 1; stat->atomic_write_segments_max = 1;
@ -741,6 +744,7 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
tmp.stx_atomic_write_unit_min = stat->atomic_write_unit_min; tmp.stx_atomic_write_unit_min = stat->atomic_write_unit_min;
tmp.stx_atomic_write_unit_max = stat->atomic_write_unit_max; tmp.stx_atomic_write_unit_max = stat->atomic_write_unit_max;
tmp.stx_atomic_write_segments_max = stat->atomic_write_segments_max; tmp.stx_atomic_write_segments_max = stat->atomic_write_segments_max;
tmp.stx_atomic_write_unit_max_opt = stat->atomic_write_unit_max_opt;
return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0; return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
} }

View File

@ -3312,6 +3312,11 @@ xfs_bmap_compute_alignments(
align = xfs_get_cowextsz_hint(ap->ip); align = xfs_get_cowextsz_hint(ap->ip);
else if (ap->datatype & XFS_ALLOC_USERDATA) else if (ap->datatype & XFS_ALLOC_USERDATA)
align = xfs_get_extsz_hint(ap->ip); align = xfs_get_extsz_hint(ap->ip);
/* Try to align start block to any minimum allocation alignment */
if (align > 1 && (ap->flags & XFS_BMAPI_EXTSZALIGN))
args->alignment = align;
if (align) { if (align) {
if (xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0, if (xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0,
ap->eof, 0, ap->conv, &ap->offset, ap->eof, 0, ap->conv, &ap->offset,

View File

@ -87,6 +87,9 @@ struct xfs_bmalloca {
/* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */ /* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */
#define XFS_BMAPI_NORMAP (1u << 10) #define XFS_BMAPI_NORMAP (1u << 10)
/* Try to align allocations to the extent size hint */
#define XFS_BMAPI_EXTSZALIGN (1u << 11)
#define XFS_BMAPI_FLAGS \ #define XFS_BMAPI_FLAGS \
{ XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \
{ XFS_BMAPI_METADATA, "METADATA" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \
@ -98,7 +101,8 @@ struct xfs_bmalloca {
{ XFS_BMAPI_REMAP, "REMAP" }, \ { XFS_BMAPI_REMAP, "REMAP" }, \
{ XFS_BMAPI_COWFORK, "COWFORK" }, \ { XFS_BMAPI_COWFORK, "COWFORK" }, \
{ XFS_BMAPI_NODISCARD, "NODISCARD" }, \ { XFS_BMAPI_NODISCARD, "NODISCARD" }, \
{ XFS_BMAPI_NORMAP, "NORMAP" } { XFS_BMAPI_NORMAP, "NORMAP" },\
{ XFS_BMAPI_EXTSZALIGN, "EXTSZALIGN" }
static inline int xfs_bmapi_aflag(int w) static inline int xfs_bmapi_aflag(int w)

View File

@ -91,6 +91,7 @@ xfs_log_calc_trans_resv_for_minlogblocks(
*/ */
if (xfs_want_minlogsize_fixes(&mp->m_sb)) { if (xfs_want_minlogsize_fixes(&mp->m_sb)) {
xfs_trans_resv_calc(mp, resv); xfs_trans_resv_calc(mp, resv);
resv->tr_atomic_ioend = M_RES(mp)->tr_atomic_ioend;
return; return;
} }
@ -107,6 +108,9 @@ xfs_log_calc_trans_resv_for_minlogblocks(
xfs_trans_resv_calc(mp, resv); xfs_trans_resv_calc(mp, resv);
/* Copy the dynamic transaction reservation types from the running fs */
resv->tr_atomic_ioend = M_RES(mp)->tr_atomic_ioend;
if (xfs_has_reflink(mp)) { if (xfs_has_reflink(mp)) {
/* /*
* In the early days of reflink, typical log operation counts * In the early days of reflink, typical log operation counts

View File

@ -22,6 +22,12 @@
#include "xfs_rtbitmap.h" #include "xfs_rtbitmap.h"
#include "xfs_attr_item.h" #include "xfs_attr_item.h"
#include "xfs_log.h" #include "xfs_log.h"
#include "xfs_defer.h"
#include "xfs_bmap_item.h"
#include "xfs_extfree_item.h"
#include "xfs_rmap_item.h"
#include "xfs_refcount_item.h"
#include "xfs_trace.h"
#define _ALLOC true #define _ALLOC true
#define _FREE false #define _FREE false
@ -263,6 +269,42 @@ xfs_rtalloc_block_count(
* register overflow from temporaries in the calculations. * register overflow from temporaries in the calculations.
*/ */
/*
* Finishing a data device refcount updates (t1):
* the agfs of the ags containing the blocks: nr_ops * sector size
* the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size
*/
inline unsigned int
xfs_calc_finish_cui_reservation(
struct xfs_mount *mp,
unsigned int nr_ops)
{
if (!xfs_has_reflink(mp))
return 0;
return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) +
xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops),
mp->m_sb.sb_blocksize);
}
/*
* Realtime refcount updates (t2);
* the rt refcount inode
* the rtrefcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size
*/
inline unsigned int
xfs_calc_finish_rt_cui_reservation(
struct xfs_mount *mp,
unsigned int nr_ops)
{
if (!xfs_has_rtreflink(mp))
return 0;
return xfs_calc_inode_res(mp, 1) +
xfs_calc_buf_res(xfs_rtrefcountbt_block_count(mp, nr_ops),
mp->m_sb.sb_blocksize);
}
/* /*
* Compute the log reservation required to handle the refcount update * Compute the log reservation required to handle the refcount update
* transaction. Refcount updates are always done via deferred log items. * transaction. Refcount updates are always done via deferred log items.
@ -280,19 +322,10 @@ xfs_calc_refcountbt_reservation(
struct xfs_mount *mp, struct xfs_mount *mp,
unsigned int nr_ops) unsigned int nr_ops)
{ {
unsigned int blksz = XFS_FSB_TO_B(mp, 1); unsigned int t1, t2;
unsigned int t1, t2 = 0;
if (!xfs_has_reflink(mp)) t1 = xfs_calc_finish_cui_reservation(mp, nr_ops);
return 0; t2 = xfs_calc_finish_rt_cui_reservation(mp, nr_ops);
t1 = xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) +
xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz);
if (xfs_has_realtime(mp))
t2 = xfs_calc_inode_res(mp, 1) +
xfs_calc_buf_res(xfs_rtrefcountbt_block_count(mp, nr_ops),
blksz);
return max(t1, t2); return max(t1, t2);
} }
@ -379,6 +412,96 @@ xfs_calc_write_reservation_minlogsize(
return xfs_calc_write_reservation(mp, true); return xfs_calc_write_reservation(mp, true);
} }
/*
* Finishing an EFI can free the blocks and bmap blocks (t2):
* the agf for each of the ags: nr * sector size
* the agfl for each of the ags: nr * sector size
* the super block to reflect the freed blocks: sector size
* worst case split in allocation btrees per extent assuming nr extents:
* nr exts * 2 trees * (2 * max depth - 1) * block size
*/
inline unsigned int
xfs_calc_finish_efi_reservation(
struct xfs_mount *mp,
unsigned int nr)
{
return xfs_calc_buf_res((2 * nr) + 1, mp->m_sb.sb_sectsize) +
xfs_calc_buf_res(xfs_allocfree_block_count(mp, nr),
mp->m_sb.sb_blocksize);
}
/*
* Or, if it's a realtime file (t3):
* the agf for each of the ags: 2 * sector size
* the agfl for each of the ags: 2 * sector size
* the super block to reflect the freed blocks: sector size
* the realtime bitmap:
* 2 exts * ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes
* the realtime summary: 2 exts * 1 block
* worst case split in allocation btrees per extent assuming 2 extents:
* 2 exts * 2 trees * (2 * max depth - 1) * block size
*/
inline unsigned int
xfs_calc_finish_rt_efi_reservation(
struct xfs_mount *mp,
unsigned int nr)
{
if (!xfs_has_realtime(mp))
return 0;
return xfs_calc_buf_res((2 * nr) + 1, mp->m_sb.sb_sectsize) +
xfs_calc_buf_res(xfs_rtalloc_block_count(mp, nr),
mp->m_sb.sb_blocksize) +
xfs_calc_buf_res(xfs_allocfree_block_count(mp, nr),
mp->m_sb.sb_blocksize);
}
/*
* Finishing an RUI is the same as an EFI. We can split the rmap btree twice
* on each end of the record, and that can cause the AGFL to be refilled or
* emptied out.
*/
inline unsigned int
xfs_calc_finish_rui_reservation(
struct xfs_mount *mp,
unsigned int nr)
{
if (!xfs_has_rmapbt(mp))
return 0;
return xfs_calc_finish_efi_reservation(mp, nr);
}
/*
* Finishing an RUI is the same as an EFI. We can split the rmap btree twice
* on each end of the record, and that can cause the AGFL to be refilled or
* emptied out.
*/
inline unsigned int
xfs_calc_finish_rt_rui_reservation(
struct xfs_mount *mp,
unsigned int nr)
{
if (!xfs_has_rtrmapbt(mp))
return 0;
return xfs_calc_finish_rt_efi_reservation(mp, nr);
}
/*
* In finishing a BUI, we can modify:
* the inode being truncated: inode size
* dquots
* the inode's bmap btree: (max depth + 1) * block size
*/
inline unsigned int
xfs_calc_finish_bui_reservation(
struct xfs_mount *mp,
unsigned int nr)
{
return xfs_calc_inode_res(mp, 1) + XFS_DQUOT_LOGRES +
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1,
mp->m_sb.sb_blocksize);
}
/* /*
* In truncating a file we free up to two extents at once. We can modify (t1): * In truncating a file we free up to two extents at once. We can modify (t1):
* the inode being truncated: inode size * the inode being truncated: inode size
@ -411,16 +534,8 @@ xfs_calc_itruncate_reservation(
t1 = xfs_calc_inode_res(mp, 1) + t1 = xfs_calc_inode_res(mp, 1) +
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz); xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz);
t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + t2 = xfs_calc_finish_efi_reservation(mp, 4);
xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4), blksz); t3 = xfs_calc_finish_rt_efi_reservation(mp, 2);
if (xfs_has_realtime(mp)) {
t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 2), blksz) +
xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz);
} else {
t3 = 0;
}
/* /*
* In the early days of reflink, we included enough reservation to log * In the early days of reflink, we included enough reservation to log
@ -501,9 +616,7 @@ xfs_calc_rename_reservation(
xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
XFS_FSB_TO_B(mp, 1)); XFS_FSB_TO_B(mp, 1));
t2 = xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + t2 = xfs_calc_finish_efi_reservation(mp, 3);
xfs_calc_buf_res(xfs_allocfree_block_count(mp, 3),
XFS_FSB_TO_B(mp, 1));
if (xfs_has_parent(mp)) { if (xfs_has_parent(mp)) {
unsigned int rename_overhead, exchange_overhead; unsigned int rename_overhead, exchange_overhead;
@ -611,9 +724,7 @@ xfs_calc_link_reservation(
overhead += xfs_calc_iunlink_remove_reservation(mp); overhead += xfs_calc_iunlink_remove_reservation(mp);
t1 = xfs_calc_inode_res(mp, 2) + t1 = xfs_calc_inode_res(mp, 2) +
xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)); xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
t2 = xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + t2 = xfs_calc_finish_efi_reservation(mp, 1);
xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
if (xfs_has_parent(mp)) { if (xfs_has_parent(mp)) {
t3 = resp->tr_attrsetm.tr_logres; t3 = resp->tr_attrsetm.tr_logres;
@ -676,9 +787,7 @@ xfs_calc_remove_reservation(
t1 = xfs_calc_inode_res(mp, 2) + t1 = xfs_calc_inode_res(mp, 2) +
xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)); xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
t2 = xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) + t2 = xfs_calc_finish_efi_reservation(mp, 2);
xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2),
XFS_FSB_TO_B(mp, 1));
if (xfs_has_parent(mp)) { if (xfs_has_parent(mp)) {
t3 = resp->tr_attrrm.tr_logres; t3 = resp->tr_attrrm.tr_logres;
@ -1181,6 +1290,15 @@ xfs_calc_namespace_reservations(
resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
} }
STATIC void
xfs_calc_default_atomic_ioend_reservation(
struct xfs_mount *mp,
struct xfs_trans_resv *resp)
{
/* Pick a default that will scale reasonably for the log size. */
resp->tr_atomic_ioend = resp->tr_itruncate;
}
void void
xfs_trans_resv_calc( xfs_trans_resv_calc(
struct xfs_mount *mp, struct xfs_mount *mp,
@ -1275,4 +1393,167 @@ xfs_trans_resv_calc(
resp->tr_itruncate.tr_logcount += logcount_adj; resp->tr_itruncate.tr_logcount += logcount_adj;
resp->tr_write.tr_logcount += logcount_adj; resp->tr_write.tr_logcount += logcount_adj;
resp->tr_qm_dqalloc.tr_logcount += logcount_adj; resp->tr_qm_dqalloc.tr_logcount += logcount_adj;
/*
* Now that we've finished computing the static reservations, we can
* compute the dynamic reservation for atomic writes.
*/
xfs_calc_default_atomic_ioend_reservation(mp, resp);
}
/*
* Return the per-extent and fixed transaction reservation sizes needed to
* complete an atomic write.
*/
STATIC unsigned int
xfs_calc_atomic_write_ioend_geometry(
struct xfs_mount *mp,
unsigned int *step_size)
{
const unsigned int efi = xfs_efi_log_space(1);
const unsigned int efd = xfs_efd_log_space(1);
const unsigned int rui = xfs_rui_log_space(1);
const unsigned int rud = xfs_rud_log_space();
const unsigned int cui = xfs_cui_log_space(1);
const unsigned int cud = xfs_cud_log_space();
const unsigned int bui = xfs_bui_log_space(1);
const unsigned int bud = xfs_bud_log_space();
/*
* Maximum overhead to complete an atomic write ioend in software:
* remove data fork extent + remove cow fork extent + map extent into
* data fork.
*
* tx0: Creates a BUI and a CUI and that's all it needs.
*
* tx1: Roll to finish the BUI. Need space for the BUD, an RUI, and
* enough space to relog the CUI (== CUI + CUD).
*
* tx2: Roll again to finish the RUI. Need space for the RUD and space
* to relog the CUI.
*
* tx3: Roll again, need space for the CUD and possibly a new EFI.
*
* tx4: Roll again, need space for an EFD.
*
* If the extent referenced by the pair of BUI/CUI items is not the one
* being currently processed, then we need to reserve space to relog
* both items.
*/
const unsigned int tx0 = bui + cui;
const unsigned int tx1 = bud + rui + cui + cud;
const unsigned int tx2 = rud + cui + cud;
const unsigned int tx3 = cud + efi;
const unsigned int tx4 = efd;
const unsigned int relog = bui + bud + cui + cud;
const unsigned int per_intent = max(max3(tx0, tx1, tx2),
max3(tx3, tx4, relog));
/* Overhead to finish one step of each intent item type */
const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1);
const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1);
const unsigned int f3 = xfs_calc_finish_cui_reservation(mp, 1);
const unsigned int f4 = xfs_calc_finish_bui_reservation(mp, 1);
/* We only finish one item per transaction in a chain */
*step_size = max(f4, max3(f1, f2, f3));
return per_intent;
}
/*
* Compute the maximum size (in fsblocks) of atomic writes that we can complete
* given the existing log reservations.
*/
xfs_extlen_t
xfs_calc_max_atomic_write_fsblocks(
struct xfs_mount *mp)
{
const struct xfs_trans_res *resv = &M_RES(mp)->tr_atomic_ioend;
unsigned int per_intent = 0;
unsigned int step_size = 0;
unsigned int ret = 0;
if (resv->tr_logres > 0) {
per_intent = xfs_calc_atomic_write_ioend_geometry(mp,
&step_size);
if (resv->tr_logres >= step_size)
ret = (resv->tr_logres - step_size) / per_intent;
}
trace_xfs_calc_max_atomic_write_fsblocks(mp, per_intent, step_size,
resv->tr_logres, ret);
return ret;
}
/*
* Compute the log blocks and transaction reservation needed to complete an
* atomic write of a given number of blocks. Worst case, each block requires
* separate handling. A return value of 0 means something went wrong.
*/
xfs_extlen_t
xfs_calc_atomic_write_log_geometry(
struct xfs_mount *mp,
xfs_extlen_t blockcount,
unsigned int *new_logres)
{
struct xfs_trans_res *curr_res = &M_RES(mp)->tr_atomic_ioend;
uint old_logres = curr_res->tr_logres;
unsigned int per_intent, step_size;
unsigned int logres;
xfs_extlen_t min_logblocks;
ASSERT(blockcount > 0);
xfs_calc_default_atomic_ioend_reservation(mp, M_RES(mp));
per_intent = xfs_calc_atomic_write_ioend_geometry(mp, &step_size);
/* Check for overflows */
if (check_mul_overflow(blockcount, per_intent, &logres) ||
check_add_overflow(logres, step_size, &logres))
return 0;
curr_res->tr_logres = logres;
min_logblocks = xfs_log_calc_minimum_size(mp);
curr_res->tr_logres = old_logres;
trace_xfs_calc_max_atomic_write_log_geometry(mp, per_intent, step_size,
blockcount, min_logblocks, logres);
*new_logres = logres;
return min_logblocks;
}
/*
* Compute the transaction reservation needed to complete an out of place
* atomic write of a given number of blocks.
*/
int
xfs_calc_atomic_write_reservation(
struct xfs_mount *mp,
xfs_extlen_t blockcount)
{
unsigned int new_logres;
xfs_extlen_t min_logblocks;
/*
* If the caller doesn't ask for a specific atomic write size, then
* use the defaults.
*/
if (blockcount == 0) {
xfs_calc_default_atomic_ioend_reservation(mp, M_RES(mp));
return 0;
}
min_logblocks = xfs_calc_atomic_write_log_geometry(mp, blockcount,
&new_logres);
if (!min_logblocks || min_logblocks > mp->m_sb.sb_logblocks)
return -EINVAL;
M_RES(mp)->tr_atomic_ioend.tr_logres = new_logres;
return 0;
} }

View File

@ -48,6 +48,7 @@ struct xfs_trans_resv {
struct xfs_trans_res tr_qm_dqalloc; /* allocate quota on disk */ struct xfs_trans_res tr_qm_dqalloc; /* allocate quota on disk */
struct xfs_trans_res tr_sb; /* modify superblock */ struct xfs_trans_res tr_sb; /* modify superblock */
struct xfs_trans_res tr_fsyncts; /* update timestamps on fsync */ struct xfs_trans_res tr_fsyncts; /* update timestamps on fsync */
struct xfs_trans_res tr_atomic_ioend; /* untorn write completion */
}; };
/* shorthand way of accessing reservation structure */ /* shorthand way of accessing reservation structure */
@ -98,8 +99,32 @@ struct xfs_trans_resv {
void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp); void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp);
uint xfs_allocfree_block_count(struct xfs_mount *mp, uint num_ops); uint xfs_allocfree_block_count(struct xfs_mount *mp, uint num_ops);
unsigned int xfs_calc_finish_bui_reservation(struct xfs_mount *mp,
unsigned int nr_ops);
unsigned int xfs_calc_finish_efi_reservation(struct xfs_mount *mp,
unsigned int nr_ops);
unsigned int xfs_calc_finish_rt_efi_reservation(struct xfs_mount *mp,
unsigned int nr_ops);
unsigned int xfs_calc_finish_rui_reservation(struct xfs_mount *mp,
unsigned int nr_ops);
unsigned int xfs_calc_finish_rt_rui_reservation(struct xfs_mount *mp,
unsigned int nr_ops);
unsigned int xfs_calc_finish_cui_reservation(struct xfs_mount *mp,
unsigned int nr_ops);
unsigned int xfs_calc_finish_rt_cui_reservation(struct xfs_mount *mp,
unsigned int nr_ops);
unsigned int xfs_calc_itruncate_reservation_minlogsize(struct xfs_mount *mp); unsigned int xfs_calc_itruncate_reservation_minlogsize(struct xfs_mount *mp);
unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp); unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp);
unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp); unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp);
xfs_extlen_t xfs_calc_max_atomic_write_fsblocks(struct xfs_mount *mp);
xfs_extlen_t xfs_calc_atomic_write_log_geometry(struct xfs_mount *mp,
xfs_extlen_t blockcount, unsigned int *new_logres);
int xfs_calc_atomic_write_reservation(struct xfs_mount *mp,
xfs_extlen_t blockcount);
#endif /* __XFS_TRANS_RESV_H__ */ #endif /* __XFS_TRANS_RESV_H__ */

View File

@ -680,8 +680,6 @@ xfs_scrub_metadata(
if (error) if (error)
goto out; goto out;
xfs_warn_experimental(mp, XFS_EXPERIMENTAL_SCRUB);
sc = kzalloc(sizeof(struct xfs_scrub), XCHK_GFP_FLAGS); sc = kzalloc(sizeof(struct xfs_scrub), XCHK_GFP_FLAGS);
if (!sc) { if (!sc) {
error = -ENOMEM; error = -ENOMEM;

View File

@ -77,6 +77,11 @@ xfs_bui_item_size(
*nbytes += xfs_bui_log_format_sizeof(buip->bui_format.bui_nextents); *nbytes += xfs_bui_log_format_sizeof(buip->bui_format.bui_nextents);
} }
unsigned int xfs_bui_log_space(unsigned int nr)
{
return xlog_item_space(1, xfs_bui_log_format_sizeof(nr));
}
/* /*
* This is called to fill in the vector of log iovecs for the * This is called to fill in the vector of log iovecs for the
* given bui log item. We use only 1 iovec, and we point that * given bui log item. We use only 1 iovec, and we point that
@ -168,6 +173,11 @@ xfs_bud_item_size(
*nbytes += sizeof(struct xfs_bud_log_format); *nbytes += sizeof(struct xfs_bud_log_format);
} }
unsigned int xfs_bud_log_space(void)
{
return xlog_item_space(1, sizeof(struct xfs_bud_log_format));
}
/* /*
* This is called to fill in the vector of log iovecs for the * This is called to fill in the vector of log iovecs for the
* given bud log item. We use only 1 iovec, and we point that * given bud log item. We use only 1 iovec, and we point that

View File

@ -72,4 +72,7 @@ struct xfs_bmap_intent;
void xfs_bmap_defer_add(struct xfs_trans *tp, struct xfs_bmap_intent *bi); void xfs_bmap_defer_add(struct xfs_trans *tp, struct xfs_bmap_intent *bi);
unsigned int xfs_bui_log_space(unsigned int nr);
unsigned int xfs_bud_log_space(void);
#endif /* __XFS_BMAP_ITEM_H__ */ #endif /* __XFS_BMAP_ITEM_H__ */

View File

@ -1687,23 +1687,65 @@ xfs_free_buftarg(
kfree(btp); kfree(btp);
} }
/*
* Configure this buffer target for hardware-assisted atomic writes if the
* underlying block device supports is congruent with the filesystem geometry.
*/
static inline void
xfs_configure_buftarg_atomic_writes(
struct xfs_buftarg *btp)
{
struct xfs_mount *mp = btp->bt_mount;
unsigned int min_bytes, max_bytes;
min_bytes = bdev_atomic_write_unit_min_bytes(btp->bt_bdev);
max_bytes = bdev_atomic_write_unit_max_bytes(btp->bt_bdev);
/*
* Ignore atomic write geometry that is nonsense or doesn't even cover
* a single fsblock.
*/
if (min_bytes > max_bytes ||
min_bytes > mp->m_sb.sb_blocksize ||
max_bytes < mp->m_sb.sb_blocksize) {
min_bytes = 0;
max_bytes = 0;
}
btp->bt_bdev_awu_min = min_bytes;
btp->bt_bdev_awu_max = max_bytes;
}
/* Configure a buffer target that abstracts a block device. */
int int
xfs_setsize_buftarg( xfs_configure_buftarg(
struct xfs_buftarg *btp, struct xfs_buftarg *btp,
unsigned int sectorsize) unsigned int sectorsize)
{ {
int error;
ASSERT(btp->bt_bdev != NULL);
/* Set up metadata sector size info */ /* Set up metadata sector size info */
btp->bt_meta_sectorsize = sectorsize; btp->bt_meta_sectorsize = sectorsize;
btp->bt_meta_sectormask = sectorsize - 1; btp->bt_meta_sectormask = sectorsize - 1;
if (set_blocksize(btp->bt_bdev_file, sectorsize)) { error = bdev_validate_blocksize(btp->bt_bdev, sectorsize);
if (error) {
xfs_warn(btp->bt_mount, xfs_warn(btp->bt_mount,
"Cannot set_blocksize to %u on device %pg", "Cannot use blocksize %u on device %pg, err %d",
sectorsize, btp->bt_bdev); sectorsize, btp->bt_bdev, error);
return -EINVAL; return -EINVAL;
} }
return 0; /*
* Flush the block device pagecache so our bios see anything dirtied
* before mount.
*/
if (bdev_can_atomic_write(btp->bt_bdev))
xfs_configure_buftarg_atomic_writes(btp);
return sync_blockdev(btp->bt_bdev);
} }
int int
@ -1752,6 +1794,8 @@ xfs_alloc_buftarg(
{ {
struct xfs_buftarg *btp; struct xfs_buftarg *btp;
const struct dax_holder_operations *ops = NULL; const struct dax_holder_operations *ops = NULL;
int error;
#if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE) #if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE)
ops = &xfs_dax_holder_operations; ops = &xfs_dax_holder_operations;
@ -1765,28 +1809,31 @@ xfs_alloc_buftarg(
btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off, btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off,
mp, ops); mp, ops);
if (bdev_can_atomic_write(btp->bt_bdev)) { /*
btp->bt_bdev_awu_min = bdev_atomic_write_unit_min_bytes( * Flush and invalidate all devices' pagecaches before reading any
btp->bt_bdev); * metadata because XFS doesn't use the bdev pagecache.
btp->bt_bdev_awu_max = bdev_atomic_write_unit_max_bytes( */
btp->bt_bdev); error = sync_blockdev(btp->bt_bdev);
} if (error)
goto error_free;
/* /*
* When allocating the buftargs we have not yet read the super block and * When allocating the buftargs we have not yet read the super block and
* thus don't know the file system sector size yet. * thus don't know the file system sector size yet.
*/ */
if (xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev))) btp->bt_meta_sectorsize = bdev_logical_block_size(btp->bt_bdev);
goto error_free; btp->bt_meta_sectormask = btp->bt_meta_sectorsize - 1;
if (xfs_init_buftarg(btp, bdev_logical_block_size(btp->bt_bdev),
mp->m_super->s_id)) error = xfs_init_buftarg(btp, btp->bt_meta_sectorsize,
mp->m_super->s_id);
if (error)
goto error_free; goto error_free;
return btp; return btp;
error_free: error_free:
kfree(btp); kfree(btp);
return NULL; return ERR_PTR(error);
} }
static inline void static inline void

View File

@ -112,7 +112,7 @@ struct xfs_buftarg {
struct percpu_counter bt_readahead_count; struct percpu_counter bt_readahead_count;
struct ratelimit_state bt_ioerror_rl; struct ratelimit_state bt_ioerror_rl;
/* Atomic write unit values */ /* Atomic write unit values, bytes */
unsigned int bt_bdev_awu_min; unsigned int bt_bdev_awu_min;
unsigned int bt_bdev_awu_max; unsigned int bt_bdev_awu_max;
@ -374,7 +374,7 @@ struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp,
extern void xfs_free_buftarg(struct xfs_buftarg *); extern void xfs_free_buftarg(struct xfs_buftarg *);
extern void xfs_buftarg_wait(struct xfs_buftarg *); extern void xfs_buftarg_wait(struct xfs_buftarg *);
extern void xfs_buftarg_drain(struct xfs_buftarg *); extern void xfs_buftarg_drain(struct xfs_buftarg *);
extern int xfs_setsize_buftarg(struct xfs_buftarg *, unsigned int); int xfs_configure_buftarg(struct xfs_buftarg *btp, unsigned int sectorsize);
#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)

View File

@ -103,6 +103,25 @@ xfs_buf_item_size_segment(
return; return;
} }
/*
* Compute the worst case log item overhead for an invalidated buffer with the
* given map count and block size.
*/
unsigned int
xfs_buf_inval_log_space(
unsigned int map_count,
unsigned int blocksize)
{
unsigned int chunks = DIV_ROUND_UP(blocksize, XFS_BLF_CHUNK);
unsigned int bitmap_size = DIV_ROUND_UP(chunks, NBWORD);
unsigned int ret =
offsetof(struct xfs_buf_log_format, blf_data_map) +
(bitmap_size * sizeof_field(struct xfs_buf_log_format,
blf_data_map[0]));
return ret * map_count;
}
/* /*
* Return the number of log iovecs and space needed to log the given buf log * Return the number of log iovecs and space needed to log the given buf log
* item. * item.

View File

@ -64,6 +64,9 @@ static inline void xfs_buf_dquot_iodone(struct xfs_buf *bp)
void xfs_buf_iodone(struct xfs_buf *); void xfs_buf_iodone(struct xfs_buf *);
bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec); bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec);
unsigned int xfs_buf_inval_log_space(unsigned int map_count,
unsigned int blocksize);
extern struct kmem_cache *xfs_buf_item_cache; extern struct kmem_cache *xfs_buf_item_cache;
#endif /* __XFS_BUF_ITEM_H__ */ #endif /* __XFS_BUF_ITEM_H__ */

View File

@ -167,6 +167,14 @@ xfs_discard_extents(
return error; return error;
} }
/*
* Care must be taken setting up the trim cursor as the perags may not have been
* initialised when the cursor is initialised. e.g. a clean mount which hasn't
* read in AGFs and the first operation run on the mounted fs is a trim. This
* can result in perag fields that aren't initialised until
* xfs_trim_gather_extents() calls xfs_alloc_read_agf() to lock down the AG for
* the free space search.
*/
struct xfs_trim_cur { struct xfs_trim_cur {
xfs_agblock_t start; xfs_agblock_t start;
xfs_extlen_t count; xfs_extlen_t count;
@ -204,6 +212,14 @@ xfs_trim_gather_extents(
if (error) if (error)
goto out_trans_cancel; goto out_trans_cancel;
/*
* First time through tcur->count will not have been initialised as
* pag->pagf_longest is not guaranteed to be valid before we read
* the AGF buffer above.
*/
if (!tcur->count)
tcur->count = pag->pagf_longest;
if (tcur->by_bno) { if (tcur->by_bno) {
/* sub-AG discard request always starts at tcur->start */ /* sub-AG discard request always starts at tcur->start */
cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag); cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag);
@ -350,7 +366,6 @@ xfs_trim_perag_extents(
{ {
struct xfs_trim_cur tcur = { struct xfs_trim_cur tcur = {
.start = start, .start = start,
.count = pag->pagf_longest,
.end = end, .end = end,
.minlen = minlen, .minlen = minlen,
}; };

View File

@ -83,6 +83,11 @@ xfs_efi_item_size(
*nbytes += xfs_efi_log_format_sizeof(efip->efi_format.efi_nextents); *nbytes += xfs_efi_log_format_sizeof(efip->efi_format.efi_nextents);
} }
unsigned int xfs_efi_log_space(unsigned int nr)
{
return xlog_item_space(1, xfs_efi_log_format_sizeof(nr));
}
/* /*
* This is called to fill in the vector of log iovecs for the * This is called to fill in the vector of log iovecs for the
* given efi log item. We use only 1 iovec, and we point that * given efi log item. We use only 1 iovec, and we point that
@ -254,6 +259,11 @@ xfs_efd_item_size(
*nbytes += xfs_efd_log_format_sizeof(efdp->efd_format.efd_nextents); *nbytes += xfs_efd_log_format_sizeof(efdp->efd_format.efd_nextents);
} }
unsigned int xfs_efd_log_space(unsigned int nr)
{
return xlog_item_space(1, xfs_efd_log_format_sizeof(nr));
}
/* /*
* This is called to fill in the vector of log iovecs for the * This is called to fill in the vector of log iovecs for the
* given efd log item. We use only 1 iovec, and we point that * given efd log item. We use only 1 iovec, and we point that

View File

@ -94,4 +94,7 @@ void xfs_extent_free_defer_add(struct xfs_trans *tp,
struct xfs_extent_free_item *xefi, struct xfs_extent_free_item *xefi,
struct xfs_defer_pending **dfpp); struct xfs_defer_pending **dfpp);
unsigned int xfs_efi_log_space(unsigned int nr);
unsigned int xfs_efd_log_space(unsigned int nr);
#endif /* __XFS_EXTFREE_ITEM_H__ */ #endif /* __XFS_EXTFREE_ITEM_H__ */

View File

@ -576,7 +576,10 @@ xfs_dio_write_end_io(
nofs_flag = memalloc_nofs_save(); nofs_flag = memalloc_nofs_save();
if (flags & IOMAP_DIO_COW) { if (flags & IOMAP_DIO_COW) {
error = xfs_reflink_end_cow(ip, offset, size); if (iocb->ki_flags & IOCB_ATOMIC)
error = xfs_reflink_end_atomic_cow(ip, offset, size);
else
error = xfs_reflink_end_cow(ip, offset, size);
if (error) if (error)
goto out; goto out;
} }
@ -725,6 +728,72 @@ xfs_file_dio_write_zoned(
return ret; return ret;
} }
/*
* Handle block atomic writes
*
* Two methods of atomic writes are supported:
* - REQ_ATOMIC-based, which would typically use some form of HW offload in the
* disk
* - COW-based, which uses a COW fork as a staging extent for data updates
* before atomically updating extent mappings for the range being written
*
*/
static noinline ssize_t
xfs_file_dio_write_atomic(
struct xfs_inode *ip,
struct kiocb *iocb,
struct iov_iter *from)
{
unsigned int iolock = XFS_IOLOCK_SHARED;
ssize_t ret, ocount = iov_iter_count(from);
const struct iomap_ops *dops;
/*
* HW offload should be faster, so try that first if it is already
* known that the write length is not too large.
*/
if (ocount > xfs_inode_buftarg(ip)->bt_bdev_awu_max)
dops = &xfs_atomic_write_cow_iomap_ops;
else
dops = &xfs_direct_write_iomap_ops;
retry:
ret = xfs_ilock_iocb_for_write(iocb, &iolock);
if (ret)
return ret;
ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
if (ret)
goto out_unlock;
/* Demote similar to xfs_file_dio_write_aligned() */
if (iolock == XFS_IOLOCK_EXCL) {
xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
iolock = XFS_IOLOCK_SHARED;
}
trace_xfs_file_direct_write(iocb, from);
ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops,
0, NULL, 0);
/*
* The retry mechanism is based on the ->iomap_begin method returning
* -ENOPROTOOPT, which would be when the REQ_ATOMIC-based write is not
* possible. The REQ_ATOMIC-based method typically not be possible if
* the write spans multiple extents or the disk blocks are misaligned.
*/
if (ret == -ENOPROTOOPT && dops == &xfs_direct_write_iomap_ops) {
xfs_iunlock(ip, iolock);
dops = &xfs_atomic_write_cow_iomap_ops;
goto retry;
}
out_unlock:
if (iolock)
xfs_iunlock(ip, iolock);
return ret;
}
/* /*
* Handle block unaligned direct I/O writes * Handle block unaligned direct I/O writes
* *
@ -840,6 +909,8 @@ xfs_file_dio_write(
return xfs_file_dio_write_unaligned(ip, iocb, from); return xfs_file_dio_write_unaligned(ip, iocb, from);
if (xfs_is_zoned_inode(ip)) if (xfs_is_zoned_inode(ip))
return xfs_file_dio_write_zoned(ip, iocb, from); return xfs_file_dio_write_zoned(ip, iocb, from);
if (iocb->ki_flags & IOCB_ATOMIC)
return xfs_file_dio_write_atomic(ip, iocb, from);
return xfs_file_dio_write_aligned(ip, iocb, from, return xfs_file_dio_write_aligned(ip, iocb, from,
&xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL); &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL);
} }
@ -1032,14 +1103,12 @@ xfs_file_write_iter(
return xfs_file_dax_write(iocb, from); return xfs_file_dax_write(iocb, from);
if (iocb->ki_flags & IOCB_ATOMIC) { if (iocb->ki_flags & IOCB_ATOMIC) {
/* if (ocount < xfs_get_atomic_write_min(ip))
* Currently only atomic writing of a single FS block is
* supported. It would be possible to atomic write smaller than
* a FS block, but there is no requirement to support this.
* Note that iomap also does not support this yet.
*/
if (ocount != ip->i_mount->m_sb.sb_blocksize)
return -EINVAL; return -EINVAL;
if (ocount > xfs_get_atomic_write_max(ip))
return -EINVAL;
ret = generic_atomic_write_valid(iocb, from); ret = generic_atomic_write_valid(iocb, from);
if (ret) if (ret)
return ret; return ret;
@ -1488,7 +1557,7 @@ xfs_file_open(
if (xfs_is_shutdown(XFS_M(inode->i_sb))) if (xfs_is_shutdown(XFS_M(inode->i_sb)))
return -EIO; return -EIO;
file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
if (xfs_inode_can_atomicwrite(XFS_I(inode))) if (xfs_get_atomic_write_min(XFS_I(inode)) > 0)
file->f_mode |= FMODE_CAN_ATOMIC_WRITE; file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
return generic_file_open(inode, file); return generic_file_open(inode, file);
} }

View File

@ -304,11 +304,9 @@ xfs_filestream_create_association(
* for us, so all we need to do here is take another active reference to * for us, so all we need to do here is take another active reference to
* the perag for the cached association. * the perag for the cached association.
* *
* If we fail to store the association, we need to drop the fstrms * If we fail to store the association, we do not need to return an
* counter as well as drop the perag reference we take here for the * error for this failure - as long as we return a referenced AG, the
* item. We do not need to return an error for this failure - as long as * allocation can still go ahead just fine.
* we return a referenced AG, the allocation can still go ahead just
* fine.
*/ */
item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_RETRY_MAYFAIL); item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!item) if (!item)
@ -316,14 +314,9 @@ xfs_filestream_create_association(
atomic_inc(&pag_group(args->pag)->xg_active_ref); atomic_inc(&pag_group(args->pag)->xg_active_ref);
item->pag = args->pag; item->pag = args->pag;
error = xfs_mru_cache_insert(mp->m_filestream, pino, &item->mru); xfs_mru_cache_insert(mp->m_filestream, pino, &item->mru);
if (error)
goto out_free_item;
return 0; return 0;
out_free_item:
xfs_perag_rele(item->pag);
kfree(item);
out_put_fstrms: out_put_fstrms:
atomic_dec(&args->pag->pagf_fstrms); atomic_dec(&args->pag->pagf_fstrms);
return 0; return 0;

View File

@ -23,8 +23,6 @@ xfs_param_t xfs_params = {
.inherit_sync = { 0, 1, 1 }, .inherit_sync = { 0, 1, 1 },
.inherit_nodump = { 0, 1, 1 }, .inherit_nodump = { 0, 1, 1 },
.inherit_noatim = { 0, 1, 1 }, .inherit_noatim = { 0, 1, 1 },
.xfs_buf_timer = { 100/2, 1*100, 30*100 },
.xfs_buf_age = { 1*100, 15*100, 7200*100},
.inherit_nosym = { 0, 0, 1 }, .inherit_nosym = { 0, 0, 1 },
.rotorstep = { 1, 1, 255 }, .rotorstep = { 1, 1, 255 },
.inherit_nodfrg = { 0, 1, 1 }, .inherit_nodfrg = { 0, 1, 1 },

View File

@ -356,19 +356,9 @@ static inline bool xfs_inode_has_bigrtalloc(const struct xfs_inode *ip)
(XFS_IS_REALTIME_INODE(ip) ? \ (XFS_IS_REALTIME_INODE(ip) ? \
(ip)->i_mount->m_rtdev_targp : (ip)->i_mount->m_ddev_targp) (ip)->i_mount->m_rtdev_targp : (ip)->i_mount->m_ddev_targp)
static inline bool static inline bool xfs_inode_can_hw_atomic_write(const struct xfs_inode *ip)
xfs_inode_can_atomicwrite(
struct xfs_inode *ip)
{ {
struct xfs_mount *mp = ip->i_mount; return xfs_inode_buftarg(ip)->bt_bdev_awu_max > 0;
struct xfs_buftarg *target = xfs_inode_buftarg(ip);
if (mp->m_sb.sb_blocksize < target->bt_bdev_awu_min)
return false;
if (mp->m_sb.sb_blocksize > target->bt_bdev_awu_max)
return false;
return true;
} }
/* /*

View File

@ -798,6 +798,38 @@ imap_spans_range(
return true; return true;
} }
static bool
xfs_bmap_hw_atomic_write_possible(
struct xfs_inode *ip,
struct xfs_bmbt_irec *imap,
xfs_fileoff_t offset_fsb,
xfs_fileoff_t end_fsb)
{
struct xfs_mount *mp = ip->i_mount;
xfs_fsize_t len = XFS_FSB_TO_B(mp, end_fsb - offset_fsb);
/*
* atomic writes are required to be naturally aligned for disk blocks,
* which ensures that we adhere to block layer rules that we won't
* straddle any boundary or violate write alignment requirement.
*/
if (!IS_ALIGNED(imap->br_startblock, imap->br_blockcount))
return false;
/*
* Spanning multiple extents would mean that multiple BIOs would be
* issued, and so would lose atomicity required for REQ_ATOMIC-based
* atomics.
*/
if (!imap_spans_range(imap, offset_fsb, end_fsb))
return false;
/*
* The ->iomap_begin caller should ensure this, but check anyway.
*/
return len <= xfs_inode_buftarg(ip)->bt_bdev_awu_max;
}
static int static int
xfs_direct_write_iomap_begin( xfs_direct_write_iomap_begin(
struct inode *inode, struct inode *inode,
@ -812,9 +844,11 @@ xfs_direct_write_iomap_begin(
struct xfs_bmbt_irec imap, cmap; struct xfs_bmbt_irec imap, cmap;
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length); xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length);
xfs_fileoff_t orig_end_fsb = end_fsb;
int nimaps = 1, error = 0; int nimaps = 1, error = 0;
bool shared = false; bool shared = false;
u16 iomap_flags = 0; u16 iomap_flags = 0;
bool needs_alloc;
unsigned int lockmode; unsigned int lockmode;
u64 seq; u64 seq;
@ -875,13 +909,37 @@ relock:
(flags & IOMAP_DIRECT) || IS_DAX(inode)); (flags & IOMAP_DIRECT) || IS_DAX(inode));
if (error) if (error)
goto out_unlock; goto out_unlock;
if (shared) if (shared) {
if ((flags & IOMAP_ATOMIC) &&
!xfs_bmap_hw_atomic_write_possible(ip, &cmap,
offset_fsb, end_fsb)) {
error = -ENOPROTOOPT;
goto out_unlock;
}
goto out_found_cow; goto out_found_cow;
}
end_fsb = imap.br_startoff + imap.br_blockcount; end_fsb = imap.br_startoff + imap.br_blockcount;
length = XFS_FSB_TO_B(mp, end_fsb) - offset; length = XFS_FSB_TO_B(mp, end_fsb) - offset;
} }
if (imap_needs_alloc(inode, flags, &imap, nimaps)) needs_alloc = imap_needs_alloc(inode, flags, &imap, nimaps);
if (flags & IOMAP_ATOMIC) {
error = -ENOPROTOOPT;
/*
* If we allocate less than what is required for the write
* then we may end up with multiple extents, which means that
* REQ_ATOMIC-based cannot be used, so avoid this possibility.
*/
if (needs_alloc && orig_end_fsb - offset_fsb > 1)
goto out_unlock;
if (!xfs_bmap_hw_atomic_write_possible(ip, &imap, offset_fsb,
orig_end_fsb))
goto out_unlock;
}
if (needs_alloc)
goto allocate_blocks; goto allocate_blocks;
/* /*
@ -1022,6 +1080,134 @@ const struct iomap_ops xfs_zoned_direct_write_iomap_ops = {
}; };
#endif /* CONFIG_XFS_RT */ #endif /* CONFIG_XFS_RT */
static int
xfs_atomic_write_cow_iomap_begin(
struct inode *inode,
loff_t offset,
loff_t length,
unsigned flags,
struct iomap *iomap,
struct iomap *srcmap)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
const xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length);
xfs_filblks_t count_fsb = end_fsb - offset_fsb;
int nmaps = 1;
xfs_filblks_t resaligned;
struct xfs_bmbt_irec cmap;
struct xfs_iext_cursor icur;
struct xfs_trans *tp;
unsigned int dblocks = 0, rblocks = 0;
int error;
u64 seq;
ASSERT(flags & IOMAP_WRITE);
ASSERT(flags & IOMAP_DIRECT);
if (xfs_is_shutdown(mp))
return -EIO;
if (!xfs_can_sw_atomic_write(mp)) {
ASSERT(xfs_can_sw_atomic_write(mp));
return -EINVAL;
}
/* blocks are always allocated in this path */
if (flags & IOMAP_NOWAIT)
return -EAGAIN;
trace_xfs_iomap_atomic_write_cow(ip, offset, length);
xfs_ilock(ip, XFS_ILOCK_EXCL);
if (!ip->i_cowfp) {
ASSERT(!xfs_is_reflink_inode(ip));
xfs_ifork_init_cow(ip);
}
if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
cmap.br_startoff = end_fsb;
if (cmap.br_startoff <= offset_fsb) {
xfs_trim_extent(&cmap, offset_fsb, count_fsb);
goto found;
}
end_fsb = cmap.br_startoff;
count_fsb = end_fsb - offset_fsb;
resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb,
xfs_get_cowextsz_hint(ip));
xfs_iunlock(ip, XFS_ILOCK_EXCL);
if (XFS_IS_REALTIME_INODE(ip)) {
dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
rblocks = resaligned;
} else {
dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
rblocks = 0;
}
error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks,
rblocks, false, &tp);
if (error)
return error;
/* extent layout could have changed since the unlock, so check again */
if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
cmap.br_startoff = end_fsb;
if (cmap.br_startoff <= offset_fsb) {
xfs_trim_extent(&cmap, offset_fsb, count_fsb);
xfs_trans_cancel(tp);
goto found;
}
/*
* Allocate the entire reservation as unwritten blocks.
*
* Use XFS_BMAPI_EXTSZALIGN to hint at aligning new extents according to
* extszhint, such that there will be a greater chance that future
* atomic writes to that same range will be aligned (and don't require
* this COW-based method).
*/
error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC |
XFS_BMAPI_EXTSZALIGN, 0, &cmap, &nmaps);
if (error) {
xfs_trans_cancel(tp);
goto out_unlock;
}
xfs_inode_set_cowblocks_tag(ip);
error = xfs_trans_commit(tp);
if (error)
goto out_unlock;
found:
if (cmap.br_state != XFS_EXT_NORM) {
error = xfs_reflink_convert_cow_locked(ip, offset_fsb,
count_fsb);
if (error)
goto out_unlock;
cmap.br_state = XFS_EXT_NORM;
}
length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq);
out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
const struct iomap_ops xfs_atomic_write_cow_iomap_ops = {
.iomap_begin = xfs_atomic_write_cow_iomap_begin,
};
static int static int
xfs_dax_write_iomap_end( xfs_dax_write_iomap_end(
struct inode *inode, struct inode *inode,

View File

@ -56,5 +56,6 @@ extern const struct iomap_ops xfs_read_iomap_ops;
extern const struct iomap_ops xfs_seek_iomap_ops; extern const struct iomap_ops xfs_seek_iomap_ops;
extern const struct iomap_ops xfs_xattr_iomap_ops; extern const struct iomap_ops xfs_xattr_iomap_ops;
extern const struct iomap_ops xfs_dax_write_iomap_ops; extern const struct iomap_ops xfs_dax_write_iomap_ops;
extern const struct iomap_ops xfs_atomic_write_cow_iomap_ops;
#endif /* __XFS_IOMAP_H__*/ #endif /* __XFS_IOMAP_H__*/

View File

@ -601,16 +601,82 @@ xfs_report_dioalign(
stat->dio_offset_align = stat->dio_read_offset_align; stat->dio_offset_align = stat->dio_read_offset_align;
} }
unsigned int
xfs_get_atomic_write_min(
struct xfs_inode *ip)
{
struct xfs_mount *mp = ip->i_mount;
/*
* If we can complete an atomic write via atomic out of place writes,
* then advertise a minimum size of one fsblock. Without this
* mechanism, we can only guarantee atomic writes up to a single LBA.
*
* If out of place writes are not available, we can guarantee an atomic
* write of exactly one single fsblock if the bdev will make that
* guarantee for us.
*/
if (xfs_inode_can_hw_atomic_write(ip) || xfs_can_sw_atomic_write(mp))
return mp->m_sb.sb_blocksize;
return 0;
}
unsigned int
xfs_get_atomic_write_max(
struct xfs_inode *ip)
{
struct xfs_mount *mp = ip->i_mount;
/*
* If out of place writes are not available, we can guarantee an atomic
* write of exactly one single fsblock if the bdev will make that
* guarantee for us.
*/
if (!xfs_can_sw_atomic_write(mp)) {
if (xfs_inode_can_hw_atomic_write(ip))
return mp->m_sb.sb_blocksize;
return 0;
}
/*
* If we can complete an atomic write via atomic out of place writes,
* then advertise a maximum size of whatever we can complete through
* that means. Hardware support is reported via max_opt, not here.
*/
if (XFS_IS_REALTIME_INODE(ip))
return XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_RTG].awu_max);
return XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_AG].awu_max);
}
unsigned int
xfs_get_atomic_write_max_opt(
struct xfs_inode *ip)
{
unsigned int awu_max = xfs_get_atomic_write_max(ip);
/* if the max is 1x block, then just keep behaviour that opt is 0 */
if (awu_max <= ip->i_mount->m_sb.sb_blocksize)
return 0;
/*
* Advertise the maximum size of an atomic write that we can tell the
* block device to perform for us. In general the bdev limit will be
* less than our out of place write limit, but we don't want to exceed
* the awu_max.
*/
return min(awu_max, xfs_inode_buftarg(ip)->bt_bdev_awu_max);
}
static void static void
xfs_report_atomic_write( xfs_report_atomic_write(
struct xfs_inode *ip, struct xfs_inode *ip,
struct kstat *stat) struct kstat *stat)
{ {
unsigned int unit_min = 0, unit_max = 0; generic_fill_statx_atomic_writes(stat,
xfs_get_atomic_write_min(ip),
if (xfs_inode_can_atomicwrite(ip)) xfs_get_atomic_write_max(ip),
unit_min = unit_max = ip->i_mount->m_sb.sb_blocksize; xfs_get_atomic_write_max_opt(ip));
generic_fill_statx_atomic_writes(stat, unit_min, unit_max);
} }
STATIC int STATIC int

View File

@ -19,5 +19,8 @@ int xfs_inode_init_security(struct inode *inode, struct inode *dir,
extern void xfs_setup_inode(struct xfs_inode *ip); extern void xfs_setup_inode(struct xfs_inode *ip);
extern void xfs_setup_iops(struct xfs_inode *ip); extern void xfs_setup_iops(struct xfs_inode *ip);
extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init); extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init);
unsigned int xfs_get_atomic_write_min(struct xfs_inode *ip);
unsigned int xfs_get_atomic_write_max(struct xfs_inode *ip);
unsigned int xfs_get_atomic_write_max_opt(struct xfs_inode *ip);
#endif /* __XFS_IOPS_H__ */ #endif /* __XFS_IOPS_H__ */

View File

@ -309,9 +309,7 @@ xlog_cil_alloc_shadow_bufs(
* Then round nbytes up to 64-bit alignment so that the initial * Then round nbytes up to 64-bit alignment so that the initial
* buffer alignment is easy to calculate and verify. * buffer alignment is easy to calculate and verify.
*/ */
nbytes += niovecs * nbytes = xlog_item_space(niovecs, nbytes);
(sizeof(uint64_t) + sizeof(struct xlog_op_header));
nbytes = round_up(nbytes, sizeof(uint64_t));
/* /*
* The data buffer needs to start 64-bit aligned, so round up * The data buffer needs to start 64-bit aligned, so round up

View File

@ -698,4 +698,17 @@ xlog_kvmalloc(
return p; return p;
} }
/*
* Given a count of iovecs and space for a log item, compute the space we need
* in the log to store that data plus the log headers.
*/
static inline unsigned int
xlog_item_space(
unsigned int niovecs,
unsigned int nbytes)
{
nbytes += niovecs * (sizeof(uint64_t) + sizeof(struct xlog_op_header));
return round_up(nbytes, sizeof(uint64_t));
}
#endif /* __XFS_LOG_PRIV_H__ */ #endif /* __XFS_LOG_PRIV_H__ */

View File

@ -141,14 +141,6 @@ xfs_warn_experimental(
const char *name; const char *name;
long opstate; long opstate;
} features[] = { } features[] = {
[XFS_EXPERIMENTAL_PNFS] = {
.opstate = XFS_OPSTATE_WARNED_PNFS,
.name = "pNFS",
},
[XFS_EXPERIMENTAL_SCRUB] = {
.opstate = XFS_OPSTATE_WARNED_SCRUB,
.name = "online scrub",
},
[XFS_EXPERIMENTAL_SHRINK] = { [XFS_EXPERIMENTAL_SHRINK] = {
.opstate = XFS_OPSTATE_WARNED_SHRINK, .opstate = XFS_OPSTATE_WARNED_SHRINK,
.name = "online shrink", .name = "online shrink",
@ -161,14 +153,6 @@ xfs_warn_experimental(
.opstate = XFS_OPSTATE_WARNED_LBS, .opstate = XFS_OPSTATE_WARNED_LBS,
.name = "large block size", .name = "large block size",
}, },
[XFS_EXPERIMENTAL_EXCHRANGE] = {
.opstate = XFS_OPSTATE_WARNED_EXCHRANGE,
.name = "exchange range",
},
[XFS_EXPERIMENTAL_PPTR] = {
.opstate = XFS_OPSTATE_WARNED_PPTR,
.name = "parent pointer",
},
[XFS_EXPERIMENTAL_METADIR] = { [XFS_EXPERIMENTAL_METADIR] = {
.opstate = XFS_OPSTATE_WARNED_METADIR, .opstate = XFS_OPSTATE_WARNED_METADIR,
.name = "metadata directory tree", .name = "metadata directory tree",

View File

@ -91,13 +91,9 @@ void xfs_buf_alert_ratelimited(struct xfs_buf *bp, const char *rlmsg,
const char *fmt, ...); const char *fmt, ...);
enum xfs_experimental_feat { enum xfs_experimental_feat {
XFS_EXPERIMENTAL_PNFS,
XFS_EXPERIMENTAL_SCRUB,
XFS_EXPERIMENTAL_SHRINK, XFS_EXPERIMENTAL_SHRINK,
XFS_EXPERIMENTAL_LARP, XFS_EXPERIMENTAL_LARP,
XFS_EXPERIMENTAL_LBS, XFS_EXPERIMENTAL_LBS,
XFS_EXPERIMENTAL_EXCHRANGE,
XFS_EXPERIMENTAL_PPTR,
XFS_EXPERIMENTAL_METADIR, XFS_EXPERIMENTAL_METADIR,
XFS_EXPERIMENTAL_ZONED, XFS_EXPERIMENTAL_ZONED,

View File

@ -666,6 +666,158 @@ xfs_agbtree_compute_maxlevels(
mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels); mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels);
} }
/* Maximum atomic write IO size that the kernel allows. */
static inline xfs_extlen_t xfs_calc_atomic_write_max(struct xfs_mount *mp)
{
return rounddown_pow_of_two(XFS_B_TO_FSB(mp, MAX_RW_COUNT));
}
static inline unsigned int max_pow_of_two_factor(const unsigned int nr)
{
return 1 << (ffs(nr) - 1);
}
/*
* If the data device advertises atomic write support, limit the size of data
* device atomic writes to the greatest power-of-two factor of the AG size so
* that every atomic write unit aligns with the start of every AG. This is
* required so that the per-AG allocations for an atomic write will always be
* aligned compatibly with the alignment requirements of the storage.
*
* If the data device doesn't advertise atomic writes, then there are no
* alignment restrictions and the largest out-of-place write we can do
* ourselves is the number of blocks that user files can allocate from any AG.
*/
static inline xfs_extlen_t xfs_calc_perag_awu_max(struct xfs_mount *mp)
{
if (mp->m_ddev_targp->bt_bdev_awu_min > 0)
return max_pow_of_two_factor(mp->m_sb.sb_agblocks);
return rounddown_pow_of_two(mp->m_ag_max_usable);
}
/*
* Reflink on the realtime device requires rtgroups, and atomic writes require
* reflink.
*
* If the realtime device advertises atomic write support, limit the size of
* data device atomic writes to the greatest power-of-two factor of the rtgroup
* size so that every atomic write unit aligns with the start of every rtgroup.
* This is required so that the per-rtgroup allocations for an atomic write
* will always be aligned compatibly with the alignment requirements of the
* storage.
*
* If the rt device doesn't advertise atomic writes, then there are no
* alignment restrictions and the largest out-of-place write we can do
* ourselves is the number of blocks that user files can allocate from any
* rtgroup.
*/
static inline xfs_extlen_t xfs_calc_rtgroup_awu_max(struct xfs_mount *mp)
{
struct xfs_groups *rgs = &mp->m_groups[XG_TYPE_RTG];
if (rgs->blocks == 0)
return 0;
if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_bdev_awu_min > 0)
return max_pow_of_two_factor(rgs->blocks);
return rounddown_pow_of_two(rgs->blocks);
}
/* Compute the maximum atomic write unit size for each section. */
static inline void
xfs_calc_atomic_write_unit_max(
struct xfs_mount *mp)
{
struct xfs_groups *ags = &mp->m_groups[XG_TYPE_AG];
struct xfs_groups *rgs = &mp->m_groups[XG_TYPE_RTG];
const xfs_extlen_t max_write = xfs_calc_atomic_write_max(mp);
const xfs_extlen_t max_ioend = xfs_reflink_max_atomic_cow(mp);
const xfs_extlen_t max_agsize = xfs_calc_perag_awu_max(mp);
const xfs_extlen_t max_rgsize = xfs_calc_rtgroup_awu_max(mp);
ags->awu_max = min3(max_write, max_ioend, max_agsize);
rgs->awu_max = min3(max_write, max_ioend, max_rgsize);
trace_xfs_calc_atomic_write_unit_max(mp, max_write, max_ioend,
max_agsize, max_rgsize);
}
/*
* Try to set the atomic write maximum to a new value that we got from
* userspace via mount option.
*/
int
xfs_set_max_atomic_write_opt(
struct xfs_mount *mp,
unsigned long long new_max_bytes)
{
const xfs_filblks_t new_max_fsbs = XFS_B_TO_FSBT(mp, new_max_bytes);
const xfs_extlen_t max_write = xfs_calc_atomic_write_max(mp);
const xfs_extlen_t max_group =
max(mp->m_groups[XG_TYPE_AG].blocks,
mp->m_groups[XG_TYPE_RTG].blocks);
const xfs_extlen_t max_group_write =
max(xfs_calc_perag_awu_max(mp), xfs_calc_rtgroup_awu_max(mp));
int error;
if (new_max_bytes == 0)
goto set_limit;
ASSERT(max_write <= U32_MAX);
/* generic_atomic_write_valid enforces power of two length */
if (!is_power_of_2(new_max_bytes)) {
xfs_warn(mp,
"max atomic write size of %llu bytes is not a power of 2",
new_max_bytes);
return -EINVAL;
}
if (new_max_bytes & mp->m_blockmask) {
xfs_warn(mp,
"max atomic write size of %llu bytes not aligned with fsblock",
new_max_bytes);
return -EINVAL;
}
if (new_max_fsbs > max_write) {
xfs_warn(mp,
"max atomic write size of %lluk cannot be larger than max write size %lluk",
new_max_bytes >> 10,
XFS_FSB_TO_B(mp, max_write) >> 10);
return -EINVAL;
}
if (new_max_fsbs > max_group) {
xfs_warn(mp,
"max atomic write size of %lluk cannot be larger than allocation group size %lluk",
new_max_bytes >> 10,
XFS_FSB_TO_B(mp, max_group) >> 10);
return -EINVAL;
}
if (new_max_fsbs > max_group_write) {
xfs_warn(mp,
"max atomic write size of %lluk cannot be larger than max allocation group write size %lluk",
new_max_bytes >> 10,
XFS_FSB_TO_B(mp, max_group_write) >> 10);
return -EINVAL;
}
set_limit:
error = xfs_calc_atomic_write_reservation(mp, new_max_fsbs);
if (error) {
xfs_warn(mp,
"cannot support completing atomic writes of %lluk",
new_max_bytes >> 10);
return error;
}
xfs_calc_atomic_write_unit_max(mp);
mp->m_awu_max_bytes = new_max_bytes;
return 0;
}
/* Compute maximum possible height for realtime btree types for this fs. */ /* Compute maximum possible height for realtime btree types for this fs. */
static inline void static inline void
xfs_rtbtree_compute_maxlevels( xfs_rtbtree_compute_maxlevels(
@ -1082,6 +1234,15 @@ xfs_mountfs(
xfs_zone_gc_start(mp); xfs_zone_gc_start(mp);
} }
/*
* Pre-calculate atomic write unit max. This involves computations
* derived from transaction reservations, so we must do this after the
* log is fully initialized.
*/
error = xfs_set_max_atomic_write_opt(mp, mp->m_awu_max_bytes);
if (error)
goto out_agresv;
return 0; return 0;
out_agresv: out_agresv:

View File

@ -119,6 +119,12 @@ struct xfs_groups {
* SMR hard drives. * SMR hard drives.
*/ */
xfs_fsblock_t start_fsb; xfs_fsblock_t start_fsb;
/*
* Maximum length of an atomic write for files stored in this
* collection of allocation groups, in fsblocks.
*/
xfs_extlen_t awu_max;
}; };
struct xfs_freecounter { struct xfs_freecounter {
@ -230,6 +236,10 @@ typedef struct xfs_mount {
bool m_update_sb; /* sb needs update in mount */ bool m_update_sb; /* sb needs update in mount */
unsigned int m_max_open_zones; unsigned int m_max_open_zones;
unsigned int m_zonegc_low_space; unsigned int m_zonegc_low_space;
struct xfs_mru_cache *m_zone_cache; /* Inode to open zone cache */
/* max_atomic_write mount option value */
unsigned long long m_awu_max_bytes;
/* /*
* Bitsets of per-fs metadata that have been checked and/or are sick. * Bitsets of per-fs metadata that have been checked and/or are sick.
@ -464,6 +474,11 @@ static inline bool xfs_has_nonzoned(const struct xfs_mount *mp)
return !xfs_has_zoned(mp); return !xfs_has_zoned(mp);
} }
static inline bool xfs_can_sw_atomic_write(struct xfs_mount *mp)
{
return xfs_has_reflink(mp);
}
/* /*
* Some features are always on for v5 file systems, allow the compiler to * Some features are always on for v5 file systems, allow the compiler to
* eliminiate dead code when building without v4 support. * eliminiate dead code when building without v4 support.
@ -543,10 +558,6 @@ __XFS_HAS_FEAT(nouuid, NOUUID)
*/ */
#define XFS_OPSTATE_BLOCKGC_ENABLED 6 #define XFS_OPSTATE_BLOCKGC_ENABLED 6
/* Kernel has logged a warning about pNFS being used on this fs. */
#define XFS_OPSTATE_WARNED_PNFS 7
/* Kernel has logged a warning about online fsck being used on this fs. */
#define XFS_OPSTATE_WARNED_SCRUB 8
/* Kernel has logged a warning about shrink being used on this fs. */ /* Kernel has logged a warning about shrink being used on this fs. */
#define XFS_OPSTATE_WARNED_SHRINK 9 #define XFS_OPSTATE_WARNED_SHRINK 9
/* Kernel has logged a warning about logged xattr updates being used. */ /* Kernel has logged a warning about logged xattr updates being used. */
@ -559,10 +570,6 @@ __XFS_HAS_FEAT(nouuid, NOUUID)
#define XFS_OPSTATE_USE_LARP 13 #define XFS_OPSTATE_USE_LARP 13
/* Kernel has logged a warning about blocksize > pagesize on this fs. */ /* Kernel has logged a warning about blocksize > pagesize on this fs. */
#define XFS_OPSTATE_WARNED_LBS 14 #define XFS_OPSTATE_WARNED_LBS 14
/* Kernel has logged a warning about exchange-range being used on this fs. */
#define XFS_OPSTATE_WARNED_EXCHRANGE 15
/* Kernel has logged a warning about parent pointers being used on this fs. */
#define XFS_OPSTATE_WARNED_PPTR 16
/* Kernel has logged a warning about metadata dirs being used on this fs. */ /* Kernel has logged a warning about metadata dirs being used on this fs. */
#define XFS_OPSTATE_WARNED_METADIR 17 #define XFS_OPSTATE_WARNED_METADIR 17
/* Filesystem should use qflags to determine quotaon status */ /* Filesystem should use qflags to determine quotaon status */
@ -631,7 +638,6 @@ xfs_should_warn(struct xfs_mount *mp, long nr)
{ (1UL << XFS_OPSTATE_READONLY), "read_only" }, \ { (1UL << XFS_OPSTATE_READONLY), "read_only" }, \
{ (1UL << XFS_OPSTATE_INODEGC_ENABLED), "inodegc" }, \ { (1UL << XFS_OPSTATE_INODEGC_ENABLED), "inodegc" }, \
{ (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" }, \ { (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" }, \
{ (1UL << XFS_OPSTATE_WARNED_SCRUB), "wscrub" }, \
{ (1UL << XFS_OPSTATE_WARNED_SHRINK), "wshrink" }, \ { (1UL << XFS_OPSTATE_WARNED_SHRINK), "wshrink" }, \
{ (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" }, \ { (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" }, \
{ (1UL << XFS_OPSTATE_QUOTACHECK_RUNNING), "quotacheck" }, \ { (1UL << XFS_OPSTATE_QUOTACHECK_RUNNING), "quotacheck" }, \
@ -793,4 +799,7 @@ static inline void xfs_mod_sb_delalloc(struct xfs_mount *mp, int64_t delta)
percpu_counter_add(&mp->m_delalloc_blks, delta); percpu_counter_add(&mp->m_delalloc_blks, delta);
} }
int xfs_set_max_atomic_write_opt(struct xfs_mount *mp,
unsigned long long new_max_bytes);
#endif /* __XFS_MOUNT_H__ */ #endif /* __XFS_MOUNT_H__ */

View File

@ -414,6 +414,8 @@ xfs_mru_cache_destroy(
* To insert an element, call xfs_mru_cache_insert() with the data store, the * To insert an element, call xfs_mru_cache_insert() with the data store, the
* element's key and the client data pointer. This function returns 0 on * element's key and the client data pointer. This function returns 0 on
* success or ENOMEM if memory for the data element couldn't be allocated. * success or ENOMEM if memory for the data element couldn't be allocated.
*
* The passed in elem is freed through the per-cache free_func on failure.
*/ */
int int
xfs_mru_cache_insert( xfs_mru_cache_insert(
@ -421,14 +423,15 @@ xfs_mru_cache_insert(
unsigned long key, unsigned long key,
struct xfs_mru_cache_elem *elem) struct xfs_mru_cache_elem *elem)
{ {
int error; int error = -EINVAL;
ASSERT(mru && mru->lists); ASSERT(mru && mru->lists);
if (!mru || !mru->lists) if (!mru || !mru->lists)
return -EINVAL; goto out_free;
error = -ENOMEM;
if (radix_tree_preload(GFP_KERNEL)) if (radix_tree_preload(GFP_KERNEL))
return -ENOMEM; goto out_free;
INIT_LIST_HEAD(&elem->list_node); INIT_LIST_HEAD(&elem->list_node);
elem->key = key; elem->key = key;
@ -440,6 +443,12 @@ xfs_mru_cache_insert(
_xfs_mru_cache_list_insert(mru, elem); _xfs_mru_cache_list_insert(mru, elem);
spin_unlock(&mru->lock); spin_unlock(&mru->lock);
if (error)
goto out_free;
return 0;
out_free:
mru->free_func(mru->data, elem);
return error; return error;
} }

View File

@ -58,8 +58,6 @@ xfs_fs_get_uuid(
{ {
struct xfs_mount *mp = XFS_M(sb); struct xfs_mount *mp = XFS_M(sb);
xfs_warn_experimental(mp, XFS_EXPERIMENTAL_PNFS);
if (*len < sizeof(uuid_t)) if (*len < sizeof(uuid_t))
return -EINVAL; return -EINVAL;

View File

@ -78,6 +78,11 @@ xfs_cui_item_size(
*nbytes += xfs_cui_log_format_sizeof(cuip->cui_format.cui_nextents); *nbytes += xfs_cui_log_format_sizeof(cuip->cui_format.cui_nextents);
} }
unsigned int xfs_cui_log_space(unsigned int nr)
{
return xlog_item_space(1, xfs_cui_log_format_sizeof(nr));
}
/* /*
* This is called to fill in the vector of log iovecs for the * This is called to fill in the vector of log iovecs for the
* given cui log item. We use only 1 iovec, and we point that * given cui log item. We use only 1 iovec, and we point that
@ -179,6 +184,11 @@ xfs_cud_item_size(
*nbytes += sizeof(struct xfs_cud_log_format); *nbytes += sizeof(struct xfs_cud_log_format);
} }
unsigned int xfs_cud_log_space(void)
{
return xlog_item_space(1, sizeof(struct xfs_cud_log_format));
}
/* /*
* This is called to fill in the vector of log iovecs for the * This is called to fill in the vector of log iovecs for the
* given cud log item. We use only 1 iovec, and we point that * given cud log item. We use only 1 iovec, and we point that

View File

@ -76,4 +76,7 @@ struct xfs_refcount_intent;
void xfs_refcount_defer_add(struct xfs_trans *tp, void xfs_refcount_defer_add(struct xfs_trans *tp,
struct xfs_refcount_intent *ri); struct xfs_refcount_intent *ri);
unsigned int xfs_cui_log_space(unsigned int nr);
unsigned int xfs_cud_log_space(void);
#endif /* __XFS_REFCOUNT_ITEM_H__ */ #endif /* __XFS_REFCOUNT_ITEM_H__ */

View File

@ -293,7 +293,7 @@ xfs_bmap_trim_cow(
return xfs_reflink_trim_around_shared(ip, imap, shared); return xfs_reflink_trim_around_shared(ip, imap, shared);
} }
static int int
xfs_reflink_convert_cow_locked( xfs_reflink_convert_cow_locked(
struct xfs_inode *ip, struct xfs_inode *ip,
xfs_fileoff_t offset_fsb, xfs_fileoff_t offset_fsb,
@ -786,35 +786,19 @@ xfs_reflink_update_quota(
* requirements as low as possible. * requirements as low as possible.
*/ */
STATIC int STATIC int
xfs_reflink_end_cow_extent( xfs_reflink_end_cow_extent_locked(
struct xfs_trans *tp,
struct xfs_inode *ip, struct xfs_inode *ip,
xfs_fileoff_t *offset_fsb, xfs_fileoff_t *offset_fsb,
xfs_fileoff_t end_fsb) xfs_fileoff_t end_fsb)
{ {
struct xfs_iext_cursor icur; struct xfs_iext_cursor icur;
struct xfs_bmbt_irec got, del, data; struct xfs_bmbt_irec got, del, data;
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
unsigned int resblks;
int nmaps; int nmaps;
bool isrt = XFS_IS_REALTIME_INODE(ip); bool isrt = XFS_IS_REALTIME_INODE(ip);
int error; int error;
resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
XFS_TRANS_RESERVE, &tp);
if (error)
return error;
/*
* Lock the inode. We have to ijoin without automatic unlock because
* the lead transaction is the refcountbt record deletion; the data
* fork update follows as a deferred log item.
*/
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
/* /*
* In case of racing, overlapping AIO writes no COW extents might be * In case of racing, overlapping AIO writes no COW extents might be
* left by the time I/O completes for the loser of the race. In that * left by the time I/O completes for the loser of the race. In that
@ -823,7 +807,7 @@ xfs_reflink_end_cow_extent(
if (!xfs_iext_lookup_extent(ip, ifp, *offset_fsb, &icur, &got) || if (!xfs_iext_lookup_extent(ip, ifp, *offset_fsb, &icur, &got) ||
got.br_startoff >= end_fsb) { got.br_startoff >= end_fsb) {
*offset_fsb = end_fsb; *offset_fsb = end_fsb;
goto out_cancel; return 0;
} }
/* /*
@ -837,7 +821,7 @@ xfs_reflink_end_cow_extent(
if (!xfs_iext_next_extent(ifp, &icur, &got) || if (!xfs_iext_next_extent(ifp, &icur, &got) ||
got.br_startoff >= end_fsb) { got.br_startoff >= end_fsb) {
*offset_fsb = end_fsb; *offset_fsb = end_fsb;
goto out_cancel; return 0;
} }
} }
del = got; del = got;
@ -846,14 +830,14 @@ xfs_reflink_end_cow_extent(
error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
XFS_IEXT_REFLINK_END_COW_CNT); XFS_IEXT_REFLINK_END_COW_CNT);
if (error) if (error)
goto out_cancel; return error;
/* Grab the corresponding mapping in the data fork. */ /* Grab the corresponding mapping in the data fork. */
nmaps = 1; nmaps = 1;
error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data, error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data,
&nmaps, 0); &nmaps, 0);
if (error) if (error)
goto out_cancel; return error;
/* We can only remap the smaller of the two extent sizes. */ /* We can only remap the smaller of the two extent sizes. */
data.br_blockcount = min(data.br_blockcount, del.br_blockcount); data.br_blockcount = min(data.br_blockcount, del.br_blockcount);
@ -882,7 +866,7 @@ xfs_reflink_end_cow_extent(
error = xfs_bunmapi(NULL, ip, data.br_startoff, error = xfs_bunmapi(NULL, ip, data.br_startoff,
data.br_blockcount, 0, 1, &done); data.br_blockcount, 0, 1, &done);
if (error) if (error)
goto out_cancel; return error;
ASSERT(done); ASSERT(done);
} }
@ -899,17 +883,45 @@ xfs_reflink_end_cow_extent(
/* Remove the mapping from the CoW fork. */ /* Remove the mapping from the CoW fork. */
xfs_bmap_del_extent_cow(ip, &icur, &got, &del); xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
if (error)
return error;
/* Update the caller about how much progress we made. */ /* Update the caller about how much progress we made. */
*offset_fsb = del.br_startoff + del.br_blockcount; *offset_fsb = del.br_startoff + del.br_blockcount;
return 0; return 0;
}
out_cancel: /*
xfs_trans_cancel(tp); * Remap part of the CoW fork into the data fork.
*
* We aim to remap the range starting at @offset_fsb and ending at @end_fsb
* into the data fork; this function will remap what it can (at the end of the
* range) and update @end_fsb appropriately. Each remap gets its own
* transaction because we can end up merging and splitting bmbt blocks for
* every remap operation and we'd like to keep the block reservation
* requirements as low as possible.
*/
STATIC int
xfs_reflink_end_cow_extent(
struct xfs_inode *ip,
xfs_fileoff_t *offset_fsb,
xfs_fileoff_t end_fsb)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
unsigned int resblks;
int error;
resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
XFS_TRANS_RESERVE, &tp);
if (error)
return error;
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
error = xfs_reflink_end_cow_extent_locked(tp, ip, offset_fsb, end_fsb);
if (error)
xfs_trans_cancel(tp);
else
error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error; return error;
} }
@ -972,6 +984,78 @@ xfs_reflink_end_cow(
return error; return error;
} }
/*
* Fully remap all of the file's data fork at once, which is the critical part
* in achieving atomic behaviour.
* The regular CoW end path does not use function as to keep the block
* reservation per transaction as low as possible.
*/
int
xfs_reflink_end_atomic_cow(
struct xfs_inode *ip,
xfs_off_t offset,
xfs_off_t count)
{
xfs_fileoff_t offset_fsb;
xfs_fileoff_t end_fsb;
int error = 0;
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
unsigned int resblks;
trace_xfs_reflink_end_cow(ip, offset, count);
offset_fsb = XFS_B_TO_FSBT(mp, offset);
end_fsb = XFS_B_TO_FSB(mp, offset + count);
/*
* Each remapping operation could cause a btree split, so in the worst
* case that's one for each block.
*/
resblks = (end_fsb - offset_fsb) *
XFS_NEXTENTADD_SPACE_RES(mp, 1, XFS_DATA_FORK);
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_atomic_ioend, resblks, 0,
XFS_TRANS_RESERVE, &tp);
if (error)
return error;
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
while (end_fsb > offset_fsb && !error) {
error = xfs_reflink_end_cow_extent_locked(tp, ip, &offset_fsb,
end_fsb);
}
if (error) {
trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
goto out_cancel;
}
error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
out_cancel:
xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
/* Compute the largest atomic write that we can complete through software. */
xfs_extlen_t
xfs_reflink_max_atomic_cow(
struct xfs_mount *mp)
{
/* We cannot do any atomic writes without out of place writes. */
if (!xfs_can_sw_atomic_write(mp))
return 0;
/*
* Atomic write limits must always be a power-of-2, according to
* generic_atomic_write_valid.
*/
return rounddown_pow_of_two(xfs_calc_max_atomic_write_fsblocks(mp));
}
/* /*
* Free all CoW staging blocks that are still referenced by the ondisk refcount * Free all CoW staging blocks that are still referenced by the ondisk refcount
* metadata. The ondisk metadata does not track which inode created the * metadata. The ondisk metadata does not track which inode created the

View File

@ -35,6 +35,8 @@ int xfs_reflink_allocate_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
bool convert_now); bool convert_now);
extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset, extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count); xfs_off_t count);
int xfs_reflink_convert_cow_locked(struct xfs_inode *ip,
xfs_fileoff_t offset_fsb, xfs_filblks_t count_fsb);
extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip, extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip,
struct xfs_trans **tpp, xfs_fileoff_t offset_fsb, struct xfs_trans **tpp, xfs_fileoff_t offset_fsb,
@ -43,6 +45,8 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count, bool cancel_real); xfs_off_t count, bool cancel_real);
extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset, extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count); xfs_off_t count);
int xfs_reflink_end_atomic_cow(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count);
extern int xfs_reflink_recover_cow(struct xfs_mount *mp); extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in, extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out, loff_t len, struct file *file_out, loff_t pos_out, loff_t len,
@ -64,4 +68,6 @@ extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen,
bool xfs_reflink_supports_rextsize(struct xfs_mount *mp, unsigned int rextsize); bool xfs_reflink_supports_rextsize(struct xfs_mount *mp, unsigned int rextsize);
xfs_extlen_t xfs_reflink_max_atomic_cow(struct xfs_mount *mp);
#endif /* __XFS_REFLINK_H */ #endif /* __XFS_REFLINK_H */

View File

@ -77,6 +77,11 @@ xfs_rui_item_size(
*nbytes += xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents); *nbytes += xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents);
} }
unsigned int xfs_rui_log_space(unsigned int nr)
{
return xlog_item_space(1, xfs_rui_log_format_sizeof(nr));
}
/* /*
* This is called to fill in the vector of log iovecs for the * This is called to fill in the vector of log iovecs for the
* given rui log item. We use only 1 iovec, and we point that * given rui log item. We use only 1 iovec, and we point that
@ -180,6 +185,11 @@ xfs_rud_item_size(
*nbytes += sizeof(struct xfs_rud_log_format); *nbytes += sizeof(struct xfs_rud_log_format);
} }
unsigned int xfs_rud_log_space(void)
{
return xlog_item_space(1, sizeof(struct xfs_rud_log_format));
}
/* /*
* This is called to fill in the vector of log iovecs for the * This is called to fill in the vector of log iovecs for the
* given rud log item. We use only 1 iovec, and we point that * given rud log item. We use only 1 iovec, and we point that

View File

@ -75,4 +75,7 @@ struct xfs_rmap_intent;
void xfs_rmap_defer_add(struct xfs_trans *tp, struct xfs_rmap_intent *ri); void xfs_rmap_defer_add(struct xfs_trans *tp, struct xfs_rmap_intent *ri);
unsigned int xfs_rui_log_space(unsigned int nr);
unsigned int xfs_rud_log_space(void);
#endif /* __XFS_RMAP_ITEM_H__ */ #endif /* __XFS_RMAP_ITEM_H__ */

View File

@ -111,7 +111,7 @@ enum {
Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota, Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota,
Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce, Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_max_open_zones, Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_max_open_zones,
Opt_lifetime, Opt_nolifetime, Opt_lifetime, Opt_nolifetime, Opt_max_atomic_write,
}; };
static const struct fs_parameter_spec xfs_fs_parameters[] = { static const struct fs_parameter_spec xfs_fs_parameters[] = {
@ -159,6 +159,7 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = {
fsparam_u32("max_open_zones", Opt_max_open_zones), fsparam_u32("max_open_zones", Opt_max_open_zones),
fsparam_flag("lifetime", Opt_lifetime), fsparam_flag("lifetime", Opt_lifetime),
fsparam_flag("nolifetime", Opt_nolifetime), fsparam_flag("nolifetime", Opt_nolifetime),
fsparam_string("max_atomic_write", Opt_max_atomic_write),
{} {}
}; };
@ -241,6 +242,9 @@ xfs_fs_show_options(
if (mp->m_max_open_zones) if (mp->m_max_open_zones)
seq_printf(m, ",max_open_zones=%u", mp->m_max_open_zones); seq_printf(m, ",max_open_zones=%u", mp->m_max_open_zones);
if (mp->m_awu_max_bytes)
seq_printf(m, ",max_atomic_write=%lluk",
mp->m_awu_max_bytes >> 10);
return 0; return 0;
} }
@ -380,10 +384,11 @@ xfs_blkdev_get(
struct file **bdev_filep) struct file **bdev_filep)
{ {
int error = 0; int error = 0;
blk_mode_t mode;
*bdev_filep = bdev_file_open_by_path(name, mode = sb_open_mode(mp->m_super->s_flags);
BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES, *bdev_filep = bdev_file_open_by_path(name, mode,
mp->m_super, &fs_holder_ops); mp->m_super, &fs_holder_ops);
if (IS_ERR(*bdev_filep)) { if (IS_ERR(*bdev_filep)) {
error = PTR_ERR(*bdev_filep); error = PTR_ERR(*bdev_filep);
*bdev_filep = NULL; *bdev_filep = NULL;
@ -481,21 +486,29 @@ xfs_open_devices(
/* /*
* Setup xfs_mount buffer target pointers * Setup xfs_mount buffer target pointers
*/ */
error = -ENOMEM;
mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_file); mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_file);
if (!mp->m_ddev_targp) if (IS_ERR(mp->m_ddev_targp)) {
error = PTR_ERR(mp->m_ddev_targp);
mp->m_ddev_targp = NULL;
goto out_close_rtdev; goto out_close_rtdev;
}
if (rtdev_file) { if (rtdev_file) {
mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev_file); mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev_file);
if (!mp->m_rtdev_targp) if (IS_ERR(mp->m_rtdev_targp)) {
error = PTR_ERR(mp->m_rtdev_targp);
mp->m_rtdev_targp = NULL;
goto out_free_ddev_targ; goto out_free_ddev_targ;
}
} }
if (logdev_file && file_bdev(logdev_file) != ddev) { if (logdev_file && file_bdev(logdev_file) != ddev) {
mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev_file); mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev_file);
if (!mp->m_logdev_targp) if (IS_ERR(mp->m_logdev_targp)) {
error = PTR_ERR(mp->m_logdev_targp);
mp->m_logdev_targp = NULL;
goto out_free_rtdev_targ; goto out_free_rtdev_targ;
}
} else { } else {
mp->m_logdev_targp = mp->m_ddev_targp; mp->m_logdev_targp = mp->m_ddev_targp;
/* Handle won't be used, drop it */ /* Handle won't be used, drop it */
@ -528,7 +541,7 @@ xfs_setup_devices(
{ {
int error; int error;
error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize); error = xfs_configure_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize);
if (error) if (error)
return error; return error;
@ -537,7 +550,7 @@ xfs_setup_devices(
if (xfs_has_sector(mp)) if (xfs_has_sector(mp))
log_sector_size = mp->m_sb.sb_logsectsize; log_sector_size = mp->m_sb.sb_logsectsize;
error = xfs_setsize_buftarg(mp->m_logdev_targp, error = xfs_configure_buftarg(mp->m_logdev_targp,
log_sector_size); log_sector_size);
if (error) if (error)
return error; return error;
@ -551,7 +564,7 @@ xfs_setup_devices(
} }
mp->m_rtdev_targp = mp->m_ddev_targp; mp->m_rtdev_targp = mp->m_ddev_targp;
} else if (mp->m_rtname) { } else if (mp->m_rtname) {
error = xfs_setsize_buftarg(mp->m_rtdev_targp, error = xfs_configure_buftarg(mp->m_rtdev_targp,
mp->m_sb.sb_sectsize); mp->m_sb.sb_sectsize);
if (error) if (error)
return error; return error;
@ -1334,6 +1347,42 @@ suffix_kstrtoint(
return ret; return ret;
} }
static int
suffix_kstrtoull(
const char *s,
unsigned int base,
unsigned long long *res)
{
int last, shift_left_factor = 0;
unsigned long long _res;
char *value;
int ret = 0;
value = kstrdup(s, GFP_KERNEL);
if (!value)
return -ENOMEM;
last = strlen(value) - 1;
if (value[last] == 'K' || value[last] == 'k') {
shift_left_factor = 10;
value[last] = '\0';
}
if (value[last] == 'M' || value[last] == 'm') {
shift_left_factor = 20;
value[last] = '\0';
}
if (value[last] == 'G' || value[last] == 'g') {
shift_left_factor = 30;
value[last] = '\0';
}
if (kstrtoull(value, base, &_res))
ret = -EINVAL;
kfree(value);
*res = _res << shift_left_factor;
return ret;
}
static inline void static inline void
xfs_fs_warn_deprecated( xfs_fs_warn_deprecated(
struct fs_context *fc, struct fs_context *fc,
@ -1518,6 +1567,14 @@ xfs_fs_parse_param(
case Opt_nolifetime: case Opt_nolifetime:
parsing_mp->m_features |= XFS_FEAT_NOLIFETIME; parsing_mp->m_features |= XFS_FEAT_NOLIFETIME;
return 0; return 0;
case Opt_max_atomic_write:
if (suffix_kstrtoull(param->string, 10,
&parsing_mp->m_awu_max_bytes)) {
xfs_warn(parsing_mp,
"max atomic write size must be positive integer");
return -EINVAL;
}
return 0;
default: default:
xfs_warn(parsing_mp, "unknown mount option [%s].", param->key); xfs_warn(parsing_mp, "unknown mount option [%s].", param->key);
return -EINVAL; return -EINVAL;
@ -1897,13 +1954,6 @@ xfs_fs_fill_super(
} }
} }
if (xfs_has_exchange_range(mp))
xfs_warn_experimental(mp, XFS_EXPERIMENTAL_EXCHRANGE);
if (xfs_has_parent(mp))
xfs_warn_experimental(mp, XFS_EXPERIMENTAL_PPTR);
/* /*
* If no quota mount options were provided, maybe we'll try to pick * If no quota mount options were provided, maybe we'll try to pick
* up the quota accounting and enforcement flags from the ondisk sb. * up the quota accounting and enforcement flags from the ondisk sb.
@ -1969,6 +2019,20 @@ xfs_remount_rw(
struct xfs_sb *sbp = &mp->m_sb; struct xfs_sb *sbp = &mp->m_sb;
int error; int error;
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp &&
bdev_read_only(mp->m_logdev_targp->bt_bdev)) {
xfs_warn(mp,
"ro->rw transition prohibited by read-only logdev");
return -EACCES;
}
if (mp->m_rtdev_targp &&
bdev_read_only(mp->m_rtdev_targp->bt_bdev)) {
xfs_warn(mp,
"ro->rw transition prohibited by read-only rtdev");
return -EACCES;
}
if (xfs_has_norecovery(mp)) { if (xfs_has_norecovery(mp)) {
xfs_warn(mp, xfs_warn(mp,
"ro->rw transition prohibited on norecovery mount"); "ro->rw transition prohibited on norecovery mount");
@ -2129,6 +2193,14 @@ xfs_fs_reconfigure(
mp->m_features |= XFS_FEAT_ATTR2; mp->m_features |= XFS_FEAT_ATTR2;
} }
/* Validate new max_atomic_write option before making other changes */
if (mp->m_awu_max_bytes != new_mp->m_awu_max_bytes) {
error = xfs_set_max_atomic_write_opt(mp,
new_mp->m_awu_max_bytes);
if (error)
return error;
}
/* inode32 -> inode64 */ /* inode32 -> inode64 */
if (xfs_has_small_inums(mp) && !xfs_has_small_inums(new_mp)) { if (xfs_has_small_inums(mp) && !xfs_has_small_inums(new_mp)) {
mp->m_features &= ~XFS_FEAT_SMALL_INUMS; mp->m_features &= ~XFS_FEAT_SMALL_INUMS;

View File

@ -29,8 +29,6 @@ typedef struct xfs_param {
xfs_sysctl_val_t inherit_sync; /* Inherit the "sync" inode flag. */ xfs_sysctl_val_t inherit_sync; /* Inherit the "sync" inode flag. */
xfs_sysctl_val_t inherit_nodump;/* Inherit the "nodump" inode flag. */ xfs_sysctl_val_t inherit_nodump;/* Inherit the "nodump" inode flag. */
xfs_sysctl_val_t inherit_noatim;/* Inherit the "noatime" inode flag. */ xfs_sysctl_val_t inherit_noatim;/* Inherit the "noatime" inode flag. */
xfs_sysctl_val_t xfs_buf_timer; /* Interval between xfsbufd wakeups. */
xfs_sysctl_val_t xfs_buf_age; /* Metadata buffer age before flush. */
xfs_sysctl_val_t inherit_nosym; /* Inherit the "nosymlinks" flag. */ xfs_sysctl_val_t inherit_nosym; /* Inherit the "nosymlinks" flag. */
xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */ xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */
xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */ xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */

View File

@ -170,6 +170,99 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list); DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list);
DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list); DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list);
TRACE_EVENT(xfs_calc_atomic_write_unit_max,
TP_PROTO(struct xfs_mount *mp, unsigned int max_write,
unsigned int max_ioend, unsigned int max_agsize,
unsigned int max_rgsize),
TP_ARGS(mp, max_write, max_ioend, max_agsize, max_rgsize),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(unsigned int, max_write)
__field(unsigned int, max_ioend)
__field(unsigned int, max_agsize)
__field(unsigned int, max_rgsize)
__field(unsigned int, data_awu_max)
__field(unsigned int, rt_awu_max)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->max_write = max_write;
__entry->max_ioend = max_ioend;
__entry->max_agsize = max_agsize;
__entry->max_rgsize = max_rgsize;
__entry->data_awu_max = mp->m_groups[XG_TYPE_AG].awu_max;
__entry->rt_awu_max = mp->m_groups[XG_TYPE_RTG].awu_max;
),
TP_printk("dev %d:%d max_write %u max_ioend %u max_agsize %u max_rgsize %u data_awu_max %u rt_awu_max %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->max_write,
__entry->max_ioend,
__entry->max_agsize,
__entry->max_rgsize,
__entry->data_awu_max,
__entry->rt_awu_max)
);
TRACE_EVENT(xfs_calc_max_atomic_write_fsblocks,
TP_PROTO(struct xfs_mount *mp, unsigned int per_intent,
unsigned int step_size, unsigned int logres,
unsigned int blockcount),
TP_ARGS(mp, per_intent, step_size, logres, blockcount),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(unsigned int, per_intent)
__field(unsigned int, step_size)
__field(unsigned int, logres)
__field(unsigned int, blockcount)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->per_intent = per_intent;
__entry->step_size = step_size;
__entry->logres = logres;
__entry->blockcount = blockcount;
),
TP_printk("dev %d:%d per_intent %u step_size %u logres %u blockcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->per_intent,
__entry->step_size,
__entry->logres,
__entry->blockcount)
);
TRACE_EVENT(xfs_calc_max_atomic_write_log_geometry,
TP_PROTO(struct xfs_mount *mp, unsigned int per_intent,
unsigned int step_size, unsigned int blockcount,
unsigned int min_logblocks, unsigned int logres),
TP_ARGS(mp, per_intent, step_size, blockcount, min_logblocks, logres),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(unsigned int, per_intent)
__field(unsigned int, step_size)
__field(unsigned int, blockcount)
__field(unsigned int, min_logblocks)
__field(unsigned int, cur_logblocks)
__field(unsigned int, logres)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->per_intent = per_intent;
__entry->step_size = step_size;
__entry->blockcount = blockcount;
__entry->min_logblocks = min_logblocks;
__entry->cur_logblocks = mp->m_sb.sb_logblocks;
__entry->logres = logres;
),
TP_printk("dev %d:%d per_intent %u step_size %u blockcount %u min_logblocks %u logblocks %u logres %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->per_intent,
__entry->step_size,
__entry->blockcount,
__entry->min_logblocks,
__entry->cur_logblocks,
__entry->logres)
);
TRACE_EVENT(xlog_intent_recovery_failed, TRACE_EVENT(xlog_intent_recovery_failed,
TP_PROTO(struct xfs_mount *mp, const struct xfs_defer_op_type *ops, TP_PROTO(struct xfs_mount *mp, const struct xfs_defer_op_type *ops,
int error), int error),
@ -1657,6 +1750,28 @@ DEFINE_RW_EVENT(xfs_file_direct_write);
DEFINE_RW_EVENT(xfs_file_dax_write); DEFINE_RW_EVENT(xfs_file_dax_write);
DEFINE_RW_EVENT(xfs_reflink_bounce_dio_write); DEFINE_RW_EVENT(xfs_reflink_bounce_dio_write);
TRACE_EVENT(xfs_iomap_atomic_write_cow,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
TP_ARGS(ip, offset, count),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(xfs_off_t, offset)
__field(ssize_t, count)
),
TP_fast_assign(
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->ino = ip->i_ino;
__entry->offset = offset;
__entry->count = count;
),
TP_printk("dev %d:%d ino 0x%llx pos 0x%llx bytecount 0x%zx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->offset,
__entry->count)
)
DECLARE_EVENT_CLASS(xfs_imap_class, DECLARE_EVENT_CLASS(xfs_imap_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
int whichfork, struct xfs_bmbt_irec *irec), int whichfork, struct xfs_bmbt_irec *irec),

View File

@ -24,6 +24,7 @@
#include "xfs_zone_priv.h" #include "xfs_zone_priv.h"
#include "xfs_zones.h" #include "xfs_zones.h"
#include "xfs_trace.h" #include "xfs_trace.h"
#include "xfs_mru_cache.h"
void void
xfs_open_zone_put( xfs_open_zone_put(
@ -796,6 +797,100 @@ xfs_submit_zoned_bio(
submit_bio(&ioend->io_bio); submit_bio(&ioend->io_bio);
} }
/*
* Cache the last zone written to for an inode so that it is considered first
* for subsequent writes.
*/
struct xfs_zone_cache_item {
struct xfs_mru_cache_elem mru;
struct xfs_open_zone *oz;
};
static inline struct xfs_zone_cache_item *
xfs_zone_cache_item(struct xfs_mru_cache_elem *mru)
{
return container_of(mru, struct xfs_zone_cache_item, mru);
}
static void
xfs_zone_cache_free_func(
void *data,
struct xfs_mru_cache_elem *mru)
{
struct xfs_zone_cache_item *item = xfs_zone_cache_item(mru);
xfs_open_zone_put(item->oz);
kfree(item);
}
/*
* Check if we have a cached last open zone available for the inode and
* if yes return a reference to it.
*/
static struct xfs_open_zone *
xfs_cached_zone(
struct xfs_mount *mp,
struct xfs_inode *ip)
{
struct xfs_mru_cache_elem *mru;
struct xfs_open_zone *oz;
mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino);
if (!mru)
return NULL;
oz = xfs_zone_cache_item(mru)->oz;
if (oz) {
/*
* GC only steals open zones at mount time, so no GC zones
* should end up in the cache.
*/
ASSERT(!oz->oz_is_gc);
ASSERT(atomic_read(&oz->oz_ref) > 0);
atomic_inc(&oz->oz_ref);
}
xfs_mru_cache_done(mp->m_zone_cache);
return oz;
}
/*
* Update the last used zone cache for a given inode.
*
* The caller must have a reference on the open zone.
*/
static void
xfs_zone_cache_create_association(
struct xfs_inode *ip,
struct xfs_open_zone *oz)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_zone_cache_item *item = NULL;
struct xfs_mru_cache_elem *mru;
ASSERT(atomic_read(&oz->oz_ref) > 0);
atomic_inc(&oz->oz_ref);
mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino);
if (mru) {
/*
* If we have an association already, update it to point to the
* new zone.
*/
item = xfs_zone_cache_item(mru);
xfs_open_zone_put(item->oz);
item->oz = oz;
xfs_mru_cache_done(mp->m_zone_cache);
return;
}
item = kmalloc(sizeof(*item), GFP_KERNEL);
if (!item) {
xfs_open_zone_put(oz);
return;
}
item->oz = oz;
xfs_mru_cache_insert(mp->m_zone_cache, ip->i_ino, &item->mru);
}
void void
xfs_zone_alloc_and_submit( xfs_zone_alloc_and_submit(
struct iomap_ioend *ioend, struct iomap_ioend *ioend,
@ -819,11 +914,16 @@ xfs_zone_alloc_and_submit(
*/ */
if (!*oz && ioend->io_offset) if (!*oz && ioend->io_offset)
*oz = xfs_last_used_zone(ioend); *oz = xfs_last_used_zone(ioend);
if (!*oz)
*oz = xfs_cached_zone(mp, ip);
if (!*oz) { if (!*oz) {
select_zone: select_zone:
*oz = xfs_select_zone(mp, write_hint, pack_tight); *oz = xfs_select_zone(mp, write_hint, pack_tight);
if (!*oz) if (!*oz)
goto out_error; goto out_error;
xfs_zone_cache_create_association(ip, *oz);
} }
alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size), alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size),
@ -1211,6 +1311,14 @@ xfs_mount_zones(
error = xfs_zone_gc_mount(mp); error = xfs_zone_gc_mount(mp);
if (error) if (error)
goto out_free_zone_info; goto out_free_zone_info;
/*
* Set up a mru cache to track inode to open zone for data placement
* purposes. The magic values for group count and life time is the
* same as the defaults for file streams, which seems sane enough.
*/
xfs_mru_cache_create(&mp->m_zone_cache, mp,
5000, 10, xfs_zone_cache_free_func);
return 0; return 0;
out_free_zone_info: out_free_zone_info:
@ -1224,4 +1332,5 @@ xfs_unmount_zones(
{ {
xfs_zone_gc_unmount(mp); xfs_zone_gc_unmount(mp);
xfs_free_zone_info(mp->m_zone_info); xfs_free_zone_info(mp->m_zone_info);
xfs_mru_cache_destroy(mp->m_zone_cache);
} }

View File

@ -3502,7 +3502,8 @@ void generic_fillattr(struct mnt_idmap *, u32, struct inode *, struct kstat *);
void generic_fill_statx_attr(struct inode *inode, struct kstat *stat); void generic_fill_statx_attr(struct inode *inode, struct kstat *stat);
void generic_fill_statx_atomic_writes(struct kstat *stat, void generic_fill_statx_atomic_writes(struct kstat *stat,
unsigned int unit_min, unsigned int unit_min,
unsigned int unit_max); unsigned int unit_max,
unsigned int unit_max_opt);
extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int); extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int);
extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int); extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int);
void __inode_add_bytes(struct inode *inode, loff_t bytes); void __inode_add_bytes(struct inode *inode, loff_t bytes);

View File

@ -57,6 +57,7 @@ struct kstat {
u32 dio_read_offset_align; u32 dio_read_offset_align;
u32 atomic_write_unit_min; u32 atomic_write_unit_min;
u32 atomic_write_unit_max; u32 atomic_write_unit_max;
u32 atomic_write_unit_max_opt;
u32 atomic_write_segments_max; u32 atomic_write_segments_max;
}; };

View File

@ -182,8 +182,12 @@ struct statx {
/* File offset alignment for direct I/O reads */ /* File offset alignment for direct I/O reads */
__u32 stx_dio_read_offset_align; __u32 stx_dio_read_offset_align;
/* 0xb8 */ /* Optimised max atomic write unit in bytes */
__u64 __spare3[9]; /* Spare space for future expansion */ __u32 stx_atomic_write_unit_max_opt;
__u32 __spare2[1];
/* 0xc0 */
__u64 __spare3[8]; /* Spare space for future expansion */
/* 0x100 */ /* 0x100 */
}; };