Merge tag 'xfs-merge-6.18' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Pull xfs updates from Carlos Maiolino: "For this merge window, there are really no new features, but there are a few things worth to emphasize: - Deprecated for years already, the (no)attr2 and (no)ikeep mount options have been removed for good - Several cleanups (specially from typedefs) and bug fixes - Improvements made in the online repair reap calculations - online fsck is now enabled by default" * tag 'xfs-merge-6.18' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: (53 commits) xfs: rework datasync tracking and execution xfs: rearrange code in xfs_inode_item_precommit xfs: scrub: use kstrdup_const() for metapath scan setups xfs: use bt_nr_sectors in xfs_dax_translate_range xfs: track the number of blocks in each buftarg xfs: constify xfs_errortag_random_default xfs: improve default maximum number of open zones xfs: improve zone statistics message xfs: centralize error tag definitions xfs: remove pointless externs in xfs_error.h xfs: remove the expr argument to XFS_TEST_ERROR xfs: remove xfs_errortag_set xfs: remove xfs_errortag_get xfs: move the XLOG_REG_ constants out of xfs_log_format.h xfs: adjust the hint based zone allocation policy xfs: refactor hint based zone allocation fs: add an enum for number of life time hints xfs: fix log CRC mismatches between i386 and other architectures xfs: rename the old_crc variable in xlog_recover_process xfs: remove the unused xfs_log_iovec_t typedef ...
2026-03-22 07:27:12 +08:00 · 2025-09-29 14:35:44 -07:00
parent a40eb50a95 c91d38b57f
commit e445fba2d7
65 changed files with 1251 additions and 1055 deletions
--- a/Documentation/admin-guide/xfs.rst
+++ b/Documentation/admin-guide/xfs.rst
@@ -34,22 +34,6 @@ When mounting an XFS filesystem, the following options are accepted.
 	to the file. Specifying a fixed ``allocsize`` value turns off
 	the dynamic behaviour.

-  attr2 or noattr2
-	The options enable/disable an "opportunistic" improvement to
-	be made in the way inline extended attributes are stored
-	on-disk.  When the new form is used for the first time when
-	``attr2`` is selected (either when setting or removing extended
-	attributes) the on-disk superblock feature bit field will be
-	updated to reflect this format being in use.
-
-	The default behaviour is determined by the on-disk feature
-	bit indicating that ``attr2`` behaviour is active. If either
-	mount option is set, then that becomes the new default used
-	by the filesystem.
-
-	CRC enabled filesystems always use the ``attr2`` format, and so
-	will reject the ``noattr2`` mount option if it is set.
-
  discard or nodiscard (default)
 	Enable/disable the issuing of commands to let the block
 	device reclaim space freed by the filesystem.  This is
@@ -75,12 +59,6 @@ When mounting an XFS filesystem, the following options are accepted.
 	across the entire filesystem rather than just on directories
 	configured to use it.

-  ikeep or noikeep (default)
-	When ``ikeep`` is specified, XFS does not delete empty inode
-	clusters and keeps them around on disk.  When ``noikeep`` is
-	specified, empty inode clusters are returned to the free
-	space pool.
-
  inode32 or inode64 (default)
 	When ``inode32`` is specified, it indicates that XFS limits
 	inode creation to locations which will not result in inode
@@ -253,9 +231,8 @@ latest version and try again.

 The deprecation will take place in two parts.  Support for mounting V4
 filesystems can now be disabled at kernel build time via Kconfig option.
-The option will default to yes until September 2025, at which time it
-will be changed to default to no.  In September 2030, support will be
-removed from the codebase entirely.
+These options were changed to default to no in September 2025.  In
+September 2030, support will be removed from the codebase entirely.

 Note: Distributors may choose to withdraw V4 format support earlier than
 the dates listed above.
@@ -268,8 +245,6 @@ Deprecated Mount Options
 ============================    ================
 Mounting with V4 filesystem     September 2030
 Mounting ascii-ci filesystem    September 2030
-ikeep/noikeep			September 2025
-attr2/noattr2			September 2025
 ============================    ================


@@ -285,6 +260,8 @@ Removed Mount Options
  osyncisdsync/osyncisosync	v4.0
  barrier			v4.19
  nobarrier			v4.19
+  ikeep/noikeep			v6.18
+  attr2/noattr2			v6.18
 ===========================     =======

 sysctls
@@ -312,9 +289,6 @@ The following sysctls are available for the XFS filesystem:
 	removes unused preallocation from clean inodes and releases
 	the unused space back to the free pool.

-  fs.xfs.speculative_cow_prealloc_lifetime
-	This is an alias for speculative_prealloc_lifetime.
-
  fs.xfs.error_level		(Min: 0  Default: 3  Max: 11)
 	A volume knob for error reporting when internal errors occur.
 	This will generate detailed messages & backtraces for filesystem
@@ -341,17 +315,6 @@ The following sysctls are available for the XFS filesystem:

 	This option is intended for debugging only.

-  fs.xfs.irix_symlink_mode	(Min: 0  Default: 0  Max: 1)
-	Controls whether symlinks are created with mode 0777 (default)
-	or whether their mode is affected by the umask (irix mode).
-
-  fs.xfs.irix_sgid_inherit	(Min: 0  Default: 0  Max: 1)
-	Controls files created in SGID directories.
-	If the group ID of the new file does not match the effective group
-	ID or one of the supplementary group IDs of the parent dir, the
-	ISGID bit is cleared if the irix_sgid_inherit compatibility sysctl
-	is set.
-
  fs.xfs.inherit_sync		(Min: 0  Default: 1  Max: 1)
 	Setting this to "1" will cause the "sync" flag set
 	by the **xfs_io(8)** chattr command on a directory to be
@@ -387,24 +350,20 @@ The following sysctls are available for the XFS filesystem:
 Deprecated Sysctls
 ==================

-===========================================     ================
-  Name                                          Removal Schedule
-===========================================     ================
-fs.xfs.irix_sgid_inherit                        September 2025
-fs.xfs.irix_symlink_mode                        September 2025
-fs.xfs.speculative_cow_prealloc_lifetime        September 2025
-===========================================     ================
-
+None currently.

 Removed Sysctls
 ===============

-=============================	=======
-  Name				Removed
-=============================	=======
-  fs.xfs.xfsbufd_centisec	v4.0
-  fs.xfs.age_buffer_centisecs	v4.0
-=============================	=======
+==========================================   =======
+  Name                                       Removed
+==========================================   =======
+  fs.xfs.xfsbufd_centisec                    v4.0
+  fs.xfs.age_buffer_centisecs                v4.0
+  fs.xfs.irix_symlink_mode                   v6.18
+  fs.xfs.irix_sgid_inherit                   v6.18
+  fs.xfs.speculative_cow_prealloc_lifetime   v6.18
+==========================================   =======

 Error handling
 ==============
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -25,7 +25,7 @@ config XFS_FS
 config XFS_SUPPORT_V4
 	bool "Support deprecated V4 (crc=0) format"
 	depends on XFS_FS
-	default y
+	default n
 	help
 	  The V4 filesystem format lacks certain features that are supported
 	  by the V5 format, such as metadata checksumming, strengthened
@@ -40,7 +40,7 @@ config XFS_SUPPORT_V4
 	  filesystem is a V4 filesystem.  If no such string is found, please
 	  upgrade xfsprogs to the latest version and try again.

-	  This option will become default N in September 2025.  Support for the
+	  This option became default N in September 2025.  Support for the
 	  V4 format will be removed entirely in September 2030.  Distributors
 	  can say N here to withdraw support earlier.

@@ -50,7 +50,7 @@ config XFS_SUPPORT_V4
 config XFS_SUPPORT_ASCII_CI
 	bool "Support deprecated case-insensitive ascii (ascii-ci=1) format"
 	depends on XFS_FS
-	default y
+	default n
 	help
 	  The ASCII case insensitivity filesystem feature only works correctly
 	  on systems that have been coerced into using ISO 8859-1, and it does
@@ -67,7 +67,7 @@ config XFS_SUPPORT_ASCII_CI
 	  filesystem is a case-insensitive filesystem.  If no such string is
 	  found, please upgrade xfsprogs to the latest version and try again.

-	  This option will become default N in September 2025.  Support for the
+	  This option became default N in September 2025.  Support for the
 	  feature will be removed entirely in September 2030.  Distributors
 	  can say N here to withdraw support earlier.

@@ -137,7 +137,7 @@ config XFS_BTREE_IN_MEM

 config XFS_ONLINE_SCRUB
 	bool "XFS online metadata check support"
-	default n
+	default y
 	depends on XFS_FS
 	depends on TMPFS && SHMEM
 	select XFS_LIVE_HOOKS
@@ -150,12 +150,8 @@ config XFS_ONLINE_SCRUB
 	  advantage here is to look for problems proactively so that
 	  they can be dealt with in a controlled manner.

-	  This feature is considered EXPERIMENTAL.  Use with caution!
-
 	  See the xfs_scrub man page in section 8 for additional information.

-	  If unsure, say N.
-
 config XFS_ONLINE_SCRUB_STATS
 	bool "XFS online metadata check usage data collection"
 	default y
@@ -171,11 +167,9 @@ config XFS_ONLINE_SCRUB_STATS

 	  Usage data are collected in /sys/kernel/debug/xfs/scrub.

-	  If unsure, say N.
-
 config XFS_ONLINE_REPAIR
 	bool "XFS online metadata repair support"
-	default n
+	default y
 	depends on XFS_FS && XFS_ONLINE_SCRUB
 	select XFS_BTREE_IN_MEM
 	help
@@ -186,12 +180,8 @@ config XFS_ONLINE_REPAIR
 	  formatted with secondary metadata, such as reverse mappings and inode
 	  parent pointers.

-	  This feature is considered EXPERIMENTAL.  Use with caution!
-
 	  See the xfs_scrub man page in section 8 for additional information.

-	  If unsure, say N.
-
 config XFS_WARN
 	bool "XFS Verbose Warnings"
 	depends on XFS_FS && !XFS_DEBUG
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -92,9 +92,8 @@ xfs_ag_resv_critical(
 	trace_xfs_ag_resv_critical(pag, type, avail);

 	/* Critically low if less than 10% or max btree height remains. */
-	return XFS_TEST_ERROR(avail < orig / 10 ||
-			      avail < mp->m_agbtree_maxlevels,
-			mp, XFS_ERRTAG_AG_RESV_CRITICAL);
+	return avail < orig / 10 || avail < mp->m_agbtree_maxlevels ||
+		XFS_TEST_ERROR(mp, XFS_ERRTAG_AG_RESV_CRITICAL);
 }

 /*
@@ -203,7 +202,7 @@ __xfs_ag_resv_init(
 		return -EINVAL;
 	}

-	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_AG_RESV_FAIL))
+	if (XFS_TEST_ERROR(mp, XFS_ERRTAG_AG_RESV_FAIL))
 		error = -ENOSPC;
 	else
 		error = xfs_dec_fdblocks(mp, hidden_space, true);
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -3321,7 +3321,7 @@ xfs_agf_read_verify(
 		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
 	else {
 		fa = xfs_agf_verify(bp);
-		if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF))
+		if (fa || XFS_TEST_ERROR(mp, XFS_ERRTAG_ALLOC_READ_AGF))
 			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
 	}
 }
@@ -4019,8 +4019,7 @@ __xfs_free_extent(
 	ASSERT(len != 0);
 	ASSERT(type != XFS_AG_RESV_AGFL);

-	if (XFS_TEST_ERROR(false, mp,
-			XFS_ERRTAG_FREE_EXTENT))
+	if (XFS_TEST_ERROR(mp, XFS_ERRTAG_FREE_EXTENT))
 		return -EIO;

 	error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -667,12 +667,8 @@ xfs_attr_shortform_bytesfit(

 	/*
 	 * For attr2 we can try to move the forkoff if there is space in the
-	 * literal area, but for the old format we are done if there is no
-	 * space in the fixed attribute fork.
+	 * literal area
 	 */
-	if (!xfs_has_attr2(mp))
-		return 0;
-
 	dsize = dp->i_df.if_bytes;

 	switch (dp->i_df.if_format) {
@@ -723,22 +719,16 @@ xfs_attr_shortform_bytesfit(
 }

 /*
- * Switch on the ATTR2 superblock bit (implies also FEATURES2) unless:
- * - noattr2 mount option is set,
- * - on-disk version bit says it is already set, or
- * - the attr2 mount option is not set to enable automatic upgrade from attr1.
+ * Switch on the ATTR2 superblock bit (implies also FEATURES2) unless
+ * on-disk version bit says it is already set
 */
 STATIC void
 xfs_sbversion_add_attr2(
 	struct xfs_mount	*mp,
 	struct xfs_trans	*tp)
 {
-	if (xfs_has_noattr2(mp))
-		return;
 	if (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT)
 		return;
-	if (!xfs_has_attr2(mp))
-		return;

 	spin_lock(&mp->m_sb_lock);
 	xfs_add_attr2(mp);
@@ -889,7 +879,7 @@ xfs_attr_sf_removename(
 	/*
 	 * Fix up the start offset of the attribute fork
 	 */
-	if (totsize == sizeof(struct xfs_attr_sf_hdr) && xfs_has_attr2(mp) &&
+	if (totsize == sizeof(struct xfs_attr_sf_hdr) &&
 	    (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) &&
 	    !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE)) &&
 	    !xfs_has_parent(mp)) {
@@ -900,7 +890,6 @@ xfs_attr_sf_removename(
 		ASSERT(dp->i_forkoff);
 		ASSERT(totsize > sizeof(struct xfs_attr_sf_hdr) ||
 				(args->op_flags & XFS_DA_OP_ADDNAME) ||
-				!xfs_has_attr2(mp) ||
 				dp->i_df.if_format == XFS_DINODE_FMT_BTREE ||
 				xfs_has_parent(mp));
 		xfs_trans_log_inode(args->trans, dp,
@@ -1040,8 +1029,7 @@ xfs_attr_shortform_allfit(
 		bytes += xfs_attr_sf_entsize_byname(name_loc->namelen,
 					be16_to_cpu(name_loc->valuelen));
 	}
-	if (xfs_has_attr2(dp->i_mount) &&
-	    (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) &&
+	if ((dp->i_df.if_format != XFS_DINODE_FMT_BTREE) &&
 	    (bytes == sizeof(struct xfs_attr_sf_hdr)))
 		return -1;
 	return xfs_attr_shortform_bytesfit(dp, bytes);
@@ -1161,7 +1149,6 @@ xfs_attr3_leaf_to_shortform(
 		 * this case.
 		 */
 		if (!(args->op_flags & XFS_DA_OP_REPLACE)) {
-			ASSERT(xfs_has_attr2(dp->i_mount));
 			ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE);
 			xfs_attr_fork_remove(dp, args->trans);
 		}
@@ -1225,7 +1212,7 @@ xfs_attr3_leaf_to_node(

 	trace_xfs_attr_leaf_to_node(args);

-	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) {
+	if (XFS_TEST_ERROR(mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) {
 		error = -EIO;
 		goto out;
 	}
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -997,8 +997,7 @@ xfs_bmap_add_attrfork_local(
 static int
 xfs_bmap_set_attrforkoff(
 	struct xfs_inode	*ip,
-	int			size,
-	int			*version)
+	int			size)
 {
 	int			default_size = xfs_default_attroffset(ip) >> 3;

@@ -1012,8 +1011,6 @@ xfs_bmap_set_attrforkoff(
 		ip->i_forkoff = xfs_attr_shortform_bytesfit(ip, size);
 		if (!ip->i_forkoff)
 			ip->i_forkoff = default_size;
-		else if (xfs_has_attr2(ip->i_mount) && version)
-			*version = 2;
 		break;
 	default:
 		ASSERT(0);
@@ -1035,7 +1032,6 @@ xfs_bmap_add_attrfork(
 	int			rsvd)		/* xact may use reserved blks */
 {
 	struct xfs_mount	*mp = tp->t_mountp;
-	int			version = 1;	/* superblock attr version */
 	int			logflags;	/* logging flags */
 	int			error;		/* error return value */

@@ -1045,7 +1041,7 @@ xfs_bmap_add_attrfork(
 	ASSERT(!xfs_inode_has_attr_fork(ip));

 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-	error = xfs_bmap_set_attrforkoff(ip, size, &version);
+	error = xfs_bmap_set_attrforkoff(ip, size);
 	if (error)
 		return error;

@@ -1069,16 +1065,12 @@ xfs_bmap_add_attrfork(
 		xfs_trans_log_inode(tp, ip, logflags);
 	if (error)
 		return error;
-	if (!xfs_has_attr(mp) ||
-	   (!xfs_has_attr2(mp) && version == 2)) {
+	if (!xfs_has_attr(mp)) {
 		bool log_sb = false;

 		spin_lock(&mp->m_sb_lock);
 		if (!xfs_has_attr(mp)) {
 			xfs_add_attr(mp);
-			log_sb = true;
-		}
-		if (!xfs_has_attr2(mp) && version == 2) {
 			xfs_add_attr2(mp);
 			log_sb = true;
 		}
@@ -3662,8 +3654,7 @@ xfs_bmap_btalloc(
 	/* Trim the allocation back to the maximum an AG can fit. */
 	args.maxlen = min(ap->length, mp->m_ag_max_usable);

-	if (unlikely(XFS_TEST_ERROR(false, mp,
-			XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT)))
+	if (unlikely(XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT)))
 		error = xfs_bmap_exact_minlen_extent_alloc(ap, &args);
 	else if ((ap->datatype & XFS_ALLOC_USERDATA) &&
 			xfs_inode_is_filestream(ap->ip))
@@ -3849,7 +3840,7 @@ xfs_bmapi_read(
 	}

 	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
-	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+	    XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) {
 		xfs_bmap_mark_sick(ip, whichfork);
 		return -EFSCORRUPTED;
 	}
@@ -4200,7 +4191,7 @@ xfs_bmapi_write(
 			(XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO));

 	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
-	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+	    XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) {
 		xfs_bmap_mark_sick(ip, whichfork);
 		return -EFSCORRUPTED;
 	}
@@ -4545,7 +4536,7 @@ xfs_bmapi_remap(
 			(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC));

 	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
-	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+	    XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) {
 		xfs_bmap_mark_sick(ip, whichfork);
 		return -EFSCORRUPTED;
 	}
@@ -5679,7 +5670,7 @@ xfs_bmap_collapse_extents(
 	int			logflags = 0;

 	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
-	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+	    XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) {
 		xfs_bmap_mark_sick(ip, whichfork);
 		return -EFSCORRUPTED;
 	}
@@ -5795,7 +5786,7 @@ xfs_bmap_insert_extents(
 	int			logflags = 0;

 	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
-	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+	    XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) {
 		xfs_bmap_mark_sick(ip, whichfork);
 		return -EFSCORRUPTED;
 	}
@@ -5900,7 +5891,7 @@ xfs_bmap_split_extent(
 	int				i = 0;

 	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
-	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+	    XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) {
 		xfs_bmap_mark_sick(ip, whichfork);
 		return -EFSCORRUPTED;
 	}
@@ -6065,7 +6056,7 @@ xfs_bmap_finish_one(

 	trace_xfs_bmap_deferred(bi);

-	if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_BMAP_FINISH_ONE))
+	if (XFS_TEST_ERROR(tp->t_mountp, XFS_ERRTAG_BMAP_FINISH_ONE))
 		return -EIO;

 	switch (bi->bi_type) {
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -306,7 +306,7 @@ xfs_btree_check_block(

 	fa = __xfs_btree_check_block(cur, block, level, bp);
 	if (XFS_IS_CORRUPT(mp, fa != NULL) ||
-	    XFS_TEST_ERROR(false, mp, xfs_btree_block_errtag(cur))) {
+	    XFS_TEST_ERROR(mp, xfs_btree_block_errtag(cur))) {
 		if (bp)
 			trace_xfs_btree_corrupt(bp, _RET_IP_);
 		xfs_btree_mark_sick(cur);
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -565,7 +565,7 @@ xfs_da3_split(

 	trace_xfs_da_split(state->args);

-	if (XFS_TEST_ERROR(false, state->mp, XFS_ERRTAG_DA_LEAF_SPLIT))
+	if (XFS_TEST_ERROR(state->mp, XFS_ERRTAG_DA_LEAF_SPLIT))
 		return -EIO;

 	/*
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -223,7 +223,7 @@ xfs_dir_ino_validate(
 	bool		ino_ok = xfs_verify_dir_ino(mp, ino);

 	if (XFS_IS_CORRUPT(mp, !ino_ok) ||
-	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DIR_INO_VALIDATE)) {
+	    XFS_TEST_ERROR(mp, XFS_ERRTAG_DIR_INO_VALIDATE)) {
 		xfs_warn(mp, "Invalid inode number 0x%Lx",
 				(unsigned long long) ino);
 		return -EFSCORRUPTED;
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -4,14 +4,22 @@
 * Copyright (C) 2017 Oracle.
 * All Rights Reserved.
 */
-#ifndef __XFS_ERRORTAG_H_
+#if !defined(__XFS_ERRORTAG_H_) || defined(XFS_ERRTAG)
 #define __XFS_ERRORTAG_H_

 /*
- * error injection tags - the labels can be anything you want
- * but each tag should have its own unique number
+ * There are two ways to use this header file.  The first way is to #include it
+ * bare, which will define all the XFS_ERRTAG_* error injection knobs for use
+ * with the XFS_TEST_ERROR macro.  The second way is to enclose the #include
+ * with a #define for an XFS_ERRTAG macro, in which case the header will define
+ " an XFS_ERRTAGS macro that expands to invoke that XFS_ERRTAG macro for each
+ * defined error injection knob.
 */

+/*
+ * These are the actual error injection tags.  The numbers should be consecutive
+ * because arrays are sized based on the maximum.
+ */
 #define XFS_ERRTAG_NOERROR				0
 #define XFS_ERRTAG_IFLUSH_1				1
 #define XFS_ERRTAG_IFLUSH_2				2
@@ -71,49 +79,61 @@
 * Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
 */
 #define XFS_RANDOM_DEFAULT				100
-#define XFS_RANDOM_IFLUSH_1				XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IFLUSH_2				XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IFLUSH_3				XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IFLUSH_4				XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IFLUSH_5				XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IFLUSH_6				XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_DA_READ_BUF				XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_BTREE_CHECK_LBLOCK			(XFS_RANDOM_DEFAULT/4)
-#define XFS_RANDOM_BTREE_CHECK_SBLOCK			XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_ALLOC_READ_AGF			XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IALLOC_READ_AGI			XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_ITOBP_INOTOBP			XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IUNLINK				XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IUNLINK_REMOVE			XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_DIR_INO_VALIDATE			XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_BULKSTAT_READ_CHUNK			XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IODONE_IOERR				(XFS_RANDOM_DEFAULT/10)
-#define XFS_RANDOM_STRATREAD_IOERR			(XFS_RANDOM_DEFAULT/10)
-#define XFS_RANDOM_STRATCMPL_IOERR			(XFS_RANDOM_DEFAULT/10)
-#define XFS_RANDOM_DIOWRITE_IOERR			(XFS_RANDOM_DEFAULT/10)
-#define XFS_RANDOM_BMAPIFORMAT				XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_FREE_EXTENT				1
-#define XFS_RANDOM_RMAP_FINISH_ONE			1
-#define XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE		1
-#define XFS_RANDOM_REFCOUNT_FINISH_ONE			1
-#define XFS_RANDOM_BMAP_FINISH_ONE			1
-#define XFS_RANDOM_AG_RESV_CRITICAL			4
-#define XFS_RANDOM_LOG_BAD_CRC				1
-#define XFS_RANDOM_LOG_ITEM_PIN				1
-#define XFS_RANDOM_BUF_LRU_REF				2
-#define XFS_RANDOM_FORCE_SCRUB_REPAIR			1
-#define XFS_RANDOM_FORCE_SUMMARY_RECALC			1
-#define XFS_RANDOM_IUNLINK_FALLBACK			(XFS_RANDOM_DEFAULT/10)
-#define XFS_RANDOM_BUF_IOERROR				XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_REDUCE_MAX_IEXTENTS			1
-#define XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT		1
-#define XFS_RANDOM_AG_RESV_FAIL				1
-#define XFS_RANDOM_LARP					1
-#define XFS_RANDOM_DA_LEAF_SPLIT			1
-#define XFS_RANDOM_ATTR_LEAF_TO_NODE			1
-#define XFS_RANDOM_WB_DELAY_MS				3000
-#define XFS_RANDOM_WRITE_DELAY_MS			3000
-#define XFS_RANDOM_EXCHMAPS_FINISH_ONE			1
-#define XFS_RANDOM_METAFILE_RESV_CRITICAL		4
+
+/*
+ * Table of errror injection knobs.  The parameters to the XFS_ERRTAG macro are:
+ *   1. The XFS_ERRTAG_ flag but without the prefix;
+ *   2. The name of the sysfs knob; and
+ *   3. The default value for the knob.
+ */
+#ifdef XFS_ERRTAG
+# undef XFS_ERRTAGS
+# define XFS_ERRTAGS \
+XFS_ERRTAG(NOERROR,		noerror,		XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IFLUSH_1,		iflush1,		XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IFLUSH_2,		iflush2,		XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IFLUSH_3,		iflush3,		XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IFLUSH_4,		iflush4,		XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IFLUSH_5,		iflush5,		XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IFLUSH_6,		iflush6,		XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(DA_READ_BUF,		dareadbuf,		XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(BTREE_CHECK_LBLOCK,	btree_chk_lblk,		XFS_RANDOM_DEFAULT/4) \
+XFS_ERRTAG(BTREE_CHECK_SBLOCK,	btree_chk_sblk,		XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(ALLOC_READ_AGF,	readagf,		XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IALLOC_READ_AGI,	readagi,		XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(ITOBP_INOTOBP,	itobp,			XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IUNLINK,		iunlink,		XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IUNLINK_REMOVE,	iunlinkrm,		XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(DIR_INO_VALIDATE,	dirinovalid,		XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(BULKSTAT_READ_CHUNK,	bulkstat,		XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IODONE_IOERR,	logiodone,		XFS_RANDOM_DEFAULT/10) \
+XFS_ERRTAG(STRATREAD_IOERR,	stratread,		XFS_RANDOM_DEFAULT/10) \
+XFS_ERRTAG(STRATCMPL_IOERR,	stratcmpl,		XFS_RANDOM_DEFAULT/10) \
+XFS_ERRTAG(DIOWRITE_IOERR,	diowrite,		XFS_RANDOM_DEFAULT/10) \
+XFS_ERRTAG(BMAPIFORMAT,		bmapifmt,		XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(FREE_EXTENT,		free_extent,		1) \
+XFS_ERRTAG(RMAP_FINISH_ONE,	rmap_finish_one,	1) \
+XFS_ERRTAG(REFCOUNT_CONTINUE_UPDATE, refcount_continue_update, 1) \
+XFS_ERRTAG(REFCOUNT_FINISH_ONE,	refcount_finish_one,	1) \
+XFS_ERRTAG(BMAP_FINISH_ONE,	bmap_finish_one,	1) \
+XFS_ERRTAG(AG_RESV_CRITICAL,	ag_resv_critical,	4) \
+XFS_ERRTAG(LOG_BAD_CRC,		log_bad_crc,		1) \
+XFS_ERRTAG(LOG_ITEM_PIN,	log_item_pin,		1) \
+XFS_ERRTAG(BUF_LRU_REF,		buf_lru_ref,		2) \
+XFS_ERRTAG(FORCE_SCRUB_REPAIR,	force_repair,		1) \
+XFS_ERRTAG(FORCE_SUMMARY_RECALC, bad_summary,		1) \
+XFS_ERRTAG(IUNLINK_FALLBACK,	iunlink_fallback,	XFS_RANDOM_DEFAULT/10) \
+XFS_ERRTAG(BUF_IOERROR,		buf_ioerror,		XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(REDUCE_MAX_IEXTENTS,	reduce_max_iextents,	1) \
+XFS_ERRTAG(BMAP_ALLOC_MINLEN_EXTENT, bmap_alloc_minlen_extent, 1) \
+XFS_ERRTAG(AG_RESV_FAIL,	ag_resv_fail,		1) \
+XFS_ERRTAG(LARP,		larp,			1) \
+XFS_ERRTAG(DA_LEAF_SPLIT,	da_leaf_split,		1) \
+XFS_ERRTAG(ATTR_LEAF_TO_NODE,	attr_leaf_to_node,	1) \
+XFS_ERRTAG(WB_DELAY_MS,		wb_delay_ms,		3000) \
+XFS_ERRTAG(WRITE_DELAY_MS,	write_delay_ms,		3000) \
+XFS_ERRTAG(EXCHMAPS_FINISH_ONE,	exchmaps_finish_one,	1) \
+XFS_ERRTAG(METAFILE_RESV_CRITICAL, metafile_resv_crit,	4)
+#endif /* XFS_ERRTAG */

 #endif /* __XFS_ERRORTAG_H_ */
--- a/fs/xfs/libxfs/xfs_exchmaps.c
+++ b/fs/xfs/libxfs/xfs_exchmaps.c
@@ -616,7 +616,7 @@ xfs_exchmaps_finish_one(
 			return error;
 	}

-	if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_EXCHMAPS_FINISH_ONE))
+	if (XFS_TEST_ERROR(tp->t_mountp, XFS_ERRTAG_EXCHMAPS_FINISH_ONE))
 		return -EIO;

 	/* If we still have work to do, ask for a new transaction. */
@@ -882,7 +882,7 @@ xmi_ensure_delta_nextents(
 				&new_nextents))
 		return -EFBIG;

-	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) &&
+	if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) &&
 	    new_nextents > 10)
 		return -EFBIG;

--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -2140,7 +2140,7 @@ xfs_difree_inobt(
 	 * remove the chunk if the block size is large enough for multiple inode
 	 * chunks (that might not be free).
 	 */
-	if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE &&
+	if (rec.ir_free == XFS_INOBT_ALL_FREE &&
 	    mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
 		xic->deleted = true;
 		xic->first_ino = xfs_agino_to_ino(pag, rec.ir_startino);
@@ -2286,7 +2286,7 @@ xfs_difree_finobt(
 	 * enough for multiple chunks. Leave the finobt record to remain in sync
 	 * with the inobt.
 	 */
-	if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE &&
+	if (rec.ir_free == XFS_INOBT_ALL_FREE &&
 	    mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
 		error = xfs_btree_delete(cur, &i);
 		if (error)
@@ -2706,7 +2706,7 @@ xfs_agi_read_verify(
 		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
 	else {
 		fa = xfs_agi_verify(bp);
-		if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_IALLOC_READ_AGI))
+		if (fa || XFS_TEST_ERROR(mp, XFS_ERRTAG_IALLOC_READ_AGI))
 			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
 	}
 }
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -61,8 +61,8 @@ xfs_inode_buf_verify(
 		di_ok = xfs_verify_magic16(bp, dip->di_magic) &&
 			xfs_dinode_good_version(mp, dip->di_version) &&
 			xfs_verify_agino_or_null(bp->b_pag, unlinked_ino);
-		if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
-						XFS_ERRTAG_ITOBP_INOTOBP))) {
+		if (unlikely(!di_ok ||
+				XFS_TEST_ERROR(mp, XFS_ERRTAG_ITOBP_INOTOBP))) {
 			if (readahead) {
 				bp->b_flags &= ~XBF_DONE;
 				xfs_buf_ioerror(bp, -EIO);
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -756,8 +756,7 @@ xfs_iext_count_extend(
 	if (nr_exts < ifp->if_nextents)
 		return -EFBIG;

-	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) &&
-	    nr_exts > 10)
+	if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && nr_exts > 10)
 		return -EFBIG;

 	if (nr_exts > xfs_iext_max_nextents(has_large, whichfork)) {
--- a/fs/xfs/libxfs/xfs_inode_util.c
+++ b/fs/xfs/libxfs/xfs_inode_util.c
@@ -299,17 +299,6 @@ xfs_inode_init(
 		} else {
 			inode_init_owner(args->idmap, inode, dir, args->mode);
 		}
-
-		/*
-		 * If the group ID of the new file does not match the effective
-		 * group ID or one of the supplementary group IDs, the S_ISGID
-		 * bit is cleared (and only if the irix_sgid_inherit
-		 * compatibility variable is set).
-		 */
-		if (irix_sgid_inherit && (inode->i_mode & S_ISGID) &&
-		    !vfsgid_in_group_p(i_gid_into_vfsgid(args->idmap, inode)))
-			inode->i_mode &= ~S_ISGID;
-
 		ip->i_projid = xfs_get_initial_prid(pip);
 	}

--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -86,43 +86,6 @@ struct xfs_unmount_log_format {
 	uint32_t	pad2;	/* may as well make it 64 bits */
 };

-/* Region types for iovec's i_type */
-#define XLOG_REG_TYPE_BFORMAT		1
-#define XLOG_REG_TYPE_BCHUNK		2
-#define XLOG_REG_TYPE_EFI_FORMAT	3
-#define XLOG_REG_TYPE_EFD_FORMAT	4
-#define XLOG_REG_TYPE_IFORMAT		5
-#define XLOG_REG_TYPE_ICORE		6
-#define XLOG_REG_TYPE_IEXT		7
-#define XLOG_REG_TYPE_IBROOT		8
-#define XLOG_REG_TYPE_ILOCAL		9
-#define XLOG_REG_TYPE_IATTR_EXT		10
-#define XLOG_REG_TYPE_IATTR_BROOT	11
-#define XLOG_REG_TYPE_IATTR_LOCAL	12
-#define XLOG_REG_TYPE_QFORMAT		13
-#define XLOG_REG_TYPE_DQUOT		14
-#define XLOG_REG_TYPE_QUOTAOFF		15
-#define XLOG_REG_TYPE_LRHEADER		16
-#define XLOG_REG_TYPE_UNMOUNT		17
-#define XLOG_REG_TYPE_COMMIT		18
-#define XLOG_REG_TYPE_TRANSHDR		19
-#define XLOG_REG_TYPE_ICREATE		20
-#define XLOG_REG_TYPE_RUI_FORMAT	21
-#define XLOG_REG_TYPE_RUD_FORMAT	22
-#define XLOG_REG_TYPE_CUI_FORMAT	23
-#define XLOG_REG_TYPE_CUD_FORMAT	24
-#define XLOG_REG_TYPE_BUI_FORMAT	25
-#define XLOG_REG_TYPE_BUD_FORMAT	26
-#define XLOG_REG_TYPE_ATTRI_FORMAT	27
-#define XLOG_REG_TYPE_ATTRD_FORMAT	28
-#define XLOG_REG_TYPE_ATTR_NAME		29
-#define XLOG_REG_TYPE_ATTR_VALUE	30
-#define XLOG_REG_TYPE_XMI_FORMAT	31
-#define XLOG_REG_TYPE_XMD_FORMAT	32
-#define XLOG_REG_TYPE_ATTR_NEWNAME	33
-#define XLOG_REG_TYPE_ATTR_NEWVALUE	34
-#define XLOG_REG_TYPE_MAX		34
-
 /*
 * Flags to log operation header
 *
@@ -141,14 +104,13 @@ struct xfs_unmount_log_format {
 #define XLOG_END_TRANS		0x10	/* End a continued transaction */
 #define XLOG_UNMOUNT_TRANS	0x20	/* Unmount a filesystem transaction */

-
-typedef struct xlog_op_header {
+struct xlog_op_header {
 	__be32	   oh_tid;	/* transaction id of operation	:  4 b */
 	__be32	   oh_len;	/* bytes in data region		:  4 b */
 	__u8	   oh_clientid;	/* who sent me this		:  1 b */
 	__u8	   oh_flags;	/*				:  1 b */
 	__u16	   oh_res2;	/* 32 bit align			:  2 b */
-} xlog_op_header_t;
+};

 /* valid values for h_fmt */
 #define XLOG_FMT_UNKNOWN  0
@@ -174,12 +136,40 @@ typedef struct xlog_rec_header {
 	__be32	  h_prev_block; /* block number to previous LR		:  4 */
 	__be32	  h_num_logops;	/* number of log operations in this LR	:  4 */
 	__be32	  h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
-	/* new fields */
+
+	/* fields added by the Linux port: */
 	__be32    h_fmt;        /* format of log record                 :  4 */
 	uuid_t	  h_fs_uuid;    /* uuid of FS                           : 16 */
+
+	/* fields added for log v2: */
 	__be32	  h_size;	/* iclog size				:  4 */
+
+	/*
+	 * When h_size added for log v2 support, it caused structure to have
+	 * a different size on i386 vs all other architectures because the
+	 * sum of the size ofthe  member is not aligned by that of the largest
+	 * __be64-sized member, and i386 has really odd struct alignment rules.
+	 *
+	 * Due to the way the log headers are placed out on-disk that alone is
+	 * not a problem becaue the xlog_rec_header always sits alone in a
+	 * BBSIZEs area, and the rest of that area is padded with zeroes.
+	 * But xlog_cksum used to calculate the checksum based on the structure
+	 * size, and thus gives different checksums for i386 vs the rest.
+	 * We now do two checksum validation passes for both sizes to allow
+	 * moving v5 file systems with unclean logs between i386 and other
+	 * (little-endian) architectures.
+	 */
+	__u32	  h_pad0;
 } xlog_rec_header_t;

+#ifdef __i386__
+#define XLOG_REC_SIZE		offsetofend(struct xlog_rec_header, h_size)
+#define XLOG_REC_SIZE_OTHER	sizeof(struct xlog_rec_header)
+#else
+#define XLOG_REC_SIZE		sizeof(struct xlog_rec_header)
+#define XLOG_REC_SIZE_OTHER	offsetofend(struct xlog_rec_header, h_size)
+#endif /* __i386__ */
+
 typedef struct xlog_rec_ext_header {
 	__be32	  xh_cycle;	/* write cycle of log			: 4 */
 	__be32	  xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /*	: 256 */
@@ -195,12 +185,11 @@ typedef union xlog_in_core2 {
 } xlog_in_core_2_t;

 /* not an on-disk structure, but needed by log recovery in userspace */
-typedef struct xfs_log_iovec {
+struct xfs_log_iovec {
 	void		*i_addr;	/* beginning address of region */
 	int		i_len;		/* length in bytes of region */
 	uint		i_type;		/* type of region */
-} xfs_log_iovec_t;
-
+};

 /*
 * Transaction Header definitions.
@@ -213,12 +202,12 @@ typedef struct xfs_log_iovec {
 * Do not change the below structure without redoing the code in
 * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans().
 */
-typedef struct xfs_trans_header {
+struct xfs_trans_header {
 	uint		th_magic;		/* magic number */
 	uint		th_type;		/* transaction type */
 	int32_t		th_tid;			/* transaction id (unused) */
 	uint		th_num_items;		/* num items logged by trans */
-} xfs_trans_header_t;
+};

 #define	XFS_TRANS_HEADER_MAGIC	0x5452414e	/* TRAN */

@@ -542,7 +531,7 @@ struct xfs_log_dinode {
 #define __XFS_BLF_DATAMAP_SIZE	((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD)
 #define XFS_BLF_DATAMAP_SIZE	(__XFS_BLF_DATAMAP_SIZE + 1)

-typedef struct xfs_buf_log_format {
+struct xfs_buf_log_format {
 	unsigned short	blf_type;	/* buf log item type indicator */
 	unsigned short	blf_size;	/* size of this item */
 	unsigned short	blf_flags;	/* misc state */
@@ -550,7 +539,7 @@ typedef struct xfs_buf_log_format {
 	int64_t		blf_blkno;	/* starting blkno of this buf */
 	unsigned int	blf_map_size;	/* used size of data bitmap in words */
 	unsigned int	blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */
-} xfs_buf_log_format_t;
+};

 /*
 * All buffers now need to tell recovery where the magic number
@@ -606,40 +595,41 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf)
 /*
 * EFI/EFD log format definitions
 */
-typedef struct xfs_extent {
+struct xfs_extent {
 	xfs_fsblock_t	ext_start;
 	xfs_extlen_t	ext_len;
-} xfs_extent_t;
+};

 /*
- * Since an xfs_extent_t has types (start:64, len: 32)
- * there are different alignments on 32 bit and 64 bit kernels.
- * So we provide the different variants for use by a
- * conversion routine.
+ * Since the structures in struct xfs_extent add up to 96 bytes, it has
+ * different alignments on i386 vs all other architectures, because i386
+ * does not pad structures to their natural alignment.
+ *
+ * Provide the different variants for use by a conversion routine.
 */
-typedef struct xfs_extent_32 {
+struct xfs_extent_32 {
 	uint64_t	ext_start;
 	uint32_t	ext_len;
-} __attribute__((packed)) xfs_extent_32_t;
+} __attribute__((packed));

-typedef struct xfs_extent_64 {
+struct xfs_extent_64 {
 	uint64_t	ext_start;
 	uint32_t	ext_len;
 	uint32_t	ext_pad;
-} xfs_extent_64_t;
+};

 /*
 * This is the structure used to lay out an efi log item in the
 * log.  The efi_extents field is a variable size array whose
 * size is given by efi_nextents.
 */
-typedef struct xfs_efi_log_format {
+struct xfs_efi_log_format {
 	uint16_t		efi_type;	/* efi log item type */
 	uint16_t		efi_size;	/* size of this item */
 	uint32_t		efi_nextents;	/* # extents to free */
 	uint64_t		efi_id;		/* efi identifier */
-	xfs_extent_t		efi_extents[];	/* array of extents to free */
-} xfs_efi_log_format_t;
+	struct xfs_extent	efi_extents[];	/* array of extents to free */
+};

 static inline size_t
 xfs_efi_log_format_sizeof(
@@ -649,13 +639,13 @@ xfs_efi_log_format_sizeof(
 			nr * sizeof(struct xfs_extent);
 }

-typedef struct xfs_efi_log_format_32 {
+struct xfs_efi_log_format_32 {
 	uint16_t		efi_type;	/* efi log item type */
 	uint16_t		efi_size;	/* size of this item */
 	uint32_t		efi_nextents;	/* # extents to free */
 	uint64_t		efi_id;		/* efi identifier */
-	xfs_extent_32_t		efi_extents[];	/* array of extents to free */
-} __attribute__((packed)) xfs_efi_log_format_32_t;
+	struct xfs_extent_32	efi_extents[];	/* array of extents to free */
+} __attribute__((packed));

 static inline size_t
 xfs_efi_log_format32_sizeof(
@@ -665,13 +655,13 @@ xfs_efi_log_format32_sizeof(
 			nr * sizeof(struct xfs_extent_32);
 }

-typedef struct xfs_efi_log_format_64 {
+struct xfs_efi_log_format_64 {
 	uint16_t		efi_type;	/* efi log item type */
 	uint16_t		efi_size;	/* size of this item */
 	uint32_t		efi_nextents;	/* # extents to free */
 	uint64_t		efi_id;		/* efi identifier */
-	xfs_extent_64_t		efi_extents[];	/* array of extents to free */
-} xfs_efi_log_format_64_t;
+	struct xfs_extent_64	efi_extents[];	/* array of extents to free */
+};

 static inline size_t
 xfs_efi_log_format64_sizeof(
@@ -686,13 +676,13 @@ xfs_efi_log_format64_sizeof(
 * log.  The efd_extents array is a variable size array whose
 * size is given by efd_nextents;
 */
-typedef struct xfs_efd_log_format {
+struct xfs_efd_log_format {
 	uint16_t		efd_type;	/* efd log item type */
 	uint16_t		efd_size;	/* size of this item */
 	uint32_t		efd_nextents;	/* # of extents freed */
 	uint64_t		efd_efi_id;	/* id of corresponding efi */
-	xfs_extent_t		efd_extents[];	/* array of extents freed */
-} xfs_efd_log_format_t;
+	struct xfs_extent	efd_extents[];	/* array of extents freed */
+};

 static inline size_t
 xfs_efd_log_format_sizeof(
@@ -702,13 +692,13 @@ xfs_efd_log_format_sizeof(
 			nr * sizeof(struct xfs_extent);
 }

-typedef struct xfs_efd_log_format_32 {
+struct xfs_efd_log_format_32 {
 	uint16_t		efd_type;	/* efd log item type */
 	uint16_t		efd_size;	/* size of this item */
 	uint32_t		efd_nextents;	/* # of extents freed */
 	uint64_t		efd_efi_id;	/* id of corresponding efi */
-	xfs_extent_32_t		efd_extents[];	/* array of extents freed */
-} __attribute__((packed)) xfs_efd_log_format_32_t;
+	struct xfs_extent_32	efd_extents[];	/* array of extents freed */
+} __attribute__((packed));

 static inline size_t
 xfs_efd_log_format32_sizeof(
@@ -718,13 +708,13 @@ xfs_efd_log_format32_sizeof(
 			nr * sizeof(struct xfs_extent_32);
 }

-typedef struct xfs_efd_log_format_64 {
+struct xfs_efd_log_format_64 {
 	uint16_t		efd_type;	/* efd log item type */
 	uint16_t		efd_size;	/* size of this item */
 	uint32_t		efd_nextents;	/* # of extents freed */
 	uint64_t		efd_efi_id;	/* id of corresponding efi */
-	xfs_extent_64_t		efd_extents[];	/* array of extents freed */
-} xfs_efd_log_format_64_t;
+	struct xfs_extent_64	efd_extents[];	/* array of extents freed */
+};

 static inline size_t
 xfs_efd_log_format64_sizeof(
@@ -957,14 +947,14 @@ struct xfs_xmd_log_format {
 * The first two fields must be the type and size fitting into
 * 32 bits : log_recovery code assumes that.
 */
-typedef struct xfs_dq_logformat {
+struct xfs_dq_logformat {
 	uint16_t		qlf_type;      /* dquot log item type */
 	uint16_t		qlf_size;      /* size of this item */
 	xfs_dqid_t		qlf_id;	       /* usr/grp/proj id : 32 bits */
 	int64_t			qlf_blkno;     /* blkno of dquot buffer */
 	int32_t			qlf_len;       /* len of dquot buffer */
 	uint32_t		qlf_boffset;   /* off of dquot in buffer */
-} xfs_dq_logformat_t;
+};

 /*
 * log format struct for QUOTAOFF records.
@@ -974,12 +964,12 @@ typedef struct xfs_dq_logformat {
 * to the first and ensures that the first logitem is taken out of the AIL
 * only when the last one is securely committed.
 */
-typedef struct xfs_qoff_logformat {
+struct xfs_qoff_logformat {
 	unsigned short		qf_type;	/* quotaoff log item type */
 	unsigned short		qf_size;	/* size of this item */
 	unsigned int		qf_flags;	/* USR and/or GRP */
 	char			qf_pad[12];	/* padding for future */
-} xfs_qoff_logformat_t;
+};

 /*
 * Disk quotas status in m_qflags, and also sb_qflags. 16 bits.
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -111,7 +111,7 @@ struct xlog_recover_item {
 struct xlog_recover {
 	struct hlist_node	r_list;
 	xlog_tid_t		r_log_tid;	/* log's transaction id */
-	xfs_trans_header_t	r_theader;	/* trans header for partial */
+	struct xfs_trans_header	r_theader;	/* trans header for partial */
 	int			r_state;	/* not needed */
 	xfs_lsn_t		r_lsn;		/* xact lsn */
 	struct list_head	r_itemq;	/* q for items */
--- a/fs/xfs/libxfs/xfs_metafile.c
+++ b/fs/xfs/libxfs/xfs_metafile.c
@@ -121,7 +121,7 @@ xfs_metafile_resv_critical(
 			div_u64(mp->m_metafile_resv_target, 10)))
 		return true;

-	return XFS_TEST_ERROR(false, mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL);
+	return XFS_TEST_ERROR(mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL);
 }

 /* Allocate a block from the metadata file's reservation. */
--- a/fs/xfs/libxfs/xfs_ondisk.h
+++ b/fs/xfs/libxfs/xfs_ondisk.h
@@ -174,6 +174,8 @@ xfs_check_ondisk_structs(void)
 	XFS_CHECK_STRUCT_SIZE(struct xfs_rud_log_format,	16);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_map_extent,		32);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_phys_extent,		16);
+	XFS_CHECK_STRUCT_SIZE(struct xlog_rec_header,		328);
+	XFS_CHECK_STRUCT_SIZE(struct xlog_rec_ext_header,	260);

 	XFS_CHECK_OFFSET(struct xfs_bui_log_format, bui_extents,	16);
 	XFS_CHECK_OFFSET(struct xfs_cui_log_format, cui_extents,	16);
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -1113,8 +1113,7 @@ xfs_refcount_still_have_space(
 	 * refcount continue update "error" has been injected.
 	 */
 	if (cur->bc_refc.nr_ops > 2 &&
-	    XFS_TEST_ERROR(false, cur->bc_mp,
-			XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE))
+	    XFS_TEST_ERROR(cur->bc_mp, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE))
 		return false;

 	if (cur->bc_refc.nr_ops == 0)
@@ -1398,7 +1397,7 @@ xfs_refcount_finish_one(

 	trace_xfs_refcount_deferred(mp, ri);

-	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE))
+	if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE))
 		return -EIO;

 	/*
@@ -1511,7 +1510,7 @@ xfs_rtrefcount_finish_one(

 	trace_xfs_refcount_deferred(mp, ri);

-	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE))
+	if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE))
 		return -EIO;

 	/*
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -2690,7 +2690,7 @@ xfs_rmap_finish_one(

 	trace_xfs_rmap_deferred(mp, ri);

-	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE))
+	if (XFS_TEST_ERROR(mp, XFS_ERRTAG_RMAP_FINISH_ONE))
 		return -EIO;

 	/*
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1067,7 +1067,7 @@ xfs_rtfree_extent(
 	ASSERT(rbmip->i_itemp != NULL);
 	xfs_assert_ilocked(rbmip, XFS_ILOCK_EXCL);

-	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FREE_EXTENT))
+	if (XFS_TEST_ERROR(mp, XFS_ERRTAG_FREE_EXTENT))
 		return -EIO;

 	error = xfs_rtcheck_alloc_range(&args, start, len);
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -142,8 +142,6 @@ xfs_sb_version_to_features(
 	if (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) {
 		if (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT)
 			features |= XFS_FEAT_LAZYSBCOUNT;
-		if (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT)
-			features |= XFS_FEAT_ATTR2;
 		if (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT)
 			features |= XFS_FEAT_PROJID32;
 		if (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE)
@@ -155,7 +153,7 @@ xfs_sb_version_to_features(

 	/* Always on V5 features */
 	features |= XFS_FEAT_ALIGN | XFS_FEAT_LOGV2 | XFS_FEAT_EXTFLG |
-		    XFS_FEAT_LAZYSBCOUNT | XFS_FEAT_ATTR2 | XFS_FEAT_PROJID32 |
+		    XFS_FEAT_LAZYSBCOUNT | XFS_FEAT_PROJID32 |
 		    XFS_FEAT_V3INODES | XFS_FEAT_CRC | XFS_FEAT_PQUOTINO;

 	/* Optional V5 features */
@@ -1524,7 +1522,8 @@ xfs_fs_geometry(
 	geo->version = XFS_FSOP_GEOM_VERSION;
 	geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK |
 		     XFS_FSOP_GEOM_FLAGS_DIRV2 |
-		     XFS_FSOP_GEOM_FLAGS_EXTFLG;
+		     XFS_FSOP_GEOM_FLAGS_EXTFLG |
+		     XFS_FSOP_GEOM_FLAGS_ATTR2;
 	if (xfs_has_attr(mp))
 		geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR;
 	if (xfs_has_quota(mp))
@@ -1537,8 +1536,6 @@ xfs_fs_geometry(
 		geo->flags |= XFS_FSOP_GEOM_FLAGS_DIRV2CI;
 	if (xfs_has_lazysbcount(mp))
 		geo->flags |= XFS_FSOP_GEOM_FLAGS_LAZYSB;
-	if (xfs_has_attr2(mp))
-		geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR2;
 	if (xfs_has_projid32(mp))
 		geo->flags |= XFS_FSOP_GEOM_FLAGS_PROJID32;
 	if (xfs_has_crc(mp))
--- a/fs/xfs/libxfs/xfs_zones.h
+++ b/fs/xfs/libxfs/xfs_zones.h
@@ -29,6 +29,13 @@ struct xfs_rtgroup;
 #define XFS_OPEN_GC_ZONES	1U
 #define XFS_MIN_OPEN_ZONES	(XFS_OPEN_GC_ZONES + 1U)

+/*
+ * For zoned devices that do not have a limit on the number of open zones, and
+ * for regular devices using the zoned allocator, use the most common SMR disks
+ * limit (128) as the default limit on the number of open zones.
+ */
+#define XFS_DEFAULT_MAX_OPEN_ZONES	128
+
 bool xfs_zone_validate(struct blk_zone *zone, struct xfs_rtgroup *rtg,
 	xfs_rgblock_t *write_pointer);

--- a/fs/xfs/scrub/cow_repair.c
+++ b/fs/xfs/scrub/cow_repair.c
@@ -300,7 +300,7 @@ xrep_cow_find_bad(
 	 * on the debugging knob, replace everything in the CoW fork.
 	 */
 	if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) ||
-	    XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) {
+	    XFS_TEST_ERROR(sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) {
 		error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock,
 				xc->irec.br_blockcount);
 		if (error)
@@ -385,7 +385,7 @@ xrep_cow_find_bad_rt(
 	 * CoW fork and then scan for staging extents in the refcountbt.
 	 */
 	if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) ||
-	    XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) {
+	    XFS_TEST_ERROR(sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) {
 		error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock,
 				xc->irec.br_blockcount);
 		if (error)
--- a/fs/xfs/scrub/metapath.c
+++ b/fs/xfs/scrub/metapath.c
@@ -79,7 +79,7 @@ xchk_metapath_cleanup(

 	if (mpath->dp_ilock_flags)
 		xfs_iunlock(mpath->dp, mpath->dp_ilock_flags);
-	kfree(mpath->path);
+	kfree_const(mpath->path);
 }

 /* Set up a metadir path scan.  @path must be dynamically allocated. */
@@ -98,13 +98,13 @@ xchk_setup_metapath_scan(

 	error = xchk_install_live_inode(sc, ip);
 	if (error) {
-		kfree(path);
+		kfree_const(path);
 		return error;
 	}

 	mpath = kzalloc(sizeof(struct xchk_metapath), XCHK_GFP_FLAGS);
 	if (!mpath) {
-		kfree(path);
+		kfree_const(path);
 		return -ENOMEM;
 	}

@@ -132,7 +132,7 @@ xchk_setup_metapath_rtdir(
 		return -ENOENT;

 	return xchk_setup_metapath_scan(sc, sc->mp->m_metadirip,
-			kasprintf(GFP_KERNEL, "rtgroups"), sc->mp->m_rtdirip);
+			kstrdup_const("rtgroups", GFP_KERNEL), sc->mp->m_rtdirip);
 }

 /* Scan a rtgroup inode under the /rtgroups directory. */
@@ -179,7 +179,7 @@ xchk_setup_metapath_quotadir(
 		return -ENOENT;

 	return xchk_setup_metapath_scan(sc, sc->mp->m_metadirip,
-			kstrdup("quota", GFP_KERNEL), qi->qi_dirip);
+			kstrdup_const("quota", GFP_KERNEL), qi->qi_dirip);
 }

 /* Scan a quota inode under the /quota directory. */
@@ -212,7 +212,7 @@ xchk_setup_metapath_dqinode(
 		return -ENOENT;

 	return xchk_setup_metapath_scan(sc, qi->qi_dirip,
-			kstrdup(xfs_dqinode_path(type), GFP_KERNEL), ip);
+			kstrdup_const(xfs_dqinode_path(type), GFP_KERNEL), ip);
 }
 #else
 # define xchk_setup_metapath_quotadir(...)	(-ENOENT)
--- a/fs/xfs/scrub/newbt.c
+++ b/fs/xfs/scrub/newbt.c
@@ -27,6 +27,15 @@
 #include "scrub/repair.h"
 #include "scrub/newbt.h"

+/*
+ * This is the maximum number of deferred extent freeing item extents (EFIs)
+ * that we'll attach to a transaction without rolling the transaction to avoid
+ * overrunning a tr_itruncate reservation.  The newbt code should reserve
+ * exactly the correct number of blocks to rebuild the btree, so there should
+ * not be any excess blocks to free when committing a new btree.
+ */
+#define XREP_MAX_ITRUNCATE_EFIS	(128)
+
 /*
 * Estimate proper slack values for a btree that's being reloaded.
 *
--- a/fs/xfs/scrub/reap.c
+++ b/fs/xfs/scrub/reap.c
@@ -36,6 +36,12 @@
 #include "xfs_metafile.h"
 #include "xfs_rtgroup.h"
 #include "xfs_rtrmap_btree.h"
+#include "xfs_extfree_item.h"
+#include "xfs_rmap_item.h"
+#include "xfs_refcount_item.h"
+#include "xfs_buf_item.h"
+#include "xfs_bmap_item.h"
+#include "xfs_bmap_btree.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -91,21 +97,33 @@
 struct xreap_state {
 	struct xfs_scrub		*sc;

-	/* Reverse mapping owner and metadata reservation type. */
-	const struct xfs_owner_info	*oinfo;
-	enum xfs_ag_resv_type		resv;
-
-	/* If true, roll the transaction before reaping the next extent. */
-	bool				force_roll;
-
-	/* Number of deferred reaps attached to the current transaction. */
-	unsigned int			deferred;
+	union {
+		struct {
+			/*
+			 * For AG blocks, this is reverse mapping owner and
+			 * metadata reservation type.
+			 */
+			const struct xfs_owner_info	*oinfo;
+			enum xfs_ag_resv_type		resv;
+		};
+		struct {
+			/* For file blocks, this is the inode and fork. */
+			struct xfs_inode		*ip;
+			int				whichfork;
+		};
+	};

 	/* Number of invalidated buffers logged to the current transaction. */
-	unsigned int			invalidated;
+	unsigned int			nr_binval;

-	/* Number of deferred reaps queued during the whole reap sequence. */
-	unsigned long long		total_deferred;
+	/* Maximum number of buffers we can invalidate in a single tx. */
+	unsigned int			max_binval;
+
+	/* Number of deferred reaps attached to the current transaction. */
+	unsigned int			nr_deferred;
+
+	/* Maximum number of intents we can reap in a single transaction. */
+	unsigned int			max_deferred;
 };

 /* Put a block back on the AGFL. */
@@ -148,71 +166,79 @@ xreap_put_freelist(
 }

 /* Are there any uncommitted reap operations? */
-static inline bool xreap_dirty(const struct xreap_state *rs)
+static inline bool xreap_is_dirty(const struct xreap_state *rs)
 {
-	if (rs->force_roll)
-		return true;
-	if (rs->deferred)
-		return true;
-	if (rs->invalidated)
-		return true;
-	if (rs->total_deferred)
-		return true;
-	return false;
+	return rs->nr_binval > 0 || rs->nr_deferred > 0;
 }

-#define XREAP_MAX_BINVAL	(2048)
-
 /*
- * Decide if we want to roll the transaction after reaping an extent.  We don't
- * want to overrun the transaction reservation, so we prohibit more than
- * 128 EFIs per transaction.  For the same reason, we limit the number
- * of buffer invalidations to 2048.
+ * Decide if we need to roll the transaction to clear out the the log
+ * reservation that we allocated to buffer invalidations.
 */
-static inline bool xreap_want_roll(const struct xreap_state *rs)
+static inline bool xreap_want_binval_roll(const struct xreap_state *rs)
 {
-	if (rs->force_roll)
-		return true;
-	if (rs->deferred > XREP_MAX_ITRUNCATE_EFIS)
-		return true;
-	if (rs->invalidated > XREAP_MAX_BINVAL)
-		return true;
-	return false;
+	return rs->nr_binval >= rs->max_binval;
 }

-static inline void xreap_reset(struct xreap_state *rs)
+/* Reset the buffer invalidation count after rolling. */
+static inline void xreap_binval_reset(struct xreap_state *rs)
 {
-	rs->total_deferred += rs->deferred;
-	rs->deferred = 0;
-	rs->invalidated = 0;
-	rs->force_roll = false;
+	rs->nr_binval = 0;
 }

-#define XREAP_MAX_DEFER_CHAIN		(2048)
+/*
+ * Bump the number of invalidated buffers, and return true if we can continue,
+ * or false if we need to roll the transaction.
+ */
+static inline bool xreap_inc_binval(struct xreap_state *rs)
+{
+	rs->nr_binval++;
+	return rs->nr_binval < rs->max_binval;
+}

 /*
 * Decide if we want to finish the deferred ops that are attached to the scrub
 * transaction.  We don't want to queue huge chains of deferred ops because
 * that can consume a lot of log space and kernel memory.  Hence we trigger a
- * xfs_defer_finish if there are more than 2048 deferred reap operations or the
- * caller did some real work.
+ * xfs_defer_finish if there are too many deferred reap operations or we've run
+ * out of space for invalidations.
 */
-static inline bool
-xreap_want_defer_finish(const struct xreap_state *rs)
+static inline bool xreap_want_defer_finish(const struct xreap_state *rs)
 {
-	if (rs->force_roll)
-		return true;
-	if (rs->total_deferred > XREAP_MAX_DEFER_CHAIN)
-		return true;
-	return false;
+	return rs->nr_deferred >= rs->max_deferred;
 }

+/*
+ * Reset the defer chain length and buffer invalidation count after finishing
+ * items.
+ */
 static inline void xreap_defer_finish_reset(struct xreap_state *rs)
 {
-	rs->total_deferred = 0;
-	rs->deferred = 0;
-	rs->invalidated = 0;
-	rs->force_roll = false;
+	rs->nr_deferred = 0;
+	rs->nr_binval = 0;
+}
+
+/*
+ * Bump the number of deferred extent reaps.
+ */
+static inline void xreap_inc_defer(struct xreap_state *rs)
+{
+	rs->nr_deferred++;
+}
+
+/* Force the caller to finish a deferred item chain. */
+static inline void xreap_force_defer_finish(struct xreap_state *rs)
+{
+	rs->nr_deferred = rs->max_deferred;
+}
+
+/* Maximum number of fsblocks that we might find in a buffer to invalidate. */
+static inline unsigned int
+xrep_binval_max_fsblocks(
+	struct xfs_mount	*mp)
+{
+	/* Remote xattr values are the largest buffers that we support. */
+	return xfs_attr3_max_rmt_blocks(mp);
 }

 /*
@@ -224,12 +250,8 @@ xrep_bufscan_max_sectors(
 	struct xfs_mount	*mp,
 	xfs_extlen_t		fsblocks)
 {
-	int			max_fsbs;
-
-	/* Remote xattr values are the largest buffers that we support. */
-	max_fsbs = xfs_attr3_max_rmt_blocks(mp);
-
-	return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks, max_fsbs));
+	return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks,
+				       xrep_binval_max_fsblocks(mp)));
 }

 /*
@@ -297,14 +319,13 @@ xreap_agextent_binval(
 		while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) {
 			xfs_trans_bjoin(sc->tp, bp);
 			xfs_trans_binval(sc->tp, bp);
-			rs->invalidated++;

 			/*
 			 * Stop invalidating if we've hit the limit; we should
 			 * still have enough reservation left to free however
 			 * far we've gotten.
 			 */
-			if (rs->invalidated > XREAP_MAX_BINVAL) {
+			if (!xreap_inc_binval(rs)) {
 				*aglenp -= agbno_next - bno;
 				goto out;
 			}
@@ -416,21 +437,23 @@ xreap_agextent_iter(
 		trace_xreap_dispose_unmap_extent(pag_group(sc->sa.pag), agbno,
 				*aglenp);

-		rs->force_roll = true;
-
 		if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
 			/*
-			 * If we're unmapping CoW staging extents, remove the
+			 * t0: Unmapping CoW staging extents, remove the
 			 * records from the refcountbt, which will remove the
 			 * rmap record as well.
 			 */
 			xfs_refcount_free_cow_extent(sc->tp, false, fsbno,
 					*aglenp);
+			xreap_inc_defer(rs);
 			return 0;
 		}

-		return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno,
-				*aglenp, rs->oinfo);
+		/* t1: unmap crosslinked metadata blocks */
+		xfs_rmap_free_extent(sc->tp, false, fsbno, *aglenp,
+				rs->oinfo->oi_owner);
+		xreap_inc_defer(rs);
+		return 0;
 	}

 	trace_xreap_dispose_free_extent(pag_group(sc->sa.pag), agbno, *aglenp);
@@ -443,12 +466,12 @@ xreap_agextent_iter(
 	 */
 	xreap_agextent_binval(rs, agbno, aglenp);
 	if (*aglenp == 0) {
-		ASSERT(xreap_want_roll(rs));
+		ASSERT(xreap_want_binval_roll(rs));
 		return 0;
 	}

 	/*
-	 * If we're getting rid of CoW staging extents, use deferred work items
+	 * t2: To get rid of CoW staging extents, use deferred work items
 	 * to remove the refcountbt records (which removes the rmap records)
 	 * and free the extent.  We're not worried about the system going down
 	 * here because log recovery walks the refcount btree to clean out the
@@ -463,23 +486,23 @@ xreap_agextent_iter(
 		if (error)
 			return error;

-		rs->force_roll = true;
+		xreap_inc_defer(rs);
 		return 0;
 	}

-	/* Put blocks back on the AGFL one at a time. */
+	/* t3: Put blocks back on the AGFL one at a time. */
 	if (rs->resv == XFS_AG_RESV_AGFL) {
 		ASSERT(*aglenp == 1);
 		error = xreap_put_freelist(sc, agbno);
 		if (error)
 			return error;

-		rs->force_roll = true;
+		xreap_force_defer_finish(rs);
 		return 0;
 	}

 	/*
-	 * Use deferred frees to get rid of the old btree blocks to try to
+	 * t4: Use deferred frees to get rid of the old btree blocks to try to
 	 * minimize the window in which we could crash and lose the old blocks.
 	 * Add a defer ops barrier every other extent to avoid stressing the
 	 * system with large EFIs.
@@ -489,12 +512,194 @@ xreap_agextent_iter(
 	if (error)
 		return error;

-	rs->deferred++;
-	if (rs->deferred % 2 == 0)
+	xreap_inc_defer(rs);
+	if (rs->nr_deferred % 2 == 0)
 		xfs_defer_add_barrier(sc->tp);
 	return 0;
 }

+/* Configure the deferral and invalidation limits */
+static inline void
+xreap_configure_limits(
+	struct xreap_state	*rs,
+	unsigned int		fixed_overhead,
+	unsigned int		variable_overhead,
+	unsigned int		per_intent,
+	unsigned int		per_binval)
+{
+	struct xfs_scrub	*sc = rs->sc;
+	unsigned int		res = sc->tp->t_log_res - fixed_overhead;
+
+	/* Don't underflow the reservation */
+	if (sc->tp->t_log_res < (fixed_overhead + variable_overhead)) {
+		ASSERT(sc->tp->t_log_res >=
+				(fixed_overhead + variable_overhead));
+		xfs_force_shutdown(sc->mp, SHUTDOWN_CORRUPT_INCORE);
+		return;
+	}
+
+	rs->max_deferred = per_intent ? res / variable_overhead : 0;
+	res -= rs->max_deferred * per_intent;
+	rs->max_binval = per_binval ? res / per_binval : 0;
+}
+
+/*
+ * Compute the maximum number of intent items that reaping can attach to the
+ * scrub transaction given the worst case log overhead of the intent items
+ * needed to reap a single per-AG space extent.  This is not for freeing CoW
+ * staging extents.
+ */
+STATIC void
+xreap_configure_agextent_limits(
+	struct xreap_state	*rs)
+{
+	struct xfs_scrub	*sc = rs->sc;
+	struct xfs_mount	*mp = sc->mp;
+
+	/*
+	 * In the worst case, relogging an intent item causes both an intent
+	 * item and a done item to be attached to a transaction for each extent
+	 * that we'd like to process.
+	 */
+	const unsigned int	efi = xfs_efi_log_space(1) +
+				      xfs_efd_log_space(1);
+	const unsigned int	rui = xfs_rui_log_space(1) +
+				      xfs_rud_log_space();
+
+	/*
+	 * Various things can happen when reaping non-CoW metadata blocks:
+	 *
+	 * t1: Unmapping crosslinked metadata blocks: deferred removal of rmap
+	 * record.
+	 *
+	 * t3: Freeing to AGFL: roll and finish deferred items for every block.
+	 * Limits here do not matter.
+	 *
+	 * t4: Freeing metadata blocks: deferred freeing of the space, which
+	 * also removes the rmap record.
+	 *
+	 * For simplicity, we'll use the worst-case intents size to determine
+	 * the maximum number of deferred extents before we have to finish the
+	 * whole chain.  If we're trying to reap a btree larger than this size,
+	 * a crash midway through reaping can result in leaked blocks.
+	 */
+	const unsigned int	t1 = rui;
+	const unsigned int	t4 = rui + efi;
+	const unsigned int	per_intent = max(t1, t4);
+
+	/*
+	 * For each transaction in a reap chain, we must be able to take one
+	 * step in the defer item chain, which should only consist of EFI or
+	 * RUI items.
+	 */
+	const unsigned int	f1 = xfs_calc_finish_efi_reservation(mp, 1);
+	const unsigned int	f2 = xfs_calc_finish_rui_reservation(mp, 1);
+	const unsigned int	step_size = max(f1, f2);
+
+	/* Largest buffer size (in fsblocks) that can be invalidated. */
+	const unsigned int	max_binval = xrep_binval_max_fsblocks(mp);
+
+	/* Maximum overhead of invalidating one buffer. */
+	const unsigned int	per_binval =
+		xfs_buf_inval_log_space(1, XFS_B_TO_FSBT(mp, max_binval));
+
+	/*
+	 * For each transaction in a reap chain, we can delete some number of
+	 * extents and invalidate some number of blocks.  We assume that btree
+	 * blocks aren't usually contiguous; and that scrub likely pulled all
+	 * the buffers into memory.  From these assumptions, set the maximum
+	 * number of deferrals we can queue before flushing the defer chain,
+	 * and the number of invalidations we can queue before rolling to a
+	 * clean transaction (and possibly relogging some of the deferrals) to
+	 * the same quantity.
+	 */
+	const unsigned int	variable_overhead = per_intent + per_binval;
+
+	xreap_configure_limits(rs, step_size, variable_overhead, per_intent,
+			per_binval);
+
+	trace_xreap_agextent_limits(sc->tp, per_binval, rs->max_binval,
+			step_size, per_intent, rs->max_deferred);
+}
+
+/*
+ * Compute the maximum number of intent items that reaping can attach to the
+ * scrub transaction given the worst case log overhead of the intent items
+ * needed to reap a single CoW staging extent.  This is not for freeing
+ * metadata blocks.
+ */
+STATIC void
+xreap_configure_agcow_limits(
+	struct xreap_state	*rs)
+{
+	struct xfs_scrub	*sc = rs->sc;
+	struct xfs_mount	*mp = sc->mp;
+
+	/*
+	 * In the worst case, relogging an intent item causes both an intent
+	 * item and a done item to be attached to a transaction for each extent
+	 * that we'd like to process.
+	 */
+	const unsigned int	efi = xfs_efi_log_space(1) +
+				      xfs_efd_log_space(1);
+	const unsigned int	rui = xfs_rui_log_space(1) +
+				      xfs_rud_log_space();
+	const unsigned int	cui = xfs_cui_log_space(1) +
+				      xfs_cud_log_space();
+
+	/*
+	 * Various things can happen when reaping non-CoW metadata blocks:
+	 *
+	 * t0: Unmapping crosslinked CoW blocks: deferred removal of refcount
+	 * record, which defers removal of rmap record
+	 *
+	 * t2: Freeing CoW blocks: deferred removal of refcount record, which
+	 * defers removal of rmap record; and deferred removal of the space
+	 *
+	 * For simplicity, we'll use the worst-case intents size to determine
+	 * the maximum number of deferred extents before we have to finish the
+	 * whole chain.  If we're trying to reap a btree larger than this size,
+	 * a crash midway through reaping can result in leaked blocks.
+	 */
+	const unsigned int	t0 = cui + rui;
+	const unsigned int	t2 = cui + rui + efi;
+	const unsigned int	per_intent = max(t0, t2);
+
+	/*
+	 * For each transaction in a reap chain, we must be able to take one
+	 * step in the defer item chain, which should only consist of CUI, EFI,
+	 * or RUI items.
+	 */
+	const unsigned int	f1 = xfs_calc_finish_efi_reservation(mp, 1);
+	const unsigned int	f2 = xfs_calc_finish_rui_reservation(mp, 1);
+	const unsigned int	f3 = xfs_calc_finish_cui_reservation(mp, 1);
+	const unsigned int	step_size = max3(f1, f2, f3);
+
+	/* Largest buffer size (in fsblocks) that can be invalidated. */
+	const unsigned int	max_binval = xrep_binval_max_fsblocks(mp);
+
+	/* Overhead of invalidating one buffer */
+	const unsigned int	per_binval =
+		xfs_buf_inval_log_space(1, XFS_B_TO_FSBT(mp, max_binval));
+
+	/*
+	 * For each transaction in a reap chain, we can delete some number of
+	 * extents and invalidate some number of blocks.  We assume that CoW
+	 * staging extents are usually more than 1 fsblock, and that there
+	 * shouldn't be any buffers for those blocks.  From the assumptions,
+	 * set the number of deferrals to use as much of the reservation as
+	 * it can, but leave space to invalidate 1/8th that number of buffers.
+	 */
+	const unsigned int	variable_overhead = per_intent +
+							(per_binval / 8);
+
+	xreap_configure_limits(rs, step_size, variable_overhead, per_intent,
+			per_binval);
+
+	trace_xreap_agcow_limits(sc->tp, per_binval, rs->max_binval, step_size,
+			per_intent, rs->max_deferred);
+}
+
 /*
 * Break an AG metadata extent into sub-extents by fate (crosslinked, not
 * crosslinked), and dispose of each sub-extent separately.
@@ -531,11 +736,11 @@ xreap_agmeta_extent(
 			if (error)
 				return error;
 			xreap_defer_finish_reset(rs);
-		} else if (xreap_want_roll(rs)) {
+		} else if (xreap_want_binval_roll(rs)) {
 			error = xrep_roll_ag_trans(sc);
 			if (error)
 				return error;
-			xreap_reset(rs);
+			xreap_binval_reset(rs);
 		}

 		agbno += aglen;
@@ -562,11 +767,12 @@ xrep_reap_agblocks(
 	ASSERT(xfs_has_rmapbt(sc->mp));
 	ASSERT(sc->ip == NULL);

+	xreap_configure_agextent_limits(&rs);
 	error = xagb_bitmap_walk(bitmap, xreap_agmeta_extent, &rs);
 	if (error)
 		return error;

-	if (xreap_dirty(&rs))
+	if (xreap_is_dirty(&rs))
 		return xrep_defer_finish(sc);

 	return 0;
@@ -628,7 +834,7 @@ xreap_fsmeta_extent(
 			if (error)
 				goto out_agf;
 			xreap_defer_finish_reset(rs);
-		} else if (xreap_want_roll(rs)) {
+		} else if (xreap_want_binval_roll(rs)) {
 			/*
 			 * Hold the AGF buffer across the transaction roll so
 			 * that we don't have to reattach it to the scrub
@@ -639,7 +845,7 @@ xreap_fsmeta_extent(
 			xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
 			if (error)
 				goto out_agf;
-			xreap_reset(rs);
+			xreap_binval_reset(rs);
 		}

 		agbno += aglen;
@@ -674,11 +880,15 @@ xrep_reap_fsblocks(
 	ASSERT(xfs_has_rmapbt(sc->mp));
 	ASSERT(sc->ip != NULL);

+	if (oinfo == &XFS_RMAP_OINFO_COW)
+		xreap_configure_agcow_limits(&rs);
+	else
+		xreap_configure_agextent_limits(&rs);
 	error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs);
 	if (error)
 		return error;

-	if (xreap_dirty(&rs))
+	if (xreap_is_dirty(&rs))
 		return xrep_defer_finish(sc);

 	return 0;
@@ -770,7 +980,7 @@ xreap_rgextent_iter(
 	rtbno = xfs_rgbno_to_rtb(sc->sr.rtg, rgbno);

 	/*
-	 * If there are other rmappings, this block is cross linked and must
+	 * t1: There are other rmappings; this block is cross linked and must
 	 * not be freed.  Remove the forward and reverse mapping and move on.
 	 */
 	if (crosslinked) {
@@ -778,14 +988,14 @@ xreap_rgextent_iter(
 				*rglenp);

 		xfs_refcount_free_cow_extent(sc->tp, true, rtbno, *rglenp);
-		rs->deferred++;
+		xreap_inc_defer(rs);
 		return 0;
 	}

 	trace_xreap_dispose_free_extent(rtg_group(sc->sr.rtg), rgbno, *rglenp);

 	/*
-	 * The CoW staging extent is not crosslinked.  Use deferred work items
+	 * t2: The CoW staging extent is not crosslinked.  Use deferred work
 	 * to remove the refcountbt records (which removes the rmap records)
 	 * and free the extent.  We're not worried about the system going down
 	 * here because log recovery walks the refcount btree to clean out the
@@ -799,10 +1009,73 @@ xreap_rgextent_iter(
 	if (error)
 		return error;

-	rs->deferred++;
+	xreap_inc_defer(rs);
 	return 0;
 }

+/*
+ * Compute the maximum number of intent items that reaping can attach to the
+ * scrub transaction given the worst case log overhead of the intent items
+ * needed to reap a single CoW staging extent.  This is not for freeing
+ * metadata blocks.
+ */
+STATIC void
+xreap_configure_rgcow_limits(
+	struct xreap_state	*rs)
+{
+	struct xfs_scrub	*sc = rs->sc;
+	struct xfs_mount	*mp = sc->mp;
+
+	/*
+	 * In the worst case, relogging an intent item causes both an intent
+	 * item and a done item to be attached to a transaction for each extent
+	 * that we'd like to process.
+	 */
+	const unsigned int	efi = xfs_efi_log_space(1) +
+				      xfs_efd_log_space(1);
+	const unsigned int	rui = xfs_rui_log_space(1) +
+				      xfs_rud_log_space();
+	const unsigned int	cui = xfs_cui_log_space(1) +
+				      xfs_cud_log_space();
+
+	/*
+	 * Various things can happen when reaping non-CoW metadata blocks:
+	 *
+	 * t1: Unmapping crosslinked CoW blocks: deferred removal of refcount
+	 * record, which defers removal of rmap record
+	 *
+	 * t2: Freeing CoW blocks: deferred removal of refcount record, which
+	 * defers removal of rmap record; and deferred removal of the space
+	 *
+	 * For simplicity, we'll use the worst-case intents size to determine
+	 * the maximum number of deferred extents before we have to finish the
+	 * whole chain.  If we're trying to reap a btree larger than this size,
+	 * a crash midway through reaping can result in leaked blocks.
+	 */
+	const unsigned int	t1 = cui + rui;
+	const unsigned int	t2 = cui + rui + efi;
+	const unsigned int	per_intent = max(t1, t2);
+
+	/*
+	 * For each transaction in a reap chain, we must be able to take one
+	 * step in the defer item chain, which should only consist of CUI, EFI,
+	 * or RUI items.
+	 */
+	const unsigned int	f1 = xfs_calc_finish_rt_efi_reservation(mp, 1);
+	const unsigned int	f2 = xfs_calc_finish_rt_rui_reservation(mp, 1);
+	const unsigned int	f3 = xfs_calc_finish_rt_cui_reservation(mp, 1);
+	const unsigned int	step_size = max3(f1, f2, f3);
+
+	/*
+	 * The only buffer for the rt device is the rtgroup super, so we don't
+	 * need to save space for buffer invalidations.
+	 */
+	xreap_configure_limits(rs, step_size, per_intent, per_intent, 0);
+
+	trace_xreap_rgcow_limits(sc->tp, 0, 0, step_size, per_intent,
+			rs->max_deferred);
+}
+
 #define XREAP_RTGLOCK_ALL	(XFS_RTGLOCK_BITMAP | \
 				 XFS_RTGLOCK_RMAP | \
 				 XFS_RTGLOCK_REFCOUNT)
@@ -855,11 +1128,11 @@ xreap_rtmeta_extent(
 			if (error)
 				goto out_unlock;
 			xreap_defer_finish_reset(rs);
-		} else if (xreap_want_roll(rs)) {
+		} else if (xreap_want_binval_roll(rs)) {
 			error = xfs_trans_roll_inode(&sc->tp, sc->ip);
 			if (error)
 				goto out_unlock;
-			xreap_reset(rs);
+			xreap_binval_reset(rs);
 		}

 		rgbno += rglen;
@@ -891,12 +1164,14 @@ xrep_reap_rtblocks(

 	ASSERT(xfs_has_rmapbt(sc->mp));
 	ASSERT(sc->ip != NULL);
+	ASSERT(oinfo == &XFS_RMAP_OINFO_COW);

+	xreap_configure_rgcow_limits(&rs);
 	error = xrtb_bitmap_walk(bitmap, xreap_rtmeta_extent, &rs);
 	if (error)
 		return error;

-	if (xreap_dirty(&rs))
+	if (xreap_is_dirty(&rs))
 		return xrep_defer_finish(sc);

 	return 0;
@@ -929,13 +1204,13 @@ xrep_reap_metadir_fsblocks(
 	ASSERT(sc->ip != NULL);
 	ASSERT(xfs_is_metadir_inode(sc->ip));

+	xreap_configure_agextent_limits(&rs);
 	xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK);
-
 	error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs);
 	if (error)
 		return error;

-	if (xreap_dirty(&rs)) {
+	if (xreap_is_dirty(&rs)) {
 		error = xrep_defer_finish(sc);
 		if (error)
 			return error;
@@ -955,13 +1230,12 @@ xrep_reap_metadir_fsblocks(
 */
 STATIC int
 xreap_bmapi_select(
-	struct xfs_scrub	*sc,
-	struct xfs_inode	*ip,
-	int			whichfork,
+	struct xreap_state	*rs,
 	struct xfs_bmbt_irec	*imap,
 	bool			*crosslinked)
 {
 	struct xfs_owner_info	oinfo;
+	struct xfs_scrub	*sc = rs->sc;
 	struct xfs_btree_cur	*cur;
 	xfs_filblks_t		len = 1;
 	xfs_agblock_t		bno;
@@ -975,7 +1249,8 @@ xreap_bmapi_select(
 	cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
 			sc->sa.pag);

-	xfs_rmap_ino_owner(&oinfo, ip->i_ino, whichfork, imap->br_startoff);
+	xfs_rmap_ino_owner(&oinfo, rs->ip->i_ino, rs->whichfork,
+			imap->br_startoff);
 	error = xfs_rmap_has_other_keys(cur, agbno, 1, &oinfo, crosslinked);
 	if (error)
 		goto out_cur;
@@ -1038,21 +1313,19 @@ xreap_buf_loggable(
 */
 STATIC int
 xreap_bmapi_binval(
-	struct xfs_scrub	*sc,
-	struct xfs_inode	*ip,
-	int			whichfork,
+	struct xreap_state	*rs,
 	struct xfs_bmbt_irec	*imap)
 {
+	struct xfs_scrub	*sc = rs->sc;
 	struct xfs_mount	*mp = sc->mp;
 	struct xfs_perag	*pag = sc->sa.pag;
-	int			bmap_flags = xfs_bmapi_aflag(whichfork);
+	int			bmap_flags = xfs_bmapi_aflag(rs->whichfork);
 	xfs_fileoff_t		off;
 	xfs_fileoff_t		max_off;
 	xfs_extlen_t		scan_blocks;
 	xfs_agblock_t		bno;
 	xfs_agblock_t		agbno;
 	xfs_agblock_t		agbno_next;
-	unsigned int		invalidated = 0;
 	int			error;

 	/*
@@ -1079,7 +1352,7 @@ xreap_bmapi_binval(
 		struct xfs_bmbt_irec	hmap;
 		int			nhmaps = 1;

-		error = xfs_bmapi_read(ip, off, max_off - off, &hmap,
+		error = xfs_bmapi_read(rs->ip, off, max_off - off, &hmap,
 				&nhmaps, bmap_flags);
 		if (error)
 			return error;
@@ -1120,14 +1393,13 @@ xreap_bmapi_binval(
 				xfs_buf_stale(bp);
 				xfs_buf_relse(bp);
 			}
-			invalidated++;

 			/*
 			 * Stop invalidating if we've hit the limit; we should
 			 * still have enough reservation left to free however
-			 * much of the mapping we've seen so far.
+			 * far we've gotten.
 			 */
-			if (invalidated > XREAP_MAX_BINVAL) {
+			if (!xreap_inc_binval(rs)) {
 				imap->br_blockcount = agbno_next - bno;
 				goto out;
 			}
@@ -1149,12 +1421,11 @@ out:
 */
 STATIC int
 xrep_reap_bmapi_iter(
-	struct xfs_scrub		*sc,
-	struct xfs_inode		*ip,
-	int				whichfork,
+	struct xreap_state		*rs,
 	struct xfs_bmbt_irec		*imap,
 	bool				crosslinked)
 {
+	struct xfs_scrub		*sc = rs->sc;
 	int				error;

 	if (crosslinked) {
@@ -1171,14 +1442,14 @@ xrep_reap_bmapi_iter(
 				imap->br_blockcount);

 		/*
-		 * Schedule removal of the mapping from the fork.  We use
+		 * t0: Schedule removal of the mapping from the fork.  We use
 		 * deferred log intents in this function to control the exact
 		 * sequence of metadata updates.
 		 */
-		xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap);
-		xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT,
+		xfs_bmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap);
+		xfs_trans_mod_dquot_byino(sc->tp, rs->ip, XFS_TRANS_DQ_BCOUNT,
 				-(int64_t)imap->br_blockcount);
-		xfs_rmap_unmap_extent(sc->tp, ip, whichfork, imap);
+		xfs_rmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap);
 		return 0;
 	}

@@ -1199,41 +1470,139 @@ xrep_reap_bmapi_iter(
 	 * transaction is full of logged buffer invalidations, so we need to
 	 * return early so that we can roll and retry.
 	 */
-	error = xreap_bmapi_binval(sc, ip, whichfork, imap);
+	error = xreap_bmapi_binval(rs, imap);
 	if (error || imap->br_blockcount == 0)
 		return error;

 	/*
-	 * Schedule removal of the mapping from the fork.  We use deferred log
-	 * intents in this function to control the exact sequence of metadata
+	 * t1: Schedule removal of the mapping from the fork.  We use deferred
+	 * work in this function to control the exact sequence of metadata
 	 * updates.
 	 */
-	xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap);
-	xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT,
+	xfs_bmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap);
+	xfs_trans_mod_dquot_byino(sc->tp, rs->ip, XFS_TRANS_DQ_BCOUNT,
 			-(int64_t)imap->br_blockcount);
 	return xfs_free_extent_later(sc->tp, imap->br_startblock,
 			imap->br_blockcount, NULL, XFS_AG_RESV_NONE,
 			XFS_FREE_EXTENT_SKIP_DISCARD);
 }

+/* Compute the maximum mapcount of a file buffer. */
+static unsigned int
+xreap_bmapi_binval_mapcount(
+	struct xfs_scrub	*sc)
+{
+	/* directory blocks can span multiple fsblocks and be discontiguous */
+	if (sc->sm->sm_type == XFS_SCRUB_TYPE_DIR)
+		return sc->mp->m_dir_geo->fsbcount;
+
+	/* all other file xattr/symlink blocks must be contiguous */
+	return 1;
+}
+
+/* Compute the maximum block size of a file buffer. */
+static unsigned int
+xreap_bmapi_binval_blocksize(
+	struct xfs_scrub	*sc)
+{
+	switch (sc->sm->sm_type) {
+	case XFS_SCRUB_TYPE_DIR:
+		return sc->mp->m_dir_geo->blksize;
+	case XFS_SCRUB_TYPE_XATTR:
+	case XFS_SCRUB_TYPE_PARENT:
+		/*
+		 * The xattr structure itself consists of single fsblocks, but
+		 * there could be remote xattr blocks to invalidate.
+		 */
+		return XFS_XATTR_SIZE_MAX;
+	}
+
+	/* everything else is a single block */
+	return sc->mp->m_sb.sb_blocksize;
+}
+
+/*
+ * Compute the maximum number of buffer invalidations that we can do while
+ * reaping a single extent from a file fork.
+ */
+STATIC void
+xreap_configure_bmapi_limits(
+	struct xreap_state	*rs)
+{
+	struct xfs_scrub	*sc = rs->sc;
+	struct xfs_mount	*mp = sc->mp;
+
+	/* overhead of invalidating a buffer */
+	const unsigned int	per_binval =
+		xfs_buf_inval_log_space(xreap_bmapi_binval_mapcount(sc),
+					    xreap_bmapi_binval_blocksize(sc));
+
+	/*
+	 * In the worst case, relogging an intent item causes both an intent
+	 * item and a done item to be attached to a transaction for each extent
+	 * that we'd like to process.
+	 */
+	const unsigned int	efi = xfs_efi_log_space(1) +
+				      xfs_efd_log_space(1);
+	const unsigned int	rui = xfs_rui_log_space(1) +
+				      xfs_rud_log_space();
+	const unsigned int	bui = xfs_bui_log_space(1) +
+				      xfs_bud_log_space();
+
+	/*
+	 * t1: Unmapping crosslinked file data blocks: one bmap deletion,
+	 * possibly an EFI for underfilled bmbt blocks, and an rmap deletion.
+	 *
+	 * t2: Freeing freeing file data blocks: one bmap deletion, possibly an
+	 * EFI for underfilled bmbt blocks, and another EFI for the space
+	 * itself.
+	 */
+	const unsigned int	t1 = (bui + efi) + rui;
+	const unsigned int	t2 = (bui + efi) + efi;
+	const unsigned int	per_intent = max(t1, t2);
+
+	/*
+	 * For each transaction in a reap chain, we must be able to take one
+	 * step in the defer item chain, which should only consist of CUI, EFI,
+	 * or RUI items.
+	 */
+	const unsigned int	f1 = xfs_calc_finish_efi_reservation(mp, 1);
+	const unsigned int	f2 = xfs_calc_finish_rui_reservation(mp, 1);
+	const unsigned int	f3 = xfs_calc_finish_bui_reservation(mp, 1);
+	const unsigned int	step_size = max3(f1, f2, f3);
+
+	/*
+	 * Each call to xreap_ifork_extent starts with a clean transaction and
+	 * operates on a single mapping by creating a chain of log intent items
+	 * for that mapping.  We need to leave enough reservation in the
+	 * transaction to log btree buffer and inode updates for each step in
+	 * the chain, and to relog the log intents.
+	 */
+	const unsigned int	per_extent_res = per_intent + step_size;
+
+	xreap_configure_limits(rs, per_extent_res, per_binval, 0, per_binval);
+
+	trace_xreap_bmapi_limits(sc->tp, per_binval, rs->max_binval,
+			step_size, per_intent, 1);
+}
+
 /*
 * Dispose of as much of this file extent as we can.  Upon successful return,
 * the imap will reflect the mapping that was removed from the fork.
 */
 STATIC int
 xreap_ifork_extent(
-	struct xfs_scrub		*sc,
-	struct xfs_inode		*ip,
-	int				whichfork,
+	struct xreap_state		*rs,
 	struct xfs_bmbt_irec		*imap)
 {
+	struct xfs_scrub		*sc = rs->sc;
 	xfs_agnumber_t			agno;
 	bool				crosslinked;
 	int				error;

 	ASSERT(sc->sa.pag == NULL);

-	trace_xreap_ifork_extent(sc, ip, whichfork, imap);
+	trace_xreap_ifork_extent(sc, rs->ip, rs->whichfork, imap);

 	agno = XFS_FSB_TO_AGNO(sc->mp, imap->br_startblock);
 	sc->sa.pag = xfs_perag_get(sc->mp, agno);
@@ -1248,11 +1617,11 @@ xreap_ifork_extent(
 	 * Decide the fate of the blocks at the beginning of the mapping, then
 	 * update the mapping to use it with the unmap calls.
 	 */
-	error = xreap_bmapi_select(sc, ip, whichfork, imap, &crosslinked);
+	error = xreap_bmapi_select(rs, imap, &crosslinked);
 	if (error)
 		goto out_agf;

-	error = xrep_reap_bmapi_iter(sc, ip, whichfork, imap, crosslinked);
+	error = xrep_reap_bmapi_iter(rs, imap, crosslinked);
 	if (error)
 		goto out_agf;

@@ -1276,6 +1645,11 @@ xrep_reap_ifork(
 	struct xfs_inode	*ip,
 	int			whichfork)
 {
+	struct xreap_state	rs = {
+		.sc		= sc,
+		.ip		= ip,
+		.whichfork	= whichfork,
+	};
 	xfs_fileoff_t		off = 0;
 	int			bmap_flags = xfs_bmapi_aflag(whichfork);
 	int			error;
@@ -1284,6 +1658,7 @@ xrep_reap_ifork(
 	ASSERT(ip == sc->ip || ip == sc->tempip);
 	ASSERT(whichfork == XFS_ATTR_FORK || !XFS_IS_REALTIME_INODE(ip));

+	xreap_configure_bmapi_limits(&rs);
 	while (off < XFS_MAX_FILEOFF) {
 		struct xfs_bmbt_irec	imap;
 		int			nimaps = 1;
@@ -1303,13 +1678,14 @@ xrep_reap_ifork(
 		 * can in a single transaction.
 		 */
 		if (xfs_bmap_is_real_extent(&imap)) {
-			error = xreap_ifork_extent(sc, ip, whichfork, &imap);
+			error = xreap_ifork_extent(&rs, &imap);
 			if (error)
 				return error;

 			error = xfs_defer_finish(&sc->tp);
 			if (error)
 				return error;
+			xreap_defer_finish_reset(&rs);
 		}

 		off = imap.br_startoff + imap.br_blockcount;
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -1110,7 +1110,7 @@ xrep_will_attempt(
 		return true;

 	/* Let debug users force us into the repair routines. */
-	if (XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
+	if (XFS_TEST_ERROR(sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
 		return true;

 	/* Metadata is corrupt or failed cross-referencing. */
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -18,14 +18,6 @@ static inline int xrep_notsupported(struct xfs_scrub *sc)

 #ifdef CONFIG_XFS_ONLINE_REPAIR

-/*
- * This is the maximum number of deferred extent freeing item extents (EFIs)
- * that we'll attach to a transaction without rolling the transaction to avoid
- * overrunning a tr_itruncate reservation.
- */
-#define XREP_MAX_ITRUNCATE_EFIS	(128)
-
-
 /* Repair helpers */

 int xrep_attempt(struct xfs_scrub *sc, struct xchk_stats_run *run);
--- a/fs/xfs/scrub/symlink_repair.c
+++ b/fs/xfs/scrub/symlink_repair.c
@@ -185,7 +185,7 @@ xrep_symlink_salvage_inline(
 		return 0;

 	nr = min(XFS_SYMLINK_MAXLEN, xfs_inode_data_fork_size(ip));
-	strncpy(target_buf, ifp->if_data, nr);
+	memcpy(target_buf, ifp->if_data, nr);
 	return nr;
 }

--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -22,6 +22,7 @@
 #include "xfs_parent.h"
 #include "xfs_metafile.h"
 #include "xfs_rtgroup.h"
+#include "xfs_trans.h"
 #include "scrub/scrub.h"
 #include "scrub/xfile.h"
 #include "scrub/xfarray.h"
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -2000,6 +2000,51 @@ DEFINE_REPAIR_EXTENT_EVENT(xreap_agextent_binval);
 DEFINE_REPAIR_EXTENT_EVENT(xreap_bmapi_binval);
 DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert);

+DECLARE_EVENT_CLASS(xrep_reap_limits_class,
+	TP_PROTO(const struct xfs_trans *tp, unsigned int per_binval,
+		 unsigned int max_binval, unsigned int step_size,
+		 unsigned int per_intent,
+		 unsigned int max_deferred),
+	TP_ARGS(tp, per_binval, max_binval, step_size, per_intent, max_deferred),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned int, log_res)
+		__field(unsigned int, per_binval)
+		__field(unsigned int, max_binval)
+		__field(unsigned int, step_size)
+		__field(unsigned int, per_intent)
+		__field(unsigned int, max_deferred)
+	),
+	TP_fast_assign(
+		__entry->dev = tp->t_mountp->m_super->s_dev;
+		__entry->log_res = tp->t_log_res;
+		__entry->per_binval = per_binval;
+		__entry->max_binval = max_binval;
+		__entry->step_size = step_size;
+		__entry->per_intent = per_intent;
+		__entry->max_deferred = max_deferred;
+	),
+	TP_printk("dev %d:%d logres %u per_binval %u max_binval %u step_size %u per_intent %u max_deferred %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->log_res,
+		  __entry->per_binval,
+		  __entry->max_binval,
+		  __entry->step_size,
+		  __entry->per_intent,
+		  __entry->max_deferred)
+);
+#define DEFINE_REPAIR_REAP_LIMITS_EVENT(name) \
+DEFINE_EVENT(xrep_reap_limits_class, name, \
+	TP_PROTO(const struct xfs_trans *tp, unsigned int per_binval, \
+		 unsigned int max_binval, unsigned int step_size, \
+		 unsigned int per_intent, \
+		 unsigned int max_deferred), \
+	TP_ARGS(tp, per_binval, max_binval, step_size, per_intent, max_deferred))
+DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_agextent_limits);
+DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_agcow_limits);
+DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_rgcow_limits);
+DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_bmapi_limits);
+
 DECLARE_EVENT_CLASS(xrep_reap_find_class,
 	TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno,
 		 xfs_extlen_t len, bool crosslinked),
--- a/fs/xfs/xfs_attr_item.c
+++ b/fs/xfs/xfs_attr_item.c
@@ -491,7 +491,7 @@ xfs_attr_finish_item(
 	/* Reset trans after EAGAIN cycle since the transaction is new */
 	args->trans = tp;

-	if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) {
+	if (XFS_TEST_ERROR(args->dp->i_mount, XFS_ERRTAG_LARP)) {
 		error = -EIO;
 		goto out;
 	}
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -387,8 +387,6 @@ xfs_buf_map_verify(
 	struct xfs_buftarg	*btp,
 	struct xfs_buf_map	*map)
 {
-	xfs_daddr_t		eofs;
-
 	/* Check for IOs smaller than the sector size / not sector aligned */
 	ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize));
 	ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
@@ -397,11 +395,10 @@ xfs_buf_map_verify(
 	 * Corrupted block numbers can get through to here, unfortunately, so we
 	 * have to check that the buffer falls within the filesystem bounds.
 	 */
-	eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
-	if (map->bm_bn < 0 || map->bm_bn >= eofs) {
+	if (map->bm_bn < 0 || map->bm_bn >= btp->bt_nr_sectors) {
 		xfs_alert(btp->bt_mount,
 			  "%s: daddr 0x%llx out of range, EOFS 0x%llx",
-			  __func__, map->bm_bn, eofs);
+			  __func__, map->bm_bn, btp->bt_nr_sectors);
 		WARN_ON(1);
 		return -EFSCORRUPTED;
 	}
@@ -1299,7 +1296,7 @@ xfs_buf_bio_end_io(
 	if (bio->bi_status)
 		xfs_buf_ioerror(bp, blk_status_to_errno(bio->bi_status));
 	else if ((bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) &&
-		 XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
+		 XFS_TEST_ERROR(bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
 		xfs_buf_ioerror(bp, -EIO);

 	if (bp->b_flags & XBF_ASYNC) {
@@ -1720,26 +1717,30 @@ xfs_configure_buftarg_atomic_writes(
 int
 xfs_configure_buftarg(
 	struct xfs_buftarg	*btp,
-	unsigned int		sectorsize)
+	unsigned int		sectorsize,
+	xfs_rfsblock_t		nr_blocks)
 {
-	int			error;
+	struct xfs_mount	*mp = btp->bt_mount;

-	ASSERT(btp->bt_bdev != NULL);
+	if (btp->bt_bdev) {
+		int		error;

-	/* Set up metadata sector size info */
-	btp->bt_meta_sectorsize = sectorsize;
-	btp->bt_meta_sectormask = sectorsize - 1;
+		error = bdev_validate_blocksize(btp->bt_bdev, sectorsize);
+		if (error) {
+			xfs_warn(mp,
+				"Cannot use blocksize %u on device %pg, err %d",
+				sectorsize, btp->bt_bdev, error);
+			return -EINVAL;
+		}

-	error = bdev_validate_blocksize(btp->bt_bdev, sectorsize);
-	if (error) {
-		xfs_warn(btp->bt_mount,
-			"Cannot use blocksize %u on device %pg, err %d",
-			sectorsize, btp->bt_bdev, error);
-		return -EINVAL;
+		if (bdev_can_atomic_write(btp->bt_bdev))
+			xfs_configure_buftarg_atomic_writes(btp);
 	}

-	if (bdev_can_atomic_write(btp->bt_bdev))
-		xfs_configure_buftarg_atomic_writes(btp);
+	btp->bt_meta_sectorsize = sectorsize;
+	btp->bt_meta_sectormask = sectorsize - 1;
+	/* m_blkbb_log is not set up yet */
+	btp->bt_nr_sectors = nr_blocks << (mp->m_sb.sb_blocklog - BBSHIFT);
 	return 0;
 }

@@ -1749,6 +1750,9 @@ xfs_init_buftarg(
 	size_t				logical_sectorsize,
 	const char			*descr)
 {
+	/* The maximum size of the buftarg is only known once the sb is read. */
+	btp->bt_nr_sectors = (xfs_daddr_t)-1;
+
 	/* Set up device logical sector size mask */
 	btp->bt_logical_sectorsize = logical_sectorsize;
 	btp->bt_logical_sectormask = logical_sectorsize - 1;
@@ -2084,7 +2088,7 @@ void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
 	 * This allows userspace to disrupt buffer caching for debug/testing
 	 * purposes.
 	 */
-	if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF))
+	if (XFS_TEST_ERROR(bp->b_mount, XFS_ERRTAG_BUF_LRU_REF))
 		lru_ref = 0;

 	atomic_set(&bp->b_lru_ref, lru_ref);
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -103,6 +103,7 @@ struct xfs_buftarg {
 	size_t			bt_meta_sectormask;
 	size_t			bt_logical_sectorsize;
 	size_t			bt_logical_sectormask;
+	xfs_daddr_t		bt_nr_sectors;

 	/* LRU control structures */
 	struct shrinker		*bt_shrinker;
@@ -372,7 +373,8 @@ struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp,
 extern void xfs_free_buftarg(struct xfs_buftarg *);
 extern void xfs_buftarg_wait(struct xfs_buftarg *);
 extern void xfs_buftarg_drain(struct xfs_buftarg *);
-int xfs_configure_buftarg(struct xfs_buftarg *btp, unsigned int sectorsize);
+int xfs_configure_buftarg(struct xfs_buftarg *btp, unsigned int sectorsize,
+		xfs_fsblock_t nr_blocks);

 #define xfs_readonly_buftarg(buftarg)	bdev_read_only((buftarg)->bt_bdev)

--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -736,6 +736,16 @@ xlog_recover_do_primary_sb_buffer(
 	 */
 	xfs_sb_from_disk(&mp->m_sb, dsb);

+	/*
+	 * Grow can change the device size.  Mirror that into the buftarg.
+	 */
+	mp->m_ddev_targp->bt_nr_sectors =
+		XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
+	if (mp->m_rtdev_targp && mp->m_rtdev_targp != mp->m_ddev_targp) {
+		mp->m_rtdev_targp->bt_nr_sectors =
+			XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
+	}
+
 	if (mp->m_sb.sb_agcount < orig_agcount) {
 		xfs_alert(mp, "Shrinking AG count in log recovery not supported");
 		return -EFSCORRUPTED;
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -10,61 +10,17 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_errortag.h"
 #include "xfs_error.h"
 #include "xfs_sysfs.h"
 #include "xfs_inode.h"

 #ifdef DEBUG

-static unsigned int xfs_errortag_random_default[] = {
-	XFS_RANDOM_DEFAULT,
-	XFS_RANDOM_IFLUSH_1,
-	XFS_RANDOM_IFLUSH_2,
-	XFS_RANDOM_IFLUSH_3,
-	XFS_RANDOM_IFLUSH_4,
-	XFS_RANDOM_IFLUSH_5,
-	XFS_RANDOM_IFLUSH_6,
-	XFS_RANDOM_DA_READ_BUF,
-	XFS_RANDOM_BTREE_CHECK_LBLOCK,
-	XFS_RANDOM_BTREE_CHECK_SBLOCK,
-	XFS_RANDOM_ALLOC_READ_AGF,
-	XFS_RANDOM_IALLOC_READ_AGI,
-	XFS_RANDOM_ITOBP_INOTOBP,
-	XFS_RANDOM_IUNLINK,
-	XFS_RANDOM_IUNLINK_REMOVE,
-	XFS_RANDOM_DIR_INO_VALIDATE,
-	XFS_RANDOM_BULKSTAT_READ_CHUNK,
-	XFS_RANDOM_IODONE_IOERR,
-	XFS_RANDOM_STRATREAD_IOERR,
-	XFS_RANDOM_STRATCMPL_IOERR,
-	XFS_RANDOM_DIOWRITE_IOERR,
-	XFS_RANDOM_BMAPIFORMAT,
-	XFS_RANDOM_FREE_EXTENT,
-	XFS_RANDOM_RMAP_FINISH_ONE,
-	XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE,
-	XFS_RANDOM_REFCOUNT_FINISH_ONE,
-	XFS_RANDOM_BMAP_FINISH_ONE,
-	XFS_RANDOM_AG_RESV_CRITICAL,
-	0, /* XFS_RANDOM_DROP_WRITES has been removed */
-	XFS_RANDOM_LOG_BAD_CRC,
-	XFS_RANDOM_LOG_ITEM_PIN,
-	XFS_RANDOM_BUF_LRU_REF,
-	XFS_RANDOM_FORCE_SCRUB_REPAIR,
-	XFS_RANDOM_FORCE_SUMMARY_RECALC,
-	XFS_RANDOM_IUNLINK_FALLBACK,
-	XFS_RANDOM_BUF_IOERROR,
-	XFS_RANDOM_REDUCE_MAX_IEXTENTS,
-	XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT,
-	XFS_RANDOM_AG_RESV_FAIL,
-	XFS_RANDOM_LARP,
-	XFS_RANDOM_DA_LEAF_SPLIT,
-	XFS_RANDOM_ATTR_LEAF_TO_NODE,
-	XFS_RANDOM_WB_DELAY_MS,
-	XFS_RANDOM_WRITE_DELAY_MS,
-	XFS_RANDOM_EXCHMAPS_FINISH_ONE,
-	XFS_RANDOM_METAFILE_RESV_CRITICAL,
-};
+#define XFS_ERRTAG(_tag, _name, _default) \
+	[XFS_ERRTAG_##_tag]	= (_default),
+#include "xfs_errortag.h"
+static const unsigned int xfs_errortag_random_default[] = { XFS_ERRTAGS };
+#undef XFS_ERRTAG

 struct xfs_errortag_attr {
 	struct attribute	attr;
@@ -93,21 +49,18 @@ xfs_errortag_attr_store(
 	size_t			count)
 {
 	struct xfs_mount	*mp = to_mp(kobject);
-	struct xfs_errortag_attr *xfs_attr = to_attr(attr);
+	unsigned int		error_tag = to_attr(attr)->tag;
 	int			ret;
-	unsigned int		val;

 	if (strcmp(buf, "default") == 0) {
-		val = xfs_errortag_random_default[xfs_attr->tag];
+		mp->m_errortag[error_tag] =
+			xfs_errortag_random_default[error_tag];
 	} else {
-		ret = kstrtouint(buf, 0, &val);
+		ret = kstrtouint(buf, 0, &mp->m_errortag[error_tag]);
 		if (ret)
 			return ret;
 	}

-	ret = xfs_errortag_set(mp, xfs_attr->tag, val);
-	if (ret)
-		return ret;
 	return count;
 }

@@ -118,10 +71,9 @@ xfs_errortag_attr_show(
 	char			*buf)
 {
 	struct xfs_mount	*mp = to_mp(kobject);
-	struct xfs_errortag_attr *xfs_attr = to_attr(attr);
+	unsigned int		error_tag = to_attr(attr)->tag;

-	return snprintf(buf, PAGE_SIZE, "%u\n",
-			xfs_errortag_get(mp, xfs_attr->tag));
+	return snprintf(buf, PAGE_SIZE, "%u\n", mp->m_errortag[error_tag]);
 }

 static const struct sysfs_ops xfs_errortag_sysfs_ops = {
@@ -129,110 +81,28 @@ static const struct sysfs_ops xfs_errortag_sysfs_ops = {
 	.store = xfs_errortag_attr_store,
 };

-#define XFS_ERRORTAG_ATTR_RW(_name, _tag) \
+#define XFS_ERRTAG(_tag, _name, _default)				\
 static struct xfs_errortag_attr xfs_errortag_attr_##_name = {		\
 	.attr = {.name = __stringify(_name),				\
 		 .mode = VERIFY_OCTAL_PERMISSIONS(S_IWUSR | S_IRUGO) },	\
-	.tag	= (_tag),						\
-}
-
-#define XFS_ERRORTAG_ATTR_LIST(_name) &xfs_errortag_attr_##_name.attr
-
-XFS_ERRORTAG_ATTR_RW(noerror,		XFS_ERRTAG_NOERROR);
-XFS_ERRORTAG_ATTR_RW(iflush1,		XFS_ERRTAG_IFLUSH_1);
-XFS_ERRORTAG_ATTR_RW(iflush2,		XFS_ERRTAG_IFLUSH_2);
-XFS_ERRORTAG_ATTR_RW(iflush3,		XFS_ERRTAG_IFLUSH_3);
-XFS_ERRORTAG_ATTR_RW(iflush4,		XFS_ERRTAG_IFLUSH_4);
-XFS_ERRORTAG_ATTR_RW(iflush5,		XFS_ERRTAG_IFLUSH_5);
-XFS_ERRORTAG_ATTR_RW(iflush6,		XFS_ERRTAG_IFLUSH_6);
-XFS_ERRORTAG_ATTR_RW(dareadbuf,		XFS_ERRTAG_DA_READ_BUF);
-XFS_ERRORTAG_ATTR_RW(btree_chk_lblk,	XFS_ERRTAG_BTREE_CHECK_LBLOCK);
-XFS_ERRORTAG_ATTR_RW(btree_chk_sblk,	XFS_ERRTAG_BTREE_CHECK_SBLOCK);
-XFS_ERRORTAG_ATTR_RW(readagf,		XFS_ERRTAG_ALLOC_READ_AGF);
-XFS_ERRORTAG_ATTR_RW(readagi,		XFS_ERRTAG_IALLOC_READ_AGI);
-XFS_ERRORTAG_ATTR_RW(itobp,		XFS_ERRTAG_ITOBP_INOTOBP);
-XFS_ERRORTAG_ATTR_RW(iunlink,		XFS_ERRTAG_IUNLINK);
-XFS_ERRORTAG_ATTR_RW(iunlinkrm,		XFS_ERRTAG_IUNLINK_REMOVE);
-XFS_ERRORTAG_ATTR_RW(dirinovalid,	XFS_ERRTAG_DIR_INO_VALIDATE);
-XFS_ERRORTAG_ATTR_RW(bulkstat,		XFS_ERRTAG_BULKSTAT_READ_CHUNK);
-XFS_ERRORTAG_ATTR_RW(logiodone,		XFS_ERRTAG_IODONE_IOERR);
-XFS_ERRORTAG_ATTR_RW(stratread,		XFS_ERRTAG_STRATREAD_IOERR);
-XFS_ERRORTAG_ATTR_RW(stratcmpl,		XFS_ERRTAG_STRATCMPL_IOERR);
-XFS_ERRORTAG_ATTR_RW(diowrite,		XFS_ERRTAG_DIOWRITE_IOERR);
-XFS_ERRORTAG_ATTR_RW(bmapifmt,		XFS_ERRTAG_BMAPIFORMAT);
-XFS_ERRORTAG_ATTR_RW(free_extent,	XFS_ERRTAG_FREE_EXTENT);
-XFS_ERRORTAG_ATTR_RW(rmap_finish_one,	XFS_ERRTAG_RMAP_FINISH_ONE);
-XFS_ERRORTAG_ATTR_RW(refcount_continue_update,	XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE);
-XFS_ERRORTAG_ATTR_RW(refcount_finish_one,	XFS_ERRTAG_REFCOUNT_FINISH_ONE);
-XFS_ERRORTAG_ATTR_RW(bmap_finish_one,	XFS_ERRTAG_BMAP_FINISH_ONE);
-XFS_ERRORTAG_ATTR_RW(ag_resv_critical,	XFS_ERRTAG_AG_RESV_CRITICAL);
-XFS_ERRORTAG_ATTR_RW(log_bad_crc,	XFS_ERRTAG_LOG_BAD_CRC);
-XFS_ERRORTAG_ATTR_RW(log_item_pin,	XFS_ERRTAG_LOG_ITEM_PIN);
-XFS_ERRORTAG_ATTR_RW(buf_lru_ref,	XFS_ERRTAG_BUF_LRU_REF);
-XFS_ERRORTAG_ATTR_RW(force_repair,	XFS_ERRTAG_FORCE_SCRUB_REPAIR);
-XFS_ERRORTAG_ATTR_RW(bad_summary,	XFS_ERRTAG_FORCE_SUMMARY_RECALC);
-XFS_ERRORTAG_ATTR_RW(iunlink_fallback,	XFS_ERRTAG_IUNLINK_FALLBACK);
-XFS_ERRORTAG_ATTR_RW(buf_ioerror,	XFS_ERRTAG_BUF_IOERROR);
-XFS_ERRORTAG_ATTR_RW(reduce_max_iextents,	XFS_ERRTAG_REDUCE_MAX_IEXTENTS);
-XFS_ERRORTAG_ATTR_RW(bmap_alloc_minlen_extent,	XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT);
-XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ERRTAG_AG_RESV_FAIL);
-XFS_ERRORTAG_ATTR_RW(larp,		XFS_ERRTAG_LARP);
-XFS_ERRORTAG_ATTR_RW(da_leaf_split,	XFS_ERRTAG_DA_LEAF_SPLIT);
-XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node,	XFS_ERRTAG_ATTR_LEAF_TO_NODE);
-XFS_ERRORTAG_ATTR_RW(wb_delay_ms,	XFS_ERRTAG_WB_DELAY_MS);
-XFS_ERRORTAG_ATTR_RW(write_delay_ms,	XFS_ERRTAG_WRITE_DELAY_MS);
-XFS_ERRORTAG_ATTR_RW(exchmaps_finish_one, XFS_ERRTAG_EXCHMAPS_FINISH_ONE);
-XFS_ERRORTAG_ATTR_RW(metafile_resv_crit, XFS_ERRTAG_METAFILE_RESV_CRITICAL);
+	.tag	= XFS_ERRTAG_##_tag,					\
+};
+#include "xfs_errortag.h"
+XFS_ERRTAGS
+#undef XFS_ERRTAG

+#define XFS_ERRTAG(_tag, _name, _default) \
+	&xfs_errortag_attr_##_name.attr,
+#include "xfs_errortag.h"
 static struct attribute *xfs_errortag_attrs[] = {
-	XFS_ERRORTAG_ATTR_LIST(noerror),
-	XFS_ERRORTAG_ATTR_LIST(iflush1),
-	XFS_ERRORTAG_ATTR_LIST(iflush2),
-	XFS_ERRORTAG_ATTR_LIST(iflush3),
-	XFS_ERRORTAG_ATTR_LIST(iflush4),
-	XFS_ERRORTAG_ATTR_LIST(iflush5),
-	XFS_ERRORTAG_ATTR_LIST(iflush6),
-	XFS_ERRORTAG_ATTR_LIST(dareadbuf),
-	XFS_ERRORTAG_ATTR_LIST(btree_chk_lblk),
-	XFS_ERRORTAG_ATTR_LIST(btree_chk_sblk),
-	XFS_ERRORTAG_ATTR_LIST(readagf),
-	XFS_ERRORTAG_ATTR_LIST(readagi),
-	XFS_ERRORTAG_ATTR_LIST(itobp),
-	XFS_ERRORTAG_ATTR_LIST(iunlink),
-	XFS_ERRORTAG_ATTR_LIST(iunlinkrm),
-	XFS_ERRORTAG_ATTR_LIST(dirinovalid),
-	XFS_ERRORTAG_ATTR_LIST(bulkstat),
-	XFS_ERRORTAG_ATTR_LIST(logiodone),
-	XFS_ERRORTAG_ATTR_LIST(stratread),
-	XFS_ERRORTAG_ATTR_LIST(stratcmpl),
-	XFS_ERRORTAG_ATTR_LIST(diowrite),
-	XFS_ERRORTAG_ATTR_LIST(bmapifmt),
-	XFS_ERRORTAG_ATTR_LIST(free_extent),
-	XFS_ERRORTAG_ATTR_LIST(rmap_finish_one),
-	XFS_ERRORTAG_ATTR_LIST(refcount_continue_update),
-	XFS_ERRORTAG_ATTR_LIST(refcount_finish_one),
-	XFS_ERRORTAG_ATTR_LIST(bmap_finish_one),
-	XFS_ERRORTAG_ATTR_LIST(ag_resv_critical),
-	XFS_ERRORTAG_ATTR_LIST(log_bad_crc),
-	XFS_ERRORTAG_ATTR_LIST(log_item_pin),
-	XFS_ERRORTAG_ATTR_LIST(buf_lru_ref),
-	XFS_ERRORTAG_ATTR_LIST(force_repair),
-	XFS_ERRORTAG_ATTR_LIST(bad_summary),
-	XFS_ERRORTAG_ATTR_LIST(iunlink_fallback),
-	XFS_ERRORTAG_ATTR_LIST(buf_ioerror),
-	XFS_ERRORTAG_ATTR_LIST(reduce_max_iextents),
-	XFS_ERRORTAG_ATTR_LIST(bmap_alloc_minlen_extent),
-	XFS_ERRORTAG_ATTR_LIST(ag_resv_fail),
-	XFS_ERRORTAG_ATTR_LIST(larp),
-	XFS_ERRORTAG_ATTR_LIST(da_leaf_split),
-	XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node),
-	XFS_ERRORTAG_ATTR_LIST(wb_delay_ms),
-	XFS_ERRORTAG_ATTR_LIST(write_delay_ms),
-	XFS_ERRORTAG_ATTR_LIST(exchmaps_finish_one),
-	XFS_ERRORTAG_ATTR_LIST(metafile_resv_crit),
-	NULL,
+	XFS_ERRTAGS
+	NULL
 };
 ATTRIBUTE_GROUPS(xfs_errortag);
+#undef XFS_ERRTAG
+
+/* -1 because XFS_ERRTAG_DROP_WRITES got removed, + 1 for NULL termination */
+static_assert(ARRAY_SIZE(xfs_errortag_attrs) == XFS_ERRTAG_MAX);

 static const struct kobj_type xfs_errortag_ktype = {
 	.release = xfs_sysfs_release,
@@ -295,7 +165,6 @@ xfs_errortag_enabled(
 bool
 xfs_errortag_test(
 	struct xfs_mount	*mp,
-	const char		*expression,
 	const char		*file,
 	int			line,
 	unsigned int		error_tag)
@@ -321,35 +190,11 @@ xfs_errortag_test(
 		return false;

 	xfs_warn_ratelimited(mp,
-"Injecting error (%s) at file %s, line %d, on filesystem \"%s\"",
-			expression, file, line, mp->m_super->s_id);
+"Injecting error at file %s, line %d, on filesystem \"%s\"",
+			file, line, mp->m_super->s_id);
 	return true;
 }

-int
-xfs_errortag_get(
-	struct xfs_mount	*mp,
-	unsigned int		error_tag)
-{
-	if (!xfs_errortag_valid(error_tag))
-		return -EINVAL;
-
-	return mp->m_errortag[error_tag];
-}
-
-int
-xfs_errortag_set(
-	struct xfs_mount	*mp,
-	unsigned int		error_tag,
-	unsigned int		tag_value)
-{
-	if (!xfs_errortag_valid(error_tag))
-		return -EINVAL;
-
-	mp->m_errortag[error_tag] = tag_value;
-	return 0;
-}
-
 int
 xfs_errortag_add(
 	struct xfs_mount	*mp,
@@ -359,9 +204,8 @@ xfs_errortag_add(

 	if (!xfs_errortag_valid(error_tag))
 		return -EINVAL;
-
-	return xfs_errortag_set(mp, error_tag,
-			xfs_errortag_random_default[error_tag]);
+	mp->m_errortag[error_tag] = xfs_errortag_random_default[error_tag];
+	return 0;
 }

 int
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -8,22 +8,17 @@

 struct xfs_mount;

-extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
-			const char *filename, int linenum,
-			xfs_failaddr_t failaddr);
-extern void xfs_corruption_error(const char *tag, int level,
-			struct xfs_mount *mp, const void *buf, size_t bufsize,
-			const char *filename, int linenum,
-			xfs_failaddr_t failaddr);
+void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
+		const char *filename, int linenum, xfs_failaddr_t failaddr);
+void xfs_corruption_error(const char *tag, int level, struct xfs_mount *mp,
+		const void *buf, size_t bufsize, const char *filename,
+		int linenum, xfs_failaddr_t failaddr);
 void xfs_buf_corruption_error(struct xfs_buf *bp, xfs_failaddr_t fa);
-extern void xfs_buf_verifier_error(struct xfs_buf *bp, int error,
-			const char *name, const void *buf, size_t bufsz,
-			xfs_failaddr_t failaddr);
-extern void xfs_verifier_error(struct xfs_buf *bp, int error,
-			xfs_failaddr_t failaddr);
-extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error,
-			const char *name, const void *buf, size_t bufsz,
-			xfs_failaddr_t failaddr);
+void xfs_buf_verifier_error(struct xfs_buf *bp, int error, const char *name,
+		const void *buf, size_t bufsz, xfs_failaddr_t failaddr);
+void xfs_verifier_error(struct xfs_buf *bp, int error, xfs_failaddr_t failaddr);
+void xfs_inode_verifier_error(struct xfs_inode *ip, int error, const char *name,
+		const void *buf, size_t bufsz, xfs_failaddr_t failaddr);

 #define	XFS_ERROR_REPORT(e, lvl, mp)	\
 	xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address)
@@ -39,12 +34,12 @@ extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error,
 #define XFS_CORRUPTION_DUMP_LEN		(128)

 #ifdef DEBUG
-extern int xfs_errortag_init(struct xfs_mount *mp);
-extern void xfs_errortag_del(struct xfs_mount *mp);
-extern bool xfs_errortag_test(struct xfs_mount *mp, const char *expression,
-		const char *file, int line, unsigned int error_tag);
-#define XFS_TEST_ERROR(expr, mp, tag)		\
-	((expr) || xfs_errortag_test((mp), #expr, __FILE__, __LINE__, (tag)))
+int xfs_errortag_init(struct xfs_mount *mp);
+void xfs_errortag_del(struct xfs_mount *mp);
+bool xfs_errortag_test(struct xfs_mount *mp, const char *file, int line,
+		unsigned int error_tag);
+#define XFS_TEST_ERROR(mp, tag)		\
+	xfs_errortag_test((mp), __FILE__, __LINE__, (tag))
 bool xfs_errortag_enabled(struct xfs_mount *mp, unsigned int tag);
 #define XFS_ERRORTAG_DELAY(mp, tag)		\
 	do { \
@@ -58,17 +53,13 @@ bool xfs_errortag_enabled(struct xfs_mount *mp, unsigned int tag);
 		mdelay((mp)->m_errortag[(tag)]); \
 	} while (0)

-extern int xfs_errortag_get(struct xfs_mount *mp, unsigned int error_tag);
-extern int xfs_errortag_set(struct xfs_mount *mp, unsigned int error_tag,
-		unsigned int tag_value);
-extern int xfs_errortag_add(struct xfs_mount *mp, unsigned int error_tag);
-extern int xfs_errortag_clearall(struct xfs_mount *mp);
+int xfs_errortag_add(struct xfs_mount *mp, unsigned int error_tag);
+int xfs_errortag_clearall(struct xfs_mount *mp);
 #else
 #define xfs_errortag_init(mp)			(0)
 #define xfs_errortag_del(mp)
-#define XFS_TEST_ERROR(expr, mp, tag)		(expr)
+#define XFS_TEST_ERROR(mp, tag)			(false)
 #define XFS_ERRORTAG_DELAY(mp, tag)		((void)0)
-#define xfs_errortag_set(mp, tag, val)		(ENOSYS)
 #define xfs_errortag_add(mp, tag)		(ENOSYS)
 #define xfs_errortag_clearall(mp)		(ENOSYS)
 #endif /* DEBUG */
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -202,7 +202,7 @@ xfs_efi_copy_format(
 			       sizeof(struct xfs_extent));
 		return 0;
 	} else if (buf->iov_len == len32) {
-		xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->iov_base;
+		struct xfs_efi_log_format_32 *src_efi_fmt_32 = buf->iov_base;

 		dst_efi_fmt->efi_type     = src_efi_fmt_32->efi_type;
 		dst_efi_fmt->efi_size     = src_efi_fmt_32->efi_size;
@@ -216,7 +216,7 @@ xfs_efi_copy_format(
 		}
 		return 0;
 	} else if (buf->iov_len == len64) {
-		xfs_efi_log_format_64_t *src_efi_fmt_64 = buf->iov_base;
+		struct xfs_efi_log_format_64 *src_efi_fmt_64 = buf->iov_base;

 		dst_efi_fmt->efi_type     = src_efi_fmt_64->efi_type;
 		dst_efi_fmt->efi_size     = src_efi_fmt_64->efi_size;
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -49,7 +49,7 @@ struct xfs_efi_log_item {
 	struct xfs_log_item	efi_item;
 	atomic_t		efi_refcount;
 	atomic_t		efi_next_extent;
-	xfs_efi_log_format_t	efi_format;
+	struct xfs_efi_log_format efi_format;
 };

 static inline size_t
@@ -69,7 +69,7 @@ struct xfs_efd_log_item {
 	struct xfs_log_item	efd_item;
 	struct xfs_efi_log_item *efd_efip;
 	uint			efd_next_extent;
-	xfs_efd_log_format_t	efd_format;
+	struct xfs_efd_log_format efd_format;
 };

 static inline size_t
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -75,52 +75,47 @@ xfs_dir_fsync(
 	return xfs_log_force_inode(ip);
 }

-static xfs_csn_t
-xfs_fsync_seq(
-	struct xfs_inode	*ip,
-	bool			datasync)
-{
-	if (!xfs_ipincount(ip))
-		return 0;
-	if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
-		return 0;
-	return ip->i_itemp->ili_commit_seq;
-}
-
 /*
- * All metadata updates are logged, which means that we just have to flush the
- * log up to the latest LSN that touched the inode.
+ * All metadata updates are logged, which means that we just have to push the
+ * journal to the required sequence number than holds the updates. We track
+ * datasync commits separately to full sync commits, and hence only need to
+ * select the correct sequence number for the log force here.
 *
- * If we have concurrent fsync/fdatasync() calls, we need them to all block on
- * the log force before we clear the ili_fsync_fields field. This ensures that
- * we don't get a racing sync operation that does not wait for the metadata to
- * hit the journal before returning.  If we race with clearing ili_fsync_fields,
- * then all that will happen is the log force will do nothing as the lsn will
- * already be on disk.  We can't race with setting ili_fsync_fields because that
- * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
- * shared until after the ili_fsync_fields is cleared.
+ * We don't have to serialise against concurrent modifications, as we do not
+ * have to wait for modifications that have not yet completed. We define a
+ * transaction commit as completing when the commit sequence number is updated,
+ * hence if the sequence number has not updated, the sync operation has been
+ * run before the commit completed and we don't have to wait for it.
+ *
+ * If we have concurrent fsync/fdatasync() calls, the sequence numbers remain
+ * set on the log item until - at least - the journal flush completes. In
+ * reality, they are only cleared when the inode is fully unpinned (i.e.
+ * persistent in the journal and not dirty in the CIL), and so we rely on
+ * xfs_log_force_seq() either skipping sequences that have been persisted or
+ * waiting on sequences that are still in flight to correctly order concurrent
+ * sync operations.
 */
-static  int
+static int
 xfs_fsync_flush_log(
 	struct xfs_inode	*ip,
 	bool			datasync,
 	int			*log_flushed)
 {
-	int			error = 0;
-	xfs_csn_t		seq;
+	struct xfs_inode_log_item *iip = ip->i_itemp;
+	xfs_csn_t		seq = 0;

-	xfs_ilock(ip, XFS_ILOCK_SHARED);
-	seq = xfs_fsync_seq(ip, datasync);
-	if (seq) {
-		error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
+	spin_lock(&iip->ili_lock);
+	if (datasync)
+		seq = iip->ili_datasync_seq;
+	else
+		seq = iip->ili_commit_seq;
+	spin_unlock(&iip->ili_lock);
+
+	if (!seq)
+		return 0;
+
+	return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
 					  log_flushed);
-
-		spin_lock(&ip->i_itemp->ili_lock);
-		ip->i_itemp->ili_fsync_fields = 0;
-		spin_unlock(&ip->i_itemp->ili_lock);
-	}
-	xfs_iunlock(ip, XFS_ILOCK_SHARED);
-	return error;
 }

 STATIC int
@@ -158,12 +153,10 @@ xfs_file_fsync(
 		error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);

 	/*
-	 * Any inode that has dirty modifications in the log is pinned.  The
-	 * racy check here for a pinned inode will not catch modifications
-	 * that happen concurrently to the fsync call, but fsync semantics
-	 * only require to sync previously completed I/O.
+	 * If the inode has a inode log item attached, it may need the journal
+	 * flushed to persist any changes the log item might be tracking.
 	 */
-	if (xfs_ipincount(ip)) {
+	if (ip->i_itemp) {
 		err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed);
 		if (err2 && !error)
 			error = err2;
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -14,8 +14,6 @@
 */
 xfs_param_t xfs_params = {
 			  /*	MIN		DFLT		MAX	*/
-	.sgid_inherit	= {	0,		0,		1	},
-	.symlink_mode	= {	0,		0,		1	},
 	.panic_mask	= {	0,		0,		XFS_PTAG_MASK},
 	.error_level	= {	0,		3,		11	},
 	.syncd_timer	= {	1*100,		30*100,		7200*100},
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -646,8 +646,7 @@ xfs_iget_cache_miss(
 		goto out_destroy;

 	/*
-	 * For version 5 superblocks, if we are initialising a new inode and we
-	 * are not utilising the XFS_FEAT_IKEEP inode cluster mode, we can
+	 * For version 5 superblocks, if we are initialising a new inode, we
 	 * simply build the new inode core with a random generation number.
 	 *
 	 * For version 4 (and older) superblocks, log recovery is dependent on
@@ -655,8 +654,7 @@ xfs_iget_cache_miss(
 	 * value and hence we must also read the inode off disk even when
 	 * initializing new inodes.
 	 */
-	if (xfs_has_v3inodes(mp) &&
-	    (flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) {
+	if (xfs_has_v3inodes(mp) && (flags & XFS_IGET_CREATE)) {
 		VFS_I(ip)->i_generation = get_random_u32();
 	} else {
 		struct xfs_buf		*bp;
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -877,6 +877,35 @@ xfs_create_tmpfile(
 	return error;
 }

+static inline int
+xfs_projid_differ(
+	struct xfs_inode	*tdp,
+	struct xfs_inode	*sip)
+{
+	/*
+	 * If we are using project inheritance, we only allow hard link/renames
+	 * creation in our tree when the project IDs are the same; else
+	 * the tree quota mechanism could be circumvented.
+	 */
+	if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
+		     tdp->i_projid != sip->i_projid)) {
+		/*
+		 * Project quota setup skips special files which can
+		 * leave inodes in a PROJINHERIT directory without a
+		 * project ID set. We need to allow links to be made
+		 * to these "project-less" inodes because userspace
+		 * expects them to succeed after project ID setup,
+		 * but everything else should be rejected.
+		 */
+		if (!special_file(VFS_I(sip)->i_mode) ||
+		    sip->i_projid != 0) {
+			return -EXDEV;
+		}
+	}
+
+	return 0;
+}
+
 int
 xfs_link(
 	struct xfs_inode	*tdp,
@@ -930,27 +959,9 @@ xfs_link(
 		goto error_return;
 	}

-	/*
-	 * If we are using project inheritance, we only allow hard link
-	 * creation in our tree when the project IDs are the same; else
-	 * the tree quota mechanism could be circumvented.
-	 */
-	if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
-		     tdp->i_projid != sip->i_projid)) {
-		/*
-		 * Project quota setup skips special files which can
-		 * leave inodes in a PROJINHERIT directory without a
-		 * project ID set. We need to allow links to be made
-		 * to these "project-less" inodes because userspace
-		 * expects them to succeed after project ID setup,
-		 * but everything else should be rejected.
-		 */
-		if (!special_file(VFS_I(sip)->i_mode) ||
-		    sip->i_projid != 0) {
-			error = -EXDEV;
-			goto error_return;
-		}
-	}
+	error = xfs_projid_differ(tdp, sip);
+	if (error)
+		goto error_return;

 	error = xfs_dir_add_child(tp, resblks, &du);
 	if (error)
@@ -1656,7 +1667,6 @@ retry:
 	spin_lock(&iip->ili_lock);
 	iip->ili_last_fields = iip->ili_fields;
 	iip->ili_fields = 0;
-	iip->ili_fsync_fields = 0;
 	spin_unlock(&iip->ili_lock);
 	ASSERT(iip->ili_last_fields);

@@ -1821,12 +1831,20 @@ static void
 xfs_iunpin(
 	struct xfs_inode	*ip)
 {
-	xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED);
+	struct xfs_inode_log_item *iip = ip->i_itemp;
+	xfs_csn_t		seq = 0;

 	trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
+	xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED);
+
+	spin_lock(&iip->ili_lock);
+	seq = iip->ili_commit_seq;
+	spin_unlock(&iip->ili_lock);
+	if (!seq)
+		return;

 	/* Give the log a push to start the unpinning I/O */
-	xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL);
+	xfs_log_force_seq(ip->i_mount, seq, 0, NULL);

 }

@@ -2227,16 +2245,9 @@ retry:
 	if (du_wip.ip)
 		xfs_trans_ijoin(tp, du_wip.ip, 0);

-	/*
-	 * If we are using project inheritance, we only allow renames
-	 * into our tree when the project IDs are the same; else the
-	 * tree quota mechanism would be circumvented.
-	 */
-	if (unlikely((target_dp->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
-		     target_dp->i_projid != src_ip->i_projid)) {
-		error = -EXDEV;
+	error = xfs_projid_differ(target_dp, src_ip);
+	if (error)
 		goto out_trans_cancel;
-	}

 	/* RENAME_EXCHANGE is unique from here on. */
 	if (flags & RENAME_EXCHANGE) {
@@ -2377,8 +2388,8 @@ xfs_iflush(
 	 * error handling as the caller will shutdown and fail the buffer.
 	 */
 	error = -EFSCORRUPTED;
-	if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
-			       mp, XFS_ERRTAG_IFLUSH_1)) {
+	if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC) ||
+	    XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_1)) {
 		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
 			"%s: Bad inode %llu magic number 0x%x, ptr "PTR_FMT,
 			__func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
@@ -2394,29 +2405,27 @@ xfs_iflush(
 			goto flush_out;
 		}
 	} else if (S_ISREG(VFS_I(ip)->i_mode)) {
-		if (XFS_TEST_ERROR(
-		    ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
-		    ip->i_df.if_format != XFS_DINODE_FMT_BTREE,
-		    mp, XFS_ERRTAG_IFLUSH_3)) {
+		if ((ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
+		     ip->i_df.if_format != XFS_DINODE_FMT_BTREE) ||
+		    XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_3)) {
 			xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
 				"%s: Bad regular inode %llu, ptr "PTR_FMT,
 				__func__, ip->i_ino, ip);
 			goto flush_out;
 		}
 	} else if (S_ISDIR(VFS_I(ip)->i_mode)) {
-		if (XFS_TEST_ERROR(
-		    ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
-		    ip->i_df.if_format != XFS_DINODE_FMT_BTREE &&
-		    ip->i_df.if_format != XFS_DINODE_FMT_LOCAL,
-		    mp, XFS_ERRTAG_IFLUSH_4)) {
+		if ((ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
+		     ip->i_df.if_format != XFS_DINODE_FMT_BTREE &&
+		     ip->i_df.if_format != XFS_DINODE_FMT_LOCAL) ||
+		    XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_4)) {
 			xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
 				"%s: Bad directory inode %llu, ptr "PTR_FMT,
 				__func__, ip->i_ino, ip);
 			goto flush_out;
 		}
 	}
-	if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) >
-				ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) {
+	if (ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) >
+	    ip->i_nblocks || XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_5)) {
 		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
 			"%s: detected corrupt incore inode %llu, "
 			"total extents = %llu nblocks = %lld, ptr "PTR_FMT,
@@ -2425,8 +2434,8 @@ xfs_iflush(
 			ip->i_nblocks, ip);
 		goto flush_out;
 	}
-	if (XFS_TEST_ERROR(ip->i_forkoff > mp->m_sb.sb_inodesize,
-				mp, XFS_ERRTAG_IFLUSH_6)) {
+	if (ip->i_forkoff > mp->m_sb.sb_inodesize ||
+	    XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_6)) {
 		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
 			"%s: bad inode %llu, forkoff 0x%x, ptr "PTR_FMT,
 			__func__, ip->i_ino, ip->i_forkoff, ip);
@@ -2502,7 +2511,6 @@ flush_out:
 	spin_lock(&iip->ili_lock);
 	iip->ili_last_fields = iip->ili_fields;
 	iip->ili_fields = 0;
-	iip->ili_fsync_fields = 0;
 	set_bit(XFS_LI_FLUSHING, &iip->ili_item.li_flags);
 	spin_unlock(&iip->ili_lock);

@@ -2661,12 +2669,15 @@ int
 xfs_log_force_inode(
 	struct xfs_inode	*ip)
 {
+	struct xfs_inode_log_item *iip = ip->i_itemp;
 	xfs_csn_t		seq = 0;

-	xfs_ilock(ip, XFS_ILOCK_SHARED);
-	if (xfs_ipincount(ip))
-		seq = ip->i_itemp->ili_commit_seq;
-	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	if (!iip)
+		return 0;
+
+	spin_lock(&iip->ili_lock);
+	seq = iip->ili_commit_seq;
+	spin_unlock(&iip->ili_lock);

 	if (!seq)
 		return 0;
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -131,46 +131,28 @@ xfs_inode_item_precommit(
 	}

 	/*
-	 * Inode verifiers do not check that the extent size hint is an integer
-	 * multiple of the rt extent size on a directory with both rtinherit
-	 * and extszinherit flags set.  If we're logging a directory that is
-	 * misconfigured in this way, clear the hint.
+	 * Inode verifiers do not check that the extent size hints are an
+	 * integer multiple of the rt extent size on a directory with
+	 * rtinherit flags set.  If we're logging a directory that is
+	 * misconfigured in this way, clear the bad hints.
 	 */
-	if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
-	    (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
-	    xfs_extlen_to_rtxmod(ip->i_mount, ip->i_extsize) > 0) {
-		ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
-				   XFS_DIFLAG_EXTSZINHERIT);
-		ip->i_extsize = 0;
-		flags |= XFS_ILOG_CORE;
+	if (ip->i_diflags & XFS_DIFLAG_RTINHERIT) {
+		if ((ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
+		    xfs_extlen_to_rtxmod(ip->i_mount, ip->i_extsize) > 0) {
+			ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
+					   XFS_DIFLAG_EXTSZINHERIT);
+			ip->i_extsize = 0;
+			flags |= XFS_ILOG_CORE;
+		}
+		if ((ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
+		    xfs_extlen_to_rtxmod(ip->i_mount, ip->i_cowextsize) > 0) {
+			ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
+			ip->i_cowextsize = 0;
+			flags |= XFS_ILOG_CORE;
+		}
 	}

-	/*
-	 * Record the specific change for fdatasync optimisation. This allows
-	 * fdatasync to skip log forces for inodes that are only timestamp
-	 * dirty. Once we've processed the XFS_ILOG_IVERSION flag, convert it
-	 * to XFS_ILOG_CORE so that the actual on-disk dirty tracking
-	 * (ili_fields) correctly tracks that the version has changed.
-	 */
 	spin_lock(&iip->ili_lock);
-	iip->ili_fsync_fields |= (flags & ~XFS_ILOG_IVERSION);
-	if (flags & XFS_ILOG_IVERSION)
-		flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE);
-
-	/*
-	 * Inode verifiers do not check that the CoW extent size hint is an
-	 * integer multiple of the rt extent size on a directory with both
-	 * rtinherit and cowextsize flags set.  If we're logging a directory
-	 * that is misconfigured in this way, clear the hint.
-	 */
-	if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
-	    (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
-	    xfs_extlen_to_rtxmod(ip->i_mount, ip->i_cowextsize) > 0) {
-		ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
-		ip->i_cowextsize = 0;
-		flags |= XFS_ILOG_CORE;
-	}
-
 	if (!iip->ili_item.li_buf) {
 		struct xfs_buf	*bp;
 		int		error;
@@ -204,6 +186,20 @@ xfs_inode_item_precommit(
 		xfs_trans_brelse(tp, bp);
 	}

+	/*
+	 * Store the dirty flags back into the inode item as this state is used
+	 * later on in xfs_inode_item_committing() to determine whether the
+	 * transaction is relevant to fsync state or not.
+	 */
+	iip->ili_dirty_flags = flags;
+
+	/*
+	 * Convert the flags on-disk fields that have been modified in the
+	 * transaction so that ili_fields tracks the changes correctly.
+	 */
+	if (flags & XFS_ILOG_IVERSION)
+		flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE);
+
 	/*
 	 * Always OR in the bits from the ili_last_fields field.  This is to
 	 * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines
@@ -214,12 +210,6 @@ xfs_inode_item_precommit(
 	spin_unlock(&iip->ili_lock);

 	xfs_inode_item_precommit_check(ip);
-
-	/*
-	 * We are done with the log item transaction dirty state, so clear it so
-	 * that it doesn't pollute future transactions.
-	 */
-	iip->ili_dirty_flags = 0;
 	return 0;
 }

@@ -729,13 +719,24 @@ xfs_inode_item_unpin(
 	struct xfs_log_item	*lip,
 	int			remove)
 {
-	struct xfs_inode	*ip = INODE_ITEM(lip)->ili_inode;
+	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+	struct xfs_inode	*ip = iip->ili_inode;

 	trace_xfs_inode_unpin(ip, _RET_IP_);
 	ASSERT(lip->li_buf || xfs_iflags_test(ip, XFS_ISTALE));
 	ASSERT(atomic_read(&ip->i_pincount) > 0);
-	if (atomic_dec_and_test(&ip->i_pincount))
+
+	/*
+	 * If this is the last unpin, then the inode no longer needs a journal
+	 * flush to persist it. Hence we can clear the commit sequence numbers
+	 * as a fsync/fdatasync operation on the inode at this point is a no-op.
+	 */
+	if (atomic_dec_and_lock(&ip->i_pincount, &iip->ili_lock)) {
+		iip->ili_commit_seq = 0;
+		iip->ili_datasync_seq = 0;
+		spin_unlock(&iip->ili_lock);
 		wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
+	}
 }

 STATIC uint
@@ -858,12 +859,45 @@ xfs_inode_item_committed(
 	return lsn;
 }

+/*
+ * The modification is now complete, so before we unlock the inode we need to
+ * update the commit sequence numbers for data integrity journal flushes. We
+ * always record the commit sequence number (ili_commit_seq) so that anything
+ * that needs a full journal sync will capture all of this modification.
+ *
+ * We then
+ * check if the changes will impact a datasync (O_DSYNC) journal flush. If the
+ * changes will require a datasync flush, then we also record the sequence in
+ * ili_datasync_seq.
+ *
+ * These commit sequence numbers will get cleared atomically with the inode being
+ * unpinned (i.e. pin count goes to zero), and so it will only be set when the
+ * inode is dirty in the journal. This removes the need for checking if the
+ * inode is pinned to determine if a journal flush is necessary, and hence
+ * removes the need for holding the ILOCK_SHARED in xfs_file_fsync() to
+ * serialise pin counts against commit sequence number updates.
+ *
+ */
 STATIC void
 xfs_inode_item_committing(
 	struct xfs_log_item	*lip,
 	xfs_csn_t		seq)
 {
-	INODE_ITEM(lip)->ili_commit_seq = seq;
+	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+
+	spin_lock(&iip->ili_lock);
+	iip->ili_commit_seq = seq;
+	if (iip->ili_dirty_flags & ~(XFS_ILOG_IVERSION | XFS_ILOG_TIMESTAMP))
+		iip->ili_datasync_seq = seq;
+	spin_unlock(&iip->ili_lock);
+
+	/*
+	 * Clear the per-transaction dirty flags now that we have finished
+	 * recording the transaction's inode modifications in the CIL and are
+	 * about to release and (maybe) unlock the inode.
+	 */
+	iip->ili_dirty_flags = 0;
+
 	return xfs_inode_item_release(lip);
 }

@@ -1055,7 +1089,6 @@ xfs_iflush_abort_clean(
 {
 	iip->ili_last_fields = 0;
 	iip->ili_fields = 0;
-	iip->ili_fsync_fields = 0;
 	iip->ili_flush_lsn = 0;
 	iip->ili_item.li_buf = NULL;
 	list_del_init(&iip->ili_item.li_bio_list);
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -32,9 +32,17 @@ struct xfs_inode_log_item {
 	spinlock_t		ili_lock;	   /* flush state lock */
 	unsigned int		ili_last_fields;   /* fields when flushed */
 	unsigned int		ili_fields;	   /* fields to be logged */
-	unsigned int		ili_fsync_fields;  /* logged since last fsync */
 	xfs_lsn_t		ili_flush_lsn;	   /* lsn at last flush */
+
+	/*
+	 * We record the sequence number for every inode modification, as
+	 * well as those that only require fdatasync operations for data
+	 * integrity. This allows optimisation of the O_DSYNC/fdatasync path
+	 * without needing to track what modifications the journal is currently
+	 * carrying for the inode. These are protected by the above ili_lock.
+	 */
 	xfs_csn_t		ili_commit_seq;	   /* last transaction commit */
+	xfs_csn_t		ili_datasync_seq;  /* for datasync optimisation */
 };

 static inline int xfs_inode_clean(struct xfs_inode *ip)
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -512,9 +512,6 @@ xfs_fileattr_get(
 {
 	struct xfs_inode	*ip = XFS_I(d_inode(dentry));

-	if (d_is_special(dentry))
-		return -ENOTTY;
-
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
 	xfs_fill_fsxattr(ip, XFS_DATA_FORK, fa);
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -736,9 +733,6 @@ xfs_fileattr_set(

 	trace_xfs_ioctl_setattr(ip);

-	if (d_is_special(dentry))
-		return -ENOTTY;
-
 	if (!fa->fsx_valid) {
 		if (fa->flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL |
 				  FS_NOATIME_FL | FS_NODUMP_FL |
@@ -1209,21 +1203,21 @@ xfs_file_ioctl(
 				current->comm);
 		return -ENOTTY;
 	case XFS_IOC_DIOINFO: {
-		struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
+		struct kstat		st;
 		struct dioattr		da;

-		da.d_mem = target->bt_logical_sectorsize;
+		error = vfs_getattr(&filp->f_path, &st, STATX_DIOALIGN, 0);
+		if (error)
+			return error;

 		/*
-		 * See xfs_report_dioalign() for an explanation about why this
-		 * reports a value larger than the sector size for COW inodes.
+		 * Some userspace directly feeds the return value to
+		 * posix_memalign, which fails for values that are smaller than
+		 * the pointer size.  Round up the value to not break userspace.
 		 */
-		if (xfs_is_cow_inode(ip))
-			da.d_miniosz = xfs_inode_alloc_unitsize(ip);
-		else
-			da.d_miniosz = target->bt_logical_sectorsize;
+		da.d_mem = roundup(st.dio_mem_align, sizeof(void *));
+		da.d_miniosz = st.dio_offset_align;
 		da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
-
 		if (copy_to_user(arg, &da, sizeof(da)))
 			return -EFAULT;
 		return 0;
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -149,9 +149,18 @@ xfs_bmbt_to_iomap(
 		iomap->bdev = target->bt_bdev;
 	iomap->flags = iomap_flags;

-	if (xfs_ipincount(ip) &&
-	    (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
-		iomap->flags |= IOMAP_F_DIRTY;
+	/*
+	 * If the inode is dirty for datasync purposes, let iomap know so it
+	 * doesn't elide the IO completion journal flushes on O_DSYNC IO.
+	 */
+	if (ip->i_itemp) {
+		struct xfs_inode_log_item *iip = ip->i_itemp;
+
+		spin_lock(&iip->ili_lock);
+		if (iip->ili_datasync_seq)
+			iomap->flags |= IOMAP_F_DIRTY;
+		spin_unlock(&iip->ili_lock);
+	}

 	iomap->validity_cookie = sequence_cookie;
 	return 0;
@@ -1554,7 +1563,7 @@ xfs_zoned_buffered_write_iomap_begin(
 		return error;

 	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) ||
-	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+	    XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) {
 		xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
 		error = -EFSCORRUPTED;
 		goto out_unlock;
@@ -1728,7 +1737,7 @@ xfs_buffered_write_iomap_begin(
 		return error;

 	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) ||
-	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+	    XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) {
 		xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
 		error = -EFSCORRUPTED;
 		goto out_unlock;
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -431,14 +431,12 @@ xfs_vn_symlink(
 	struct dentry		*dentry,
 	const char		*symname)
 {
-	struct inode	*inode;
-	struct xfs_inode *cip = NULL;
-	struct xfs_name	name;
-	int		error;
-	umode_t		mode;
+	struct inode		*inode;
+	struct xfs_inode	*cip = NULL;
+	struct xfs_name		name;
+	int			error;
+	umode_t			mode = S_IFLNK | S_IRWXUGO;

-	mode = S_IFLNK |
-		(irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
 	error = xfs_dentry_mode_to_name(&name, dentry, mode);
 	if (unlikely(error))
 		goto out;
@@ -1335,6 +1333,8 @@ static const struct inode_operations xfs_symlink_inode_operations = {
 	.setattr		= xfs_vn_setattr,
 	.listxattr		= xfs_vn_listxattr,
 	.update_time		= xfs_vn_update_time,
+	.fileattr_get		= xfs_fileattr_get,
+	.fileattr_set		= xfs_fileattr_set,
 };

 /* Figure out if this file actually supports DAX. */
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -89,8 +89,6 @@ typedef __u32			xfs_nlink_t;
 #undef XFS_NATIVE_HOST
 #endif

-#define irix_sgid_inherit	xfs_params.sgid_inherit.val
-#define irix_symlink_mode	xfs_params.symlink_mode.val
 #define xfs_panic_mask		xfs_params.panic_mask.val
 #define xfs_error_level		xfs_params.error_level.val
 #define xfs_syncd_centisecs	xfs_params.syncd_timer.val
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -969,8 +969,8 @@ xfs_log_unmount_write(
 	 * counters will be recalculated.  Refer to xlog_check_unmount_rec for
 	 * more details.
 	 */
-	if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp,
-			XFS_ERRTAG_FORCE_SUMMARY_RECALC)) {
+	if (xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS) ||
+	    XFS_TEST_ERROR(mp, XFS_ERRTAG_FORCE_SUMMARY_RECALC)) {
 		xfs_alert(mp, "%s: will fix summary counters at next mount",
 				__func__);
 		return;
@@ -1240,7 +1240,7 @@ xlog_ioend_work(
 	/*
 	 * Race to shutdown the filesystem if we see an error.
 	 */
-	if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) {
+	if (error || XFS_TEST_ERROR(log->l_mp, XFS_ERRTAG_IODONE_IOERR)) {
 		xfs_alert(log->l_mp, "log I/O error %d", error);
 		xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
 	}
@@ -1567,13 +1567,13 @@ xlog_cksum(
 	struct xlog		*log,
 	struct xlog_rec_header	*rhead,
 	char			*dp,
-	int			size)
+	unsigned int		hdrsize,
+	unsigned int		size)
 {
 	uint32_t		crc;

 	/* first generate the crc for the record header ... */
-	crc = xfs_start_cksum_update((char *)rhead,
-			      sizeof(struct xlog_rec_header),
+	crc = xfs_start_cksum_update((char *)rhead, hdrsize,
 			      offsetof(struct xlog_rec_header, h_crc));

 	/* ... then for additional cycle data for v2 logs ... */
@@ -1817,7 +1817,7 @@ xlog_sync(

 	/* calculcate the checksum */
 	iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header,
-					    iclog->ic_datap, size);
+			iclog->ic_datap, XLOG_REC_SIZE, size);
 	/*
 	 * Intentionally corrupt the log record CRC based on the error injection
 	 * frequency, if defined. This facilitates testing log recovery in the
@@ -1826,7 +1826,7 @@ xlog_sync(
 	 * detects the bad CRC and attempts to recover.
 	 */
 #ifdef DEBUG
-	if (XFS_TEST_ERROR(false, log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) {
+	if (XFS_TEST_ERROR(log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) {
 		iclog->ic_header.h_crc &= cpu_to_le32(0xAAAAAAAA);
 		iclog->ic_fail_crc = true;
 		xfs_warn(log->l_mp,
@@ -2655,10 +2655,11 @@ restart:
 	 * until you know exactly how many bytes get copied.  Therefore, wait
 	 * until later to update ic_offset.
 	 *
-	 * xlog_write() algorithm assumes that at least 2 xlog_op_header_t's
+	 * xlog_write() algorithm assumes that at least 2 xlog_op_header's
 	 * can fit into remaining data section.
 	 */
-	if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) {
+	if (iclog->ic_size - iclog->ic_offset <
+	    2 * sizeof(struct xlog_op_header)) {
 		int		error = 0;

 		xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
@@ -3152,11 +3153,11 @@ xlog_calc_unit_res(
 	 */

 	/* for trans header */
-	unit_bytes += sizeof(xlog_op_header_t);
-	unit_bytes += sizeof(xfs_trans_header_t);
+	unit_bytes += sizeof(struct xlog_op_header);
+	unit_bytes += sizeof(struct xfs_trans_header);

 	/* for start-rec */
-	unit_bytes += sizeof(xlog_op_header_t);
+	unit_bytes += sizeof(struct xlog_op_header);

 	/*
 	 * for LR headers - the space for data in an iclog is the size minus
@@ -3179,12 +3180,12 @@ xlog_calc_unit_res(
 	num_headers = howmany(unit_bytes, iclog_space);

 	/* for split-recs - ophdrs added when data split over LRs */
-	unit_bytes += sizeof(xlog_op_header_t) * num_headers;
+	unit_bytes += sizeof(struct xlog_op_header) * num_headers;

 	/* add extra header reservations if we overrun */
 	while (!num_headers ||
 	       howmany(unit_bytes, iclog_space) > num_headers) {
-		unit_bytes += sizeof(xlog_op_header_t);
+		unit_bytes += sizeof(struct xlog_op_header);
 		num_headers++;
 	}
 	unit_bytes += log->l_iclog_hsize * num_headers;
@@ -3321,7 +3322,7 @@ xlog_verify_iclog(
 	struct xlog_in_core	*iclog,
 	int			count)
 {
-	xlog_op_header_t	*ophead;
+	struct xlog_op_header	*ophead;
 	xlog_in_core_t		*icptr;
 	xlog_in_core_2_t	*xhdr;
 	void			*base_ptr, *ptr, *p;
@@ -3399,7 +3400,7 @@ xlog_verify_iclog(
 				op_len = be32_to_cpu(iclog->ic_header.h_cycle_data[idx]);
 			}
 		}
-		ptr += sizeof(xlog_op_header_t) + op_len;
+		ptr += sizeof(struct xlog_op_header) + op_len;
 	}
 }
 #endif
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -20,6 +20,43 @@ struct xfs_log_vec {
 	int			lv_alloc_size;	/* size of allocated lv */
 };

+/* Region types for iovec's i_type */
+#define XLOG_REG_TYPE_BFORMAT		1
+#define XLOG_REG_TYPE_BCHUNK		2
+#define XLOG_REG_TYPE_EFI_FORMAT	3
+#define XLOG_REG_TYPE_EFD_FORMAT	4
+#define XLOG_REG_TYPE_IFORMAT		5
+#define XLOG_REG_TYPE_ICORE		6
+#define XLOG_REG_TYPE_IEXT		7
+#define XLOG_REG_TYPE_IBROOT		8
+#define XLOG_REG_TYPE_ILOCAL		9
+#define XLOG_REG_TYPE_IATTR_EXT		10
+#define XLOG_REG_TYPE_IATTR_BROOT	11
+#define XLOG_REG_TYPE_IATTR_LOCAL	12
+#define XLOG_REG_TYPE_QFORMAT		13
+#define XLOG_REG_TYPE_DQUOT		14
+#define XLOG_REG_TYPE_QUOTAOFF		15
+#define XLOG_REG_TYPE_LRHEADER		16
+#define XLOG_REG_TYPE_UNMOUNT		17
+#define XLOG_REG_TYPE_COMMIT		18
+#define XLOG_REG_TYPE_TRANSHDR		19
+#define XLOG_REG_TYPE_ICREATE		20
+#define XLOG_REG_TYPE_RUI_FORMAT	21
+#define XLOG_REG_TYPE_RUD_FORMAT	22
+#define XLOG_REG_TYPE_CUI_FORMAT	23
+#define XLOG_REG_TYPE_CUD_FORMAT	24
+#define XLOG_REG_TYPE_BUI_FORMAT	25
+#define XLOG_REG_TYPE_BUD_FORMAT	26
+#define XLOG_REG_TYPE_ATTRI_FORMAT	27
+#define XLOG_REG_TYPE_ATTRD_FORMAT	28
+#define XLOG_REG_TYPE_ATTR_NAME		29
+#define XLOG_REG_TYPE_ATTR_VALUE	30
+#define XLOG_REG_TYPE_XMI_FORMAT	31
+#define XLOG_REG_TYPE_XMD_FORMAT	32
+#define XLOG_REG_TYPE_ATTR_NEWNAME	33
+#define XLOG_REG_TYPE_ATTR_NEWVALUE	34
+#define XLOG_REG_TYPE_MAX		34
+
 #define XFS_LOG_VEC_ORDERED	(-1)

 /*
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -499,8 +499,8 @@ xlog_recover_finish(
 extern void
 xlog_recover_cancel(struct xlog *);

-extern __le32	 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
-			    char *dp, int size);
+__le32	 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
+		char *dp, unsigned int hdrsize, unsigned int size);

 extern struct kmem_cache *xfs_log_ticket_cache;
 struct xlog_ticket *xlog_ticket_alloc(struct xlog *log, int unit_bytes,
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2894,20 +2894,34 @@ xlog_recover_process(
 	int			pass,
 	struct list_head	*buffer_list)
 {
-	__le32			old_crc = rhead->h_crc;
-	__le32			crc;
+	__le32			expected_crc = rhead->h_crc, crc, other_crc;

-	crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
+	crc = xlog_cksum(log, rhead, dp, XLOG_REC_SIZE,
+			be32_to_cpu(rhead->h_len));
+
+	/*
+	 * Look at the end of the struct xlog_rec_header definition in
+	 * xfs_log_format.h for the glory details.
+	 */
+	if (expected_crc && crc != expected_crc) {
+		other_crc = xlog_cksum(log, rhead, dp, XLOG_REC_SIZE_OTHER,
+				be32_to_cpu(rhead->h_len));
+		if (other_crc == expected_crc) {
+			xfs_notice_once(log->l_mp,
+	"Fixing up incorrect CRC due to padding.");
+			crc = other_crc;
+		}
+	}

 	/*
 	 * Nothing else to do if this is a CRC verification pass. Just return
 	 * if this a record with a non-zero crc. Unfortunately, mkfs always
-	 * sets old_crc to 0 so we must consider this valid even on v5 supers.
-	 * Otherwise, return EFSBADCRC on failure so the callers up the stack
-	 * know precisely what failed.
+	 * sets expected_crc to 0 so we must consider this valid even on v5
+	 * supers.  Otherwise, return EFSBADCRC on failure so the callers up the
+	 * stack know precisely what failed.
 	 */
 	if (pass == XLOG_RECOVER_CRCPASS) {
-		if (old_crc && crc != old_crc)
+		if (expected_crc && crc != expected_crc)
 			return -EFSBADCRC;
 		return 0;
 	}
@@ -2918,11 +2932,11 @@ xlog_recover_process(
 	 * zero CRC check prevents warnings from being emitted when upgrading
 	 * the kernel from one that does not add CRCs by default.
 	 */
-	if (crc != old_crc) {
-		if (old_crc || xfs_has_crc(log->l_mp)) {
+	if (crc != expected_crc) {
+		if (expected_crc || xfs_has_crc(log->l_mp)) {
 			xfs_alert(log->l_mp,
 		"log record CRC mismatch: found 0x%x, expected 0x%x.",
-					le32_to_cpu(old_crc),
+					le32_to_cpu(expected_crc),
 					le32_to_cpu(crc));
 			xfs_hex_dump(dp, 32);
 		}
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1057,19 +1057,6 @@ xfs_mountfs(
 	xfs_inodegc_start(mp);
 	xfs_blockgc_start(mp);

-	/*
-	 * Now that we've recovered any pending superblock feature bit
-	 * additions, we can finish setting up the attr2 behaviour for the
-	 * mount. The noattr2 option overrides the superblock flag, so only
-	 * check the superblock feature flag if the mount option is not set.
-	 */
-	if (xfs_has_noattr2(mp)) {
-		mp->m_features &= ~XFS_FEAT_ATTR2;
-	} else if (!xfs_has_attr2(mp) &&
-		   (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT)) {
-		mp->m_features |= XFS_FEAT_ATTR2;
-	}
-
 	if (xfs_has_metadir(mp)) {
 		error = xfs_mount_setup_metadir(mp);
 		if (error)
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -363,7 +363,6 @@ typedef struct xfs_mount {
 #define XFS_FEAT_EXTFLG		(1ULL << 7)	/* unwritten extents */
 #define XFS_FEAT_ASCIICI	(1ULL << 8)	/* ASCII only case-insens. */
 #define XFS_FEAT_LAZYSBCOUNT	(1ULL << 9)	/* Superblk counters */
-#define XFS_FEAT_ATTR2		(1ULL << 10)	/* dynamic attr fork */
 #define XFS_FEAT_PARENT		(1ULL << 11)	/* parent pointers */
 #define XFS_FEAT_PROJID32	(1ULL << 12)	/* 32 bit project id */
 #define XFS_FEAT_CRC		(1ULL << 13)	/* metadata CRCs */
@@ -386,7 +385,6 @@ typedef struct xfs_mount {

 /* Mount features */
 #define XFS_FEAT_NOLIFETIME	(1ULL << 47)	/* disable lifetime hints */
-#define XFS_FEAT_NOATTR2	(1ULL << 48)	/* disable attr2 creation */
 #define XFS_FEAT_NOALIGN	(1ULL << 49)	/* ignore alignment */
 #define XFS_FEAT_ALLOCSIZE	(1ULL << 50)	/* user specified allocation size */
 #define XFS_FEAT_LARGE_IOSIZE	(1ULL << 51)	/* report large preferred
@@ -396,7 +394,6 @@ typedef struct xfs_mount {
 #define XFS_FEAT_DISCARD	(1ULL << 54)	/* discard unused blocks */
 #define XFS_FEAT_GRPID		(1ULL << 55)	/* group-ID assigned from directory */
 #define XFS_FEAT_SMALL_INUMS	(1ULL << 56)	/* user wants 32bit inodes */
-#define XFS_FEAT_IKEEP		(1ULL << 57)	/* keep empty inode clusters*/
 #define XFS_FEAT_SWALLOC	(1ULL << 58)	/* stripe width allocation */
 #define XFS_FEAT_FILESTREAMS	(1ULL << 59)	/* use filestreams allocator */
 #define XFS_FEAT_DAX_ALWAYS	(1ULL << 60)	/* DAX always enabled */
@@ -504,12 +501,17 @@ __XFS_HAS_V4_FEAT(align, ALIGN)
 __XFS_HAS_V4_FEAT(logv2, LOGV2)
 __XFS_HAS_V4_FEAT(extflg, EXTFLG)
 __XFS_HAS_V4_FEAT(lazysbcount, LAZYSBCOUNT)
-__XFS_ADD_V4_FEAT(attr2, ATTR2)
 __XFS_ADD_V4_FEAT(projid32, PROJID32)
 __XFS_HAS_V4_FEAT(v3inodes, V3INODES)
 __XFS_HAS_V4_FEAT(crc, CRC)
 __XFS_HAS_V4_FEAT(pquotino, PQUOTINO)

+static inline void xfs_add_attr2(struct xfs_mount *mp)
+{
+	if (IS_ENABLED(CONFIG_XFS_SUPPORT_V4))
+		xfs_sb_version_addattr2(&mp->m_sb);
+}
+
 /*
 * Mount features
 *
@@ -517,7 +519,6 @@ __XFS_HAS_V4_FEAT(pquotino, PQUOTINO)
 * bit inodes and read-only state, are kept as operational state rather than
 * features.
 */
-__XFS_HAS_FEAT(noattr2, NOATTR2)
 __XFS_HAS_FEAT(noalign, NOALIGN)
 __XFS_HAS_FEAT(allocsize, ALLOCSIZE)
 __XFS_HAS_FEAT(large_iosize, LARGE_IOSIZE)
@@ -526,7 +527,6 @@ __XFS_HAS_FEAT(dirsync, DIRSYNC)
 __XFS_HAS_FEAT(discard, DISCARD)
 __XFS_HAS_FEAT(grpid, GRPID)
 __XFS_HAS_FEAT(small_inums, SMALL_INUMS)
-__XFS_HAS_FEAT(ikeep, IKEEP)
 __XFS_HAS_FEAT(swalloc, SWALLOC)
 __XFS_HAS_FEAT(filestreams, FILESTREAMS)
 __XFS_HAS_FEAT(dax_always, DAX_ALWAYS)
--- a/fs/xfs/xfs_notify_failure.c
+++ b/fs/xfs/xfs_notify_failure.c
@@ -165,7 +165,7 @@ xfs_dax_translate_range(
 	uint64_t		*bblen)
 {
 	u64			dev_start = btp->bt_dax_part_off;
-	u64			dev_len = bdev_nr_bytes(btp->bt_bdev);
+	u64			dev_len = BBTOB(btp->bt_nr_sectors);
 	u64			dev_end = dev_start + dev_len - 1;

 	/* Notify failure on the whole device. */
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -105,8 +105,8 @@ enum {
 	Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev,
 	Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid,
 	Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups,
-	Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32, Opt_ikeep,
-	Opt_noikeep, Opt_largeio, Opt_nolargeio, Opt_attr2, Opt_noattr2,
+	Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32,
+	Opt_largeio, Opt_nolargeio,
 	Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota,
 	Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota,
 	Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
@@ -133,12 +133,8 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = {
 	fsparam_flag("norecovery",	Opt_norecovery),
 	fsparam_flag("inode64",		Opt_inode64),
 	fsparam_flag("inode32",		Opt_inode32),
-	fsparam_flag("ikeep",		Opt_ikeep),
-	fsparam_flag("noikeep",		Opt_noikeep),
 	fsparam_flag("largeio",		Opt_largeio),
 	fsparam_flag("nolargeio",	Opt_nolargeio),
-	fsparam_flag("attr2",		Opt_attr2),
-	fsparam_flag("noattr2",		Opt_noattr2),
 	fsparam_flag("filestreams",	Opt_filestreams),
 	fsparam_flag("quota",		Opt_quota),
 	fsparam_flag("noquota",		Opt_noquota),
@@ -175,13 +171,11 @@ xfs_fs_show_options(
 {
 	static struct proc_xfs_info xfs_info_set[] = {
 		/* the few simple ones we can get from the mount struct */
-		{ XFS_FEAT_IKEEP,		",ikeep" },
 		{ XFS_FEAT_WSYNC,		",wsync" },
 		{ XFS_FEAT_NOALIGN,		",noalign" },
 		{ XFS_FEAT_SWALLOC,		",swalloc" },
 		{ XFS_FEAT_NOUUID,		",nouuid" },
 		{ XFS_FEAT_NORECOVERY,		",norecovery" },
-		{ XFS_FEAT_ATTR2,		",attr2" },
 		{ XFS_FEAT_FILESTREAMS,		",filestreams" },
 		{ XFS_FEAT_GRPID,		",grpid" },
 		{ XFS_FEAT_DISCARD,		",discard" },
@@ -541,7 +535,8 @@ xfs_setup_devices(
 {
 	int			error;

-	error = xfs_configure_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize);
+	error = xfs_configure_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize,
+			mp->m_sb.sb_dblocks);
 	if (error)
 		return error;

@@ -551,7 +546,7 @@ xfs_setup_devices(
 		if (xfs_has_sector(mp))
 			log_sector_size = mp->m_sb.sb_logsectsize;
 		error = xfs_configure_buftarg(mp->m_logdev_targp,
-					    log_sector_size);
+				log_sector_size, mp->m_sb.sb_logblocks);
 		if (error)
 			return error;
 	}
@@ -565,7 +560,7 @@ xfs_setup_devices(
 		mp->m_rtdev_targp = mp->m_ddev_targp;
 	} else if (mp->m_rtname) {
 		error = xfs_configure_buftarg(mp->m_rtdev_targp,
-					    mp->m_sb.sb_sectsize);
+				mp->m_sb.sb_sectsize, mp->m_sb.sb_rblocks);
 		if (error)
 			return error;
 	}
@@ -1088,15 +1083,6 @@ xfs_finish_flags(
 		}
 	}

-	/*
-	 * V5 filesystems always use attr2 format for attributes.
-	 */
-	if (xfs_has_crc(mp) && xfs_has_noattr2(mp)) {
-		xfs_warn(mp, "Cannot mount a V5 filesystem as noattr2. "
-			     "attr2 is always enabled for V5 filesystems.");
-		return -EINVAL;
-	}
-
 	/*
 	 * prohibit r/w mounts of read-only filesystems
 	 */
@@ -1543,22 +1529,6 @@ xfs_fs_parse_param(
 		return 0;
 #endif
 	/* Following mount options will be removed in September 2025 */
-	case Opt_ikeep:
-		xfs_fs_warn_deprecated(fc, param, XFS_FEAT_IKEEP, true);
-		parsing_mp->m_features |= XFS_FEAT_IKEEP;
-		return 0;
-	case Opt_noikeep:
-		xfs_fs_warn_deprecated(fc, param, XFS_FEAT_IKEEP, false);
-		parsing_mp->m_features &= ~XFS_FEAT_IKEEP;
-		return 0;
-	case Opt_attr2:
-		xfs_fs_warn_deprecated(fc, param, XFS_FEAT_ATTR2, true);
-		parsing_mp->m_features |= XFS_FEAT_ATTR2;
-		return 0;
-	case Opt_noattr2:
-		xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true);
-		parsing_mp->m_features |= XFS_FEAT_NOATTR2;
-		return 0;
 	case Opt_max_open_zones:
 		parsing_mp->m_max_open_zones = result.uint_32;
 		return 0;
@@ -1594,16 +1564,6 @@ xfs_fs_validate_params(
 		return -EINVAL;
 	}

-	/*
-	 * We have not read the superblock at this point, so only the attr2
-	 * mount option can set the attr2 feature by this stage.
-	 */
-	if (xfs_has_attr2(mp) && xfs_has_noattr2(mp)) {
-		xfs_warn(mp, "attr2 and noattr2 cannot both be specified.");
-		return -EINVAL;
-	}
-
-
 	if (xfs_has_noalign(mp) && (mp->m_dalign || mp->m_swidth)) {
 		xfs_warn(mp,
 	"sunit and swidth options incompatible with the noalign option");
@@ -2178,21 +2138,6 @@ xfs_fs_reconfigure(
 	if (error)
 		return error;

-	/* attr2 -> noattr2 */
-	if (xfs_has_noattr2(new_mp)) {
-		if (xfs_has_crc(mp)) {
-			xfs_warn(mp,
-			"attr2 is always enabled for a V5 filesystem - can't be changed.");
-			return -EINVAL;
-		}
-		mp->m_features &= ~XFS_FEAT_ATTR2;
-		mp->m_features |= XFS_FEAT_NOATTR2;
-	} else if (xfs_has_attr2(new_mp)) {
-		/* noattr2 -> attr2 */
-		mp->m_features &= ~XFS_FEAT_NOATTR2;
-		mp->m_features |= XFS_FEAT_ATTR2;
-	}
-
 	/* Validate new max_atomic_write option before making other changes */
 	if (mp->m_awu_max_bytes != new_mp->m_awu_max_bytes) {
 		error = xfs_set_max_atomic_write_opt(mp,
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -50,7 +50,7 @@ xfs_panic_mask_proc_handler(
 }
 #endif /* CONFIG_PROC_FS */

-STATIC int
+static inline int
 xfs_deprecated_dointvec_minmax(
 	const struct ctl_table	*ctl,
 	int			write,
@@ -67,24 +67,6 @@ xfs_deprecated_dointvec_minmax(
 }

 static const struct ctl_table xfs_table[] = {
-	{
-		.procname	= "irix_sgid_inherit",
-		.data		= &xfs_params.sgid_inherit.val,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= xfs_deprecated_dointvec_minmax,
-		.extra1		= &xfs_params.sgid_inherit.min,
-		.extra2		= &xfs_params.sgid_inherit.max
-	},
-	{
-		.procname	= "irix_symlink_mode",
-		.data		= &xfs_params.symlink_mode.val,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= xfs_deprecated_dointvec_minmax,
-		.extra1		= &xfs_params.symlink_mode.min,
-		.extra2		= &xfs_params.symlink_mode.max
-	},
 	{
 		.procname	= "panic_mask",
 		.data		= &xfs_params.panic_mask.val,
@@ -185,15 +167,6 @@ static const struct ctl_table xfs_table[] = {
 		.extra1		= &xfs_params.blockgc_timer.min,
 		.extra2		= &xfs_params.blockgc_timer.max,
 	},
-	{
-		.procname	= "speculative_cow_prealloc_lifetime",
-		.data		= &xfs_params.blockgc_timer.val,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= xfs_deprecated_dointvec_minmax,
-		.extra1		= &xfs_params.blockgc_timer.min,
-		.extra2		= &xfs_params.blockgc_timer.max,
-	},
 	/* please keep this the last entry */
 #ifdef CONFIG_PROC_FS
 	{
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -19,9 +19,6 @@ typedef struct xfs_sysctl_val {
 } xfs_sysctl_val_t;

 typedef struct xfs_param {
-	xfs_sysctl_val_t sgid_inherit;	/* Inherit S_ISGID if process' GID is
-					 * not a member of parent dir GID. */
-	xfs_sysctl_val_t symlink_mode;	/* Link creat mode affected by umask */
 	xfs_sysctl_val_t panic_mask;	/* bitmask to cause panic on errors. */
 	xfs_sysctl_val_t error_level;	/* Degree of reporting for problems  */
 	xfs_sysctl_val_t syncd_timer;	/* Interval between xfssyncd wakeups */
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -452,19 +452,17 @@ xfs_trans_mod_sb(
 */
 STATIC void
 xfs_trans_apply_sb_deltas(
-	xfs_trans_t	*tp)
+	struct xfs_trans	*tp)
 {
-	struct xfs_dsb	*sbp;
-	struct xfs_buf	*bp;
-	int		whole = 0;
-
-	bp = xfs_trans_getsb(tp);
-	sbp = bp->b_addr;
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_buf		*bp = xfs_trans_getsb(tp);
+	struct xfs_dsb		*sbp = bp->b_addr;
+	int			whole = 0;

 	/*
 	 * Only update the superblock counters if we are logging them
 	 */
-	if (!xfs_has_lazysbcount((tp->t_mountp))) {
+	if (!xfs_has_lazysbcount(mp)) {
 		if (tp->t_icount_delta)
 			be64_add_cpu(&sbp->sb_icount, tp->t_icount_delta);
 		if (tp->t_ifree_delta)
@@ -491,8 +489,7 @@ xfs_trans_apply_sb_deltas(
 	 * write the correct value ondisk.
 	 */
 	if ((tp->t_frextents_delta || tp->t_res_frextents_delta) &&
-	    !xfs_has_rtgroups(tp->t_mountp)) {
-		struct xfs_mount	*mp = tp->t_mountp;
+	    !xfs_has_rtgroups(mp)) {
 		int64_t			rtxdelta;

 		rtxdelta = tp->t_frextents_delta + tp->t_res_frextents_delta;
@@ -505,6 +502,8 @@ xfs_trans_apply_sb_deltas(

 	if (tp->t_dblocks_delta) {
 		be64_add_cpu(&sbp->sb_dblocks, tp->t_dblocks_delta);
+		mp->m_ddev_targp->bt_nr_sectors +=
+			XFS_FSB_TO_BB(mp, tp->t_dblocks_delta);
 		whole = 1;
 	}
 	if (tp->t_agcount_delta) {
@@ -524,7 +523,7 @@ xfs_trans_apply_sb_deltas(
 		 * recompute the ondisk rtgroup block log.  The incore values
 		 * will be recomputed in xfs_trans_unreserve_and_mod_sb.
 		 */
-		if (xfs_has_rtgroups(tp->t_mountp)) {
+		if (xfs_has_rtgroups(mp)) {
 			sbp->sb_rgblklog = xfs_compute_rgblklog(
 						be32_to_cpu(sbp->sb_rgextents),
 						be32_to_cpu(sbp->sb_rextsize));
@@ -537,6 +536,8 @@ xfs_trans_apply_sb_deltas(
 	}
 	if (tp->t_rblocks_delta) {
 		be64_add_cpu(&sbp->sb_rblocks, tp->t_rblocks_delta);
+		mp->m_rtdev_targp->bt_nr_sectors +=
+			XFS_FSB_TO_BB(mp, tp->t_rblocks_delta);
 		whole = 1;
 	}
 	if (tp->t_rextents_delta) {
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -374,7 +374,7 @@ xfsaild_push_item(
 	 * If log item pinning is enabled, skip the push and track the item as
 	 * pinned. This can help induce head-behind-tail conditions.
 	 */
-	if (XFS_TEST_ERROR(false, ailp->ail_log->l_mp, XFS_ERRTAG_LOG_ITEM_PIN))
+	if (XFS_TEST_ERROR(ailp->ail_log->l_mp, XFS_ERRTAG_LOG_ITEM_PIN))
 		return XFS_ITEM_PINNED;

 	/*
--- a/fs/xfs/xfs_zone_alloc.c
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -493,64 +493,58 @@ xfs_try_open_zone(
 	return oz;
 }

-/*
- * For data with short or medium lifetime, try to colocated it into an
- * already open zone with a matching temperature.
- */
-static bool
-xfs_colocate_eagerly(
-	enum rw_hint		file_hint)
-{
-	switch (file_hint) {
-	case WRITE_LIFE_MEDIUM:
-	case WRITE_LIFE_SHORT:
-	case WRITE_LIFE_NONE:
-		return true;
-	default:
-		return false;
-	}
-}
+enum xfs_zone_alloc_score {
+	/* Any open zone will do it, we're desperate */
+	XFS_ZONE_ALLOC_ANY	= 0,

-static bool
-xfs_good_hint_match(
-	struct xfs_open_zone	*oz,
-	enum rw_hint		file_hint)
-{
-	switch (oz->oz_write_hint) {
-	case WRITE_LIFE_LONG:
-	case WRITE_LIFE_EXTREME:
-		/* colocate long and extreme */
-		if (file_hint == WRITE_LIFE_LONG ||
-		    file_hint == WRITE_LIFE_EXTREME)
-			return true;
-		break;
-	case WRITE_LIFE_MEDIUM:
-		/* colocate medium with medium */
-		if (file_hint == WRITE_LIFE_MEDIUM)
-			return true;
-		break;
-	case WRITE_LIFE_SHORT:
-	case WRITE_LIFE_NONE:
-	case WRITE_LIFE_NOT_SET:
-		/* colocate short and none */
-		if (file_hint <= WRITE_LIFE_SHORT)
-			return true;
-		break;
-	}
-	return false;
-}
+	/* It better fit somehow */
+	XFS_ZONE_ALLOC_OK	= 1,
+
+	/* Only reuse a zone if it fits really well. */
+	XFS_ZONE_ALLOC_GOOD	= 2,
+};
+
+/*
+ * Life time hint co-location matrix.  Fields not set default to 0
+ * aka XFS_ZONE_ALLOC_ANY.
+ */
+static const unsigned int
+xfs_zoned_hint_score[WRITE_LIFE_HINT_NR][WRITE_LIFE_HINT_NR] = {
+	[WRITE_LIFE_NOT_SET]	= {
+		[WRITE_LIFE_NOT_SET]	= XFS_ZONE_ALLOC_OK,
+	},
+	[WRITE_LIFE_NONE]	= {
+		[WRITE_LIFE_NONE]	= XFS_ZONE_ALLOC_OK,
+	},
+	[WRITE_LIFE_SHORT]	= {
+		[WRITE_LIFE_SHORT]	= XFS_ZONE_ALLOC_GOOD,
+	},
+	[WRITE_LIFE_MEDIUM]	= {
+		[WRITE_LIFE_MEDIUM]	= XFS_ZONE_ALLOC_GOOD,
+	},
+	[WRITE_LIFE_LONG]	= {
+		[WRITE_LIFE_LONG]	= XFS_ZONE_ALLOC_OK,
+		[WRITE_LIFE_EXTREME]	= XFS_ZONE_ALLOC_OK,
+	},
+	[WRITE_LIFE_EXTREME]	= {
+		[WRITE_LIFE_LONG]	= XFS_ZONE_ALLOC_OK,
+		[WRITE_LIFE_EXTREME]	= XFS_ZONE_ALLOC_OK,
+	},
+};

 static bool
 xfs_try_use_zone(
 	struct xfs_zone_info	*zi,
 	enum rw_hint		file_hint,
 	struct xfs_open_zone	*oz,
-	bool			lowspace)
+	unsigned int		goodness)
 {
 	if (oz->oz_allocated == rtg_blocks(oz->oz_rtg))
 		return false;
-	if (!lowspace && !xfs_good_hint_match(oz, file_hint))
+
+	if (xfs_zoned_hint_score[oz->oz_write_hint][file_hint] < goodness)
 		return false;
+
 	if (!atomic_inc_not_zero(&oz->oz_ref))
 		return false;

@@ -581,14 +575,14 @@ static struct xfs_open_zone *
 xfs_select_open_zone_lru(
 	struct xfs_zone_info	*zi,
 	enum rw_hint		file_hint,
-	bool			lowspace)
+	unsigned int		goodness)
 {
 	struct xfs_open_zone	*oz;

 	lockdep_assert_held(&zi->zi_open_zones_lock);

 	list_for_each_entry(oz, &zi->zi_open_zones, oz_entry)
-		if (xfs_try_use_zone(zi, file_hint, oz, lowspace))
+		if (xfs_try_use_zone(zi, file_hint, oz, goodness))
 			return oz;

 	cond_resched_lock(&zi->zi_open_zones_lock);
@@ -651,9 +645,11 @@ xfs_select_zone_nowait(
 	 * data.
 	 */
 	spin_lock(&zi->zi_open_zones_lock);
-	if (xfs_colocate_eagerly(write_hint))
-		oz = xfs_select_open_zone_lru(zi, write_hint, false);
-	else if (pack_tight)
+	oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_GOOD);
+	if (oz)
+		goto out_unlock;
+
+	if (pack_tight)
 		oz = xfs_select_open_zone_mru(zi, write_hint);
 	if (oz)
 		goto out_unlock;
@@ -667,16 +663,16 @@ xfs_select_zone_nowait(
 		goto out_unlock;

 	/*
-	 * Try to colocate cold data with other cold data if we failed to open a
-	 * new zone for it.
+	 * Try to find an zone that is an ok match to colocate data with.
 	 */
-	if (write_hint != WRITE_LIFE_NOT_SET &&
-	    !xfs_colocate_eagerly(write_hint))
-		oz = xfs_select_open_zone_lru(zi, write_hint, false);
-	if (!oz)
-		oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, false);
-	if (!oz)
-		oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, true);
+	oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_OK);
+	if (oz)
+		goto out_unlock;
+
+	/*
+	 * Pick the least recently used zone, regardless of hint match
+	 */
+	oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_ANY);
 out_unlock:
 	spin_unlock(&zi->zi_open_zones_lock);
 	return oz;
@@ -1135,7 +1131,7 @@ xfs_calc_open_zones(
 		if (bdev_open_zones)
 			mp->m_max_open_zones = bdev_open_zones;
 		else
-			mp->m_max_open_zones = xfs_max_open_zones(mp);
+			mp->m_max_open_zones = XFS_DEFAULT_MAX_OPEN_ZONES;
 	}

 	if (mp->m_max_open_zones < XFS_MIN_OPEN_ZONES) {
@@ -1248,7 +1244,7 @@ xfs_mount_zones(
 	if (!mp->m_zone_info)
 		return -ENOMEM;

-	xfs_info(mp, "%u zones of %u blocks size (%u max open)",
+	xfs_info(mp, "%u zones of %u blocks (%u max open zones)",
 		 mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks,
 		 mp->m_max_open_zones);
 	trace_xfs_zones_mount(mp);
--- a/include/linux/rw_hint.h
+++ b/include/linux/rw_hint.h
@@ -14,6 +14,7 @@ enum rw_hint {
 	WRITE_LIFE_MEDIUM	= RWH_WRITE_LIFE_MEDIUM,
 	WRITE_LIFE_LONG		= RWH_WRITE_LIFE_LONG,
 	WRITE_LIFE_EXTREME	= RWH_WRITE_LIFE_EXTREME,
+	WRITE_LIFE_HINT_NR,
 } __packed;

 /* Sparse ignores __packed annotations on enums, hence the #ifndef below. */