diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 1b7385e23b34..9f7133e02576 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -106,6 +106,7 @@ xfs-y += xfs_aops.o \ xfs_symlink.o \ xfs_sysfs.o \ xfs_trans.o \ + xfs_verify_media.o \ xfs_xattr.o # low-level transaction/log code diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index a01303c5de6c..d165de607d17 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -1160,6 +1160,34 @@ struct xfs_health_file_on_monitored_fs { __u32 flags; /* zero for now */ }; +/* Verify the media of the underlying devices */ +struct xfs_verify_media { + __u32 me_dev; /* I: XFS_DEV_{DATA,LOG,RT} */ + __u32 me_flags; /* I: XFS_VERIFY_MEDIA_* */ + + /* + * IO: inclusive start of disk range to verify, in 512b blocks. + * Will be adjusted upwards as media verification succeeds. + */ + __u64 me_start_daddr; + + /* + * IO: exclusive end of the disk range to verify, in 512b blocks. + * Can be adjusted downwards to match device size. + */ + __u64 me_end_daddr; + + __u32 me_ioerror; /* O: I/O error (positive) */ + __u32 me_max_io_size; /* I: maximum IO size in bytes */ + + __u32 me_rest_us; /* I: rest time between IOs, usecs */ + __u32 me_pad; /* zero */ +}; + +#define XFS_VERIFY_MEDIA_REPORT (1 << 0) /* report to fsnotify */ + +#define XFS_VERIFY_MEDIA_FLAGS (XFS_VERIFY_MEDIA_REPORT) + /* * ioctl commands that are used by Linux filesystems */ @@ -1202,6 +1230,8 @@ struct xfs_health_file_on_monitored_fs { #define XFS_IOC_HEALTH_MONITOR _IOW ('X', 68, struct xfs_health_monitor) #define XFS_IOC_HEALTH_FD_ON_MONITORED_FS \ _IOW ('X', 69, struct xfs_health_file_on_monitored_fs) +#define XFS_IOC_VERIFY_MEDIA _IOWR('X', 70, struct xfs_verify_media) + /* * ioctl commands that replace IRIX syssgi()'s */ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index c04c41ca924e..80a005999d2d 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -42,6 +42,7 @@ #include "xfs_handle.h" #include "xfs_rtgroup.h" #include "xfs_healthmon.h" +#include "xfs_verify_media.h" #include #include @@ -1422,6 +1423,8 @@ xfs_file_ioctl( case XFS_IOC_HEALTH_MONITOR: return xfs_ioc_health_monitor(filp, arg); + case XFS_IOC_VERIFY_MEDIA: + return xfs_ioc_verify_media(filp, arg); default: return -ENOTTY; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 0cf487775358..3483461cf462 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -6320,6 +6320,104 @@ TRACE_EVENT(xfs_healthmon_report_file_ioerror, __entry->error) ); +TRACE_EVENT(xfs_verify_media, + TP_PROTO(const struct xfs_mount *mp, const struct xfs_verify_media *me, + dev_t fdev, xfs_daddr_t daddr, uint64_t bbcount, + const struct folio *folio), + TP_ARGS(mp, me, fdev, daddr, bbcount, folio), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, fdev) + __field(xfs_daddr_t, start_daddr) + __field(xfs_daddr_t, end_daddr) + __field(unsigned int, flags) + __field(xfs_daddr_t, daddr) + __field(uint64_t, bbcount) + __field(unsigned int, bufsize) + ), + TP_fast_assign( + __entry->dev = mp->m_ddev_targp->bt_dev; + __entry->fdev = fdev; + __entry->start_daddr = me->me_start_daddr; + __entry->end_daddr = me->me_end_daddr; + __entry->flags = me->me_flags; + __entry->daddr = daddr; + __entry->bbcount = bbcount; + __entry->bufsize = folio_size(folio); + ), + TP_printk("dev %d:%d fdev %d:%d start_daddr 0x%llx end_daddr 0x%llx flags 0x%x daddr 0x%llx bbcount 0x%llx bufsize 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->fdev), MINOR(__entry->fdev), + __entry->start_daddr, + __entry->end_daddr, + __entry->flags, + __entry->daddr, + __entry->bbcount, + __entry->bufsize) +); + +TRACE_EVENT(xfs_verify_media_end, + TP_PROTO(const struct xfs_mount *mp, const struct xfs_verify_media *me, + dev_t fdev), + TP_ARGS(mp, me, fdev), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, fdev) + __field(xfs_daddr_t, start_daddr) + __field(xfs_daddr_t, end_daddr) + __field(int, ioerror) + ), + TP_fast_assign( + __entry->dev = mp->m_ddev_targp->bt_dev; + __entry->fdev = fdev; + __entry->start_daddr = me->me_start_daddr; + __entry->end_daddr = me->me_end_daddr; + __entry->ioerror = me->me_ioerror; + ), + TP_printk("dev %d:%d fdev %d:%d start_daddr 0x%llx end_daddr 0x%llx ioerror %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->fdev), MINOR(__entry->fdev), + __entry->start_daddr, + __entry->end_daddr, + __entry->ioerror) +); + +TRACE_EVENT(xfs_verify_media_error, + TP_PROTO(const struct xfs_mount *mp, const struct xfs_verify_media *me, + dev_t fdev, xfs_daddr_t daddr, uint64_t bbcount, + blk_status_t status), + TP_ARGS(mp, me, fdev, daddr, bbcount, status), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, fdev) + __field(xfs_daddr_t, start_daddr) + __field(xfs_daddr_t, end_daddr) + __field(unsigned int, flags) + __field(xfs_daddr_t, daddr) + __field(uint64_t, bbcount) + __field(int, error) + ), + TP_fast_assign( + __entry->dev = mp->m_ddev_targp->bt_dev; + __entry->fdev = fdev; + __entry->start_daddr = me->me_start_daddr; + __entry->end_daddr = me->me_end_daddr; + __entry->flags = me->me_flags; + __entry->daddr = daddr; + __entry->bbcount = bbcount; + __entry->error = blk_status_to_errno(status); + ), + TP_printk("dev %d:%d fdev %d:%d start_daddr 0x%llx end_daddr 0x%llx flags 0x%x daddr 0x%llx bbcount 0x%llx error %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->fdev), MINOR(__entry->fdev), + __entry->start_daddr, + __entry->end_daddr, + __entry->flags, + __entry->daddr, + __entry->bbcount, + __entry->error) +); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_verify_media.c b/fs/xfs/xfs_verify_media.c new file mode 100644 index 000000000000..f4f620c98d92 --- /dev/null +++ b/fs/xfs/xfs_verify_media.c @@ -0,0 +1,445 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2026 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_bit.h" +#include "xfs_btree.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_trans.h" +#include "xfs_alloc.h" +#include "xfs_ag.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_rtgroup.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_health.h" +#include "xfs_healthmon.h" +#include "xfs_trace.h" +#include "xfs_verify_media.h" + +#include + +struct xfs_group_data_lost { + xfs_agblock_t startblock; + xfs_extlen_t blockcount; +}; + +/* Report lost file data from rmap records */ +static int +xfs_verify_report_data_lost( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *data) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_inode *ip; + struct xfs_group_data_lost *lost = data; + xfs_fileoff_t fileoff = rec->rm_offset; + xfs_extlen_t blocks = rec->rm_blockcount; + const bool is_attr = + (rec->rm_flags & XFS_RMAP_ATTR_FORK); + const xfs_agblock_t lost_end = + lost->startblock + lost->blockcount; + const xfs_agblock_t rmap_end = + rec->rm_startblock + rec->rm_blockcount; + int error = 0; + + if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner)) + return 0; + + error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, 0, 0, &ip); + if (error) + return 0; + + if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) { + xfs_bmap_mark_sick(ip, is_attr ? XFS_ATTR_FORK : XFS_DATA_FORK); + goto out_rele; + } + + if (is_attr) { + xfs_inode_mark_sick(ip, XFS_SICK_INO_XATTR); + goto out_rele; + } + + if (lost->startblock > rec->rm_startblock) { + fileoff += lost->startblock - rec->rm_startblock; + blocks -= lost->startblock - rec->rm_startblock; + } + if (rmap_end > lost_end) + blocks -= rmap_end - lost_end; + + fserror_report_data_lost(VFS_I(ip), XFS_FSB_TO_B(mp, fileoff), + XFS_FSB_TO_B(mp, blocks), GFP_NOFS); + +out_rele: + xfs_irele(ip); + return 0; +} + +/* Walk reverse mappings to look for all file data loss */ +static int +xfs_verify_report_losses( + struct xfs_mount *mp, + enum xfs_group_type type, + xfs_daddr_t daddr, + u64 bblen) +{ + struct xfs_group *xg = NULL; + struct xfs_trans *tp; + xfs_fsblock_t start_bno, end_bno; + uint32_t start_gno, end_gno; + int error; + + if (type == XG_TYPE_RTG) { + start_bno = xfs_daddr_to_rtb(mp, daddr); + end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1); + } else { + start_bno = XFS_DADDR_TO_FSB(mp, daddr); + end_bno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1); + } + + tp = xfs_trans_alloc_empty(mp); + start_gno = xfs_fsb_to_gno(mp, start_bno, type); + end_gno = xfs_fsb_to_gno(mp, end_bno, type); + while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) { + struct xfs_buf *agf_bp = NULL; + struct xfs_rtgroup *rtg = NULL; + struct xfs_btree_cur *cur; + struct xfs_rmap_irec ri_low = { }; + struct xfs_rmap_irec ri_high; + struct xfs_group_data_lost lost; + + if (type == XG_TYPE_AG) { + struct xfs_perag *pag = to_perag(xg); + + error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp); + if (error) { + xfs_perag_put(pag); + break; + } + + cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag); + } else { + rtg = to_rtg(xg); + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + cur = xfs_rtrmapbt_init_cursor(tp, rtg); + } + + /* + * Set the rmap range from ri_low to ri_high, which represents + * a [start, end] where we looking for the files or metadata. + */ + memset(&ri_high, 0xFF, sizeof(ri_high)); + if (xg->xg_gno == start_gno) + ri_low.rm_startblock = + xfs_fsb_to_gbno(mp, start_bno, type); + if (xg->xg_gno == end_gno) + ri_high.rm_startblock = + xfs_fsb_to_gbno(mp, end_bno, type); + + lost.startblock = ri_low.rm_startblock; + lost.blockcount = min(xg->xg_block_count, + ri_high.rm_startblock + 1) - + ri_low.rm_startblock; + + error = xfs_rmap_query_range(cur, &ri_low, &ri_high, + xfs_verify_report_data_lost, &lost); + xfs_btree_del_cursor(cur, error); + if (agf_bp) + xfs_trans_brelse(tp, agf_bp); + if (rtg) + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); + if (error) { + xfs_group_put(xg); + break; + } + } + + xfs_trans_cancel(tp); + return 0; +} + +/* + * Compute the desired verify IO size. + * + * To minimize command overhead, we'd like to create bios that are 1MB, though + * we allow the user to ask for a smaller size. + */ +static unsigned int +xfs_verify_iosize( + const struct xfs_verify_media *me, + struct xfs_buftarg *btp, + uint64_t bbcount) +{ + unsigned int iosize = + min_not_zero(SZ_1M, me->me_max_io_size); + + BUILD_BUG_ON(BBSHIFT != SECTOR_SHIFT); + ASSERT(BBTOB(bbcount) >= bdev_logical_block_size(btp->bt_bdev)); + + return clamp(iosize, bdev_logical_block_size(btp->bt_bdev), + BBTOB(bbcount)); +} + +/* Allocate as much memory as we can get for verification buffer. */ +static struct folio * +xfs_verify_alloc_folio( + const unsigned int iosize) +{ + unsigned int order = get_order(iosize); + + while (order > 0) { + struct folio *folio = + folio_alloc(GFP_KERNEL | __GFP_NORETRY, order); + + if (folio) + return folio; + order--; + } + + return folio_alloc(GFP_KERNEL, 0); +} + +/* Report any kind of problem verifying media */ +static void +xfs_verify_media_error( + struct xfs_mount *mp, + struct xfs_verify_media *me, + struct xfs_buftarg *btp, + xfs_daddr_t daddr, + unsigned int bio_bbcount, + blk_status_t bio_status) +{ + trace_xfs_verify_media_error(mp, me, btp->bt_bdev->bd_dev, daddr, + bio_bbcount, bio_status); + + /* + * Pass any error, I/O or otherwise, up to the caller if we didn't + * successfully verify any bytes at all. + */ + if (me->me_start_daddr == daddr) + me->me_ioerror = -blk_status_to_errno(bio_status); + + /* + * PI validation failures, medium errors, or general IO errors are + * treated as indicators of data loss. Everything else are (hopefully) + * transient errors and are not reported to healthmon or fsnotify. + */ + switch (bio_status) { + case BLK_STS_PROTECTION: + case BLK_STS_IOERR: + case BLK_STS_MEDIUM: + break; + default: + return; + } + + if (!(me->me_flags & XFS_VERIFY_MEDIA_REPORT)) + return; + + xfs_healthmon_report_media(mp, me->me_dev, daddr, bio_bbcount); + + if (!xfs_has_rmapbt(mp)) + return; + + switch (me->me_dev) { + case XFS_DEV_DATA: + xfs_verify_report_losses(mp, XG_TYPE_AG, daddr, bio_bbcount); + break; + case XFS_DEV_RT: + xfs_verify_report_losses(mp, XG_TYPE_RTG, daddr, bio_bbcount); + break; + } +} + +/* Verify the media of an xfs device by submitting read requests to the disk. */ +static int +xfs_verify_media( + struct xfs_mount *mp, + struct xfs_verify_media *me) +{ + struct xfs_buftarg *btp = NULL; + struct bio *bio; + struct folio *folio; + xfs_daddr_t daddr; + uint64_t bbcount; + int error = 0; + + me->me_ioerror = 0; + + switch (me->me_dev) { + case XFS_DEV_DATA: + btp = mp->m_ddev_targp; + break; + case XFS_DEV_LOG: + if (mp->m_logdev_targp->bt_bdev != mp->m_ddev_targp->bt_bdev) + btp = mp->m_logdev_targp; + break; + case XFS_DEV_RT: + btp = mp->m_rtdev_targp; + break; + } + if (!btp) + return -ENODEV; + + /* + * If the caller told us to verify beyond the end of the disk, tell the + * user exactly where that was. + */ + if (me->me_end_daddr > btp->bt_nr_sectors) + me->me_end_daddr = btp->bt_nr_sectors; + + /* start and end have to be aligned to the lba size */ + if (!IS_ALIGNED(BBTOB(me->me_start_daddr | me->me_end_daddr), + bdev_logical_block_size(btp->bt_bdev))) + return -EINVAL; + + /* + * end_daddr is the exclusive end of the range, so if start_daddr + * reaches there (or beyond), there's no work to be done. + */ + if (me->me_start_daddr >= me->me_end_daddr) + return 0; + + /* + * There are three ranges involved here: + * + * - [me->me_start_daddr, me->me_end_daddr) is the range that the + * user wants to verify. end_daddr can be beyond the end of the + * disk; we'll constrain it to the end if necessary. + * + * - [daddr, me->me_end_daddr) is the range that we have not yet + * verified. We update daddr after each successful read. + * me->me_start_daddr is set to daddr before returning. + * + * - [daddr, daddr + bio_bbcount) is the range that we're currently + * verifying. + */ + daddr = me->me_start_daddr; + bbcount = min_t(sector_t, me->me_end_daddr, btp->bt_nr_sectors) - + me->me_start_daddr; + + folio = xfs_verify_alloc_folio(xfs_verify_iosize(me, btp, bbcount)); + if (!folio) + return -ENOMEM; + + trace_xfs_verify_media(mp, me, btp->bt_bdev->bd_dev, daddr, bbcount, + folio); + + bio = bio_alloc(btp->bt_bdev, 1, REQ_OP_READ, GFP_KERNEL); + if (!bio) { + error = -ENOMEM; + goto out_folio; + } + + while (bbcount > 0) { + unsigned int bio_bbcount; + blk_status_t bio_status; + + bio_reset(bio, btp->bt_bdev, REQ_OP_READ); + bio->bi_iter.bi_sector = daddr; + bio_add_folio_nofail(bio, folio, + min(bbcount << SECTOR_SHIFT, folio_size(folio)), + 0); + + /* + * Save the length of the bio before we submit it, because we + * need the original daddr and length for reporting IO errors + * if the bio fails. + */ + bio_bbcount = bio->bi_iter.bi_size >> SECTOR_SHIFT; + submit_bio_wait(bio); + bio_status = bio->bi_status; + if (bio_status != BLK_STS_OK) { + xfs_verify_media_error(mp, me, btp, daddr, bio_bbcount, + bio_status); + error = 0; + break; + } + + daddr += bio_bbcount; + bbcount -= bio_bbcount; + + if (bbcount == 0) + break; + + if (me->me_rest_us) { + ktime_t expires; + + expires = ktime_add_ns(ktime_get(), + me->me_rest_us * 1000); + set_current_state(TASK_KILLABLE); + schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); + } + + if (fatal_signal_pending(current)) { + error = -EINTR; + break; + } + + cond_resched(); + } + + bio_put(bio); +out_folio: + folio_put(folio); + + if (error) + return error; + + /* + * Advance start_daddr to the end of what we verified if there wasn't + * an operational error. + */ + me->me_start_daddr = daddr; + trace_xfs_verify_media_end(mp, me, btp->bt_bdev->bd_dev); + return 0; +} + +int +xfs_ioc_verify_media( + struct file *file, + struct xfs_verify_media __user *arg) +{ + struct xfs_verify_media me; + struct xfs_inode *ip = XFS_I(file_inode(file)); + struct xfs_mount *mp = ip->i_mount; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&me, arg, sizeof(me))) + return -EFAULT; + + if (me.me_pad) + return -EINVAL; + if (me.me_flags & ~XFS_VERIFY_MEDIA_FLAGS) + return -EINVAL; + + switch (me.me_dev) { + case XFS_DEV_DATA: + case XFS_DEV_LOG: + case XFS_DEV_RT: + break; + default: + return -EINVAL; + } + + error = xfs_verify_media(mp, &me); + if (error) + return error; + + if (copy_to_user(arg, &me, sizeof(me))) + return -EFAULT; + + return 0; +} diff --git a/fs/xfs/xfs_verify_media.h b/fs/xfs/xfs_verify_media.h new file mode 100644 index 000000000000..dc6eee9c8863 --- /dev/null +++ b/fs/xfs/xfs_verify_media.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2026 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#ifndef __XFS_VERIFY_MEDIA_H__ +#define __XFS_VERIFY_MEDIA_H__ + +struct xfs_verify_media; +int xfs_ioc_verify_media(struct file *file, + struct xfs_verify_media __user *arg); + +#endif /* __XFS_VERIFY_MEDIA_H__ */