diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index 0ed10aeff86b..09a9d4aca0fd 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -609,6 +609,51 @@ Description: enabled, and whether tags are shared. +What: /sys/block//queue/async_depth +Date: August 2025 +Contact: linux-block@vger.kernel.org +Description: + [RW] Controls how many asynchronous requests may be allocated + in the block layer. The value is always capped at nr_requests. + + When no elevator is active (none): + + - async_depth is always equal to nr_requests. + + For bfq scheduler: + + - By default, async_depth is set to 75% of nr_requests. + Internal limits are then derived from this value: + + * Sync writes: limited to async_depth (≈75% of nr_requests). + * Async I/O: limited to ~2/3 of async_depth (≈50% of + nr_requests). + + If a bfq_queue is weight-raised: + + * Sync writes: limited to ~1/2 of async_depth (≈37% of + nr_requests). + * Async I/O: limited to ~1/4 of async_depth (≈18% of + nr_requests). + + - If the user writes a custom value to async_depth, BFQ will + recompute these limits proportionally based on the new value. + + For Kyber: + + - By default async_depth is set to 75% of nr_requests. + - If the user writes a custom value to async_depth, then it + overrides the default and directly controls the limit for + writes and async I/O. + + For mq-deadline: + + - By default async_depth is set to nr_requests. + - If the user writes a custom value to async_depth, then it + overrides the default and directly controls the limit for + writes and async I/O. + + What: /sys/block//queue/nr_zones Date: November 2018 Contact: Damien Le Moal diff --git a/Documentation/block/biovecs.rst b/Documentation/block/biovecs.rst index b9dc0c9dbee4..11126ed6f40f 100644 --- a/Documentation/block/biovecs.rst +++ b/Documentation/block/biovecs.rst @@ -135,7 +135,6 @@ Usage of helpers: bio_first_bvec_all() bio_first_page_all() bio_first_folio_all() - bio_last_bvec_all() * The following helpers iterate over single-page segment. The passed 'struct bio_vec' will contain a single-page IO vector during the iteration:: diff --git a/Documentation/block/inline-encryption.rst b/Documentation/block/inline-encryption.rst index 6380e6ab492b..7e0703a12dfb 100644 --- a/Documentation/block/inline-encryption.rst +++ b/Documentation/block/inline-encryption.rst @@ -206,6 +206,12 @@ it to a bio, given the blk_crypto_key and the data unit number that will be used for en/decryption. Users don't need to worry about freeing the bio_crypt_ctx later, as that happens automatically when the bio is freed or reset. +To submit a bio that uses inline encryption, users must call +``blk_crypto_submit_bio()`` instead of the usual ``submit_bio()``. This will +submit the bio to the underlying driver if it supports inline crypto, or else +call the blk-crypto fallback routines before submitting normal bios to the +underlying drivers. + Finally, when done using inline encryption with a blk_crypto_key on a block_device, users must call ``blk_crypto_evict_key()``. This ensures that the key is evicted from all keyslots it may be programmed into and unlinked from diff --git a/Documentation/block/ublk.rst b/Documentation/block/ublk.rst index 8c4030bcabb6..6ad28039663d 100644 --- a/Documentation/block/ublk.rst +++ b/Documentation/block/ublk.rst @@ -260,9 +260,12 @@ The following IO commands are communicated via io_uring passthrough command, and each command is only for forwarding the IO and committing the result with specified IO tag in the command data: -- ``UBLK_IO_FETCH_REQ`` +Traditional Per-I/O Commands +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Sent from the server IO pthread for fetching future incoming IO requests +- ``UBLK_U_IO_FETCH_REQ`` + + Sent from the server I/O pthread for fetching future incoming I/O requests destined to ``/dev/ublkb*``. This command is sent only once from the server IO pthread for ublk driver to setup IO forward environment. @@ -278,7 +281,7 @@ with specified IO tag in the command data: supported by the driver, daemons must be per-queue instead - i.e. all I/Os associated to a single qid must be handled by the same task. -- ``UBLK_IO_COMMIT_AND_FETCH_REQ`` +- ``UBLK_U_IO_COMMIT_AND_FETCH_REQ`` When an IO request is destined to ``/dev/ublkb*``, the driver stores the IO's ``ublksrv_io_desc`` to the specified mapped area; then the @@ -293,7 +296,7 @@ with specified IO tag in the command data: requests with the same IO tag. That is, ``UBLK_IO_COMMIT_AND_FETCH_REQ`` is reused for both fetching request and committing back IO result. -- ``UBLK_IO_NEED_GET_DATA`` +- ``UBLK_U_IO_NEED_GET_DATA`` With ``UBLK_F_NEED_GET_DATA`` enabled, the WRITE request will be firstly issued to ublk server without data copy. Then, IO backend of ublk server @@ -322,6 +325,59 @@ with specified IO tag in the command data: ``UBLK_IO_COMMIT_AND_FETCH_REQ`` to the server, ublkdrv needs to copy the server buffer (pages) read to the IO request pages. +Batch I/O Commands (UBLK_F_BATCH_IO) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``UBLK_F_BATCH_IO`` feature provides an alternative high-performance +I/O handling model that replaces the traditional per-I/O commands with +per-queue batch commands. This significantly reduces communication overhead +and enables better load balancing across multiple server tasks. + +Key differences from traditional mode: + +- **Per-queue vs Per-I/O**: Commands operate on queues rather than individual I/Os +- **Batch processing**: Multiple I/Os are handled in single operations +- **Multishot commands**: Use io_uring multishot for reduced submission overhead +- **Flexible task assignment**: Any task can handle any I/O (no per-I/O daemons) +- **Better load balancing**: Tasks can adjust their workload dynamically + +Batch I/O Commands: + +- ``UBLK_U_IO_PREP_IO_CMDS`` + + Prepares multiple I/O commands in batch. The server provides a buffer + containing multiple I/O descriptors that will be processed together. + This reduces the number of individual command submissions required. + +- ``UBLK_U_IO_COMMIT_IO_CMDS`` + + Commits results for multiple I/O operations in batch, and prepares the + I/O descriptors to accept new requests. The server provides a buffer + containing the results of multiple completed I/Os, allowing efficient + bulk completion of requests. + +- ``UBLK_U_IO_FETCH_IO_CMDS`` + + **Multishot command** for fetching I/O commands in batch. This is the key + command that enables high-performance batch processing: + + * Uses io_uring multishot capability for reduced submission overhead + * Single command can fetch multiple I/O requests over time + * Buffer size determines maximum batch size per operation + * Multiple fetch commands can be submitted for load balancing + * Only one fetch command is active at any time per queue + * Supports dynamic load balancing across multiple server tasks + + It is one typical multishot io_uring request with provided buffer, and it + won't be completed until any failure is triggered. + + Each task can submit ``UBLK_U_IO_FETCH_IO_CMDS`` with different buffer + sizes to control how much work it handles. This enables sophisticated + load balancing strategies in multi-threaded servers. + +Migration: Applications using traditional commands (``UBLK_U_IO_FETCH_REQ``, +``UBLK_U_IO_COMMIT_AND_FETCH_REQ``) cannot use batch mode simultaneously. + Zero copy --------- diff --git a/MAINTAINERS b/MAINTAINERS index e08767323763..b27a2f5d36f1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -24276,6 +24276,7 @@ F: include/linux/property.h SOFTWARE RAID (Multiple Disks) SUPPORT M: Song Liu M: Yu Kuai +R: Li Nan L: linux-raid@vger.kernel.org S: Supported Q: https://patchwork.kernel.org/project/linux-raid/list/ diff --git a/block/bdev.c b/block/bdev.c index b8fbb9576110..ed022f8c48c7 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -208,7 +208,6 @@ int set_blocksize(struct file *file, int size) inode->i_blkbits = blksize_bits(size); mapping_set_folio_min_order(inode->i_mapping, get_order(size)); - kill_bdev(bdev); filemap_invalidate_unlock(inode->i_mapping); inode_unlock(inode); } diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 6e54b1d3d8bc..b180ce583951 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -231,7 +231,7 @@ static struct kmem_cache *bfq_pool; #define BFQ_RQ_SEEKY(bfqd, last_pos, rq) \ (get_sdist(last_pos, rq) > \ BFQQ_SEEK_THR && \ - (!blk_queue_nonrot(bfqd->queue) || \ + (blk_queue_rot(bfqd->queue) || \ blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT)) #define BFQQ_CLOSE_THR (sector_t)(8 * 1024) #define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19) @@ -697,7 +697,7 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) unsigned int limit, act_idx; /* Sync reads have full depth available */ - if (op_is_sync(opf) && !op_is_write(opf)) + if (blk_mq_is_sync_read(opf)) limit = data->q->nr_requests; else limit = bfqd->async_depths[!!bfqd->wr_busy_queues][op_is_sync(opf)]; @@ -4165,7 +4165,7 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, /* don't use too short time intervals */ if (delta_usecs < 1000) { - if (blk_queue_nonrot(bfqd->queue)) + if (!blk_queue_rot(bfqd->queue)) /* * give same worst-case guarantees as idling * for seeky @@ -4487,7 +4487,7 @@ static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, struct bfq_queue *bfqq) { bool rot_without_queueing = - !blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag, + blk_queue_rot(bfqd->queue) && !bfqd->hw_tag, bfqq_sequential_and_IO_bound, idling_boosts_thr; @@ -4521,7 +4521,7 @@ static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, * flash-based device. */ idling_boosts_thr = rot_without_queueing || - ((!blk_queue_nonrot(bfqd->queue) || !bfqd->hw_tag) && + ((blk_queue_rot(bfqd->queue) || !bfqd->hw_tag) && bfqq_sequential_and_IO_bound); /* @@ -4722,7 +4722,7 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) * there is only one in-flight large request * at a time. */ - if (blk_queue_nonrot(bfqd->queue) && + if (!blk_queue_rot(bfqd->queue) && blk_rq_sectors(bfqq->next_rq) >= BFQQ_SECT_THR_NONROT && bfqd->tot_rq_in_driver >= 1) @@ -6340,7 +6340,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd) bfqd->hw_tag_samples = 0; bfqd->nonrot_with_queueing = - blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag; + !blk_queue_rot(bfqd->queue) && bfqd->hw_tag; } static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) @@ -7112,39 +7112,29 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) static void bfq_depth_updated(struct request_queue *q) { struct bfq_data *bfqd = q->elevator->elevator_data; - unsigned int nr_requests = q->nr_requests; + unsigned int async_depth = q->async_depth; /* - * In-word depths if no bfq_queue is being weight-raised: - * leaving 25% of tags only for sync reads. + * By default: + * - sync reads are not limited + * If bfqq is not being weight-raised: + * - sync writes are limited to 75%(async depth default value) + * - async IO are limited to 50% + * If bfqq is being weight-raised: + * - sync writes are limited to ~37% + * - async IO are limited to ~18 * - * In next formulas, right-shift the value - * (1U<sb.shift), instead of computing directly - * (1U<<(bt->sb.shift - something)), to be robust against - * any possible value of bt->sb.shift, without having to - * limit 'something'. + * If request_queue->async_depth is updated by user, all limit are + * updated relatively. */ - /* no more than 50% of tags for async I/O */ - bfqd->async_depths[0][0] = max(nr_requests >> 1, 1U); - /* - * no more than 75% of tags for sync writes (25% extra tags - * w.r.t. async I/O, to prevent async I/O from starving sync - * writes) - */ - bfqd->async_depths[0][1] = max((nr_requests * 3) >> 2, 1U); + bfqd->async_depths[0][1] = async_depth; + bfqd->async_depths[0][0] = max(async_depth * 2 / 3, 1U); + bfqd->async_depths[1][1] = max(async_depth >> 1, 1U); + bfqd->async_depths[1][0] = max(async_depth >> 2, 1U); /* - * In-word depths in case some bfq_queue is being weight- - * raised: leaving ~63% of tags for sync reads. This is the - * highest percentage for which, in our tests, application - * start-up times didn't suffer from any regression due to tag - * shortage. + * Due to cgroup qos, the allowed request for bfqq might be 1 */ - /* no more than ~18% of tags for async I/O */ - bfqd->async_depths[1][0] = max((nr_requests * 3) >> 4, 1U); - /* no more than ~37% of tags for sync writes (~20% extra tags) */ - bfqd->async_depths[1][1] = max((nr_requests * 6) >> 4, 1U); - blk_mq_set_min_shallow_depth(q, 1); } @@ -7293,7 +7283,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq) INIT_HLIST_HEAD(&bfqd->burst_list); bfqd->hw_tag = -1; - bfqd->nonrot_with_queueing = blk_queue_nonrot(bfqd->queue); + bfqd->nonrot_with_queueing = !blk_queue_rot(bfqd->queue); bfqd->bfq_max_budget = bfq_default_max_budget; @@ -7328,9 +7318,9 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq) * Begin by assuming, optimistically, that the device peak * rate is equal to 2/3 of the highest reference rate. */ - bfqd->rate_dur_prod = ref_rate[blk_queue_nonrot(bfqd->queue)] * - ref_wr_duration[blk_queue_nonrot(bfqd->queue)]; - bfqd->peak_rate = ref_rate[blk_queue_nonrot(bfqd->queue)] * 2 / 3; + bfqd->rate_dur_prod = ref_rate[!blk_queue_rot(bfqd->queue)] * + ref_wr_duration[!blk_queue_rot(bfqd->queue)]; + bfqd->peak_rate = ref_rate[!blk_queue_rot(bfqd->queue)] * 2 / 3; /* see comments on the definition of next field inside bfq_data */ bfqd->actuator_load_threshold = 4; @@ -7365,6 +7355,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq) blk_queue_flag_set(QUEUE_FLAG_DISABLE_WBT_DEF, q); wbt_disable_default(q->disk); blk_stat_enable_accounting(q); + q->async_depth = (q->nr_requests * 3) >> 2; return 0; diff --git a/block/bio-integrity-auto.c b/block/bio-integrity-auto.c index cff025b06be1..44dcdf7520c5 100644 --- a/block/bio-integrity-auto.c +++ b/block/bio-integrity-auto.c @@ -52,19 +52,7 @@ static bool bip_should_check(struct bio_integrity_payload *bip) static bool bi_offload_capable(struct blk_integrity *bi) { - switch (bi->csum_type) { - case BLK_INTEGRITY_CSUM_CRC64: - return bi->metadata_size == sizeof(struct crc64_pi_tuple); - case BLK_INTEGRITY_CSUM_CRC: - case BLK_INTEGRITY_CSUM_IP: - return bi->metadata_size == sizeof(struct t10_pi_tuple); - default: - pr_warn_once("%s: unknown integrity checksum type:%d\n", - __func__, bi->csum_type); - fallthrough; - case BLK_INTEGRITY_CSUM_NONE: - return false; - } + return bi->metadata_size == bi->pi_tuple_size; } /** diff --git a/block/bio.c b/block/bio.c index 40f690985bfb..1e31ca495d8d 100644 --- a/block/bio.c +++ b/block/bio.c @@ -301,9 +301,12 @@ EXPORT_SYMBOL(bio_init); */ void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf) { + struct bio_vec *bv = bio->bi_io_vec; + bio_uninit(bio); memset(bio, 0, BIO_RESET_BYTES); atomic_set(&bio->__bi_remaining, 1); + bio->bi_io_vec = bv; bio->bi_bdev = bdev; if (bio->bi_bdev) bio_associate_blkg(bio); @@ -1196,8 +1199,8 @@ void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter) { WARN_ON_ONCE(bio->bi_max_vecs); - bio->bi_vcnt = iter->nr_segs; bio->bi_io_vec = (struct bio_vec *)iter->bvec; + bio->bi_iter.bi_idx = 0; bio->bi_iter.bi_bvec_done = iter->iov_offset; bio->bi_iter.bi_size = iov_iter_count(iter); bio_set_flag(bio, BIO_CLONED); diff --git a/block/blk-core.c b/block/blk-core.c index 8387fe50ea15..474700ffaa1c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -114,12 +114,12 @@ static const char *const blk_op_name[] = { #undef REQ_OP_NAME /** - * blk_op_str - Return string XXX in the REQ_OP_XXX. - * @op: REQ_OP_XXX. + * blk_op_str - Return the string "name" for an operation REQ_OP_name. + * @op: a request operation. * - * Description: Centralize block layer function to convert REQ_OP_XXX into - * string format. Useful in the debugging and tracing bio or request. For - * invalid REQ_OP_XXX it returns string "UNKNOWN". + * Convert a request operation REQ_OP_name into the string "name". Useful for + * debugging and tracing BIOs and requests. For an invalid request operation + * code, the string "UNKNOWN" is returned. */ inline const char *blk_op_str(enum req_op op) { @@ -463,6 +463,7 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id) fs_reclaim_release(GFP_KERNEL); q->nr_requests = BLKDEV_DEFAULT_RQ; + q->async_depth = BLKDEV_DEFAULT_RQ; return q; @@ -628,9 +629,6 @@ static void __submit_bio(struct bio *bio) /* If plug is not used, add new plug here to cache nsecs time. */ struct blk_plug plug; - if (unlikely(!blk_crypto_bio_prep(&bio))) - return; - blk_start_plug(&plug); if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) { @@ -794,6 +792,13 @@ void submit_bio_noacct(struct bio *bio) if ((bio->bi_opf & REQ_NOWAIT) && !bdev_nowait(bdev)) goto not_supported; + if (bio_has_crypt_ctx(bio)) { + if (WARN_ON_ONCE(!bio_has_data(bio))) + goto end_io; + if (!blk_crypto_supported(bio)) + goto not_supported; + } + if (should_fail_bio(bio)) goto end_io; bio_check_ro(bio); diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index 86b27f96051a..a331b061dbf4 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -22,7 +22,7 @@ #include "blk-cgroup.h" #include "blk-crypto-internal.h" -static unsigned int num_prealloc_bounce_pg = 32; +static unsigned int num_prealloc_bounce_pg = BIO_MAX_VECS; module_param(num_prealloc_bounce_pg, uint, 0); MODULE_PARM_DESC(num_prealloc_bounce_pg, "Number of preallocated bounce pages for the blk-crypto crypto API fallback"); @@ -75,13 +75,13 @@ static bool tfms_inited[BLK_ENCRYPTION_MODE_MAX]; static struct blk_crypto_fallback_keyslot { enum blk_crypto_mode_num crypto_mode; - struct crypto_skcipher *tfms[BLK_ENCRYPTION_MODE_MAX]; + struct crypto_sync_skcipher *tfms[BLK_ENCRYPTION_MODE_MAX]; } *blk_crypto_keyslots; static struct blk_crypto_profile *blk_crypto_fallback_profile; static struct workqueue_struct *blk_crypto_wq; static mempool_t *blk_crypto_bounce_page_pool; -static struct bio_set crypto_bio_split; +static struct bio_set enc_bio_set; /* * This is the key we set when evicting a keyslot. This *should* be the all 0's @@ -98,7 +98,7 @@ static void blk_crypto_fallback_evict_keyslot(unsigned int slot) WARN_ON(slotp->crypto_mode == BLK_ENCRYPTION_MODE_INVALID); /* Clear the key in the skcipher */ - err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], blank_key, + err = crypto_sync_skcipher_setkey(slotp->tfms[crypto_mode], blank_key, blk_crypto_modes[crypto_mode].keysize); WARN_ON(err); slotp->crypto_mode = BLK_ENCRYPTION_MODE_INVALID; @@ -119,7 +119,7 @@ blk_crypto_fallback_keyslot_program(struct blk_crypto_profile *profile, blk_crypto_fallback_evict_keyslot(slot); slotp->crypto_mode = crypto_mode; - err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->bytes, + err = crypto_sync_skcipher_setkey(slotp->tfms[crypto_mode], key->bytes, key->size); if (err) { blk_crypto_fallback_evict_keyslot(slot); @@ -144,94 +144,84 @@ static const struct blk_crypto_ll_ops blk_crypto_fallback_ll_ops = { static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio) { struct bio *src_bio = enc_bio->bi_private; - int i; + struct page **pages = (struct page **)enc_bio->bi_io_vec; + struct bio_vec *bv; + unsigned int i; - for (i = 0; i < enc_bio->bi_vcnt; i++) - mempool_free(enc_bio->bi_io_vec[i].bv_page, - blk_crypto_bounce_page_pool); + /* + * Use the same trick as the alloc side to avoid the need for an extra + * pages array. + */ + bio_for_each_bvec_all(bv, enc_bio, i) + pages[i] = bv->bv_page; - src_bio->bi_status = enc_bio->bi_status; + i = mempool_free_bulk(blk_crypto_bounce_page_pool, (void **)pages, + enc_bio->bi_vcnt); + if (i < enc_bio->bi_vcnt) + release_pages(pages + i, enc_bio->bi_vcnt - i); - bio_uninit(enc_bio); - kfree(enc_bio); + if (enc_bio->bi_status) + cmpxchg(&src_bio->bi_status, 0, enc_bio->bi_status); + + bio_put(enc_bio); bio_endio(src_bio); } -static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src) +#define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *)) + +static struct bio *blk_crypto_alloc_enc_bio(struct bio *bio_src, + unsigned int nr_segs, struct page ***pages_ret) { - unsigned int nr_segs = bio_segments(bio_src); - struct bvec_iter iter; - struct bio_vec bv; + unsigned int memflags = memalloc_noio_save(); + unsigned int nr_allocated; + struct page **pages; struct bio *bio; - bio = bio_kmalloc(nr_segs, GFP_NOIO); - if (!bio) - return NULL; - bio_init_inline(bio, bio_src->bi_bdev, nr_segs, bio_src->bi_opf); + bio = bio_alloc_bioset(bio_src->bi_bdev, nr_segs, bio_src->bi_opf, + GFP_NOIO, &enc_bio_set); if (bio_flagged(bio_src, BIO_REMAPPED)) bio_set_flag(bio, BIO_REMAPPED); + bio->bi_private = bio_src; + bio->bi_end_io = blk_crypto_fallback_encrypt_endio; bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; bio->bi_write_stream = bio_src->bi_write_stream; bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; - bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; - - bio_for_each_segment(bv, bio_src, iter) - bio->bi_io_vec[bio->bi_vcnt++] = bv; - bio_clone_blkg_association(bio, bio_src); + /* + * Move page array up in the allocated memory for the bio vecs as far as + * possible so that we can start filling biovecs from the beginning + * without overwriting the temporary page array. + */ + static_assert(PAGE_PTRS_PER_BVEC > 1); + pages = (struct page **)bio->bi_io_vec; + pages += nr_segs * (PAGE_PTRS_PER_BVEC - 1); + + /* + * Try a bulk allocation first. This could leave random pages in the + * array unallocated, but we'll fix that up later in mempool_alloc_bulk. + * + * Note: alloc_pages_bulk needs the array to be zeroed, as it assumes + * any non-zero slot already contains a valid allocation. + */ + memset(pages, 0, sizeof(struct page *) * nr_segs); + nr_allocated = alloc_pages_bulk(GFP_KERNEL, nr_segs, pages); + if (nr_allocated < nr_segs) + mempool_alloc_bulk(blk_crypto_bounce_page_pool, (void **)pages, + nr_segs, nr_allocated); + memalloc_noio_restore(memflags); + *pages_ret = pages; return bio; } -static bool -blk_crypto_fallback_alloc_cipher_req(struct blk_crypto_keyslot *slot, - struct skcipher_request **ciph_req_ret, - struct crypto_wait *wait) +static struct crypto_sync_skcipher * +blk_crypto_fallback_tfm(struct blk_crypto_keyslot *slot) { - struct skcipher_request *ciph_req; - const struct blk_crypto_fallback_keyslot *slotp; - int keyslot_idx = blk_crypto_keyslot_index(slot); + const struct blk_crypto_fallback_keyslot *slotp = + &blk_crypto_keyslots[blk_crypto_keyslot_index(slot)]; - slotp = &blk_crypto_keyslots[keyslot_idx]; - ciph_req = skcipher_request_alloc(slotp->tfms[slotp->crypto_mode], - GFP_NOIO); - if (!ciph_req) - return false; - - skcipher_request_set_callback(ciph_req, - CRYPTO_TFM_REQ_MAY_BACKLOG | - CRYPTO_TFM_REQ_MAY_SLEEP, - crypto_req_done, wait); - *ciph_req_ret = ciph_req; - - return true; -} - -static bool blk_crypto_fallback_split_bio_if_needed(struct bio **bio_ptr) -{ - struct bio *bio = *bio_ptr; - unsigned int i = 0; - unsigned int num_sectors = 0; - struct bio_vec bv; - struct bvec_iter iter; - - bio_for_each_segment(bv, bio, iter) { - num_sectors += bv.bv_len >> SECTOR_SHIFT; - if (++i == BIO_MAX_VECS) - break; - } - - if (num_sectors < bio_sectors(bio)) { - bio = bio_submit_split_bioset(bio, num_sectors, - &crypto_bio_split); - if (!bio) - return false; - - *bio_ptr = bio; - } - - return true; + return slotp->tfms[slotp->crypto_mode]; } union blk_crypto_iv { @@ -248,59 +238,23 @@ static void blk_crypto_dun_to_iv(const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], iv->dun[i] = cpu_to_le64(dun[i]); } -/* - * The crypto API fallback's encryption routine. - * Allocate a bounce bio for encryption, encrypt the input bio using crypto API, - * and replace *bio_ptr with the bounce bio. May split input bio if it's too - * large. Returns true on success. Returns false and sets bio->bi_status on - * error. - */ -static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr) +static void __blk_crypto_fallback_encrypt_bio(struct bio *src_bio, + struct crypto_sync_skcipher *tfm) { - struct bio *src_bio, *enc_bio; - struct bio_crypt_ctx *bc; - struct blk_crypto_keyslot *slot; - int data_unit_size; - struct skcipher_request *ciph_req = NULL; - DECLARE_CRYPTO_WAIT(wait); + struct bio_crypt_ctx *bc = src_bio->bi_crypt_context; + int data_unit_size = bc->bc_key->crypto_cfg.data_unit_size; + SYNC_SKCIPHER_REQUEST_ON_STACK(ciph_req, tfm); u64 curr_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; struct scatterlist src, dst; union blk_crypto_iv iv; - unsigned int i, j; - bool ret = false; - blk_status_t blk_st; + unsigned int nr_enc_pages, enc_idx; + struct page **enc_pages; + struct bio *enc_bio; + unsigned int i; - /* Split the bio if it's too big for single page bvec */ - if (!blk_crypto_fallback_split_bio_if_needed(bio_ptr)) - return false; - - src_bio = *bio_ptr; - bc = src_bio->bi_crypt_context; - data_unit_size = bc->bc_key->crypto_cfg.data_unit_size; - - /* Allocate bounce bio for encryption */ - enc_bio = blk_crypto_fallback_clone_bio(src_bio); - if (!enc_bio) { - src_bio->bi_status = BLK_STS_RESOURCE; - return false; - } - - /* - * Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for - * this bio's algorithm and key. - */ - blk_st = blk_crypto_get_keyslot(blk_crypto_fallback_profile, - bc->bc_key, &slot); - if (blk_st != BLK_STS_OK) { - src_bio->bi_status = blk_st; - goto out_put_enc_bio; - } - - /* and then allocate an skcipher_request for it */ - if (!blk_crypto_fallback_alloc_cipher_req(slot, &ciph_req, &wait)) { - src_bio->bi_status = BLK_STS_RESOURCE; - goto out_release_keyslot; - } + skcipher_request_set_callback(ciph_req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + NULL, NULL); memcpy(curr_dun, bc->bc_dun, sizeof(curr_dun)); sg_init_table(&src, 1); @@ -309,65 +263,159 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr) skcipher_request_set_crypt(ciph_req, &src, &dst, data_unit_size, iv.bytes); - /* Encrypt each page in the bounce bio */ - for (i = 0; i < enc_bio->bi_vcnt; i++) { - struct bio_vec *enc_bvec = &enc_bio->bi_io_vec[i]; - struct page *plaintext_page = enc_bvec->bv_page; - struct page *ciphertext_page = - mempool_alloc(blk_crypto_bounce_page_pool, GFP_NOIO); + /* + * Encrypt each page in the source bio. Because the source bio could + * have bio_vecs that span more than a single page, but the encrypted + * bios are limited to a single page per bio_vec, this can generate + * more than a single encrypted bio per source bio. + */ +new_bio: + nr_enc_pages = min(bio_segments(src_bio), BIO_MAX_VECS); + enc_bio = blk_crypto_alloc_enc_bio(src_bio, nr_enc_pages, &enc_pages); + enc_idx = 0; + for (;;) { + struct bio_vec src_bv = + bio_iter_iovec(src_bio, src_bio->bi_iter); + struct page *enc_page = enc_pages[enc_idx]; - enc_bvec->bv_page = ciphertext_page; - - if (!ciphertext_page) { - src_bio->bi_status = BLK_STS_RESOURCE; - goto out_free_bounce_pages; + if (!IS_ALIGNED(src_bv.bv_len | src_bv.bv_offset, + data_unit_size)) { + enc_bio->bi_status = BLK_STS_INVAL; + goto out_free_enc_bio; } - sg_set_page(&src, plaintext_page, data_unit_size, - enc_bvec->bv_offset); - sg_set_page(&dst, ciphertext_page, data_unit_size, - enc_bvec->bv_offset); + __bio_add_page(enc_bio, enc_page, src_bv.bv_len, + src_bv.bv_offset); - /* Encrypt each data unit in this page */ - for (j = 0; j < enc_bvec->bv_len; j += data_unit_size) { + sg_set_page(&src, src_bv.bv_page, data_unit_size, + src_bv.bv_offset); + sg_set_page(&dst, enc_page, data_unit_size, src_bv.bv_offset); + + /* + * Increment the index now that the encrypted page is added to + * the bio. This is important for the error unwind path. + */ + enc_idx++; + + /* + * Encrypt each data unit in this page. + */ + for (i = 0; i < src_bv.bv_len; i += data_unit_size) { blk_crypto_dun_to_iv(curr_dun, &iv); - if (crypto_wait_req(crypto_skcipher_encrypt(ciph_req), - &wait)) { - i++; - src_bio->bi_status = BLK_STS_IOERR; - goto out_free_bounce_pages; + if (crypto_skcipher_encrypt(ciph_req)) { + enc_bio->bi_status = BLK_STS_IOERR; + goto out_free_enc_bio; } bio_crypt_dun_increment(curr_dun, 1); src.offset += data_unit_size; dst.offset += data_unit_size; } + + bio_advance_iter_single(src_bio, &src_bio->bi_iter, + src_bv.bv_len); + if (!src_bio->bi_iter.bi_size) + break; + + if (enc_idx == nr_enc_pages) { + /* + * For each additional encrypted bio submitted, + * increment the source bio's remaining count. Each + * encrypted bio's completion handler calls bio_endio on + * the source bio, so this keeps the source bio from + * completing until the last encrypted bio does. + */ + bio_inc_remaining(src_bio); + submit_bio(enc_bio); + goto new_bio; + } } - enc_bio->bi_private = src_bio; - enc_bio->bi_end_io = blk_crypto_fallback_encrypt_endio; - *bio_ptr = enc_bio; - ret = true; + submit_bio(enc_bio); + return; - enc_bio = NULL; - goto out_free_ciph_req; +out_free_enc_bio: + /* + * Add the remaining pages to the bio so that the normal completion path + * in blk_crypto_fallback_encrypt_endio frees them. The exact data + * layout does not matter for that, so don't bother iterating the source + * bio. + */ + for (; enc_idx < nr_enc_pages; enc_idx++) + __bio_add_page(enc_bio, enc_pages[enc_idx], PAGE_SIZE, 0); + bio_endio(enc_bio); +} -out_free_bounce_pages: - while (i > 0) - mempool_free(enc_bio->bi_io_vec[--i].bv_page, - blk_crypto_bounce_page_pool); -out_free_ciph_req: - skcipher_request_free(ciph_req); -out_release_keyslot: +/* + * The crypto API fallback's encryption routine. + * + * Allocate one or more bios for encryption, encrypt the input bio using the + * crypto API, and submit the encrypted bios. Sets bio->bi_status and + * completes the source bio on error + */ +static void blk_crypto_fallback_encrypt_bio(struct bio *src_bio) +{ + struct bio_crypt_ctx *bc = src_bio->bi_crypt_context; + struct blk_crypto_keyslot *slot; + blk_status_t status; + + status = blk_crypto_get_keyslot(blk_crypto_fallback_profile, + bc->bc_key, &slot); + if (status != BLK_STS_OK) { + src_bio->bi_status = status; + bio_endio(src_bio); + return; + } + __blk_crypto_fallback_encrypt_bio(src_bio, + blk_crypto_fallback_tfm(slot)); blk_crypto_put_keyslot(slot); -out_put_enc_bio: - if (enc_bio) - bio_uninit(enc_bio); - kfree(enc_bio); - return ret; +} + +static blk_status_t __blk_crypto_fallback_decrypt_bio(struct bio *bio, + struct bio_crypt_ctx *bc, struct bvec_iter iter, + struct crypto_sync_skcipher *tfm) +{ + SYNC_SKCIPHER_REQUEST_ON_STACK(ciph_req, tfm); + u64 curr_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; + union blk_crypto_iv iv; + struct scatterlist sg; + struct bio_vec bv; + const int data_unit_size = bc->bc_key->crypto_cfg.data_unit_size; + unsigned int i; + + skcipher_request_set_callback(ciph_req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + NULL, NULL); + + memcpy(curr_dun, bc->bc_dun, sizeof(curr_dun)); + sg_init_table(&sg, 1); + skcipher_request_set_crypt(ciph_req, &sg, &sg, data_unit_size, + iv.bytes); + + /* Decrypt each segment in the bio */ + __bio_for_each_segment(bv, bio, iter, iter) { + struct page *page = bv.bv_page; + + if (!IS_ALIGNED(bv.bv_len | bv.bv_offset, data_unit_size)) + return BLK_STS_INVAL; + + sg_set_page(&sg, page, data_unit_size, bv.bv_offset); + + /* Decrypt each data unit in the segment */ + for (i = 0; i < bv.bv_len; i += data_unit_size) { + blk_crypto_dun_to_iv(curr_dun, &iv); + if (crypto_skcipher_decrypt(ciph_req)) + return BLK_STS_IOERR; + bio_crypt_dun_increment(curr_dun, 1); + sg.offset += data_unit_size; + } + } + + return BLK_STS_OK; } /* * The crypto API fallback's main decryption routine. + * * Decrypts input bio in place, and calls bio_endio on the bio. */ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work) @@ -377,63 +425,19 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work) struct bio *bio = f_ctx->bio; struct bio_crypt_ctx *bc = &f_ctx->crypt_ctx; struct blk_crypto_keyslot *slot; - struct skcipher_request *ciph_req = NULL; - DECLARE_CRYPTO_WAIT(wait); - u64 curr_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; - union blk_crypto_iv iv; - struct scatterlist sg; - struct bio_vec bv; - struct bvec_iter iter; - const int data_unit_size = bc->bc_key->crypto_cfg.data_unit_size; - unsigned int i; - blk_status_t blk_st; + blk_status_t status; - /* - * Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for - * this bio's algorithm and key. - */ - blk_st = blk_crypto_get_keyslot(blk_crypto_fallback_profile, + status = blk_crypto_get_keyslot(blk_crypto_fallback_profile, bc->bc_key, &slot); - if (blk_st != BLK_STS_OK) { - bio->bi_status = blk_st; - goto out_no_keyslot; + if (status == BLK_STS_OK) { + status = __blk_crypto_fallback_decrypt_bio(bio, bc, + f_ctx->crypt_iter, + blk_crypto_fallback_tfm(slot)); + blk_crypto_put_keyslot(slot); } - - /* and then allocate an skcipher_request for it */ - if (!blk_crypto_fallback_alloc_cipher_req(slot, &ciph_req, &wait)) { - bio->bi_status = BLK_STS_RESOURCE; - goto out; - } - - memcpy(curr_dun, bc->bc_dun, sizeof(curr_dun)); - sg_init_table(&sg, 1); - skcipher_request_set_crypt(ciph_req, &sg, &sg, data_unit_size, - iv.bytes); - - /* Decrypt each segment in the bio */ - __bio_for_each_segment(bv, bio, iter, f_ctx->crypt_iter) { - struct page *page = bv.bv_page; - - sg_set_page(&sg, page, data_unit_size, bv.bv_offset); - - /* Decrypt each data unit in the segment */ - for (i = 0; i < bv.bv_len; i += data_unit_size) { - blk_crypto_dun_to_iv(curr_dun, &iv); - if (crypto_wait_req(crypto_skcipher_decrypt(ciph_req), - &wait)) { - bio->bi_status = BLK_STS_IOERR; - goto out; - } - bio_crypt_dun_increment(curr_dun, 1); - sg.offset += data_unit_size; - } - } - -out: - skcipher_request_free(ciph_req); - blk_crypto_put_keyslot(slot); -out_no_keyslot: mempool_free(f_ctx, bio_fallback_crypt_ctx_pool); + + bio->bi_status = status; bio_endio(bio); } @@ -466,44 +470,44 @@ static void blk_crypto_fallback_decrypt_endio(struct bio *bio) /** * blk_crypto_fallback_bio_prep - Prepare a bio to use fallback en/decryption + * @bio: bio to prepare * - * @bio_ptr: pointer to the bio to prepare + * If bio is doing a WRITE operation, allocate one or more bios to contain the + * encrypted payload and submit them. * - * If bio is doing a WRITE operation, this splits the bio into two parts if it's - * too big (see blk_crypto_fallback_split_bio_if_needed()). It then allocates a - * bounce bio for the first part, encrypts it, and updates bio_ptr to point to - * the bounce bio. - * - * For a READ operation, we mark the bio for decryption by using bi_private and + * For a READ operation, mark the bio for decryption by using bi_private and * bi_end_io. * - * In either case, this function will make the bio look like a regular bio (i.e. - * as if no encryption context was ever specified) for the purposes of the rest - * of the stack except for blk-integrity (blk-integrity and blk-crypto are not - * currently supported together). + * In either case, this function will make the submitted bio(s) look like + * regular bios (i.e. as if no encryption context was ever specified) for the + * purposes of the rest of the stack except for blk-integrity (blk-integrity and + * blk-crypto are not currently supported together). * - * Return: true on success. Sets bio->bi_status and returns false on error. + * Return: true if @bio should be submitted to the driver by the caller, else + * false. Sets bio->bi_status, calls bio_endio and returns false on error. */ -bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr) +bool blk_crypto_fallback_bio_prep(struct bio *bio) { - struct bio *bio = *bio_ptr; struct bio_crypt_ctx *bc = bio->bi_crypt_context; struct bio_fallback_crypt_ctx *f_ctx; if (WARN_ON_ONCE(!tfms_inited[bc->bc_key->crypto_cfg.crypto_mode])) { /* User didn't call blk_crypto_start_using_key() first */ - bio->bi_status = BLK_STS_IOERR; + bio_io_error(bio); return false; } if (!__blk_crypto_cfg_supported(blk_crypto_fallback_profile, &bc->bc_key->crypto_cfg)) { bio->bi_status = BLK_STS_NOTSUPP; + bio_endio(bio); return false; } - if (bio_data_dir(bio) == WRITE) - return blk_crypto_fallback_encrypt_bio(bio_ptr); + if (bio_data_dir(bio) == WRITE) { + blk_crypto_fallback_encrypt_bio(bio); + return false; + } /* * bio READ case: Set up a f_ctx in the bio's bi_private and set the @@ -537,7 +541,7 @@ static int blk_crypto_fallback_init(void) get_random_bytes(blank_key, sizeof(blank_key)); - err = bioset_init(&crypto_bio_split, 64, 0, 0); + err = bioset_init(&enc_bio_set, 64, 0, BIOSET_NEED_BVECS); if (err) goto out; @@ -607,7 +611,7 @@ fail_destroy_profile: fail_free_profile: kfree(blk_crypto_fallback_profile); fail_free_bioset: - bioset_exit(&crypto_bio_split); + bioset_exit(&enc_bio_set); out: return err; } @@ -641,7 +645,8 @@ int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num) for (i = 0; i < blk_crypto_num_keyslots; i++) { slotp = &blk_crypto_keyslots[i]; - slotp->tfms[mode_num] = crypto_alloc_skcipher(cipher_str, 0, 0); + slotp->tfms[mode_num] = crypto_alloc_sync_skcipher(cipher_str, + 0, 0); if (IS_ERR(slotp->tfms[mode_num])) { err = PTR_ERR(slotp->tfms[mode_num]); if (err == -ENOENT) { @@ -653,7 +658,7 @@ int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num) goto out_free_tfms; } - crypto_skcipher_set_flags(slotp->tfms[mode_num], + crypto_sync_skcipher_set_flags(slotp->tfms[mode_num], CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); } @@ -667,7 +672,7 @@ int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num) out_free_tfms: for (i = 0; i < blk_crypto_num_keyslots; i++) { slotp = &blk_crypto_keyslots[i]; - crypto_free_skcipher(slotp->tfms[mode_num]); + crypto_free_sync_skcipher(slotp->tfms[mode_num]); slotp->tfms[mode_num] = NULL; } out: diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h index ccf6dff6ff6b..742694213529 100644 --- a/block/blk-crypto-internal.h +++ b/block/blk-crypto-internal.h @@ -86,6 +86,12 @@ bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile, int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd, void __user *argp); +static inline bool blk_crypto_supported(struct bio *bio) +{ + return blk_crypto_config_supported_natively(bio->bi_bdev, + &bio->bi_crypt_context->bc_key->crypto_cfg); +} + #else /* CONFIG_BLK_INLINE_ENCRYPTION */ static inline int blk_crypto_sysfs_register(struct gendisk *disk) @@ -139,6 +145,11 @@ static inline int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd, return -ENOTTY; } +static inline bool blk_crypto_supported(struct bio *bio) +{ + return false; +} + #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ void __bio_crypt_advance(struct bio *bio, unsigned int bytes); @@ -165,14 +176,6 @@ static inline void bio_crypt_do_front_merge(struct request *rq, #endif } -bool __blk_crypto_bio_prep(struct bio **bio_ptr); -static inline bool blk_crypto_bio_prep(struct bio **bio_ptr) -{ - if (bio_has_crypt_ctx(*bio_ptr)) - return __blk_crypto_bio_prep(bio_ptr); - return true; -} - blk_status_t __blk_crypto_rq_get_keyslot(struct request *rq); static inline blk_status_t blk_crypto_rq_get_keyslot(struct request *rq) { @@ -215,12 +218,12 @@ static inline int blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, return 0; } +bool blk_crypto_fallback_bio_prep(struct bio *bio); + #ifdef CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num); -bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr); - int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key); #else /* CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK */ @@ -232,13 +235,6 @@ blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num) return -ENOPKG; } -static inline bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr) -{ - pr_warn_once("crypto API fallback disabled; failing request.\n"); - (*bio_ptr)->bi_status = BLK_STS_NOTSUPP; - return false; -} - static inline int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key) { diff --git a/block/blk-crypto.c b/block/blk-crypto.c index 3e7bf1974cbd..856d3c5b1fa0 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -219,22 +219,6 @@ bool bio_crypt_ctx_mergeable(struct bio_crypt_ctx *bc1, unsigned int bc1_bytes, return !bc1 || bio_crypt_dun_is_contiguous(bc1, bc1_bytes, bc2->bc_dun); } -/* Check that all I/O segments are data unit aligned. */ -static bool bio_crypt_check_alignment(struct bio *bio) -{ - const unsigned int data_unit_size = - bio->bi_crypt_context->bc_key->crypto_cfg.data_unit_size; - struct bvec_iter iter; - struct bio_vec bv; - - bio_for_each_segment(bv, bio, iter) { - if (!IS_ALIGNED(bv.bv_len | bv.bv_offset, data_unit_size)) - return false; - } - - return true; -} - blk_status_t __blk_crypto_rq_get_keyslot(struct request *rq) { return blk_crypto_get_keyslot(rq->q->crypto_profile, @@ -258,57 +242,41 @@ void __blk_crypto_free_request(struct request *rq) rq->crypt_ctx = NULL; } -/** - * __blk_crypto_bio_prep - Prepare bio for inline encryption +/* + * Process a bio with a crypto context. Returns true if the caller should + * submit the passed in bio, false if the bio is consumed. * - * @bio_ptr: pointer to original bio pointer - * - * If the bio crypt context provided for the bio is supported by the underlying - * device's inline encryption hardware, do nothing. - * - * Otherwise, try to perform en/decryption for this bio by falling back to the - * kernel crypto API. When the crypto API fallback is used for encryption, - * blk-crypto may choose to split the bio into 2 - the first one that will - * continue to be processed and the second one that will be resubmitted via - * submit_bio_noacct. A bounce bio will be allocated to encrypt the contents - * of the aforementioned "first one", and *bio_ptr will be updated to this - * bounce bio. - * - * Caller must ensure bio has bio_crypt_ctx. - * - * Return: true on success; false on error (and bio->bi_status will be set - * appropriately, and bio_endio() will have been called so bio - * submission should abort). + * See the kerneldoc comment for blk_crypto_submit_bio for further details. */ -bool __blk_crypto_bio_prep(struct bio **bio_ptr) +bool __blk_crypto_submit_bio(struct bio *bio) { - struct bio *bio = *bio_ptr; const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key; + struct block_device *bdev = bio->bi_bdev; /* Error if bio has no data. */ if (WARN_ON_ONCE(!bio_has_data(bio))) { - bio->bi_status = BLK_STS_IOERR; - goto fail; - } - - if (!bio_crypt_check_alignment(bio)) { - bio->bi_status = BLK_STS_INVAL; - goto fail; + bio_io_error(bio); + return false; } /* - * Success if device supports the encryption context, or if we succeeded - * in falling back to the crypto API. + * If the device does not natively support the encryption context, try to use + * the fallback if available. */ - if (blk_crypto_config_supported_natively(bio->bi_bdev, - &bc_key->crypto_cfg)) - return true; - if (blk_crypto_fallback_bio_prep(bio_ptr)) - return true; -fail: - bio_endio(*bio_ptr); - return false; + if (!blk_crypto_config_supported_natively(bdev, &bc_key->crypto_cfg)) { + if (!IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK)) { + pr_warn_once("%pg: crypto API fallback disabled; failing request.\n", + bdev); + bio->bi_status = BLK_STS_NOTSUPP; + bio_endio(bio); + return false; + } + return blk_crypto_fallback_bio_prep(bio); + } + + return true; } +EXPORT_SYMBOL_GPL(__blk_crypto_submit_bio); int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, gfp_t gfp_mask) diff --git a/block/blk-flush.c b/block/blk-flush.c index 43d6152897a4..403a46c86411 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -199,7 +199,8 @@ static void blk_flush_complete_seq(struct request *rq, } static enum rq_end_io_ret flush_end_io(struct request *flush_rq, - blk_status_t error) + blk_status_t error, + const struct io_comp_batch *iob) { struct request_queue *q = flush_rq->q; struct list_head *running; @@ -335,7 +336,8 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, } static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, - blk_status_t error) + blk_status_t error, + const struct io_comp_batch *iob) { struct request_queue *q = rq->q; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; diff --git a/block/blk-iocost.c b/block/blk-iocost.c index a0416927d33d..ef543d163d46 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -812,7 +812,7 @@ static int ioc_autop_idx(struct ioc *ioc, struct gendisk *disk) u64 now_ns; /* rotational? */ - if (!blk_queue_nonrot(disk->queue)) + if (blk_queue_rot(disk->queue)) return AUTOP_HDD; /* handle SATA SSDs w/ broken NCQ */ diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 45bd18f68541..f7434278cd29 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -988,10 +988,7 @@ static void iolatency_pd_init(struct blkg_policy_data *pd) u64 now = blk_time_get_ns(); int cpu; - if (blk_queue_nonrot(blkg->q)) - iolat->ssd = true; - else - iolat->ssd = false; + iolat->ssd = !blk_queue_rot(blkg->q); for_each_possible_cpu(cpu) { struct latency_stat *stat; diff --git a/block/blk-merge.c b/block/blk-merge.c index d3115d7469df..c18bc440d647 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -158,8 +158,9 @@ static struct bio *bio_submit_split(struct bio *bio, int split_sectors) return bio; } -struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim, - unsigned *nsegs) +static struct bio *__bio_split_discard(struct bio *bio, + const struct queue_limits *lim, unsigned *nsegs, + unsigned int max_sectors) { unsigned int max_discard_sectors, granularity; sector_t tmp; @@ -169,8 +170,7 @@ struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim, granularity = max(lim->discard_granularity >> 9, 1U); - max_discard_sectors = - min(lim->max_discard_sectors, bio_allowed_max_sectors(lim)); + max_discard_sectors = min(max_sectors, bio_allowed_max_sectors(lim)); max_discard_sectors -= max_discard_sectors % granularity; if (unlikely(!max_discard_sectors)) return bio; @@ -194,6 +194,19 @@ struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim, return bio_submit_split(bio, split_sectors); } +struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim, + unsigned *nsegs) +{ + unsigned int max_sectors; + + if (bio_op(bio) == REQ_OP_SECURE_ERASE) + max_sectors = lim->max_secure_erase_sectors; + else + max_sectors = lim->max_discard_sectors; + + return __bio_split_discard(bio, lim, nsegs, max_sectors); +} + static inline unsigned int blk_boundary_sectors(const struct queue_limits *lim, bool is_atomic) { @@ -324,12 +337,19 @@ static inline unsigned int bvec_seg_gap(struct bio_vec *bvprv, int bio_split_io_at(struct bio *bio, const struct queue_limits *lim, unsigned *segs, unsigned max_bytes, unsigned len_align_mask) { + struct bio_crypt_ctx *bc = bio_crypt_ctx(bio); struct bio_vec bv, bvprv, *bvprvp = NULL; unsigned nsegs = 0, bytes = 0, gaps = 0; struct bvec_iter iter; + unsigned start_align_mask = lim->dma_alignment; + + if (bc) { + start_align_mask |= (bc->bc_key->crypto_cfg.data_unit_size - 1); + len_align_mask |= (bc->bc_key->crypto_cfg.data_unit_size - 1); + } bio_for_each_bvec(bv, bio, iter) { - if (bv.bv_offset & lim->dma_alignment || + if (bv.bv_offset & start_align_mask || bv.bv_len & len_align_mask) return -EINVAL; diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 4896525b1c05..faeaa1fc86a7 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -608,9 +608,23 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = { {}, }; -static void debugfs_create_files(struct dentry *parent, void *data, +static void debugfs_create_files(struct request_queue *q, struct dentry *parent, + void *data, const struct blk_mq_debugfs_attr *attr) { + lockdep_assert_held(&q->debugfs_mutex); + /* + * Creating new debugfs entries with queue freezed has the risk of + * deadlock. + */ + WARN_ON_ONCE(q->mq_freeze_depth != 0); + /* + * debugfs_mutex should not be nested under other locks that can be + * grabbed while queue is frozen. + */ + lockdep_assert_not_held(&q->elevator_lock); + lockdep_assert_not_held(&q->rq_qos_mutex); + if (IS_ERR_OR_NULL(parent)) return; @@ -624,21 +638,14 @@ void blk_mq_debugfs_register(struct request_queue *q) struct blk_mq_hw_ctx *hctx; unsigned long i; - debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs); + debugfs_create_files(q, q->debugfs_dir, q, blk_mq_debugfs_queue_attrs); queue_for_each_hw_ctx(q, hctx, i) { if (!hctx->debugfs_dir) blk_mq_debugfs_register_hctx(q, hctx); } - if (q->rq_qos) { - struct rq_qos *rqos = q->rq_qos; - - while (rqos) { - blk_mq_debugfs_register_rqos(rqos); - rqos = rqos->next; - } - } + blk_mq_debugfs_register_rq_qos(q); } static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx, @@ -650,7 +657,8 @@ static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx, snprintf(name, sizeof(name), "cpu%u", ctx->cpu); ctx_dir = debugfs_create_dir(name, hctx->debugfs_dir); - debugfs_create_files(ctx_dir, ctx, blk_mq_debugfs_ctx_attrs); + debugfs_create_files(hctx->queue, ctx_dir, ctx, + blk_mq_debugfs_ctx_attrs); } void blk_mq_debugfs_register_hctx(struct request_queue *q, @@ -666,7 +674,8 @@ void blk_mq_debugfs_register_hctx(struct request_queue *q, snprintf(name, sizeof(name), "hctx%u", hctx->queue_num); hctx->debugfs_dir = debugfs_create_dir(name, q->debugfs_dir); - debugfs_create_files(hctx->debugfs_dir, hctx, blk_mq_debugfs_hctx_attrs); + debugfs_create_files(q, hctx->debugfs_dir, hctx, + blk_mq_debugfs_hctx_attrs); hctx_for_each_ctx(hctx, ctx, i) blk_mq_debugfs_register_ctx(hctx, ctx); @@ -686,8 +695,10 @@ void blk_mq_debugfs_register_hctxs(struct request_queue *q) struct blk_mq_hw_ctx *hctx; unsigned long i; + mutex_lock(&q->debugfs_mutex); queue_for_each_hw_ctx(q, hctx, i) blk_mq_debugfs_register_hctx(q, hctx); + mutex_unlock(&q->debugfs_mutex); } void blk_mq_debugfs_unregister_hctxs(struct request_queue *q) @@ -717,7 +728,7 @@ void blk_mq_debugfs_register_sched(struct request_queue *q) q->sched_debugfs_dir = debugfs_create_dir("sched", q->debugfs_dir); - debugfs_create_files(q->sched_debugfs_dir, q, e->queue_debugfs_attrs); + debugfs_create_files(q, q->sched_debugfs_dir, q, e->queue_debugfs_attrs); } void blk_mq_debugfs_unregister_sched(struct request_queue *q) @@ -741,17 +752,7 @@ static const char *rq_qos_id_to_name(enum rq_qos_id id) return "unknown"; } -void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) -{ - lockdep_assert_held(&rqos->disk->queue->debugfs_mutex); - - if (!rqos->disk->queue->debugfs_dir) - return; - debugfs_remove_recursive(rqos->debugfs_dir); - rqos->debugfs_dir = NULL; -} - -void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) +static void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) { struct request_queue *q = rqos->disk->queue; const char *dir_name = rq_qos_id_to_name(rqos->id); @@ -766,7 +767,22 @@ void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) q->debugfs_dir); rqos->debugfs_dir = debugfs_create_dir(dir_name, q->rqos_debugfs_dir); - debugfs_create_files(rqos->debugfs_dir, rqos, rqos->ops->debugfs_attrs); + debugfs_create_files(q, rqos->debugfs_dir, rqos, + rqos->ops->debugfs_attrs); +} + +void blk_mq_debugfs_register_rq_qos(struct request_queue *q) +{ + lockdep_assert_held(&q->debugfs_mutex); + + if (q->rq_qos) { + struct rq_qos *rqos = q->rq_qos; + + while (rqos) { + blk_mq_debugfs_register_rqos(rqos); + rqos = rqos->next; + } + } } void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, @@ -789,7 +805,7 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, hctx->sched_debugfs_dir = debugfs_create_dir("sched", hctx->debugfs_dir); - debugfs_create_files(hctx->sched_debugfs_dir, hctx, + debugfs_create_files(q, hctx->sched_debugfs_dir, hctx, e->hctx_debugfs_attrs); } diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index c80e453e3014..49bb1aaa83dc 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h @@ -33,8 +33,7 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx); void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx); -void blk_mq_debugfs_register_rqos(struct rq_qos *rqos); -void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos); +void blk_mq_debugfs_register_rq_qos(struct request_queue *q); #else static inline void blk_mq_debugfs_register(struct request_queue *q) { @@ -74,13 +73,10 @@ static inline void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hc { } -static inline void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) +static inline void blk_mq_debugfs_register_rq_qos(struct request_queue *q) { } -static inline void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) -{ -} #endif #if defined(CONFIG_BLK_DEV_ZONED) && defined(CONFIG_BLK_DEBUG_FS) diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index fb018fffffdc..3c87779cdc19 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -6,11 +6,6 @@ #include #include "blk.h" -struct phys_vec { - phys_addr_t paddr; - u32 len; -}; - static bool __blk_map_iter_next(struct blk_map_iter *iter) { if (iter->iter.bi_size) @@ -112,8 +107,8 @@ static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev, struct phys_vec *vec) { enum dma_data_direction dir = rq_dma_dir(req); - unsigned int mapped = 0; unsigned int attrs = 0; + size_t mapped = 0; int error; iter->addr = state->addr; @@ -238,7 +233,6 @@ EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start); * blk_rq_dma_map_iter_next - map the next DMA segment for a request * @req: request to map * @dma_dev: device to map to - * @state: DMA IOVA state * @iter: block layer DMA iterator * * Iterate to the next mapping after a previous call to @@ -253,7 +247,7 @@ EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start); * returned in @iter.status. */ bool blk_rq_dma_map_iter_next(struct request *req, struct device *dma_dev, - struct dma_iova_state *state, struct blk_dma_iter *iter) + struct blk_dma_iter *iter) { struct phys_vec vec; @@ -297,6 +291,8 @@ int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, blk_rq_map_iter_init(rq, &iter); while (blk_map_iter_next(rq, &iter, &vec)) { *last_sg = blk_next_sg(last_sg, sglist); + + WARN_ON_ONCE(overflows_type(vec.len, unsigned int)); sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len, offset_in_page(vec.paddr)); nsegs++; @@ -417,6 +413,8 @@ int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist) while (blk_map_iter_next(rq, &iter, &vec)) { sg = blk_next_sg(&sg, sglist); + + WARN_ON_ONCE(overflows_type(vec.len, unsigned int)); sg_set_page(sg, phys_to_page(vec.paddr), vec.len, offset_in_page(vec.paddr)); segments++; diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 02c40a72e959..5678e15bd33c 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -137,4 +137,9 @@ static inline void blk_mq_set_min_shallow_depth(struct request_queue *q, depth); } +static inline bool blk_mq_is_sync_read(blk_opf_t opf) +{ + return op_is_sync(opf) && !op_is_write(opf); +} + #endif diff --git a/block/blk-mq.c b/block/blk-mq.c index 968699277c3d..5ae83d3e210f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -498,6 +498,42 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data) return rq_list_pop(data->cached_rqs); } +static void blk_mq_limit_depth(struct blk_mq_alloc_data *data) +{ + struct elevator_mq_ops *ops; + + /* If no I/O scheduler has been configured, don't limit requests */ + if (!data->q->elevator) { + blk_mq_tag_busy(data->hctx); + return; + } + + /* + * All requests use scheduler tags when an I/O scheduler is + * enabled for the queue. + */ + data->rq_flags |= RQF_SCHED_TAGS; + + /* + * Flush/passthrough requests are special and go directly to the + * dispatch list, they are not subject to the async_depth limit. + */ + if ((data->cmd_flags & REQ_OP_MASK) == REQ_OP_FLUSH || + blk_op_is_passthrough(data->cmd_flags)) + return; + + WARN_ON_ONCE(data->flags & BLK_MQ_REQ_RESERVED); + data->rq_flags |= RQF_USE_SCHED; + + /* + * By default, sync requests have no limit, and async requests are + * limited to async_depth. + */ + ops = &data->q->elevator->type->ops; + if (ops->limit_depth) + ops->limit_depth(data->cmd_flags, data); +} + static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) { struct request_queue *q = data->q; @@ -516,31 +552,7 @@ retry: data->ctx = blk_mq_get_ctx(q); data->hctx = blk_mq_map_queue(data->cmd_flags, data->ctx); - if (q->elevator) { - /* - * All requests use scheduler tags when an I/O scheduler is - * enabled for the queue. - */ - data->rq_flags |= RQF_SCHED_TAGS; - - /* - * Flush/passthrough requests are special and go directly to the - * dispatch list. - */ - if ((data->cmd_flags & REQ_OP_MASK) != REQ_OP_FLUSH && - !blk_op_is_passthrough(data->cmd_flags)) { - struct elevator_mq_ops *ops = &q->elevator->type->ops; - - WARN_ON_ONCE(data->flags & BLK_MQ_REQ_RESERVED); - - data->rq_flags |= RQF_USE_SCHED; - if (ops->limit_depth) - ops->limit_depth(data->cmd_flags, data); - } - } else { - blk_mq_tag_busy(data->hctx); - } - + blk_mq_limit_depth(data); if (data->flags & BLK_MQ_REQ_RESERVED) data->rq_flags |= RQF_RESV; @@ -1156,7 +1168,7 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error) if (rq->end_io) { rq_qos_done(rq->q, rq); - if (rq->end_io(rq, error) == RQ_END_IO_FREE) + if (rq->end_io(rq, error, NULL) == RQ_END_IO_FREE) blk_mq_free_request(rq); } else { blk_mq_free_request(rq); @@ -1211,7 +1223,7 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob) * If end_io handler returns NONE, then it still has * ownership of the request. */ - if (rq->end_io && rq->end_io(rq, 0) == RQ_END_IO_NONE) + if (rq->end_io && rq->end_io(rq, 0, iob) == RQ_END_IO_NONE) continue; WRITE_ONCE(rq->state, MQ_RQ_IDLE); @@ -1458,7 +1470,8 @@ struct blk_rq_wait { blk_status_t ret; }; -static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret) +static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret, + const struct io_comp_batch *iob) { struct blk_rq_wait *wait = rq->end_io_data; @@ -1688,7 +1701,7 @@ static bool blk_mq_req_expired(struct request *rq, struct blk_expired_data *expi void blk_mq_put_rq_ref(struct request *rq) { if (is_flush_rq(rq)) { - if (rq->end_io(rq, 0) == RQ_END_IO_FREE) + if (rq->end_io(rq, 0, NULL) == RQ_END_IO_FREE) blk_mq_free_request(rq); } else if (req_ref_put_and_test(rq)) { __blk_mq_free_request(rq); @@ -4649,6 +4662,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, spin_lock_init(&q->requeue_lock); q->nr_requests = set->queue_depth; + q->async_depth = set->queue_depth; blk_mq_init_cpu_queues(q, set->nr_hw_queues); blk_mq_map_swqueue(q); @@ -5015,6 +5029,11 @@ struct elevator_tags *blk_mq_update_nr_requests(struct request_queue *q, q->elevator->et = et; } + /* + * Preserve relative value, both nr and async_depth are at most 16 bit + * value, no need to worry about overflow. + */ + q->async_depth = max(q->async_depth * nr / q->nr_requests, 1); q->nr_requests = nr; if (q->elevator && q->elevator->type->ops.depth_updated) q->elevator->type->ops.depth_updated(q); diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index 654478dfbc20..85cf74402a09 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -347,13 +347,6 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, blk_queue_flag_set(QUEUE_FLAG_QOS_ENABLED, q); blk_mq_unfreeze_queue(q, memflags); - - if (rqos->ops->debugfs_attrs) { - mutex_lock(&q->debugfs_mutex); - blk_mq_debugfs_register_rqos(rqos); - mutex_unlock(&q->debugfs_mutex); - } - return 0; ebusy: blk_mq_unfreeze_queue(q, memflags); @@ -378,8 +371,4 @@ void rq_qos_del(struct rq_qos *rqos) if (!q->rq_qos) blk_queue_flag_clear(QUEUE_FLAG_QOS_ENABLED, q); blk_mq_unfreeze_queue(q, memflags); - - mutex_lock(&q->debugfs_mutex); - blk_mq_debugfs_unregister_rqos(rqos); - mutex_unlock(&q->debugfs_mutex); } diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index e0a70d26972b..003aa684e854 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -127,6 +127,46 @@ unlock: return ret; } +static ssize_t queue_async_depth_show(struct gendisk *disk, char *page) +{ + guard(mutex)(&disk->queue->elevator_lock); + + return queue_var_show(disk->queue->async_depth, page); +} + +static ssize_t +queue_async_depth_store(struct gendisk *disk, const char *page, size_t count) +{ + struct request_queue *q = disk->queue; + unsigned int memflags; + unsigned long nr; + int ret; + + if (!queue_is_mq(q)) + return -EINVAL; + + ret = queue_var_store(&nr, page, count); + if (ret < 0) + return ret; + + if (nr == 0) + return -EINVAL; + + memflags = blk_mq_freeze_queue(q); + scoped_guard(mutex, &q->elevator_lock) { + if (q->elevator) { + q->async_depth = min(q->nr_requests, nr); + if (q->elevator->type->ops.depth_updated) + q->elevator->type->ops.depth_updated(q); + } else { + ret = -EINVAL; + } + } + blk_mq_unfreeze_queue(q, memflags); + + return ret; +} + static ssize_t queue_ra_show(struct gendisk *disk, char *page) { ssize_t ret; @@ -532,6 +572,7 @@ static struct queue_sysfs_entry _prefix##_entry = { \ } QUEUE_RW_ENTRY(queue_requests, "nr_requests"); +QUEUE_RW_ENTRY(queue_async_depth, "async_depth"); QUEUE_RW_ENTRY(queue_ra, "read_ahead_kb"); QUEUE_LIM_RW_ENTRY(queue_max_sectors, "max_sectors_kb"); QUEUE_LIM_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb"); @@ -636,11 +677,8 @@ out: static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, size_t count) { - struct request_queue *q = disk->queue; - struct rq_qos *rqos; ssize_t ret; s64 val; - unsigned int memflags; ret = queue_var_store64(&val, page); if (ret < 0) @@ -648,40 +686,8 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, if (val < -1) return -EINVAL; - /* - * Ensure that the queue is idled, in case the latency update - * ends up either enabling or disabling wbt completely. We can't - * have IO inflight if that happens. - */ - memflags = blk_mq_freeze_queue(q); - - rqos = wbt_rq_qos(q); - if (!rqos) { - ret = wbt_init(disk); - if (ret) - goto out; - } - - ret = count; - if (val == -1) - val = wbt_default_latency_nsec(q); - else if (val >= 0) - val *= 1000ULL; - - if (wbt_get_min_lat(q) == val) - goto out; - - blk_mq_quiesce_queue(q); - - mutex_lock(&disk->rqos_state_mutex); - wbt_set_min_lat(q, val); - mutex_unlock(&disk->rqos_state_mutex); - - blk_mq_unquiesce_queue(q); -out: - blk_mq_unfreeze_queue(q, memflags); - - return ret; + ret = wbt_set_lat(disk, val); + return ret ? ret : count; } QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); @@ -754,6 +760,7 @@ static struct attribute *blk_mq_queue_attrs[] = { */ &elv_iosched_entry.attr, &queue_requests_entry.attr, + &queue_async_depth_entry.attr, #ifdef CONFIG_BLK_WBT &queue_wb_lat_entry.attr, #endif diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 0974875f77bd..1415f2bf8611 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -93,6 +93,8 @@ struct rq_wb { struct rq_depth rq_depth; }; +static int wbt_init(struct gendisk *disk, struct rq_wb *rwb); + static inline struct rq_wb *RQWB(struct rq_qos *rqos) { return container_of(rqos, struct rq_wb, rqos); @@ -506,7 +508,7 @@ u64 wbt_get_min_lat(struct request_queue *q) return RQWB(rqos)->min_lat_nsec; } -void wbt_set_min_lat(struct request_queue *q, u64 val) +static void wbt_set_min_lat(struct request_queue *q, u64 val) { struct rq_qos *rqos = wbt_rq_qos(q); if (!rqos) @@ -696,6 +698,41 @@ static void wbt_requeue(struct rq_qos *rqos, struct request *rq) } } +static int wbt_data_dir(const struct request *rq) +{ + const enum req_op op = req_op(rq); + + if (op == REQ_OP_READ) + return READ; + else if (op_is_write(op)) + return WRITE; + + /* don't account */ + return -1; +} + +static struct rq_wb *wbt_alloc(void) +{ + struct rq_wb *rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); + + if (!rwb) + return NULL; + + rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb); + if (!rwb->cb) { + kfree(rwb); + return NULL; + } + + return rwb; +} + +static void wbt_free(struct rq_wb *rwb) +{ + blk_stat_free_callback(rwb->cb); + kfree(rwb); +} + /* * Enable wbt if defaults are configured that way */ @@ -737,33 +774,35 @@ EXPORT_SYMBOL_GPL(wbt_enable_default); void wbt_init_enable_default(struct gendisk *disk) { - if (__wbt_enable_default(disk)) - WARN_ON_ONCE(wbt_init(disk)); + struct request_queue *q = disk->queue; + struct rq_wb *rwb; + + if (!__wbt_enable_default(disk)) + return; + + rwb = wbt_alloc(); + if (WARN_ON_ONCE(!rwb)) + return; + + if (WARN_ON_ONCE(wbt_init(disk, rwb))) { + wbt_free(rwb); + return; + } + + mutex_lock(&q->debugfs_mutex); + blk_mq_debugfs_register_rq_qos(q); + mutex_unlock(&q->debugfs_mutex); } -u64 wbt_default_latency_nsec(struct request_queue *q) +static u64 wbt_default_latency_nsec(struct request_queue *q) { /* * We default to 2msec for non-rotational storage, and 75msec * for rotational storage. */ - if (blk_queue_nonrot(q)) - return 2000000ULL; - else + if (blk_queue_rot(q)) return 75000000ULL; -} - -static int wbt_data_dir(const struct request *rq) -{ - const enum req_op op = req_op(rq); - - if (op == REQ_OP_READ) - return READ; - else if (op_is_write(op)) - return WRITE; - - /* don't account */ - return -1; + return 2000000ULL; } static void wbt_queue_depth_changed(struct rq_qos *rqos) @@ -777,8 +816,7 @@ static void wbt_exit(struct rq_qos *rqos) struct rq_wb *rwb = RQWB(rqos); blk_stat_remove_callback(rqos->disk->queue, rwb->cb); - blk_stat_free_callback(rwb->cb); - kfree(rwb); + wbt_free(rwb); } /* @@ -902,22 +940,11 @@ static const struct rq_qos_ops wbt_rqos_ops = { #endif }; -int wbt_init(struct gendisk *disk) +static int wbt_init(struct gendisk *disk, struct rq_wb *rwb) { struct request_queue *q = disk->queue; - struct rq_wb *rwb; - int i; int ret; - - rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); - if (!rwb) - return -ENOMEM; - - rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb); - if (!rwb->cb) { - kfree(rwb); - return -ENOMEM; - } + int i; for (i = 0; i < WBT_NUM_RWQ; i++) rq_wait_init(&rwb->rq_wait[i]); @@ -937,15 +964,60 @@ int wbt_init(struct gendisk *disk) ret = rq_qos_add(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops); mutex_unlock(&q->rq_qos_mutex); if (ret) - goto err_free; + return ret; blk_stat_add_callback(q, rwb->cb); - return 0; - -err_free: - blk_stat_free_callback(rwb->cb); - kfree(rwb); - return ret; - +} + +int wbt_set_lat(struct gendisk *disk, s64 val) +{ + struct request_queue *q = disk->queue; + struct rq_qos *rqos = wbt_rq_qos(q); + struct rq_wb *rwb = NULL; + unsigned int memflags; + int ret = 0; + + if (!rqos) { + rwb = wbt_alloc(); + if (!rwb) + return -ENOMEM; + } + + /* + * Ensure that the queue is idled, in case the latency update + * ends up either enabling or disabling wbt completely. We can't + * have IO inflight if that happens. + */ + memflags = blk_mq_freeze_queue(q); + if (!rqos) { + ret = wbt_init(disk, rwb); + if (ret) { + wbt_free(rwb); + goto out; + } + } + + if (val == -1) + val = wbt_default_latency_nsec(q); + else if (val >= 0) + val *= 1000ULL; + + if (wbt_get_min_lat(q) == val) + goto out; + + blk_mq_quiesce_queue(q); + + mutex_lock(&disk->rqos_state_mutex); + wbt_set_min_lat(q, val); + mutex_unlock(&disk->rqos_state_mutex); + + blk_mq_unquiesce_queue(q); +out: + blk_mq_unfreeze_queue(q, memflags); + mutex_lock(&q->debugfs_mutex); + blk_mq_debugfs_register_rq_qos(q); + mutex_unlock(&q->debugfs_mutex); + + return ret; } diff --git a/block/blk-wbt.h b/block/blk-wbt.h index 925f22475738..6e39da17218b 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h @@ -4,16 +4,13 @@ #ifdef CONFIG_BLK_WBT -int wbt_init(struct gendisk *disk); void wbt_init_enable_default(struct gendisk *disk); void wbt_disable_default(struct gendisk *disk); void wbt_enable_default(struct gendisk *disk); u64 wbt_get_min_lat(struct request_queue *q); -void wbt_set_min_lat(struct request_queue *q, u64 val); -bool wbt_disabled(struct request_queue *); - -u64 wbt_default_latency_nsec(struct request_queue *); +bool wbt_disabled(struct request_queue *q); +int wbt_set_lat(struct gendisk *disk, s64 val); #else diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 8000c94690ee..846b8844f04a 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -112,12 +112,12 @@ static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk) #define BLK_ZONE_WPLUG_UNHASHED (1U << 2) /** - * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX. - * @zone_cond: BLK_ZONE_COND_XXX. + * blk_zone_cond_str - Return a zone condition name string + * @zone_cond: a zone condition BLK_ZONE_COND_name * - * Description: Centralize block layer function to convert BLK_ZONE_COND_XXX - * into string format. Useful in the debugging and tracing zone conditions. For - * invalid BLK_ZONE_COND_XXX it returns string "UNKNOWN". + * Convert a BLK_ZONE_COND_name zone condition into the string "name". Useful + * for the debugging and tracing zone conditions. For an invalid zone + * conditions, the string "UNKNOWN" is returned. */ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond) { diff --git a/block/blk.h b/block/blk.h index e4c433f62dfc..401d19ed08a6 100644 --- a/block/blk.h +++ b/block/blk.h @@ -208,10 +208,14 @@ static inline unsigned int blk_queue_get_max_sectors(struct request *rq) struct request_queue *q = rq->q; enum req_op op = req_op(rq); - if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)) + if (unlikely(op == REQ_OP_DISCARD)) return min(q->limits.max_discard_sectors, UINT_MAX >> SECTOR_SHIFT); + if (unlikely(op == REQ_OP_SECURE_ERASE)) + return min(q->limits.max_secure_erase_sectors, + UINT_MAX >> SECTOR_SHIFT); + if (unlikely(op == REQ_OP_WRITE_ZEROES)) return q->limits.max_write_zeroes_sectors; @@ -371,12 +375,18 @@ struct bio *bio_split_zone_append(struct bio *bio, static inline bool bio_may_need_split(struct bio *bio, const struct queue_limits *lim) { + const struct bio_vec *bv; + if (lim->chunk_sectors) return true; - if (bio->bi_vcnt != 1) + + if (!bio->bi_io_vec) return true; - return bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > - lim->max_fast_segment_size; + + bv = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); + if (bio->bi_iter.bi_size > bv->bv_len - bio->bi_iter.bi_bvec_done) + return true; + return bv->bv_len + bv->bv_offset > lim->max_fast_segment_size; } /** diff --git a/block/elevator.c b/block/elevator.c index a2f8b2251dc6..ebe2a1fcf011 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -589,6 +589,7 @@ static int elevator_switch(struct request_queue *q, struct elv_change_ctx *ctx) blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q); q->elevator = NULL; q->nr_requests = q->tag_set->queue_depth; + q->async_depth = q->tag_set->queue_depth; } blk_add_trace_msg(q, "elv switch: %s", ctx->name); diff --git a/block/ioctl.c b/block/ioctl.c index 344478348a54..fd48f82f9f03 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -692,7 +692,7 @@ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode, queue_max_sectors(bdev_get_queue(bdev))); return put_ushort(argp, max_sectors); case BLKROTATIONAL: - return put_ushort(argp, !bdev_nonrot(bdev)); + return put_ushort(argp, bdev_rot(bdev)); case BLKRASET: case BLKFRASET: if(!capable(CAP_SYS_ADMIN)) diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index c1b36ffd19ce..b84163d1f851 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -47,9 +47,8 @@ enum { * asynchronous requests, we reserve 25% of requests for synchronous * operations. */ - KYBER_ASYNC_PERCENT = 75, + KYBER_DEFAULT_ASYNC_PERCENT = 75, }; - /* * Maximum device-wide depth for each scheduling domain. * @@ -157,9 +156,6 @@ struct kyber_queue_data { */ struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS]; - /* Number of allowed async requests. */ - unsigned int async_depth; - struct kyber_cpu_latency __percpu *cpu_latency; /* Timer for stats aggregation and adjusting domain tokens. */ @@ -401,10 +397,7 @@ err: static void kyber_depth_updated(struct request_queue *q) { - struct kyber_queue_data *kqd = q->elevator->elevator_data; - - kqd->async_depth = q->nr_requests * KYBER_ASYNC_PERCENT / 100U; - blk_mq_set_min_shallow_depth(q, kqd->async_depth); + blk_mq_set_min_shallow_depth(q, q->async_depth); } static int kyber_init_sched(struct request_queue *q, struct elevator_queue *eq) @@ -414,6 +407,7 @@ static int kyber_init_sched(struct request_queue *q, struct elevator_queue *eq) blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q); q->elevator = eq; + q->async_depth = q->nr_requests * KYBER_DEFAULT_ASYNC_PERCENT / 100; kyber_depth_updated(q); return 0; @@ -552,15 +546,8 @@ static void rq_clear_domain_token(struct kyber_queue_data *kqd, static void kyber_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) { - /* - * We use the scheduler tags as per-hardware queue queueing tokens. - * Async requests can be limited at this stage. - */ - if (!op_is_sync(opf)) { - struct kyber_queue_data *kqd = data->q->elevator->elevator_data; - - data->shallow_depth = kqd->async_depth; - } + if (!blk_mq_is_sync_read(opf)) + data->shallow_depth = data->q->async_depth; } static bool kyber_bio_merge(struct request_queue *q, struct bio *bio, @@ -956,15 +943,6 @@ KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_DISCARD, discard) KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other) #undef KYBER_DEBUGFS_DOMAIN_ATTRS -static int kyber_async_depth_show(void *data, struct seq_file *m) -{ - struct request_queue *q = data; - struct kyber_queue_data *kqd = q->elevator->elevator_data; - - seq_printf(m, "%u\n", kqd->async_depth); - return 0; -} - static int kyber_cur_domain_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; @@ -990,7 +968,6 @@ static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = { KYBER_QUEUE_DOMAIN_ATTRS(write), KYBER_QUEUE_DOMAIN_ATTRS(discard), KYBER_QUEUE_DOMAIN_ATTRS(other), - {"async_depth", 0400, kyber_async_depth_show}, {}, }; #undef KYBER_QUEUE_DOMAIN_ATTRS diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 3e3719093aec..95917a88976f 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -98,7 +98,6 @@ struct deadline_data { int fifo_batch; int writes_starved; int front_merges; - u32 async_depth; int prio_aging_expire; spinlock_t lock; @@ -486,32 +485,16 @@ unlock: return rq; } -/* - * Called by __blk_mq_alloc_request(). The shallow_depth value set by this - * function is used by __blk_mq_get_tag(). - */ static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) { - struct deadline_data *dd = data->q->elevator->elevator_data; - - /* Do not throttle synchronous reads. */ - if (op_is_sync(opf) && !op_is_write(opf)) - return; - - /* - * Throttle asynchronous requests and writes such that these requests - * do not block the allocation of synchronous requests. - */ - data->shallow_depth = dd->async_depth; + if (!blk_mq_is_sync_read(opf)) + data->shallow_depth = data->q->async_depth; } -/* Called by blk_mq_update_nr_requests(). */ +/* Called by blk_mq_init_sched() and blk_mq_update_nr_requests(). */ static void dd_depth_updated(struct request_queue *q) { - struct deadline_data *dd = q->elevator->elevator_data; - - dd->async_depth = q->nr_requests; - blk_mq_set_min_shallow_depth(q, 1); + blk_mq_set_min_shallow_depth(q, q->async_depth); } static void dd_exit_sched(struct elevator_queue *e) @@ -576,6 +559,7 @@ static int dd_init_sched(struct request_queue *q, struct elevator_queue *eq) blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); q->elevator = eq; + q->async_depth = q->nr_requests; dd_depth_updated(q); return 0; } @@ -763,7 +747,6 @@ SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]); SHOW_JIFFIES(deadline_prio_aging_expire_show, dd->prio_aging_expire); SHOW_INT(deadline_writes_starved_show, dd->writes_starved); SHOW_INT(deadline_front_merges_show, dd->front_merges); -SHOW_INT(deadline_async_depth_show, dd->async_depth); SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch); #undef SHOW_INT #undef SHOW_JIFFIES @@ -793,7 +776,6 @@ STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MA STORE_JIFFIES(deadline_prio_aging_expire_store, &dd->prio_aging_expire, 0, INT_MAX); STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX); STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1); -STORE_INT(deadline_async_depth_store, &dd->async_depth, 1, INT_MAX); STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX); #undef STORE_FUNCTION #undef STORE_INT @@ -807,7 +789,6 @@ static const struct elv_fs_entry deadline_attrs[] = { DD_ATTR(write_expire), DD_ATTR(writes_starved), DD_ATTR(front_merges), - DD_ATTR(async_depth), DD_ATTR(fifo_batch), DD_ATTR(prio_aging_expire), __ATTR_NULL @@ -894,15 +875,6 @@ static int deadline_starved_show(void *data, struct seq_file *m) return 0; } -static int dd_async_depth_show(void *data, struct seq_file *m) -{ - struct request_queue *q = data; - struct deadline_data *dd = q->elevator->elevator_data; - - seq_printf(m, "%u\n", dd->async_depth); - return 0; -} - static int dd_queued_show(void *data, struct seq_file *m) { struct request_queue *q = data; @@ -1002,7 +974,6 @@ static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = { DEADLINE_NEXT_RQ_ATTR(write2), {"batching", 0400, deadline_batching_show}, {"starved", 0400, deadline_starved_show}, - {"async_depth", 0400, dd_async_depth_show}, {"dispatch", 0400, .seq_ops = &deadline_dispatch_seq_ops}, {"owned_by_driver", 0400, dd_owned_by_driver_show}, {"queued", 0400, dd_queued_show}, diff --git a/block/partitions/core.c b/block/partitions/core.c index 815ed33caa1b..079057ab535a 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -130,7 +131,7 @@ static struct parsed_partitions *check_partition(struct gendisk *hd) state->pp_buf[0] = '\0'; state->disk = hd; - snprintf(state->name, BDEVNAME_SIZE, "%s", hd->disk_name); + strscpy(state->name, hd->disk_name); snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name); if (isdigit(state->name[strlen(state->name)-1])) sprintf(state->name, "p"); diff --git a/block/sed-opal.c b/block/sed-opal.c index 5a28f23f7f22..23a19c92d791 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -2940,7 +2940,8 @@ static int opal_activate_lsp(struct opal_dev *dev, }; int ret; - if (!opal_lr_act->num_lrs || opal_lr_act->num_lrs > OPAL_MAX_LRS) + if (opal_lr_act->sum && + (!opal_lr_act->num_lrs || opal_lr_act->num_lrs > OPAL_MAX_LRS)) return -EINVAL; ret = opal_get_key(dev, &opal_lr_act->key); diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 9778259b30d4..a5104cf96609 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -247,8 +247,7 @@ MODULE_ALIAS("rd"); /* Legacy boot options - nonmodular */ static int __init ramdisk_size(char *str) { - rd_size = simple_strtol(str, NULL, 0); - return 1; + return kstrtoul(str, 0, &rd_size) == 0; } __setup("ramdisk_size=", ramdisk_size); #endif diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 32a3a5b13802..98789a5297f2 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -969,7 +969,7 @@ static void loop_update_limits(struct loop_device *lo, struct queue_limits *lim, lim->features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_ROTATIONAL); if (file->f_op->fsync && !(lo->lo_flags & LO_FLAGS_READ_ONLY)) lim->features |= BLK_FEAT_WRITE_CACHE; - if (backing_bdev && !bdev_nonrot(backing_bdev)) + if (backing_bdev && bdev_rot(backing_bdev)) lim->features |= BLK_FEAT_ROTATIONAL; lim->max_hw_discard_sectors = max_discard_sectors; lim->max_write_zeroes_sectors = max_discard_sectors; diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 4c0632ab4e1b..740a8ac42075 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -642,7 +642,7 @@ static void nullb_device_release(struct config_item *item) null_free_dev(dev); } -static struct configfs_item_operations nullb_device_ops = { +static const struct configfs_item_operations nullb_device_ops = { .release = nullb_device_release, }; @@ -749,7 +749,7 @@ static struct configfs_attribute *nullb_group_attrs[] = { NULL, }; -static struct configfs_group_operations nullb_group_ops = { +static const struct configfs_group_operations nullb_group_ops = { .make_group = nullb_group_make_group, .drop_item = nullb_group_drop_item, }; diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index 6ea7c12e3a87..144aea1466a4 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -475,9 +475,17 @@ void rnbd_clt_remove_dev_symlink(struct rnbd_clt_dev *dev) } } +static void rnbd_dev_release(struct kobject *kobj) +{ + struct rnbd_clt_dev *dev = container_of(kobj, struct rnbd_clt_dev, kobj); + + kfree(dev); +} + static const struct kobj_type rnbd_dev_ktype = { .sysfs_ops = &kobj_sysfs_ops, .default_groups = rnbd_dev_groups, + .release = rnbd_dev_release, }; static int rnbd_clt_add_dev_kobj(struct rnbd_clt_dev *dev) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index d1c354636315..757df2896aeb 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -60,7 +60,9 @@ static void rnbd_clt_put_dev(struct rnbd_clt_dev *dev) kfree(dev->pathname); rnbd_clt_put_sess(dev->sess); mutex_destroy(&dev->lock); - kfree(dev); + + if (dev->kobj.state_initialized) + kobject_put(&dev->kobj); } static inline bool rnbd_clt_get_dev(struct rnbd_clt_dev *dev) @@ -1517,7 +1519,7 @@ static bool insert_dev_if_not_exists_devpath(struct rnbd_clt_dev *dev) return found; } -static void delete_dev(struct rnbd_clt_dev *dev) +static void rnbd_delete_dev(struct rnbd_clt_dev *dev) { struct rnbd_clt_session *sess = dev->sess; @@ -1638,7 +1640,7 @@ put_iu: kfree(rsp); rnbd_put_iu(sess, iu); del_dev: - delete_dev(dev); + rnbd_delete_dev(dev); put_dev: rnbd_clt_put_dev(dev); put_sess: @@ -1647,13 +1649,13 @@ put_sess: return ERR_PTR(ret); } -static void destroy_gen_disk(struct rnbd_clt_dev *dev) +static void rnbd_destroy_gen_disk(struct rnbd_clt_dev *dev) { del_gendisk(dev->gd); put_disk(dev->gd); } -static void destroy_sysfs(struct rnbd_clt_dev *dev, +static void rnbd_destroy_sysfs(struct rnbd_clt_dev *dev, const struct attribute *sysfs_self) { rnbd_clt_remove_dev_symlink(dev); @@ -1662,7 +1664,6 @@ static void destroy_sysfs(struct rnbd_clt_dev *dev, /* To avoid deadlock firstly remove itself */ sysfs_remove_file_self(&dev->kobj, sysfs_self); kobject_del(&dev->kobj); - kobject_put(&dev->kobj); } } @@ -1691,9 +1692,9 @@ int rnbd_clt_unmap_device(struct rnbd_clt_dev *dev, bool force, dev->dev_state = DEV_STATE_UNMAPPED; mutex_unlock(&dev->lock); - delete_dev(dev); - destroy_sysfs(dev, sysfs_self); - destroy_gen_disk(dev); + rnbd_delete_dev(dev); + rnbd_destroy_sysfs(dev, sysfs_self); + rnbd_destroy_gen_disk(dev); if (was_mapped && sess->rtrs) send_msg_close(dev, dev->device_id, RTRS_PERMIT_WAIT); diff --git a/drivers/block/rnbd/rnbd-proto.h b/drivers/block/rnbd/rnbd-proto.h index 77360c2a6069..64f1cfe9f8ef 100644 --- a/drivers/block/rnbd/rnbd-proto.h +++ b/drivers/block/rnbd/rnbd-proto.h @@ -18,7 +18,7 @@ #include #define RNBD_PROTO_VER_MAJOR 2 -#define RNBD_PROTO_VER_MINOR 0 +#define RNBD_PROTO_VER_MINOR 2 /* The default port number the RTRS server is listening on. */ #define RTRS_PORT 1234 @@ -197,6 +197,8 @@ struct rnbd_msg_io { * * @RNBD_F_SYNC: request is sync (sync write or read) * @RNBD_F_FUA: forced unit access + * @RNBD_F_PREFLUSH: request for cache flush + * @RNBD_F_NOUNMAP: do not free blocks when zeroing */ enum rnbd_io_flags { @@ -211,6 +213,8 @@ enum rnbd_io_flags { /* Flags */ RNBD_F_SYNC = 1<<(RNBD_OP_BITS + 0), RNBD_F_FUA = 1<<(RNBD_OP_BITS + 1), + RNBD_F_PREFLUSH = 1<<(RNBD_OP_BITS + 2), + RNBD_F_NOUNMAP = 1<<(RNBD_OP_BITS + 3) }; static inline u32 rnbd_op(u32 flags) @@ -245,6 +249,9 @@ static inline blk_opf_t rnbd_to_bio_flags(u32 rnbd_opf) break; case RNBD_OP_WRITE_ZEROES: bio_opf = REQ_OP_WRITE_ZEROES; + + if (rnbd_opf & RNBD_F_NOUNMAP) + bio_opf |= REQ_NOUNMAP; break; default: WARN(1, "Unknown RNBD type: %d (flags %d)\n", @@ -258,6 +265,9 @@ static inline blk_opf_t rnbd_to_bio_flags(u32 rnbd_opf) if (rnbd_opf & RNBD_F_FUA) bio_opf |= REQ_FUA; + if (rnbd_opf & RNBD_F_PREFLUSH) + bio_opf |= REQ_PREFLUSH; + return bio_opf; } @@ -280,6 +290,9 @@ static inline u32 rq_to_rnbd_flags(struct request *rq) break; case REQ_OP_WRITE_ZEROES: rnbd_opf = RNBD_OP_WRITE_ZEROES; + + if (rq->cmd_flags & REQ_NOUNMAP) + rnbd_opf |= RNBD_F_NOUNMAP; break; case REQ_OP_FLUSH: rnbd_opf = RNBD_OP_FLUSH; @@ -297,6 +310,9 @@ static inline u32 rq_to_rnbd_flags(struct request *rq) if (op_is_flush(rq->cmd_flags)) rnbd_opf |= RNBD_F_FUA; + if (rq->cmd_flags & REQ_PREFLUSH) + rnbd_opf |= RNBD_F_PREFLUSH; + return rnbd_opf; } diff --git a/drivers/block/rnbd/rnbd-srv-trace.h b/drivers/block/rnbd/rnbd-srv-trace.h index 89d0bcb17195..18ae2ed5537a 100644 --- a/drivers/block/rnbd/rnbd-srv-trace.h +++ b/drivers/block/rnbd/rnbd-srv-trace.h @@ -44,24 +44,6 @@ DEFINE_EVENT(rnbd_srv_link_class, name, \ DEFINE_LINK_EVENT(create_sess); DEFINE_LINK_EVENT(destroy_sess); -TRACE_DEFINE_ENUM(RNBD_OP_READ); -TRACE_DEFINE_ENUM(RNBD_OP_WRITE); -TRACE_DEFINE_ENUM(RNBD_OP_FLUSH); -TRACE_DEFINE_ENUM(RNBD_OP_DISCARD); -TRACE_DEFINE_ENUM(RNBD_OP_SECURE_ERASE); -TRACE_DEFINE_ENUM(RNBD_F_SYNC); -TRACE_DEFINE_ENUM(RNBD_F_FUA); - -#define show_rnbd_rw_flags(x) \ - __print_flags(x, "|", \ - { RNBD_OP_READ, "READ" }, \ - { RNBD_OP_WRITE, "WRITE" }, \ - { RNBD_OP_FLUSH, "FLUSH" }, \ - { RNBD_OP_DISCARD, "DISCARD" }, \ - { RNBD_OP_SECURE_ERASE, "SECURE_ERASE" }, \ - { RNBD_F_SYNC, "SYNC" }, \ - { RNBD_F_FUA, "FUA" }) - TRACE_EVENT(process_rdma, TP_PROTO(struct rnbd_srv_session *srv, const struct rnbd_msg_io *msg, @@ -97,7 +79,7 @@ TRACE_EVENT(process_rdma, __entry->usrlen = usrlen; ), - TP_printk("I/O req: sess: %s, type: %s, ver: %d, devid: %u, sector: %llu, bsize: %u, flags: %s, ioprio: %d, datalen: %u, usrlen: %zu", + TP_printk("I/O req: sess: %s, type: %s, ver: %d, devid: %u, sector: %llu, bsize: %u, flags: %u, ioprio: %d, datalen: %u, usrlen: %zu", __get_str(sessname), __print_symbolic(__entry->dir, { READ, "READ" }, @@ -106,7 +88,7 @@ TRACE_EVENT(process_rdma, __entry->device_id, __entry->sector, __entry->bi_size, - show_rnbd_rw_flags(__entry->flags), + __entry->flags, __entry->ioprio, __entry->datalen, __entry->usrlen diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index 2df8941a6b14..7eeb321d6140 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -145,18 +145,30 @@ static int process_rdma(struct rnbd_srv_session *srv_sess, priv->sess_dev = sess_dev; priv->id = id; - bio = bio_alloc(file_bdev(sess_dev->bdev_file), 1, + bio = bio_alloc(file_bdev(sess_dev->bdev_file), !!datalen, rnbd_to_bio_flags(le32_to_cpu(msg->rw)), GFP_KERNEL); - bio_add_virt_nofail(bio, data, datalen); - - bio->bi_opf = rnbd_to_bio_flags(le32_to_cpu(msg->rw)); - if (bio_has_data(bio) && - bio->bi_iter.bi_size != le32_to_cpu(msg->bi_size)) { - rnbd_srv_err_rl(sess_dev, "Datalen mismatch: bio bi_size (%u), bi_size (%u)\n", - bio->bi_iter.bi_size, msg->bi_size); - err = -EINVAL; - goto bio_put; + if (unlikely(!bio)) { + err = -ENOMEM; + goto put_sess_dev; } + + if (!datalen) { + /* + * For special requests like DISCARD and WRITE_ZEROES, the datalen is zero. + */ + bio->bi_iter.bi_size = le32_to_cpu(msg->bi_size); + } else { + bio_add_virt_nofail(bio, data, datalen); + bio->bi_opf = rnbd_to_bio_flags(le32_to_cpu(msg->rw)); + if (bio->bi_iter.bi_size != le32_to_cpu(msg->bi_size)) { + rnbd_srv_err_rl(sess_dev, + "Datalen mismatch: bio bi_size (%u), bi_size (%u)\n", + bio->bi_iter.bi_size, msg->bi_size); + err = -EINVAL; + goto bio_put; + } + } + bio->bi_end_io = rnbd_dev_bi_end_io; bio->bi_private = priv; bio->bi_iter.bi_sector = le64_to_cpu(msg->sector); @@ -170,6 +182,7 @@ static int process_rdma(struct rnbd_srv_session *srv_sess, bio_put: bio_put(bio); +put_sess_dev: rnbd_put_sess_dev(sess_dev); err: kfree(priv); @@ -538,6 +551,8 @@ static void rnbd_srv_fill_msg_open_rsp(struct rnbd_msg_open_rsp *rsp, { struct block_device *bdev = file_bdev(sess_dev->bdev_file); + memset(rsp, 0, sizeof(*rsp)); + rsp->hdr.type = cpu_to_le16(RNBD_MSG_OPEN_RSP); rsp->device_id = cpu_to_le32(sess_dev->device_id); rsp->nsectors = cpu_to_le64(bdev_nr_sectors(bdev)); @@ -644,6 +659,7 @@ static void process_msg_sess_info(struct rnbd_srv_session *srv_sess, trace_process_msg_sess_info(srv_sess, sess_info_msg); + memset(rsp, 0, sizeof(*rsp)); rsp->hdr.type = cpu_to_le16(RNBD_MSG_SESS_INFO_RSP); rsp->ver = srv_sess->ver; } diff --git a/drivers/block/rnull/configfs.rs b/drivers/block/rnull/configfs.rs index 6713a6d92391..158f38bbbb8b 100644 --- a/drivers/block/rnull/configfs.rs +++ b/drivers/block/rnull/configfs.rs @@ -13,7 +13,6 @@ use kernel::{ str::{kstrtobool_bytes, CString}, sync::Mutex, }; -use pin_init::PinInit; pub(crate) fn subsystem() -> impl PinInit, Error> { let item_type = configfs_attrs! { @@ -25,7 +24,7 @@ pub(crate) fn subsystem() -> impl PinInit, E ], }; - kernel::configfs::Subsystem::new(c_str!("rnull"), item_type, try_pin_init!(Config {})) + kernel::configfs::Subsystem::new(c"rnull", item_type, try_pin_init!(Config {})) } #[pin_data] diff --git a/drivers/block/rnull/rnull.rs b/drivers/block/rnull/rnull.rs index a9d5e575a2c4..0ca8715febe8 100644 --- a/drivers/block/rnull/rnull.rs +++ b/drivers/block/rnull/rnull.rs @@ -14,12 +14,9 @@ use kernel::{ Operations, TagSet, }, }, - error::Result, - pr_info, prelude::*, sync::{aref::ARef, Arc}, }; -use pin_init::PinInit; module! { type: NullBlkModule, diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index cd1e84653002..c13cda58a7c6 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -44,6 +44,9 @@ #include #include #include +#include +#include +#include #include #define UBLK_MINORS (1U << MINORBITS) @@ -54,6 +57,7 @@ #define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC) #define UBLK_CMD_UPDATE_SIZE _IOC_NR(UBLK_U_CMD_UPDATE_SIZE) #define UBLK_CMD_QUIESCE_DEV _IOC_NR(UBLK_U_CMD_QUIESCE_DEV) +#define UBLK_CMD_TRY_STOP_DEV _IOC_NR(UBLK_U_CMD_TRY_STOP_DEV) #define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF) #define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF) @@ -73,7 +77,11 @@ | UBLK_F_AUTO_BUF_REG \ | UBLK_F_QUIESCE \ | UBLK_F_PER_IO_DAEMON \ - | UBLK_F_BUF_REG_OFF_DAEMON) + | UBLK_F_BUF_REG_OFF_DAEMON \ + | (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0) \ + | UBLK_F_SAFE_STOP_DEV \ + | UBLK_F_BATCH_IO \ + | UBLK_F_NO_AUTO_PART_SCAN) #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \ | UBLK_F_USER_RECOVERY_REISSUE \ @@ -83,7 +91,20 @@ #define UBLK_PARAM_TYPE_ALL \ (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \ UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \ - UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT) + UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT | \ + UBLK_PARAM_TYPE_INTEGRITY) + +#define UBLK_BATCH_F_ALL \ + (UBLK_BATCH_F_HAS_ZONE_LBA | \ + UBLK_BATCH_F_HAS_BUF_ADDR | \ + UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) + +/* ublk batch fetch uring_cmd */ +struct ublk_batch_fetch_cmd { + struct list_head node; + struct io_uring_cmd *cmd; + unsigned short buf_group; +}; struct ublk_uring_cmd_pdu { /* @@ -105,7 +126,18 @@ struct ublk_uring_cmd_pdu { */ struct ublk_queue *ubq; - u16 tag; + union { + u16 tag; + struct ublk_batch_fetch_cmd *fcmd; /* batch io only */ + }; +}; + +struct ublk_batch_io_data { + struct ublk_device *ub; + struct io_uring_cmd *cmd; + struct ublk_batch_io header; + unsigned int issue_flags; + struct io_comp_batch *iob; }; /* @@ -155,6 +187,9 @@ struct ublk_uring_cmd_pdu { */ #define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2) +/* used for UBLK_F_BATCH_IO only */ +#define UBLK_BATCH_IO_UNUSED_TAG ((unsigned short)-1) + union ublk_io_buf { __u64 addr; struct ublk_auto_buf_reg auto_reg; @@ -179,7 +214,7 @@ struct ublk_io { * if user copy or zero copy are enabled: * - UBLK_REFCOUNT_INIT from dispatch to the server * until UBLK_IO_COMMIT_AND_FETCH_REQ - * - 1 for each inflight ublk_ch_{read,write}_iter() call + * - 1 for each inflight ublk_ch_{read,write}_iter() call not on task * - 1 for each io_uring registered buffer not registered on task * The I/O can only be completed once all references are dropped. * User copy and buffer registration operations are only permitted @@ -190,6 +225,7 @@ struct ublk_io { unsigned task_registered_buffers; void *buf_ctx_handle; + spinlock_t lock; } ____cacheline_aligned_in_smp; struct ublk_queue { @@ -204,6 +240,52 @@ struct ublk_queue { bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */ spinlock_t cancel_lock; struct ublk_device *dev; + u32 nr_io_ready; + + /* + * For supporting UBLK_F_BATCH_IO only. + * + * Inflight ublk request tag is saved in this fifo + * + * There are multiple writer from ublk_queue_rq() or ublk_queue_rqs(), + * so lock is required for storing request tag to fifo + * + * Make sure just one reader for fetching request from task work + * function to ublk server, so no need to grab the lock in reader + * side. + * + * Batch I/O State Management: + * + * The batch I/O system uses implicit state management based on the + * combination of three key variables below. + * + * - IDLE: list_empty(&fcmd_head) && !active_fcmd + * No fetch commands available, events queue in evts_fifo + * + * - READY: !list_empty(&fcmd_head) && !active_fcmd + * Fetch commands available but none processing events + * + * - ACTIVE: active_fcmd + * One fetch command actively processing events from evts_fifo + * + * Key Invariants: + * - At most one active_fcmd at any time (single reader) + * - active_fcmd is always from fcmd_head list when non-NULL + * - evts_fifo can be read locklessly by the single active reader + * - All state transitions require evts_lock protection + * - Multiple writers to evts_fifo require lock protection + */ + struct { + DECLARE_KFIFO_PTR(evts_fifo, unsigned short); + spinlock_t evts_lock; + + /* List of fetch commands available to process events */ + struct list_head fcmd_head; + + /* Currently active fetch command (NULL = none active) */ + struct ublk_batch_fetch_cmd *active_fcmd; + }____cacheline_aligned_in_smp; + struct ublk_io ios[] __counted_by(q_depth); }; @@ -231,7 +313,7 @@ struct ublk_device { struct ublk_params params; struct completion completion; - u32 nr_io_ready; + u32 nr_queue_ready; bool unprivileged_daemons; struct mutex cancel_mutex; bool canceling; @@ -239,6 +321,8 @@ struct ublk_device { struct delayed_work exit_work; struct work_struct partition_scan_work; + bool block_open; /* protected by open_mutex */ + struct ublk_queue *queues[]; }; @@ -252,8 +336,51 @@ static void ublk_io_release(void *priv); static void ublk_stop_dev_unlocked(struct ublk_device *ub); static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq); static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, - u16 q_id, u16 tag, struct ublk_io *io, size_t offset); + u16 q_id, u16 tag, struct ublk_io *io); static inline unsigned int ublk_req_build_flags(struct request *req); +static void ublk_batch_dispatch(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + struct ublk_batch_fetch_cmd *fcmd); + +static inline bool ublk_dev_support_batch_io(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_BATCH_IO; +} + +static inline bool ublk_support_batch_io(const struct ublk_queue *ubq) +{ + return ubq->flags & UBLK_F_BATCH_IO; +} + +static inline void ublk_io_lock(struct ublk_io *io) +{ + spin_lock(&io->lock); +} + +static inline void ublk_io_unlock(struct ublk_io *io) +{ + spin_unlock(&io->lock); +} + +/* Initialize the event queue */ +static inline int ublk_io_evts_init(struct ublk_queue *q, unsigned int size, + int numa_node) +{ + spin_lock_init(&q->evts_lock); + return kfifo_alloc_node(&q->evts_fifo, size, GFP_KERNEL, numa_node); +} + +/* Check if event queue is empty */ +static inline bool ublk_io_evts_empty(const struct ublk_queue *q) +{ + return kfifo_is_empty(&q->evts_fifo); +} + +static inline void ublk_io_evts_deinit(struct ublk_queue *q) +{ + WARN_ON_ONCE(!kfifo_is_empty(&q->evts_fifo)); + kfifo_free(&q->evts_fifo); +} static inline struct ublksrv_io_desc * ublk_get_iod(const struct ublk_queue *ubq, unsigned tag) @@ -261,6 +388,36 @@ ublk_get_iod(const struct ublk_queue *ubq, unsigned tag) return &ubq->io_cmd_buf[tag]; } +static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq) +{ + return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY; +} + +static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY; +} + +static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq) +{ + return ubq->flags & UBLK_F_AUTO_BUF_REG; +} + +static inline bool ublk_dev_support_auto_buf_reg(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_AUTO_BUF_REG; +} + +static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) +{ + return ubq->flags & UBLK_F_USER_COPY; +} + +static inline bool ublk_dev_support_user_copy(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_USER_COPY; +} + static inline bool ublk_dev_is_zoned(const struct ublk_device *ub) { return ub->dev_info.flags & UBLK_F_ZONED; @@ -271,6 +428,11 @@ static inline bool ublk_queue_is_zoned(const struct ublk_queue *ubq) return ubq->flags & UBLK_F_ZONED; } +static inline bool ublk_dev_support_integrity(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_INTEGRITY; +} + #ifdef CONFIG_BLK_DEV_ZONED struct ublk_zoned_report_desc { @@ -532,7 +694,7 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, #endif static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io, - bool need_map); + bool need_map, struct io_comp_batch *iob); static dev_t ublk_chr_devt; static const struct class ublk_chr_class = { @@ -545,6 +707,64 @@ static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */ static DEFINE_MUTEX(ublk_ctl_mutex); +static struct ublk_batch_fetch_cmd * +ublk_batch_alloc_fcmd(struct io_uring_cmd *cmd) +{ + struct ublk_batch_fetch_cmd *fcmd = kzalloc(sizeof(*fcmd), GFP_NOIO); + + if (fcmd) { + fcmd->cmd = cmd; + fcmd->buf_group = READ_ONCE(cmd->sqe->buf_index); + } + return fcmd; +} + +static void ublk_batch_free_fcmd(struct ublk_batch_fetch_cmd *fcmd) +{ + kfree(fcmd); +} + +static void __ublk_release_fcmd(struct ublk_queue *ubq) +{ + WRITE_ONCE(ubq->active_fcmd, NULL); +} + +/* + * Nothing can move on, so clear ->active_fcmd, and the caller should stop + * dispatching + */ +static void ublk_batch_deinit_fetch_buf(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + struct ublk_batch_fetch_cmd *fcmd, + int res) +{ + spin_lock(&ubq->evts_lock); + list_del_init(&fcmd->node); + WARN_ON_ONCE(fcmd != ubq->active_fcmd); + __ublk_release_fcmd(ubq); + spin_unlock(&ubq->evts_lock); + + io_uring_cmd_done(fcmd->cmd, res, data->issue_flags); + ublk_batch_free_fcmd(fcmd); +} + +static int ublk_batch_fetch_post_cqe(struct ublk_batch_fetch_cmd *fcmd, + struct io_br_sel *sel, + unsigned int issue_flags) +{ + if (io_uring_mshot_cmd_post_cqe(fcmd->cmd, sel, issue_flags)) + return -ENOBUFS; + return 0; +} + +static ssize_t ublk_batch_copy_io_tags(struct ublk_batch_fetch_cmd *fcmd, + void __user *buf, const u16 *tag_buf, + unsigned int len) +{ + if (copy_to_user(buf, tag_buf, len)) + return -EFAULT; + return len; +} #define UBLK_MAX_UBLKS UBLK_MINORS @@ -586,6 +806,53 @@ static void ublk_dev_param_basic_apply(struct ublk_device *ub) set_capacity(ub->ub_disk, p->dev_sectors); } +static int ublk_integrity_flags(u32 flags) +{ + int ret_flags = 0; + + if (flags & LBMD_PI_CAP_INTEGRITY) { + flags &= ~LBMD_PI_CAP_INTEGRITY; + ret_flags |= BLK_INTEGRITY_DEVICE_CAPABLE; + } + if (flags & LBMD_PI_CAP_REFTAG) { + flags &= ~LBMD_PI_CAP_REFTAG; + ret_flags |= BLK_INTEGRITY_REF_TAG; + } + return flags ? -EINVAL : ret_flags; +} + +static int ublk_integrity_pi_tuple_size(u8 csum_type) +{ + switch (csum_type) { + case LBMD_PI_CSUM_NONE: + return 0; + case LBMD_PI_CSUM_IP: + case LBMD_PI_CSUM_CRC16_T10DIF: + return 8; + case LBMD_PI_CSUM_CRC64_NVME: + return 16; + default: + return -EINVAL; + } +} + +static enum blk_integrity_checksum ublk_integrity_csum_type(u8 csum_type) +{ + switch (csum_type) { + case LBMD_PI_CSUM_NONE: + return BLK_INTEGRITY_CSUM_NONE; + case LBMD_PI_CSUM_IP: + return BLK_INTEGRITY_CSUM_IP; + case LBMD_PI_CSUM_CRC16_T10DIF: + return BLK_INTEGRITY_CSUM_CRC; + case LBMD_PI_CSUM_CRC64_NVME: + return BLK_INTEGRITY_CSUM_CRC64; + default: + WARN_ON_ONCE(1); + return BLK_INTEGRITY_CSUM_NONE; + } +} + static int ublk_validate_params(const struct ublk_device *ub) { /* basic param is the only one which must be set */ @@ -648,6 +915,29 @@ static int ublk_validate_params(const struct ublk_device *ub) return -EINVAL; } + if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) { + const struct ublk_param_integrity *p = &ub->params.integrity; + int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type); + int flags = ublk_integrity_flags(p->flags); + + if (!ublk_dev_support_integrity(ub)) + return -EINVAL; + if (flags < 0) + return flags; + if (pi_tuple_size < 0) + return pi_tuple_size; + if (!p->metadata_size) + return -EINVAL; + if (p->csum_type == LBMD_PI_CSUM_NONE && + p->flags & LBMD_PI_CAP_REFTAG) + return -EINVAL; + if (p->pi_offset + pi_tuple_size > p->metadata_size) + return -EINVAL; + if (p->interval_exp < SECTOR_SHIFT || + p->interval_exp > ub->params.basic.logical_bs_shift) + return -EINVAL; + } + return 0; } @@ -659,36 +949,6 @@ static void ublk_apply_params(struct ublk_device *ub) ublk_dev_param_zoned_apply(ub); } -static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq) -{ - return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY; -} - -static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub) -{ - return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY; -} - -static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq) -{ - return ubq->flags & UBLK_F_AUTO_BUF_REG; -} - -static inline bool ublk_dev_support_auto_buf_reg(const struct ublk_device *ub) -{ - return ub->dev_info.flags & UBLK_F_AUTO_BUF_REG; -} - -static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) -{ - return ubq->flags & UBLK_F_USER_COPY; -} - -static inline bool ublk_dev_support_user_copy(const struct ublk_device *ub) -{ - return ub->dev_info.flags & UBLK_F_USER_COPY; -} - static inline bool ublk_need_map_io(const struct ublk_queue *ubq) { return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) && @@ -726,6 +986,95 @@ static inline bool ublk_dev_need_req_ref(const struct ublk_device *ub) ublk_dev_support_auto_buf_reg(ub); } +/* + * ublk IO Reference Counting Design + * ================================== + * + * For user-copy and zero-copy modes, ublk uses a split reference model with + * two counters that together track IO lifetime: + * + * - io->ref: refcount for off-task buffer registrations and user-copy ops + * - io->task_registered_buffers: count of buffers registered on the IO task + * + * Key Invariant: + * -------------- + * When IO is dispatched to the ublk server (UBLK_IO_FLAG_OWNED_BY_SRV set), + * the sum (io->ref + io->task_registered_buffers) must equal UBLK_REFCOUNT_INIT + * when no active references exist. After IO completion, both counters become + * zero. For I/Os not currently dispatched to the ublk server, both ref and + * task_registered_buffers are 0. + * + * This invariant is checked by ublk_check_and_reset_active_ref() during daemon + * exit to determine if all references have been released. + * + * Why Split Counters: + * ------------------- + * Buffers registered on the IO daemon task can use the lightweight + * task_registered_buffers counter (simple increment/decrement) instead of + * atomic refcount operations. The ublk_io_release() callback checks if + * current == io->task to decide which counter to update. + * + * This optimization only applies before IO completion. At completion, + * ublk_sub_req_ref() collapses task_registered_buffers into the atomic ref. + * After that, all subsequent buffer unregistrations must use the atomic ref + * since they may be releasing the last reference. + * + * Reference Lifecycle: + * -------------------- + * 1. ublk_init_req_ref(): Sets io->ref = UBLK_REFCOUNT_INIT at IO dispatch + * + * 2. During IO processing: + * - On-task buffer reg: task_registered_buffers++ (no ref change) + * - Off-task buffer reg: ref++ via ublk_get_req_ref() + * - Buffer unregister callback (ublk_io_release): + * * If on-task: task_registered_buffers-- + * * If off-task: ref-- via ublk_put_req_ref() + * + * 3. ublk_sub_req_ref() at IO completion: + * - Computes: sub_refs = UBLK_REFCOUNT_INIT - task_registered_buffers + * - Subtracts sub_refs from ref and zeroes task_registered_buffers + * - This effectively collapses task_registered_buffers into the atomic ref, + * accounting for the initial UBLK_REFCOUNT_INIT minus any on-task + * buffers that were already counted + * + * Example (zero-copy, register on-task, unregister off-task): + * - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 0 + * - Register buffer on-task: task_registered_buffers = 1 + * - Unregister off-task: ref-- (UBLK_REFCOUNT_INIT - 1), task_registered_buffers stays 1 + * - Completion via ublk_sub_req_ref(): + * sub_refs = UBLK_REFCOUNT_INIT - 1, + * ref = (UBLK_REFCOUNT_INIT - 1) - (UBLK_REFCOUNT_INIT - 1) = 0 + * + * Example (auto buffer registration): + * Auto buffer registration sets task_registered_buffers = 1 at dispatch. + * + * - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 1 + * - Buffer unregister: task_registered_buffers-- (becomes 0) + * - Completion via ublk_sub_req_ref(): + * sub_refs = UBLK_REFCOUNT_INIT - 0, ref becomes 0 + * + * Example (zero-copy, ublk server killed): + * When daemon is killed, io_uring cleanup unregisters buffers off-task. + * ublk_check_and_reset_active_ref() waits for the invariant to hold. + * + * - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 0 + * - Register buffer on-task: task_registered_buffers = 1 + * - Daemon killed, io_uring cleanup unregisters buffer (off-task): + * ref-- (UBLK_REFCOUNT_INIT - 1), task_registered_buffers stays 1 + * - Daemon exit check: sum = (UBLK_REFCOUNT_INIT - 1) + 1 = UBLK_REFCOUNT_INIT + * - Sum equals UBLK_REFCOUNT_INIT, then both two counters are zeroed by + * ublk_check_and_reset_active_ref(), so ublk_abort_queue() can proceed + * and abort pending requests + * + * Batch IO Special Case: + * ---------------------- + * In batch IO mode, io->task is NULL. This means ublk_io_release() always + * takes the off-task path (ublk_put_req_ref), decrementing io->ref. The + * task_registered_buffers counter still tracks registered buffers for the + * invariant check, even though the callback doesn't decrement it. + * + * Note: updating task_registered_buffers is protected by io->lock. + */ static inline void ublk_init_req_ref(const struct ublk_queue *ubq, struct ublk_io *io) { @@ -744,7 +1093,7 @@ static inline void ublk_put_req_ref(struct ublk_io *io, struct request *req) return; /* ublk_need_map_io() and ublk_need_req_ref() are mutually exclusive */ - __ublk_complete_rq(req, io, false); + __ublk_complete_rq(req, io, false, NULL); } static inline bool ublk_sub_req_ref(struct ublk_io *io) @@ -905,6 +1254,9 @@ static int ublk_open(struct gendisk *disk, blk_mode_t mode) return -EPERM; } + if (ub->block_open) + return -ENXIO; + return 0; } @@ -915,6 +1267,35 @@ static const struct block_device_operations ub_fops = { .report_zones = ublk_report_zones, }; +static bool ublk_copy_user_bvec(const struct bio_vec *bv, unsigned *offset, + struct iov_iter *uiter, int dir, size_t *done) +{ + unsigned len; + void *bv_buf; + size_t copied; + + if (*offset >= bv->bv_len) { + *offset -= bv->bv_len; + return true; + } + + len = bv->bv_len - *offset; + bv_buf = kmap_local_page(bv->bv_page) + bv->bv_offset + *offset; + if (dir == ITER_DEST) + copied = copy_to_iter(bv_buf, len, uiter); + else + copied = copy_from_iter(bv_buf, len, uiter); + + kunmap_local(bv_buf); + + *done += copied; + if (copied < len) + return false; + + *offset = 0; + return true; +} + /* * Copy data between request pages and io_iter, and 'offset' * is the start point of linear offset of request. @@ -927,33 +1308,39 @@ static size_t ublk_copy_user_pages(const struct request *req, size_t done = 0; rq_for_each_segment(bv, req, iter) { - unsigned len; - void *bv_buf; - size_t copied; - - if (offset >= bv.bv_len) { - offset -= bv.bv_len; - continue; - } - - len = bv.bv_len - offset; - bv_buf = kmap_local_page(bv.bv_page) + bv.bv_offset + offset; - if (dir == ITER_DEST) - copied = copy_to_iter(bv_buf, len, uiter); - else - copied = copy_from_iter(bv_buf, len, uiter); - - kunmap_local(bv_buf); - - done += copied; - if (copied < len) + if (!ublk_copy_user_bvec(&bv, &offset, uiter, dir, &done)) break; - - offset = 0; } return done; } +#ifdef CONFIG_BLK_DEV_INTEGRITY +static size_t ublk_copy_user_integrity(const struct request *req, + unsigned offset, struct iov_iter *uiter, int dir) +{ + size_t done = 0; + struct bio *bio = req->bio; + struct bvec_iter iter; + struct bio_vec iv; + + if (!blk_integrity_rq(req)) + return 0; + + bio_for_each_integrity_vec(iv, bio, iter) { + if (!ublk_copy_user_bvec(&iv, &offset, uiter, dir, &done)) + break; + } + + return done; +} +#else /* #ifdef CONFIG_BLK_DEV_INTEGRITY */ +static size_t ublk_copy_user_integrity(const struct request *req, + unsigned offset, struct iov_iter *uiter, int dir) +{ + return 0; +} +#endif /* #ifdef CONFIG_BLK_DEV_INTEGRITY */ + static inline bool ublk_need_map_req(const struct request *req) { return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE; @@ -1035,6 +1422,9 @@ static inline unsigned int ublk_req_build_flags(struct request *req) if (req->cmd_flags & REQ_SWAP) flags |= UBLK_IO_F_SWAP; + if (blk_integrity_rq(req)) + flags |= UBLK_IO_F_INTEGRITY; + return flags; } @@ -1090,7 +1480,7 @@ static void ublk_end_request(struct request *req, blk_status_t error) /* todo: handle partial completion */ static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io, - bool need_map) + bool need_map, struct io_comp_batch *iob) { unsigned int unmapped_bytes; blk_status_t res = BLK_STS_OK; @@ -1144,8 +1534,11 @@ static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io, local_bh_enable(); if (requeue) blk_mq_requeue_request(req, true); - else if (likely(!blk_should_fake_timeout(req->q))) + else if (likely(!blk_should_fake_timeout(req->q))) { + if (blk_mq_add_to_batch(req, iob, false, blk_mq_end_request_batch)) + return; __blk_mq_end_request(req, BLK_STS_OK); + } return; exit: @@ -1206,10 +1599,16 @@ enum auto_buf_reg_res { AUTO_BUF_REG_OK, }; -static void ublk_prep_auto_buf_reg_io(const struct ublk_queue *ubq, - struct request *req, struct ublk_io *io, - struct io_uring_cmd *cmd, - enum auto_buf_reg_res res) +/* + * Setup io state after auto buffer registration. + * + * Must be called after ublk_auto_buf_register() is done. + * Caller must hold io->lock in batch context. + */ +static void ublk_auto_buf_io_setup(const struct ublk_queue *ubq, + struct request *req, struct ublk_io *io, + struct io_uring_cmd *cmd, + enum auto_buf_reg_res res) { if (res == AUTO_BUF_REG_OK) { io->task_registered_buffers = 1; @@ -1220,8 +1619,9 @@ static void ublk_prep_auto_buf_reg_io(const struct ublk_queue *ubq, __ublk_prep_compl_io_cmd(io, req); } +/* Register request bvec to io_uring for auto buffer registration. */ static enum auto_buf_reg_res -__ublk_do_auto_buf_reg(const struct ublk_queue *ubq, struct request *req, +ublk_auto_buf_register(const struct ublk_queue *ubq, struct request *req, struct ublk_io *io, struct io_uring_cmd *cmd, unsigned int issue_flags) { @@ -1241,15 +1641,21 @@ __ublk_do_auto_buf_reg(const struct ublk_queue *ubq, struct request *req, return AUTO_BUF_REG_OK; } -static void ublk_do_auto_buf_reg(const struct ublk_queue *ubq, struct request *req, - struct ublk_io *io, struct io_uring_cmd *cmd, - unsigned int issue_flags) +/* + * Dispatch IO to userspace with auto buffer registration. + * + * Only called in non-batch context from task work, io->lock not held. + */ +static void ublk_auto_buf_dispatch(const struct ublk_queue *ubq, + struct request *req, struct ublk_io *io, + struct io_uring_cmd *cmd, + unsigned int issue_flags) { - enum auto_buf_reg_res res = __ublk_do_auto_buf_reg(ubq, req, io, cmd, + enum auto_buf_reg_res res = ublk_auto_buf_register(ubq, req, io, cmd, issue_flags); if (res != AUTO_BUF_REG_FAIL) { - ublk_prep_auto_buf_reg_io(ubq, req, io, cmd, res); + ublk_auto_buf_io_setup(ubq, req, io, cmd, res); io_uring_cmd_done(cmd, UBLK_IO_RES_OK, issue_flags); } } @@ -1324,13 +1730,247 @@ static void ublk_dispatch_req(struct ublk_queue *ubq, struct request *req) return; if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) { - ublk_do_auto_buf_reg(ubq, req, io, io->cmd, issue_flags); + ublk_auto_buf_dispatch(ubq, req, io, io->cmd, issue_flags); } else { ublk_init_req_ref(ubq, io); ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags); } } +static bool __ublk_batch_prep_dispatch(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + unsigned short tag) +{ + struct ublk_device *ub = data->ub; + struct ublk_io *io = &ubq->ios[tag]; + struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag); + enum auto_buf_reg_res res = AUTO_BUF_REG_FALLBACK; + struct io_uring_cmd *cmd = data->cmd; + + if (!ublk_start_io(ubq, req, io)) + return false; + + if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) { + res = ublk_auto_buf_register(ubq, req, io, cmd, + data->issue_flags); + + if (res == AUTO_BUF_REG_FAIL) + return false; + } + + ublk_io_lock(io); + ublk_auto_buf_io_setup(ubq, req, io, cmd, res); + ublk_io_unlock(io); + + return true; +} + +static bool ublk_batch_prep_dispatch(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + unsigned short *tag_buf, + unsigned int len) +{ + bool has_unused = false; + unsigned int i; + + for (i = 0; i < len; i++) { + unsigned short tag = tag_buf[i]; + + if (!__ublk_batch_prep_dispatch(ubq, data, tag)) { + tag_buf[i] = UBLK_BATCH_IO_UNUSED_TAG; + has_unused = true; + } + } + + return has_unused; +} + +/* + * Filter out UBLK_BATCH_IO_UNUSED_TAG entries from tag_buf. + * Returns the new length after filtering. + */ +static unsigned int ublk_filter_unused_tags(unsigned short *tag_buf, + unsigned int len) +{ + unsigned int i, j; + + for (i = 0, j = 0; i < len; i++) { + if (tag_buf[i] != UBLK_BATCH_IO_UNUSED_TAG) { + if (i != j) + tag_buf[j] = tag_buf[i]; + j++; + } + } + + return j; +} + +#define MAX_NR_TAG 128 +static int __ublk_batch_dispatch(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + struct ublk_batch_fetch_cmd *fcmd) +{ + const unsigned int tag_sz = sizeof(unsigned short); + unsigned short tag_buf[MAX_NR_TAG]; + struct io_br_sel sel; + size_t len = 0; + bool needs_filter; + int ret; + + WARN_ON_ONCE(data->cmd != fcmd->cmd); + + sel = io_uring_cmd_buffer_select(fcmd->cmd, fcmd->buf_group, &len, + data->issue_flags); + if (sel.val < 0) + return sel.val; + if (!sel.addr) + return -ENOBUFS; + + /* single reader needn't lock and sizeof(kfifo element) is 2 bytes */ + len = min(len, sizeof(tag_buf)) / tag_sz; + len = kfifo_out(&ubq->evts_fifo, tag_buf, len); + + needs_filter = ublk_batch_prep_dispatch(ubq, data, tag_buf, len); + /* Filter out unused tags before posting to userspace */ + if (unlikely(needs_filter)) { + int new_len = ublk_filter_unused_tags(tag_buf, len); + + /* return actual length if all are failed or requeued */ + if (!new_len) { + /* release the selected buffer */ + sel.val = 0; + WARN_ON_ONCE(!io_uring_mshot_cmd_post_cqe(fcmd->cmd, + &sel, data->issue_flags)); + return len; + } + len = new_len; + } + + sel.val = ublk_batch_copy_io_tags(fcmd, sel.addr, tag_buf, len * tag_sz); + ret = ublk_batch_fetch_post_cqe(fcmd, &sel, data->issue_flags); + if (unlikely(ret < 0)) { + int i, res; + + /* + * Undo prep state for all IOs since userspace never received them. + * This restores IOs to pre-prepared state so they can be cleanly + * re-prepared when tags are pulled from FIFO again. + */ + for (i = 0; i < len; i++) { + struct ublk_io *io = &ubq->ios[tag_buf[i]]; + int index = -1; + + ublk_io_lock(io); + if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) + index = io->buf.auto_reg.index; + io->flags &= ~(UBLK_IO_FLAG_OWNED_BY_SRV | UBLK_IO_FLAG_AUTO_BUF_REG); + io->flags |= UBLK_IO_FLAG_ACTIVE; + ublk_io_unlock(io); + + if (index != -1) + io_buffer_unregister_bvec(data->cmd, index, + data->issue_flags); + } + + res = kfifo_in_spinlocked_noirqsave(&ubq->evts_fifo, + tag_buf, len, &ubq->evts_lock); + + pr_warn_ratelimited("%s: copy tags or post CQE failure, move back " + "tags(%d %zu) ret %d\n", __func__, res, len, + ret); + } + return ret; +} + +static struct ublk_batch_fetch_cmd *__ublk_acquire_fcmd( + struct ublk_queue *ubq) +{ + struct ublk_batch_fetch_cmd *fcmd; + + lockdep_assert_held(&ubq->evts_lock); + + /* + * Ordering updating ubq->evts_fifo and checking ubq->active_fcmd. + * + * The pair is the smp_mb() in ublk_batch_dispatch(). + * + * If ubq->active_fcmd is observed as non-NULL, the new added tags + * can be visisible in ublk_batch_dispatch() with the barrier pairing. + */ + smp_mb(); + if (READ_ONCE(ubq->active_fcmd)) { + fcmd = NULL; + } else { + fcmd = list_first_entry_or_null(&ubq->fcmd_head, + struct ublk_batch_fetch_cmd, node); + WRITE_ONCE(ubq->active_fcmd, fcmd); + } + return fcmd; +} + +static void ublk_batch_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw) +{ + unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS; + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd; + struct ublk_batch_io_data data = { + .ub = pdu->ubq->dev, + .cmd = fcmd->cmd, + .issue_flags = issue_flags, + }; + + WARN_ON_ONCE(pdu->ubq->active_fcmd != fcmd); + + ublk_batch_dispatch(pdu->ubq, &data, fcmd); +} + +static void +ublk_batch_dispatch(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + struct ublk_batch_fetch_cmd *fcmd) +{ + struct ublk_batch_fetch_cmd *new_fcmd; + unsigned tried = 0; + int ret = 0; + +again: + while (!ublk_io_evts_empty(ubq)) { + ret = __ublk_batch_dispatch(ubq, data, fcmd); + if (ret <= 0) + break; + } + + if (ret < 0) { + ublk_batch_deinit_fetch_buf(ubq, data, fcmd, ret); + return; + } + + __ublk_release_fcmd(ubq); + /* + * Order clearing ubq->active_fcmd from __ublk_release_fcmd() and + * checking ubq->evts_fifo. + * + * The pair is the smp_mb() in __ublk_acquire_fcmd(). + */ + smp_mb(); + if (likely(ublk_io_evts_empty(ubq))) + return; + + spin_lock(&ubq->evts_lock); + new_fcmd = __ublk_acquire_fcmd(ubq); + spin_unlock(&ubq->evts_lock); + + if (!new_fcmd) + return; + + /* Avoid lockup by allowing to handle at most 32 batches */ + if (new_fcmd == fcmd && tried++ < 32) + goto again; + + io_uring_cmd_complete_in_task(new_fcmd->cmd, ublk_batch_tw_cb); +} + static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw) { struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); @@ -1340,6 +1980,21 @@ static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw) ublk_dispatch_req(ubq, pdu->req); } +static void ublk_batch_queue_cmd(struct ublk_queue *ubq, struct request *rq, bool last) +{ + unsigned short tag = rq->tag; + struct ublk_batch_fetch_cmd *fcmd = NULL; + + spin_lock(&ubq->evts_lock); + kfifo_put(&ubq->evts_fifo, tag); + if (last) + fcmd = __ublk_acquire_fcmd(ubq); + spin_unlock(&ubq->evts_lock); + + if (fcmd) + io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb); +} + static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) { struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd; @@ -1429,16 +2084,22 @@ static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq, return BLK_STS_OK; } -static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, - const struct blk_mq_queue_data *bd) +/* + * Common helper for queue_rq that handles request preparation and + * cancellation checks. Returns status and sets should_queue to indicate + * whether the caller should proceed with queuing the request. + */ +static inline blk_status_t __ublk_queue_rq_common(struct ublk_queue *ubq, + struct request *rq, + bool *should_queue) { - struct ublk_queue *ubq = hctx->driver_data; - struct request *rq = bd->rq; blk_status_t res; res = ublk_prep_req(ubq, rq, false); - if (res != BLK_STS_OK) + if (res != BLK_STS_OK) { + *should_queue = false; return res; + } /* * ->canceling has to be handled after ->force_abort and ->fail_io @@ -1446,14 +2107,47 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, * of recovery, and cause hang when deleting disk */ if (unlikely(ubq->canceling)) { + *should_queue = false; __ublk_abort_rq(ubq, rq); return BLK_STS_OK; } + *should_queue = true; + return BLK_STS_OK; +} + +static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct ublk_queue *ubq = hctx->driver_data; + struct request *rq = bd->rq; + bool should_queue; + blk_status_t res; + + res = __ublk_queue_rq_common(ubq, rq, &should_queue); + if (!should_queue) + return res; + ublk_queue_cmd(ubq, rq); return BLK_STS_OK; } +static blk_status_t ublk_batch_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct ublk_queue *ubq = hctx->driver_data; + struct request *rq = bd->rq; + bool should_queue; + blk_status_t res; + + res = __ublk_queue_rq_common(ubq, rq, &should_queue); + if (!should_queue) + return res; + + ublk_batch_queue_cmd(ubq, rq, bd->last); + return BLK_STS_OK; +} + static inline bool ublk_belong_to_same_batch(const struct ublk_io *io, const struct ublk_io *io2) { @@ -1462,6 +2156,19 @@ static inline bool ublk_belong_to_same_batch(const struct ublk_io *io, (io->task == io2->task); } +static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx) +{ + struct ublk_queue *ubq = hctx->driver_data; + struct ublk_batch_fetch_cmd *fcmd; + + spin_lock(&ubq->evts_lock); + fcmd = __ublk_acquire_fcmd(ubq); + spin_unlock(&ubq->evts_lock); + + if (fcmd) + io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb); +} + static void ublk_queue_rqs(struct rq_list *rqlist) { struct rq_list requeue_list = { }; @@ -1490,6 +2197,57 @@ static void ublk_queue_rqs(struct rq_list *rqlist) *rqlist = requeue_list; } +static void ublk_batch_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l) +{ + unsigned short tags[MAX_NR_TAG]; + struct ublk_batch_fetch_cmd *fcmd; + struct request *rq; + unsigned cnt = 0; + + spin_lock(&ubq->evts_lock); + rq_list_for_each(l, rq) { + tags[cnt++] = (unsigned short)rq->tag; + if (cnt >= MAX_NR_TAG) { + kfifo_in(&ubq->evts_fifo, tags, cnt); + cnt = 0; + } + } + if (cnt) + kfifo_in(&ubq->evts_fifo, tags, cnt); + fcmd = __ublk_acquire_fcmd(ubq); + spin_unlock(&ubq->evts_lock); + + rq_list_init(l); + if (fcmd) + io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb); +} + +static void ublk_batch_queue_rqs(struct rq_list *rqlist) +{ + struct rq_list requeue_list = { }; + struct rq_list submit_list = { }; + struct ublk_queue *ubq = NULL; + struct request *req; + + while ((req = rq_list_pop(rqlist))) { + struct ublk_queue *this_q = req->mq_hctx->driver_data; + + if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) { + rq_list_add_tail(&requeue_list, req); + continue; + } + + if (ubq && this_q != ubq && !rq_list_empty(&submit_list)) + ublk_batch_queue_cmd_list(ubq, &submit_list); + ubq = this_q; + rq_list_add_tail(&submit_list, req); + } + + if (!rq_list_empty(&submit_list)) + ublk_batch_queue_cmd_list(ubq, &submit_list); + *rqlist = requeue_list; +} + static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, unsigned int hctx_idx) { @@ -1507,10 +2265,20 @@ static const struct blk_mq_ops ublk_mq_ops = { .timeout = ublk_timeout, }; +static const struct blk_mq_ops ublk_batch_mq_ops = { + .commit_rqs = ublk_commit_rqs, + .queue_rq = ublk_batch_queue_rq, + .queue_rqs = ublk_batch_queue_rqs, + .init_hctx = ublk_init_hctx, + .timeout = ublk_timeout, +}; + static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) { int i; + ubq->nr_io_ready = 0; + for (i = 0; i < ubq->q_depth; i++) { struct ublk_io *io = &ubq->ios[i]; @@ -1559,7 +2327,7 @@ static void ublk_reset_ch_dev(struct ublk_device *ub) /* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */ ub->mm = NULL; - ub->nr_io_ready = 0; + ub->nr_queue_ready = 0; ub->unprivileged_daemons = false; ub->ublksrv_tgid = -1; } @@ -1813,13 +2581,32 @@ static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma) static void __ublk_fail_req(struct ublk_device *ub, struct ublk_io *io, struct request *req) { - WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE); + WARN_ON_ONCE(!ublk_dev_support_batch_io(ub) && + io->flags & UBLK_IO_FLAG_ACTIVE); if (ublk_nosrv_should_reissue_outstanding(ub)) blk_mq_requeue_request(req, false); else { io->res = -EIO; - __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub)); + __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL); + } +} + +/* + * Request tag may just be filled to event kfifo, not get chance to + * dispatch, abort these requests too + */ +static void ublk_abort_batch_queue(struct ublk_device *ub, + struct ublk_queue *ubq) +{ + unsigned short tag; + + while (kfifo_out(&ubq->evts_fifo, &tag, 1)) { + struct request *req = blk_mq_tag_to_rq( + ub->tag_set.tags[ubq->q_id], tag); + + if (!WARN_ON_ONCE(!req || !blk_mq_request_started(req))) + __ublk_fail_req(ub, &ubq->ios[tag], req); } } @@ -1841,6 +2628,9 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq) if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) __ublk_fail_req(ub, io, io->req); } + + if (ublk_support_batch_io(ubq)) + ublk_abort_batch_queue(ub, ubq); } static void ublk_start_cancel(struct ublk_device *ub) @@ -1904,6 +2694,66 @@ static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag, io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, issue_flags); } +/* + * Cancel a batch fetch command if it hasn't been claimed by another path. + * + * An fcmd can only be cancelled if: + * 1. It's not the active_fcmd (which is currently being processed) + * 2. It's still on the list (!list_empty check) - once removed from the list, + * the fcmd is considered claimed and will be freed by whoever removed it + * + * Use list_del_init() so subsequent list_empty() checks work correctly. + */ +static void ublk_batch_cancel_cmd(struct ublk_queue *ubq, + struct ublk_batch_fetch_cmd *fcmd, + unsigned int issue_flags) +{ + bool done; + + spin_lock(&ubq->evts_lock); + done = (READ_ONCE(ubq->active_fcmd) != fcmd) && !list_empty(&fcmd->node); + if (done) + list_del_init(&fcmd->node); + spin_unlock(&ubq->evts_lock); + + if (done) { + io_uring_cmd_done(fcmd->cmd, UBLK_IO_RES_ABORT, issue_flags); + ublk_batch_free_fcmd(fcmd); + } +} + +static void ublk_batch_cancel_queue(struct ublk_queue *ubq) +{ + struct ublk_batch_fetch_cmd *fcmd; + LIST_HEAD(fcmd_list); + + spin_lock(&ubq->evts_lock); + ubq->force_abort = true; + list_splice_init(&ubq->fcmd_head, &fcmd_list); + fcmd = READ_ONCE(ubq->active_fcmd); + if (fcmd) + list_move(&fcmd->node, &ubq->fcmd_head); + spin_unlock(&ubq->evts_lock); + + while (!list_empty(&fcmd_list)) { + fcmd = list_first_entry(&fcmd_list, + struct ublk_batch_fetch_cmd, node); + ublk_batch_cancel_cmd(ubq, fcmd, IO_URING_F_UNLOCKED); + } +} + +static void ublk_batch_cancel_fn(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd; + struct ublk_queue *ubq = pdu->ubq; + + ublk_start_cancel(ubq->dev); + + ublk_batch_cancel_cmd(ubq, fcmd, issue_flags); +} + /* * The ublk char device won't be closed when calling cancel fn, so both * ublk device and queue are guaranteed to be live @@ -1944,17 +2794,25 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, ublk_cancel_cmd(ubq, pdu->tag, issue_flags); } +static inline bool ublk_queue_ready(const struct ublk_queue *ubq) +{ + return ubq->nr_io_ready == ubq->q_depth; +} + static inline bool ublk_dev_ready(const struct ublk_device *ub) { - u32 total = (u32)ub->dev_info.nr_hw_queues * ub->dev_info.queue_depth; - - return ub->nr_io_ready == total; + return ub->nr_queue_ready == ub->dev_info.nr_hw_queues; } static void ublk_cancel_queue(struct ublk_queue *ubq) { int i; + if (ublk_support_batch_io(ubq)) { + ublk_batch_cancel_queue(ubq); + return; + } + for (i = 0; i < ubq->q_depth; i++) ublk_cancel_cmd(ubq, i, IO_URING_F_UNLOCKED); } @@ -2052,37 +2910,52 @@ static void ublk_stop_dev(struct ublk_device *ub) ublk_cancel_dev(ub); } -/* reset ublk io_uring queue & io flags */ -static void ublk_reset_io_flags(struct ublk_device *ub) +/* reset per-queue io flags */ +static void ublk_queue_reset_io_flags(struct ublk_queue *ubq) { - int i, j; + int j; - for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { - struct ublk_queue *ubq = ublk_get_queue(ub, i); - - /* UBLK_IO_FLAG_CANCELED can be cleared now */ - spin_lock(&ubq->cancel_lock); - for (j = 0; j < ubq->q_depth; j++) - ubq->ios[j].flags &= ~UBLK_IO_FLAG_CANCELED; - spin_unlock(&ubq->cancel_lock); - ubq->fail_io = false; - } - mutex_lock(&ub->cancel_mutex); - ublk_set_canceling(ub, false); - mutex_unlock(&ub->cancel_mutex); + /* UBLK_IO_FLAG_CANCELED can be cleared now */ + spin_lock(&ubq->cancel_lock); + for (j = 0; j < ubq->q_depth; j++) + ubq->ios[j].flags &= ~UBLK_IO_FLAG_CANCELED; + ubq->canceling = false; + spin_unlock(&ubq->cancel_lock); + ubq->fail_io = false; } /* device can only be started after all IOs are ready */ -static void ublk_mark_io_ready(struct ublk_device *ub) +static void ublk_mark_io_ready(struct ublk_device *ub, u16 q_id) __must_hold(&ub->mutex) { + struct ublk_queue *ubq = ublk_get_queue(ub, q_id); + if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN)) ub->unprivileged_daemons = true; - ub->nr_io_ready++; + ubq->nr_io_ready++; + + /* Check if this specific queue is now fully ready */ + if (ublk_queue_ready(ubq)) { + ub->nr_queue_ready++; + + /* + * Reset queue flags as soon as this queue is ready. + * This clears the canceling flag, allowing batch FETCH commands + * to succeed during recovery without waiting for all queues. + */ + ublk_queue_reset_io_flags(ubq); + } + + /* Check if all queues are ready */ if (ublk_dev_ready(ub)) { - /* now we are ready for handling ublk io request */ - ublk_reset_io_flags(ub); + /* + * All queues ready - clear device-level canceling flag + * and complete the recovery/initialization. + */ + mutex_lock(&ub->cancel_mutex); + ub->canceling = false; + mutex_unlock(&ub->cancel_mutex); complete_all(&ub->completion); } } @@ -2115,7 +2988,7 @@ static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd return 0; } -static int ublk_handle_auto_buf_reg(struct ublk_io *io, +static void ublk_clear_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd *cmd, u16 *buf_idx) { @@ -2135,7 +3008,13 @@ static int ublk_handle_auto_buf_reg(struct ublk_io *io, if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd)) *buf_idx = io->buf.auto_reg.index; } +} +static int ublk_handle_auto_buf_reg(struct ublk_io *io, + struct io_uring_cmd *cmd, + u16 *buf_idx) +{ + ublk_clear_auto_buf_reg(io, cmd, buf_idx); return ublk_set_auto_buf_reg(io, cmd); } @@ -2208,7 +3087,7 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd, if (!ublk_dev_support_zero_copy(ub)) return -EINVAL; - req = __ublk_check_and_get_req(ub, q_id, tag, io, 0); + req = __ublk_check_and_get_req(ub, q_id, tag, io); if (!req) return -EINVAL; @@ -2280,7 +3159,7 @@ static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr) } static int __ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub, - struct ublk_io *io) + struct ublk_io *io, u16 q_id) { /* UBLK_IO_FETCH_REQ is only allowed before dev is setup */ if (ublk_dev_ready(ub)) @@ -2294,14 +3173,16 @@ static int __ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub, ublk_fill_io_cmd(io, cmd); - WRITE_ONCE(io->task, get_task_struct(current)); - ublk_mark_io_ready(ub); + if (ublk_dev_support_batch_io(ub)) + WRITE_ONCE(io->task, NULL); + else + WRITE_ONCE(io->task, get_task_struct(current)); return 0; } static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub, - struct ublk_io *io, __u64 buf_addr) + struct ublk_io *io, __u64 buf_addr, u16 q_id) { int ret; @@ -2311,9 +3192,11 @@ static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub, * FETCH, so it is fine even for IO_URING_F_NONBLOCK. */ mutex_lock(&ub->mutex); - ret = __ublk_fetch(cmd, ub, io); + ret = __ublk_fetch(cmd, ub, io, q_id); if (!ret) ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL); + if (!ret) + ublk_mark_io_ready(ub, q_id); mutex_unlock(&ub->mutex); return ret; } @@ -2417,7 +3300,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, ret = ublk_check_fetch_buf(ub, addr); if (ret) goto out; - ret = ublk_fetch(cmd, ub, io, addr); + ret = ublk_fetch(cmd, ub, io, addr, q_id); if (ret) goto out; @@ -2462,15 +3345,14 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, io->res = result; req = ublk_fill_io_cmd(io, cmd); ret = ublk_config_io_buf(ub, io, cmd, addr, &buf_idx); - compl = ublk_need_complete_req(ub, io); - - /* can't touch 'ublk_io' any more */ if (buf_idx != UBLK_INVALID_BUF_IDX) io_buffer_unregister_bvec(cmd, buf_idx, issue_flags); + compl = ublk_need_complete_req(ub, io); + if (req_op(req) == REQ_OP_ZONE_APPEND) req->__sector = addr; if (compl) - __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub)); + __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL); if (ret) goto out; @@ -2502,7 +3384,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, } static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, - u16 q_id, u16 tag, struct ublk_io *io, size_t offset) + u16 q_id, u16 tag, struct ublk_io *io) { struct request *req; @@ -2523,9 +3405,6 @@ static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, if (!ublk_rq_has_data(req)) goto fail_put; - if (offset > blk_rq_bytes(req)) - goto fail_put; - return req; fail_put: ublk_put_req_ref(io, req); @@ -2558,6 +3437,479 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return ublk_ch_uring_cmd_local(cmd, issue_flags); } +static inline __u64 ublk_batch_buf_addr(const struct ublk_batch_io *uc, + const struct ublk_elem_header *elem) +{ + const void *buf = elem; + + if (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR) + return *(const __u64 *)(buf + sizeof(*elem)); + return 0; +} + +static inline __u64 ublk_batch_zone_lba(const struct ublk_batch_io *uc, + const struct ublk_elem_header *elem) +{ + const void *buf = elem; + + if (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA) + return *(const __u64 *)(buf + sizeof(*elem) + + 8 * !!(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR)); + return -1; +} + +static struct ublk_auto_buf_reg +ublk_batch_auto_buf_reg(const struct ublk_batch_io *uc, + const struct ublk_elem_header *elem) +{ + struct ublk_auto_buf_reg reg = { + .index = elem->buf_index, + .flags = (uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) ? + UBLK_AUTO_BUF_REG_FALLBACK : 0, + }; + + return reg; +} + +/* + * 48 can hold any type of buffer element(8, 16 and 24 bytes) because + * it is the least common multiple(LCM) of 8, 16 and 24 + */ +#define UBLK_CMD_BATCH_TMP_BUF_SZ (48 * 10) +struct ublk_batch_io_iter { + void __user *uaddr; + unsigned done, total; + unsigned char elem_bytes; + /* copy to this buffer from user space */ + unsigned char buf[UBLK_CMD_BATCH_TMP_BUF_SZ]; +}; + +static inline int +__ublk_walk_cmd_buf(struct ublk_queue *ubq, + struct ublk_batch_io_iter *iter, + const struct ublk_batch_io_data *data, + unsigned bytes, + int (*cb)(struct ublk_queue *q, + const struct ublk_batch_io_data *data, + const struct ublk_elem_header *elem)) +{ + unsigned int i; + int ret = 0; + + for (i = 0; i < bytes; i += iter->elem_bytes) { + const struct ublk_elem_header *elem = + (const struct ublk_elem_header *)&iter->buf[i]; + + if (unlikely(elem->tag >= data->ub->dev_info.queue_depth)) { + ret = -EINVAL; + break; + } + + ret = cb(ubq, data, elem); + if (unlikely(ret)) + break; + } + + iter->done += i; + return ret; +} + +static int ublk_walk_cmd_buf(struct ublk_batch_io_iter *iter, + const struct ublk_batch_io_data *data, + int (*cb)(struct ublk_queue *q, + const struct ublk_batch_io_data *data, + const struct ublk_elem_header *elem)) +{ + struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id); + int ret = 0; + + while (iter->done < iter->total) { + unsigned int len = min(sizeof(iter->buf), iter->total - iter->done); + + if (copy_from_user(iter->buf, iter->uaddr + iter->done, len)) { + pr_warn("ublk%d: read batch cmd buffer failed\n", + data->ub->dev_info.dev_id); + return -EFAULT; + } + + ret = __ublk_walk_cmd_buf(ubq, iter, data, len, cb); + if (ret) + return ret; + } + return 0; +} + +static int ublk_batch_unprep_io(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + const struct ublk_elem_header *elem) +{ + struct ublk_io *io = &ubq->ios[elem->tag]; + + /* + * If queue was ready before this decrement, it won't be anymore, + * so we need to decrement the queue ready count and restore the + * canceling flag to prevent new requests from being queued. + */ + if (ublk_queue_ready(ubq)) { + data->ub->nr_queue_ready--; + spin_lock(&ubq->cancel_lock); + ubq->canceling = true; + spin_unlock(&ubq->cancel_lock); + } + ubq->nr_io_ready--; + + ublk_io_lock(io); + io->flags = 0; + ublk_io_unlock(io); + return 0; +} + +static void ublk_batch_revert_prep_cmd(struct ublk_batch_io_iter *iter, + const struct ublk_batch_io_data *data) +{ + int ret; + + /* Re-process only what we've already processed, starting from beginning */ + iter->total = iter->done; + iter->done = 0; + + ret = ublk_walk_cmd_buf(iter, data, ublk_batch_unprep_io); + WARN_ON_ONCE(ret); +} + +static int ublk_batch_prep_io(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + const struct ublk_elem_header *elem) +{ + struct ublk_io *io = &ubq->ios[elem->tag]; + const struct ublk_batch_io *uc = &data->header; + union ublk_io_buf buf = { 0 }; + int ret; + + if (ublk_dev_support_auto_buf_reg(data->ub)) + buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem); + else if (ublk_dev_need_map_io(data->ub)) { + buf.addr = ublk_batch_buf_addr(uc, elem); + + ret = ublk_check_fetch_buf(data->ub, buf.addr); + if (ret) + return ret; + } + + ublk_io_lock(io); + ret = __ublk_fetch(data->cmd, data->ub, io, ubq->q_id); + if (!ret) + io->buf = buf; + ublk_io_unlock(io); + + if (!ret) + ublk_mark_io_ready(data->ub, ubq->q_id); + + return ret; +} + +static int ublk_handle_batch_prep_cmd(const struct ublk_batch_io_data *data) +{ + const struct ublk_batch_io *uc = &data->header; + struct io_uring_cmd *cmd = data->cmd; + struct ublk_batch_io_iter iter = { + .uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)), + .total = uc->nr_elem * uc->elem_bytes, + .elem_bytes = uc->elem_bytes, + }; + int ret; + + mutex_lock(&data->ub->mutex); + ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_prep_io); + + if (ret && iter.done) + ublk_batch_revert_prep_cmd(&iter, data); + mutex_unlock(&data->ub->mutex); + return ret; +} + +static int ublk_batch_commit_io_check(const struct ublk_queue *ubq, + struct ublk_io *io, + union ublk_io_buf *buf) +{ + if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) + return -EBUSY; + + /* BATCH_IO doesn't support UBLK_F_NEED_GET_DATA */ + if (ublk_need_map_io(ubq) && !buf->addr) + return -EINVAL; + return 0; +} + +static int ublk_batch_commit_io(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + const struct ublk_elem_header *elem) +{ + struct ublk_io *io = &ubq->ios[elem->tag]; + const struct ublk_batch_io *uc = &data->header; + u16 buf_idx = UBLK_INVALID_BUF_IDX; + union ublk_io_buf buf = { 0 }; + struct request *req = NULL; + bool auto_reg = false; + bool compl = false; + int ret; + + if (ublk_dev_support_auto_buf_reg(data->ub)) { + buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem); + auto_reg = true; + } else if (ublk_dev_need_map_io(data->ub)) + buf.addr = ublk_batch_buf_addr(uc, elem); + + ublk_io_lock(io); + ret = ublk_batch_commit_io_check(ubq, io, &buf); + if (!ret) { + io->res = elem->result; + io->buf = buf; + req = ublk_fill_io_cmd(io, data->cmd); + + if (auto_reg) + ublk_clear_auto_buf_reg(io, data->cmd, &buf_idx); + compl = ublk_need_complete_req(data->ub, io); + } + ublk_io_unlock(io); + + if (unlikely(ret)) { + pr_warn_ratelimited("%s: dev %u queue %u io %u: commit failure %d\n", + __func__, data->ub->dev_info.dev_id, ubq->q_id, + elem->tag, ret); + return ret; + } + + if (buf_idx != UBLK_INVALID_BUF_IDX) + io_buffer_unregister_bvec(data->cmd, buf_idx, data->issue_flags); + if (req_op(req) == REQ_OP_ZONE_APPEND) + req->__sector = ublk_batch_zone_lba(uc, elem); + if (compl) + __ublk_complete_rq(req, io, ublk_dev_need_map_io(data->ub), data->iob); + return 0; +} + +static int ublk_handle_batch_commit_cmd(struct ublk_batch_io_data *data) +{ + const struct ublk_batch_io *uc = &data->header; + struct io_uring_cmd *cmd = data->cmd; + struct ublk_batch_io_iter iter = { + .uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)), + .total = uc->nr_elem * uc->elem_bytes, + .elem_bytes = uc->elem_bytes, + }; + DEFINE_IO_COMP_BATCH(iob); + int ret; + + data->iob = &iob; + ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_commit_io); + + if (iob.complete) + iob.complete(&iob); + + return iter.done == 0 ? ret : iter.done; +} + +static int ublk_check_batch_cmd_flags(const struct ublk_batch_io *uc) +{ + unsigned elem_bytes = sizeof(struct ublk_elem_header); + + if (uc->flags & ~UBLK_BATCH_F_ALL) + return -EINVAL; + + /* UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK requires buffer index */ + if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) && + (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR)) + return -EINVAL; + + elem_bytes += (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA ? sizeof(u64) : 0) + + (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR ? sizeof(u64) : 0); + if (uc->elem_bytes != elem_bytes) + return -EINVAL; + return 0; +} + +static int ublk_check_batch_cmd(const struct ublk_batch_io_data *data) +{ + const struct ublk_batch_io *uc = &data->header; + + if (uc->q_id >= data->ub->dev_info.nr_hw_queues) + return -EINVAL; + + if (uc->nr_elem > data->ub->dev_info.queue_depth) + return -E2BIG; + + if ((uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA) && + !ublk_dev_is_zoned(data->ub)) + return -EINVAL; + + if ((uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR) && + !ublk_dev_need_map_io(data->ub)) + return -EINVAL; + + if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) && + !ublk_dev_support_auto_buf_reg(data->ub)) + return -EINVAL; + + return ublk_check_batch_cmd_flags(uc); +} + +static int ublk_batch_attach(struct ublk_queue *ubq, + struct ublk_batch_io_data *data, + struct ublk_batch_fetch_cmd *fcmd) +{ + struct ublk_batch_fetch_cmd *new_fcmd = NULL; + bool free = false; + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(data->cmd); + + spin_lock(&ubq->evts_lock); + if (unlikely(ubq->force_abort || ubq->canceling)) { + free = true; + } else { + list_add_tail(&fcmd->node, &ubq->fcmd_head); + new_fcmd = __ublk_acquire_fcmd(ubq); + } + spin_unlock(&ubq->evts_lock); + + if (unlikely(free)) { + ublk_batch_free_fcmd(fcmd); + return -ENODEV; + } + + pdu->ubq = ubq; + pdu->fcmd = fcmd; + io_uring_cmd_mark_cancelable(fcmd->cmd, data->issue_flags); + + if (!new_fcmd) + goto out; + + /* + * If the two fetch commands are originated from same io_ring_ctx, + * run batch dispatch directly. Otherwise, schedule task work for + * doing it. + */ + if (io_uring_cmd_ctx_handle(new_fcmd->cmd) == + io_uring_cmd_ctx_handle(fcmd->cmd)) { + data->cmd = new_fcmd->cmd; + ublk_batch_dispatch(ubq, data, new_fcmd); + } else { + io_uring_cmd_complete_in_task(new_fcmd->cmd, + ublk_batch_tw_cb); + } +out: + return -EIOCBQUEUED; +} + +static int ublk_handle_batch_fetch_cmd(struct ublk_batch_io_data *data) +{ + struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id); + struct ublk_batch_fetch_cmd *fcmd = ublk_batch_alloc_fcmd(data->cmd); + + if (!fcmd) + return -ENOMEM; + + return ublk_batch_attach(ubq, data, fcmd); +} + +static int ublk_validate_batch_fetch_cmd(struct ublk_batch_io_data *data) +{ + const struct ublk_batch_io *uc = &data->header; + + if (uc->q_id >= data->ub->dev_info.nr_hw_queues) + return -EINVAL; + + if (!(data->cmd->flags & IORING_URING_CMD_MULTISHOT)) + return -EINVAL; + + if (uc->elem_bytes != sizeof(__u16)) + return -EINVAL; + + if (uc->flags != 0) + return -EINVAL; + + return 0; +} + +static int ublk_handle_non_batch_cmd(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + const struct ublksrv_io_cmd *ub_cmd = io_uring_sqe_cmd(cmd->sqe); + struct ublk_device *ub = cmd->file->private_data; + unsigned tag = READ_ONCE(ub_cmd->tag); + unsigned q_id = READ_ONCE(ub_cmd->q_id); + unsigned index = READ_ONCE(ub_cmd->addr); + struct ublk_queue *ubq; + struct ublk_io *io; + + if (cmd->cmd_op == UBLK_U_IO_UNREGISTER_IO_BUF) + return ublk_unregister_io_buf(cmd, ub, index, issue_flags); + + if (q_id >= ub->dev_info.nr_hw_queues) + return -EINVAL; + + if (tag >= ub->dev_info.queue_depth) + return -EINVAL; + + if (cmd->cmd_op != UBLK_U_IO_REGISTER_IO_BUF) + return -EOPNOTSUPP; + + ubq = ublk_get_queue(ub, q_id); + io = &ubq->ios[tag]; + return ublk_register_io_buf(cmd, ub, q_id, tag, io, index, + issue_flags); +} + +static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + const struct ublk_batch_io *uc = io_uring_sqe_cmd(cmd->sqe); + struct ublk_device *ub = cmd->file->private_data; + struct ublk_batch_io_data data = { + .ub = ub, + .cmd = cmd, + .header = (struct ublk_batch_io) { + .q_id = READ_ONCE(uc->q_id), + .flags = READ_ONCE(uc->flags), + .nr_elem = READ_ONCE(uc->nr_elem), + .elem_bytes = READ_ONCE(uc->elem_bytes), + }, + .issue_flags = issue_flags, + }; + u32 cmd_op = cmd->cmd_op; + int ret = -EINVAL; + + if (unlikely(issue_flags & IO_URING_F_CANCEL)) { + ublk_batch_cancel_fn(cmd, issue_flags); + return 0; + } + + switch (cmd_op) { + case UBLK_U_IO_PREP_IO_CMDS: + ret = ublk_check_batch_cmd(&data); + if (ret) + goto out; + ret = ublk_handle_batch_prep_cmd(&data); + break; + case UBLK_U_IO_COMMIT_IO_CMDS: + ret = ublk_check_batch_cmd(&data); + if (ret) + goto out; + ret = ublk_handle_batch_commit_cmd(&data); + break; + case UBLK_U_IO_FETCH_IO_CMDS: + ret = ublk_validate_batch_fetch_cmd(&data); + if (ret) + goto out; + ret = ublk_handle_batch_fetch_cmd(&data); + break; + default: + ret = ublk_handle_non_batch_cmd(cmd, issue_flags); + break; + } +out: + return ret; +} + static inline bool ublk_check_ubuf_dir(const struct request *req, int ubuf_dir) { @@ -2575,83 +3927,96 @@ static inline bool ublk_check_ubuf_dir(const struct request *req, return false; } -static struct request *ublk_check_and_get_req(struct kiocb *iocb, - struct iov_iter *iter, size_t *off, int dir, - struct ublk_io **io) +static ssize_t +ublk_user_copy(struct kiocb *iocb, struct iov_iter *iter, int dir) { struct ublk_device *ub = iocb->ki_filp->private_data; struct ublk_queue *ubq; struct request *req; + struct ublk_io *io; + unsigned data_len; + bool is_integrity; + bool on_daemon; size_t buf_off; u16 tag, q_id; + ssize_t ret; if (!user_backed_iter(iter)) - return ERR_PTR(-EACCES); + return -EACCES; if (ub->dev_info.state == UBLK_S_DEV_DEAD) - return ERR_PTR(-EACCES); + return -EACCES; tag = ublk_pos_to_tag(iocb->ki_pos); q_id = ublk_pos_to_hwq(iocb->ki_pos); buf_off = ublk_pos_to_buf_off(iocb->ki_pos); + is_integrity = !!(iocb->ki_pos & UBLKSRV_IO_INTEGRITY_FLAG); + + if (unlikely(!ublk_dev_support_integrity(ub) && is_integrity)) + return -EINVAL; if (q_id >= ub->dev_info.nr_hw_queues) - return ERR_PTR(-EINVAL); + return -EINVAL; ubq = ublk_get_queue(ub, q_id); if (!ublk_dev_support_user_copy(ub)) - return ERR_PTR(-EACCES); + return -EACCES; if (tag >= ub->dev_info.queue_depth) - return ERR_PTR(-EINVAL); + return -EINVAL; - *io = &ubq->ios[tag]; - req = __ublk_check_and_get_req(ub, q_id, tag, *io, buf_off); - if (!req) - return ERR_PTR(-EINVAL); + io = &ubq->ios[tag]; + on_daemon = current == READ_ONCE(io->task); + if (on_daemon) { + /* On daemon, io can't be completed concurrently, so skip ref */ + if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) + return -EINVAL; - if (!ublk_check_ubuf_dir(req, dir)) - goto fail; + req = io->req; + if (!ublk_rq_has_data(req)) + return -EINVAL; + } else { + req = __ublk_check_and_get_req(ub, q_id, tag, io); + if (!req) + return -EINVAL; + } - *off = buf_off; - return req; -fail: - ublk_put_req_ref(*io, req); - return ERR_PTR(-EACCES); + if (is_integrity) { + struct blk_integrity *bi = &req->q->limits.integrity; + + data_len = bio_integrity_bytes(bi, blk_rq_sectors(req)); + } else { + data_len = blk_rq_bytes(req); + } + if (buf_off > data_len) { + ret = -EINVAL; + goto out; + } + + if (!ublk_check_ubuf_dir(req, dir)) { + ret = -EACCES; + goto out; + } + + if (is_integrity) + ret = ublk_copy_user_integrity(req, buf_off, iter, dir); + else + ret = ublk_copy_user_pages(req, buf_off, iter, dir); + +out: + if (!on_daemon) + ublk_put_req_ref(io, req); + return ret; } static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to) { - struct request *req; - struct ublk_io *io; - size_t buf_off; - size_t ret; - - req = ublk_check_and_get_req(iocb, to, &buf_off, ITER_DEST, &io); - if (IS_ERR(req)) - return PTR_ERR(req); - - ret = ublk_copy_user_pages(req, buf_off, to, ITER_DEST); - ublk_put_req_ref(io, req); - - return ret; + return ublk_user_copy(iocb, to, ITER_DEST); } static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from) { - struct request *req; - struct ublk_io *io; - size_t buf_off; - size_t ret; - - req = ublk_check_and_get_req(iocb, from, &buf_off, ITER_SOURCE, &io); - if (IS_ERR(req)) - return PTR_ERR(req); - - ret = ublk_copy_user_pages(req, buf_off, from, ITER_SOURCE); - ublk_put_req_ref(io, req); - - return ret; + return ublk_user_copy(iocb, from, ITER_SOURCE); } static const struct file_operations ublk_ch_fops = { @@ -2664,13 +4029,19 @@ static const struct file_operations ublk_ch_fops = { .mmap = ublk_ch_mmap, }; -static void ublk_deinit_queue(struct ublk_device *ub, int q_id) -{ - struct ublk_queue *ubq = ub->queues[q_id]; - int size, i; +static const struct file_operations ublk_ch_batch_io_fops = { + .owner = THIS_MODULE, + .open = ublk_ch_open, + .release = ublk_ch_release, + .read_iter = ublk_ch_read_iter, + .write_iter = ublk_ch_write_iter, + .uring_cmd = ublk_ch_batch_io_uring_cmd, + .mmap = ublk_ch_mmap, +}; - if (!ubq) - return; +static void __ublk_deinit_queue(struct ublk_device *ub, struct ublk_queue *ubq) +{ + int size, i; size = ublk_queue_cmd_buf_size(ub); @@ -2685,7 +4056,20 @@ static void ublk_deinit_queue(struct ublk_device *ub, int q_id) if (ubq->io_cmd_buf) free_pages((unsigned long)ubq->io_cmd_buf, get_order(size)); + if (ublk_dev_support_batch_io(ub)) + ublk_io_evts_deinit(ubq); + kvfree(ubq); +} + +static void ublk_deinit_queue(struct ublk_device *ub, int q_id) +{ + struct ublk_queue *ubq = ub->queues[q_id]; + + if (!ubq) + return; + + __ublk_deinit_queue(ub, ubq); ub->queues[q_id] = NULL; } @@ -2709,7 +4093,7 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id) struct ublk_queue *ubq; struct page *page; int numa_node; - int size; + int size, i, ret; /* Determine NUMA node based on queue's CPU affinity */ numa_node = ublk_get_queue_numa_node(ub, q_id); @@ -2734,9 +4118,22 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id) } ubq->io_cmd_buf = page_address(page); + for (i = 0; i < ubq->q_depth; i++) + spin_lock_init(&ubq->ios[i].lock); + + if (ublk_dev_support_batch_io(ub)) { + ret = ublk_io_evts_init(ubq, ubq->q_depth, numa_node); + if (ret) + goto fail; + INIT_LIST_HEAD(&ubq->fcmd_head); + } ub->queues[q_id] = ubq; ubq->dev = ub; + return 0; +fail: + __ublk_deinit_queue(ub, ubq); + return ret; } static void ublk_deinit_queues(struct ublk_device *ub) @@ -2824,7 +4221,10 @@ static int ublk_add_chdev(struct ublk_device *ub) if (ret) goto fail; - cdev_init(&ub->cdev, &ublk_ch_fops); + if (ublk_dev_support_batch_io(ub)) + cdev_init(&ub->cdev, &ublk_ch_batch_io_fops); + else + cdev_init(&ub->cdev, &ublk_ch_fops); ret = cdev_device_add(&ub->cdev, dev); if (ret) goto fail; @@ -2848,7 +4248,10 @@ static void ublk_align_max_io_size(struct ublk_device *ub) static int ublk_add_tag_set(struct ublk_device *ub) { - ub->tag_set.ops = &ublk_mq_ops; + if (ublk_dev_support_batch_io(ub)) + ub->tag_set.ops = &ublk_batch_mq_ops; + else + ub->tag_set.ops = &ublk_mq_ops; ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues; ub->tag_set.queue_depth = ub->dev_info.queue_depth; ub->tag_set.numa_node = NUMA_NO_NODE; @@ -2959,6 +4362,23 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, lim.max_segments = ub->params.seg.max_segments; } + if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) { + const struct ublk_param_integrity *p = &ub->params.integrity; + int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type); + + lim.max_integrity_segments = + p->max_integrity_segments ?: USHRT_MAX; + lim.integrity = (struct blk_integrity) { + .flags = ublk_integrity_flags(p->flags), + .csum_type = ublk_integrity_csum_type(p->csum_type), + .metadata_size = p->metadata_size, + .pi_offset = p->pi_offset, + .interval_exp = p->interval_exp, + .tag_size = p->tag_size, + .pi_tuple_size = pi_tuple_size, + }; + } + if (wait_for_completion_interruptible(&ub->completion) != 0) return -EINTR; @@ -2966,6 +4386,11 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, return -EINVAL; mutex_lock(&ub->mutex); + /* device may become not ready in case of F_BATCH */ + if (!ublk_dev_ready(ub)) { + ret = -EINVAL; + goto out_unlock; + } if (ub->dev_info.state == UBLK_S_DEV_LIVE || test_bit(UB_STATE_USED, &ub->state)) { ret = -EEXIST; @@ -3013,9 +4438,14 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, set_bit(UB_STATE_USED, &ub->state); - /* Schedule async partition scan for trusted daemons */ - if (!ub->unprivileged_daemons) - schedule_work(&ub->partition_scan_work); + /* Skip partition scan if disabled by user */ + if (ub->dev_info.flags & UBLK_F_NO_AUTO_PART_SCAN) { + clear_bit(GD_SUPPRESS_PART_SCAN, &disk->state); + } else { + /* Schedule async partition scan for trusted daemons */ + if (!ub->unprivileged_daemons) + schedule_work(&ub->partition_scan_work); + } out_put_cdev: if (ret) { @@ -3149,6 +4579,10 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) return -EINVAL; } + /* User copy is required to access integrity buffer */ + if (info.flags & UBLK_F_INTEGRITY && !(info.flags & UBLK_F_USER_COPY)) + return -EINVAL; + /* the created device is always owned by current user */ ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid); @@ -3204,13 +4638,22 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE | UBLK_F_URING_CMD_COMP_IN_TASK | UBLK_F_PER_IO_DAEMON | - UBLK_F_BUF_REG_OFF_DAEMON; + UBLK_F_BUF_REG_OFF_DAEMON | + UBLK_F_SAFE_STOP_DEV; + + /* So far, UBLK_F_PER_IO_DAEMON won't be exposed for BATCH_IO */ + if (ublk_dev_support_batch_io(ub)) + ub->dev_info.flags &= ~UBLK_F_PER_IO_DAEMON; /* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */ if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_AUTO_BUF_REG)) ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA; + /* UBLK_F_BATCH_IO doesn't support GET_DATA */ + if (ublk_dev_support_batch_io(ub)) + ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA; + /* * Zoned storage support requires reuse `ublksrv_io_cmd->addr` for * returning write_append_lba, which is only allowed in case of @@ -3311,19 +4754,45 @@ static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait) return 0; } -static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd) +static inline void ublk_ctrl_cmd_dump(u32 cmd_op, + const struct ublksrv_ctrl_cmd *header) { - const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); - pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n", - __func__, cmd->cmd_op, header->dev_id, header->queue_id, + __func__, cmd_op, header->dev_id, header->queue_id, header->data[0], header->addr, header->len); } -static int ublk_ctrl_stop_dev(struct ublk_device *ub) +static void ublk_ctrl_stop_dev(struct ublk_device *ub) { ublk_stop_dev(ub); - return 0; +} + +static int ublk_ctrl_try_stop_dev(struct ublk_device *ub) +{ + struct gendisk *disk; + int ret = 0; + + disk = ublk_get_disk(ub); + if (!disk) + return -ENODEV; + + mutex_lock(&disk->open_mutex); + if (disk_openers(disk) > 0) { + ret = -EBUSY; + goto unlock; + } + ub->block_open = true; + /* release open_mutex as del_gendisk() will reacquire it */ + mutex_unlock(&disk->open_mutex); + + ublk_ctrl_stop_dev(ub); + goto out; + +unlock: + mutex_unlock(&disk->open_mutex); +out: + ublk_put_disk(disk); + return ret; } static int ublk_ctrl_get_dev_info(struct ublk_device *ub, @@ -3446,8 +4915,7 @@ static int ublk_ctrl_set_params(struct ublk_device *ub, return ret; } -static int ublk_ctrl_start_recovery(struct ublk_device *ub, - const struct ublksrv_ctrl_cmd *header) +static int ublk_ctrl_start_recovery(struct ublk_device *ub) { int ret = -EINVAL; @@ -3476,7 +4944,7 @@ static int ublk_ctrl_start_recovery(struct ublk_device *ub, ret = -EBUSY; goto out_unlock; } - pr_devel("%s: start recovery for dev id %d.\n", __func__, header->dev_id); + pr_devel("%s: start recovery for dev id %d\n", __func__, ub->ub_number); init_completion(&ub->completion); ret = 0; out_unlock: @@ -3578,6 +5046,13 @@ static int ublk_wait_for_idle_io(struct ublk_device *ub, unsigned int elapsed = 0; int ret; + /* + * For UBLK_F_BATCH_IO ublk server can get notified with existing + * or new fetch command, so needn't wait any more + */ + if (ublk_dev_support_batch_io(ub)) + return 0; + while (elapsed < timeout_ms && !signal_pending(current)) { unsigned int queues_cancelable = 0; int i; @@ -3685,9 +5160,8 @@ exit: } static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, - struct io_uring_cmd *cmd) + u32 cmd_op, struct ublksrv_ctrl_cmd *header) { - struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)io_uring_sqe_cmd(cmd->sqe); bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV; void __user *argp = (void __user *)(unsigned long)header->addr; char *dev_path = NULL; @@ -3703,7 +5177,7 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, * know if the specified device is created as unprivileged * mode. */ - if (_IOC_NR(cmd->cmd_op) != UBLK_CMD_GET_DEV_INFO2) + if (_IOC_NR(cmd_op) != UBLK_CMD_GET_DEV_INFO2) return 0; } @@ -3724,7 +5198,7 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, return PTR_ERR(dev_path); ret = -EINVAL; - switch (_IOC_NR(cmd->cmd_op)) { + switch (_IOC_NR(cmd_op)) { case UBLK_CMD_GET_DEV_INFO: case UBLK_CMD_GET_DEV_INFO2: case UBLK_CMD_GET_QUEUE_AFFINITY: @@ -3741,6 +5215,7 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, case UBLK_CMD_END_USER_RECOVERY: case UBLK_CMD_UPDATE_SIZE: case UBLK_CMD_QUIESCE_DEV: + case UBLK_CMD_TRY_STOP_DEV: mask = MAY_READ | MAY_WRITE; break; default: @@ -3753,7 +5228,7 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, header->addr += header->dev_path_len; } pr_devel("%s: dev id %d cmd_op %x uid %d gid %d path %s ret %d\n", - __func__, ub->ub_number, cmd->cmd_op, + __func__, ub->ub_number, cmd_op, ub->dev_info.owner_uid, ub->dev_info.owner_gid, dev_path, ret); exit: @@ -3777,7 +5252,9 @@ static bool ublk_ctrl_uring_cmd_may_sleep(u32 cmd_op) static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { - const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); + /* May point to userspace-mapped memory */ + const struct ublksrv_ctrl_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe); + struct ublksrv_ctrl_cmd header; struct ublk_device *ub = NULL; u32 cmd_op = cmd->cmd_op; int ret = -EINVAL; @@ -3786,44 +5263,51 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, issue_flags & IO_URING_F_NONBLOCK) return -EAGAIN; - ublk_ctrl_cmd_dump(cmd); - if (!(issue_flags & IO_URING_F_SQE128)) - goto out; + return -EINVAL; + + header.dev_id = READ_ONCE(ub_src->dev_id); + header.queue_id = READ_ONCE(ub_src->queue_id); + header.len = READ_ONCE(ub_src->len); + header.addr = READ_ONCE(ub_src->addr); + header.data[0] = READ_ONCE(ub_src->data[0]); + header.dev_path_len = READ_ONCE(ub_src->dev_path_len); + ublk_ctrl_cmd_dump(cmd_op, &header); ret = ublk_check_cmd_op(cmd_op); if (ret) goto out; if (cmd_op == UBLK_U_CMD_GET_FEATURES) { - ret = ublk_ctrl_get_features(header); + ret = ublk_ctrl_get_features(&header); goto out; } if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) { ret = -ENODEV; - ub = ublk_get_device_from_id(header->dev_id); + ub = ublk_get_device_from_id(header.dev_id); if (!ub) goto out; - ret = ublk_ctrl_uring_cmd_permission(ub, cmd); + ret = ublk_ctrl_uring_cmd_permission(ub, cmd_op, &header); if (ret) goto put_dev; } switch (_IOC_NR(cmd_op)) { case UBLK_CMD_START_DEV: - ret = ublk_ctrl_start_dev(ub, header); + ret = ublk_ctrl_start_dev(ub, &header); break; case UBLK_CMD_STOP_DEV: - ret = ublk_ctrl_stop_dev(ub); + ublk_ctrl_stop_dev(ub); + ret = 0; break; case UBLK_CMD_GET_DEV_INFO: case UBLK_CMD_GET_DEV_INFO2: - ret = ublk_ctrl_get_dev_info(ub, header); + ret = ublk_ctrl_get_dev_info(ub, &header); break; case UBLK_CMD_ADD_DEV: - ret = ublk_ctrl_add_dev(header); + ret = ublk_ctrl_add_dev(&header); break; case UBLK_CMD_DEL_DEV: ret = ublk_ctrl_del_dev(&ub, true); @@ -3832,26 +5316,29 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, ret = ublk_ctrl_del_dev(&ub, false); break; case UBLK_CMD_GET_QUEUE_AFFINITY: - ret = ublk_ctrl_get_queue_affinity(ub, header); + ret = ublk_ctrl_get_queue_affinity(ub, &header); break; case UBLK_CMD_GET_PARAMS: - ret = ublk_ctrl_get_params(ub, header); + ret = ublk_ctrl_get_params(ub, &header); break; case UBLK_CMD_SET_PARAMS: - ret = ublk_ctrl_set_params(ub, header); + ret = ublk_ctrl_set_params(ub, &header); break; case UBLK_CMD_START_USER_RECOVERY: - ret = ublk_ctrl_start_recovery(ub, header); + ret = ublk_ctrl_start_recovery(ub); break; case UBLK_CMD_END_USER_RECOVERY: - ret = ublk_ctrl_end_recovery(ub, header); + ret = ublk_ctrl_end_recovery(ub, &header); break; case UBLK_CMD_UPDATE_SIZE: - ublk_ctrl_set_size(ub, header); + ublk_ctrl_set_size(ub, &header); ret = 0; break; case UBLK_CMD_QUIESCE_DEV: - ret = ublk_ctrl_quiesce_dev(ub, header); + ret = ublk_ctrl_quiesce_dev(ub, &header); + break; + case UBLK_CMD_TRY_STOP_DEV: + ret = ublk_ctrl_try_stop_dev(ub); break; default: ret = -EOPNOTSUPP; @@ -3863,7 +5350,7 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, ublk_put_device(ub); out: pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n", - __func__, ret, cmd->cmd_op, header->dev_id, header->queue_id); + __func__, ret, cmd_op, header.dev_id, header.queue_id); return ret; } @@ -3886,6 +5373,12 @@ static int __init ublk_init(void) BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET); + /* + * Ensure UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE + * doesn't overflow into UBLKSRV_IO_INTEGRITY_FLAG + */ + BUILD_BUG_ON(UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE >= + UBLKSRV_IO_INTEGRITY_FLAG); BUILD_BUG_ON(sizeof(struct ublk_auto_buf_reg) != 8); init_waitqueue_head(&ublk_idr_wq); diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index a6ca92049c10..e9a7563b4b2f 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -295,7 +295,8 @@ static void dm_kill_unmapped_request(struct request *rq, blk_status_t error) } static enum rq_end_io_ret end_clone_request(struct request *clone, - blk_status_t error) + blk_status_t error, + const struct io_comp_batch *iob) { struct dm_rq_target_io *tio = clone->end_io_data; diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 84b7e2af6dba..1d4a050dab3a 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2085,7 +2085,7 @@ static void bitmap_destroy(struct mddev *mddev) return; bitmap_wait_behind_writes(mddev); - if (!mddev->serialize_policy) + if (!test_bit(MD_SERIALIZE_POLICY, &mddev->flags)) mddev_destroy_serial_pool(mddev, NULL); mutex_lock(&mddev->bitmap_info.mutex); @@ -2453,6 +2453,7 @@ static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks, memcpy(page_address(store.sb_page), page_address(bitmap->storage.sb_page), sizeof(bitmap_super_t)); + mutex_lock(&bitmap->mddev->bitmap_info.mutex); spin_lock_irq(&bitmap->counts.lock); md_bitmap_file_unmap(&bitmap->storage); bitmap->storage = store; @@ -2560,7 +2561,7 @@ static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks, set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY); } spin_unlock_irq(&bitmap->counts.lock); - + mutex_unlock(&bitmap->mddev->bitmap_info.mutex); if (!init) { __bitmap_unplug(bitmap); bitmap->mddev->pers->quiesce(bitmap->mddev, 0); @@ -2809,7 +2810,7 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len) mddev->bitmap_info.max_write_behind = backlog; if (!backlog && mddev->serial_info_pool) { /* serial_info_pool is not needed if backlog is zero */ - if (!mddev->serialize_policy) + if (!test_bit(MD_SERIALIZE_POLICY, &mddev->flags)) mddev_destroy_serial_pool(mddev, NULL); } else if (backlog && !mddev->serial_info_pool) { /* serial_info_pool is needed since backlog is not zero */ diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 11f1e91d387d..896279988dfd 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -549,8 +549,13 @@ static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); - /* daemaon thread must exist */ thread = rcu_dereference_protected(mddev->thread, true); + if (!thread) { + pr_warn("md-cluster: Received metadata update but MD thread is not ready\n"); + dlm_unlock_sync(cinfo->no_new_dev_lockres); + return; + } + wait_event(thread->wqueue, (got_lock = mddev_trylock(mddev)) || test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state)); diff --git a/drivers/md/md-llbitmap.c b/drivers/md/md-llbitmap.c index 9c1ade19b774..cd713a7dc270 100644 --- a/drivers/md/md-llbitmap.c +++ b/drivers/md/md-llbitmap.c @@ -712,8 +712,10 @@ static int llbitmap_suspend_timeout(struct llbitmap *llbitmap, int page_idx) percpu_ref_kill(&pctl->active); if (!wait_event_timeout(pctl->wait, percpu_ref_is_zero(&pctl->active), - llbitmap->mddev->bitmap_info.daemon_sleep * HZ)) + llbitmap->mddev->bitmap_info.daemon_sleep * HZ)) { + percpu_ref_resurrect(&pctl->active); return -ETIMEDOUT; + } return 0; } diff --git a/drivers/md/md.c b/drivers/md/md.c index 6d73f6e196a9..59cd303548de 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -279,7 +279,8 @@ void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev) rdev_for_each(temp, mddev) { if (!rdev) { - if (!mddev->serialize_policy || + if (!test_bit(MD_SERIALIZE_POLICY, + &mddev->flags) || !rdev_need_serial(temp)) rdev_uninit_serial(temp); else @@ -2617,9 +2618,6 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) list_add_rcu(&rdev->same_set, &mddev->disks); bd_link_disk_holder(rdev->bdev, mddev->gendisk); - /* May as well allow recovery to be retried once */ - mddev->recovery_disabled++; - return 0; fail: @@ -5864,11 +5862,11 @@ __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show, static ssize_t fail_last_dev_show(struct mddev *mddev, char *page) { - return sprintf(page, "%d\n", mddev->fail_last_dev); + return sprintf(page, "%d\n", test_bit(MD_FAILLAST_DEV, &mddev->flags)); } /* - * Setting fail_last_dev to true to allow last device to be forcibly removed + * Setting MD_FAILLAST_DEV to allow last device to be forcibly removed * from RAID1/RAID10. */ static ssize_t @@ -5881,8 +5879,10 @@ fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len) if (ret) return ret; - if (value != mddev->fail_last_dev) - mddev->fail_last_dev = value; + if (value) + set_bit(MD_FAILLAST_DEV, &mddev->flags); + else + clear_bit(MD_FAILLAST_DEV, &mddev->flags); return len; } @@ -5895,11 +5895,12 @@ static ssize_t serialize_policy_show(struct mddev *mddev, char *page) if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) return sprintf(page, "n/a\n"); else - return sprintf(page, "%d\n", mddev->serialize_policy); + return sprintf(page, "%d\n", + test_bit(MD_SERIALIZE_POLICY, &mddev->flags)); } /* - * Setting serialize_policy to true to enforce write IO is not reordered + * Setting MD_SERIALIZE_POLICY enforce write IO is not reordered * for raid1. */ static ssize_t @@ -5912,7 +5913,7 @@ serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) if (err) return err; - if (value == mddev->serialize_policy) + if (value == test_bit(MD_SERIALIZE_POLICY, &mddev->flags)) return len; err = mddev_suspend_and_lock(mddev); @@ -5924,11 +5925,13 @@ serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) goto unlock; } - if (value) + if (value) { mddev_create_serial_pool(mddev, NULL); - else + set_bit(MD_SERIALIZE_POLICY, &mddev->flags); + } else { mddev_destroy_serial_pool(mddev, NULL); - mddev->serialize_policy = value; + clear_bit(MD_SERIALIZE_POLICY, &mddev->flags); + } unlock: mddev_unlock_and_resume(mddev); return err ?: len; @@ -6502,7 +6505,7 @@ int md_run(struct mddev *mddev) * the only valid external interface is through the md * device. */ - mddev->has_superblocks = false; + clear_bit(MD_HAS_SUPERBLOCK, &mddev->flags); rdev_for_each(rdev, mddev) { if (test_bit(Faulty, &rdev->flags)) continue; @@ -6515,7 +6518,7 @@ int md_run(struct mddev *mddev) } if (rdev->sb_page) - mddev->has_superblocks = true; + set_bit(MD_HAS_SUPERBLOCK, &mddev->flags); /* perform some consistency tests on the device. * We don't want the data to overlap the metadata, @@ -6848,13 +6851,15 @@ static void __md_stop_writes(struct mddev *mddev) { timer_delete_sync(&mddev->safemode_timer); - if (mddev->pers && mddev->pers->quiesce) { - mddev->pers->quiesce(mddev, 1); - mddev->pers->quiesce(mddev, 0); - } + if (md_is_rdwr(mddev) || !mddev_is_dm(mddev)) { + if (mddev->pers && mddev->pers->quiesce) { + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + } - if (md_bitmap_enabled(mddev, true)) - mddev->bitmap_ops->flush(mddev); + if (md_bitmap_enabled(mddev, true)) + mddev->bitmap_ops->flush(mddev); + } if (md_is_rdwr(mddev) && ((!mddev->in_sync && !mddev_is_clustered(mddev)) || @@ -6865,7 +6870,7 @@ static void __md_stop_writes(struct mddev *mddev) md_update_sb(mddev, 1); } /* disable policy to guarantee rdevs free resources for serialization */ - mddev->serialize_policy = 0; + clear_bit(MD_SERIALIZE_POLICY, &mddev->flags); mddev_destroy_serial_pool(mddev, NULL); } @@ -9068,20 +9073,22 @@ static bool is_mddev_idle(struct mddev *mddev, int init) return idle; } -void md_done_sync(struct mddev *mddev, int blocks, int ok) +void md_done_sync(struct mddev *mddev, int blocks) { /* another "blocks" (512byte) blocks have been synced */ atomic_sub(blocks, &mddev->recovery_active); wake_up(&mddev->recovery_wait); - if (!ok) { - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - set_bit(MD_RECOVERY_ERROR, &mddev->recovery); - md_wakeup_thread(mddev->thread); - // stop recovery, signal do_sync .... - } } EXPORT_SYMBOL(md_done_sync); +void md_sync_error(struct mddev *mddev) +{ + // stop recovery, signal do_sync .... + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_wakeup_thread(mddev->thread); +} +EXPORT_SYMBOL(md_sync_error); + /* md_write_start(mddev, bi) * If we need to update some array metadata (e.g. 'active' flag * in superblock) before writing, schedule a superblock update @@ -9125,7 +9132,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi) rcu_read_unlock(); if (did_change) sysfs_notify_dirent_safe(mddev->sysfs_state); - if (!mddev->has_superblocks) + if (!test_bit(MD_HAS_SUPERBLOCK, &mddev->flags)) return; wait_event(mddev->sb_wait, !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); @@ -9430,6 +9437,53 @@ static bool sync_io_within_limit(struct mddev *mddev) (raid_is_456(mddev) ? 8 : 128) * sync_io_depth(mddev); } +/* + * Update sync offset and mddev status when sync completes + */ +static void md_finish_sync(struct mddev *mddev, enum sync_action action) +{ + struct md_rdev *rdev; + + switch (action) { + case ACTION_RESYNC: + case ACTION_REPAIR: + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + mddev->curr_resync = MaxSector; + mddev->resync_offset = mddev->curr_resync; + break; + case ACTION_RECOVER: + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + mddev->curr_resync = MaxSector; + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) + if (mddev->delta_disks >= 0 && + rdev_needs_recovery(rdev, mddev->curr_resync)) + rdev->recovery_offset = mddev->curr_resync; + rcu_read_unlock(); + break; + case ACTION_RESHAPE: + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && + mddev->delta_disks > 0 && + mddev->pers->finish_reshape && + mddev->pers->size && + !mddev_is_dm(mddev)) { + mddev_lock_nointr(mddev); + md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0)); + mddev_unlock(mddev); + if (!mddev_is_clustered(mddev)) + set_capacity_and_notify(mddev->gendisk, + mddev->array_sectors); + } + if (mddev->pers->finish_reshape) + mddev->pers->finish_reshape(mddev); + break; + /* */ + case ACTION_CHECK: + default: + break; + } +} + #define SYNC_MARKS 10 #define SYNC_MARK_STEP (3*HZ) #define UPDATE_FREQUENCY (5*60*HZ) @@ -9445,7 +9499,6 @@ void md_do_sync(struct md_thread *thread) int last_mark,m; sector_t last_check; int skipped = 0; - struct md_rdev *rdev; enum sync_action action; const char *desc; struct blk_plug plug; @@ -9731,65 +9784,21 @@ update: wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && - !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && mddev->curr_resync >= MD_RESYNC_ACTIVE) { + /* All sync IO completes after recovery_active becomes 0 */ mddev->curr_resync_completed = mddev->curr_resync; sysfs_notify_dirent_safe(mddev->sysfs_completed); } mddev->pers->sync_request(mddev, max_sectors, max_sectors, &skipped); - if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && - mddev->curr_resync > MD_RESYNC_ACTIVE) { - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { - if (mddev->curr_resync >= mddev->resync_offset) { - pr_debug("md: checkpointing %s of %s.\n", - desc, mdname(mddev)); - if (test_bit(MD_RECOVERY_ERROR, - &mddev->recovery)) - mddev->resync_offset = - mddev->curr_resync_completed; - else - mddev->resync_offset = - mddev->curr_resync; - } - } else - mddev->resync_offset = MaxSector; - } else { - if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) - mddev->curr_resync = MaxSector; - if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && - test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) - if (mddev->delta_disks >= 0 && - rdev_needs_recovery(rdev, mddev->curr_resync)) - rdev->recovery_offset = mddev->curr_resync; - rcu_read_unlock(); - } - } - } + if (mddev->curr_resync > MD_RESYNC_ACTIVE) + md_finish_sync(mddev, action); skip: /* set CHANGE_PENDING here since maybe another update is needed, * so other nodes are informed. It should be harmless for normal * raid */ set_mask_bits(&mddev->sb_flags, 0, BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS)); - - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && - !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && - mddev->delta_disks > 0 && - mddev->pers->finish_reshape && - mddev->pers->size && - !mddev_is_dm(mddev)) { - mddev_lock_nointr(mddev); - md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0)); - mddev_unlock(mddev); - if (!mddev_is_clustered(mddev)) - set_capacity_and_notify(mddev->gendisk, - mddev->array_sectors); - } - spin_lock(&mddev->lock); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { /* We completed so min/max setting can be forgotten if used. */ @@ -10304,7 +10313,7 @@ void md_reap_sync_thread(struct mddev *mddev) { struct md_rdev *rdev; sector_t old_dev_sectors = mddev->dev_sectors; - bool is_reshaped = false; + bool is_reshaped = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); /* resync has finished, collect result */ md_unregister_thread(mddev, &mddev->sync_thread); @@ -10320,12 +10329,6 @@ void md_reap_sync_thread(struct mddev *mddev) set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); } } - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && - mddev->pers->finish_reshape) { - mddev->pers->finish_reshape(mddev); - if (mddev_is_clustered(mddev)) - is_reshaped = true; - } /* If array is no-longer degraded, then any saved_raid_disk * information must be scrapped. @@ -10352,8 +10355,9 @@ void md_reap_sync_thread(struct mddev *mddev) * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, * so it is time to update size across cluster. */ - if (mddev_is_clustered(mddev) && is_reshaped - && !test_bit(MD_CLOSING, &mddev->flags)) + if (mddev_is_clustered(mddev) && is_reshaped && + mddev->pers->finish_reshape && + !test_bit(MD_CLOSING, &mddev->flags)) mddev->cluster_ops->update_size(mddev, old_dev_sectors); /* flag recovery needed just to double check */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); @@ -10413,8 +10417,14 @@ bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, else s += rdev->data_offset; - if (!badblocks_set(&rdev->badblocks, s, sectors, 0)) + if (!badblocks_set(&rdev->badblocks, s, sectors, 0)) { + /* + * Mark the disk as Faulty when setting badblocks fails, + * otherwise, bad sectors may be read. + */ + md_error(mddev, rdev); return false; + } /* Make sure they get written out promptly */ if (test_bit(ExternalBbl, &rdev->flags)) diff --git a/drivers/md/md.h b/drivers/md/md.h index 6985f2829bbd..ac84289664cd 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -22,6 +22,10 @@ #include #define MaxSector (~(sector_t)0) +/* + * Number of guaranteed raid bios in case of extreme VM load: + */ +#define NR_RAID_BIOS 256 enum md_submodule_type { MD_PERSONALITY = 0, @@ -340,6 +344,9 @@ struct md_cluster_operations; * array is ready yet. * @MD_BROKEN: This is used to stop writes and mark array as failed. * @MD_DELETED: This device is being deleted + * @MD_HAS_SUPERBLOCK: There is persistence sb in member disks. + * @MD_FAILLAST_DEV: Allow last rdev to be removed. + * @MD_SERIALIZE_POLICY: Enforce write IO is not reordered, just used by raid1. * * change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added */ @@ -356,6 +363,9 @@ enum mddev_flags { MD_BROKEN, MD_DO_DELETE, MD_DELETED, + MD_HAS_SUPERBLOCK, + MD_FAILLAST_DEV, + MD_SERIALIZE_POLICY, }; enum mddev_sb_flags { @@ -495,12 +505,6 @@ struct mddev { int ok_start_degraded; unsigned long recovery; - /* If a RAID personality determines that recovery (of a particular - * device) will fail due to a read error on the source device, it - * takes a copy of this number and does not attempt recovery again - * until this number changes. - */ - int recovery_disabled; int in_sync; /* know to not need resync */ /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so @@ -622,10 +626,6 @@ struct mddev { /* The sequence number for sync thread */ atomic_t sync_seq; - - bool has_superblocks:1; - bool fail_last_dev:1; - bool serialize_policy:1; }; enum recovery_flags { @@ -646,8 +646,6 @@ enum recovery_flags { MD_RECOVERY_FROZEN, /* waiting for pers->start() to finish */ MD_RECOVERY_WAIT, - /* interrupted because io-error */ - MD_RECOVERY_ERROR, /* flags determines sync action, see details in enum sync_action */ @@ -737,8 +735,8 @@ static inline int mddev_trylock(struct mddev *mddev) int ret; ret = mutex_trylock(&mddev->reconfig_mutex); - if (!ret && test_bit(MD_DELETED, &mddev->flags)) { - ret = -ENODEV; + if (ret && test_bit(MD_DELETED, &mddev->flags)) { + ret = 0; mutex_unlock(&mddev->reconfig_mutex); } return ret; @@ -912,7 +910,8 @@ extern const char *md_sync_action_name(enum sync_action action); extern void md_write_start(struct mddev *mddev, struct bio *bi); extern void md_write_inc(struct mddev *mddev, struct bio *bi); extern void md_write_end(struct mddev *mddev); -extern void md_done_sync(struct mddev *mddev, int blocks, int ok); +extern void md_done_sync(struct mddev *mddev, int blocks); +extern void md_sync_error(struct mddev *mddev); extern void md_error(struct mddev *mddev, struct md_rdev *rdev); extern void md_finish_reshape(struct mddev *mddev); void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 985c377356eb..d83b2b1c0049 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -27,7 +27,9 @@ module_param(default_layout, int, 0644); (1L << MD_JOURNAL_CLEAN) | \ (1L << MD_FAILFAST_SUPPORTED) |\ (1L << MD_HAS_PPL) | \ - (1L << MD_HAS_MULTIPLE_PPLS)) + (1L << MD_HAS_MULTIPLE_PPLS) | \ + (1L << MD_FAILLAST_DEV) | \ + (1L << MD_SERIALIZE_POLICY)) /* * inform the user of the raid configuration diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index 521625756128..c33099925f23 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -3,11 +3,6 @@ #define RESYNC_BLOCK_SIZE (64*1024) #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) -/* - * Number of guaranteed raid bios in case of extreme VM load: - */ -#define NR_RAID_BIOS 256 - /* when we get a read error on a read-only array, we redirect to another * device without failing the first device, or trying to over-write to * correct the read error. To keep track of bad blocks on a per-bio diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 57d50465eed1..867db18bc3ba 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -542,7 +542,7 @@ static void raid1_end_write_request(struct bio *bio) call_bio_endio(r1_bio); } } - } else if (rdev->mddev->serialize_policy) + } else if (test_bit(MD_SERIALIZE_POLICY, &rdev->mddev->flags)) remove_serial(rdev, lo, hi); if (r1_bio->bios[mirror] == NULL) rdev_dec_pending(rdev, conf->mddev); @@ -1644,7 +1644,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, mbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO, &mddev->bio_set); - if (mddev->serialize_policy) + if (test_bit(MD_SERIALIZE_POLICY, &mddev->flags)) wait_for_serialization(rdev, r1_bio); } @@ -1746,7 +1746,7 @@ static void raid1_status(struct seq_file *seq, struct mddev *mddev) * - &mddev->degraded is bumped. * * @rdev is marked as &Faulty excluding case when array is failed and - * &mddev->fail_last_dev is off. + * MD_FAILLAST_DEV is not set. */ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev) { @@ -1759,8 +1759,7 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev) (conf->raid_disks - mddev->degraded) == 1) { set_bit(MD_BROKEN, &mddev->flags); - if (!mddev->fail_last_dev) { - conf->recovery_disabled = mddev->recovery_disabled; + if (!test_bit(MD_FAILLAST_DEV, &mddev->flags)) { spin_unlock_irqrestore(&conf->device_lock, flags); return; } @@ -1904,7 +1903,6 @@ static bool raid1_remove_conf(struct r1conf *conf, int disk) /* Only remove non-faulty devices if recovery is not possible. */ if (!test_bit(Faulty, &rdev->flags) && - rdev->mddev->recovery_disabled != conf->recovery_disabled && rdev->mddev->degraded < conf->raid_disks) return false; @@ -1924,9 +1922,6 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) int first = 0; int last = conf->raid_disks - 1; - if (mddev->recovery_disabled == conf->recovery_disabled) - return -EBUSY; - if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; @@ -2062,7 +2057,7 @@ static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio) } while (sectors_to_go > 0); } -static void put_sync_write_buf(struct r1bio *r1_bio, int uptodate) +static void put_sync_write_buf(struct r1bio *r1_bio) { if (atomic_dec_and_test(&r1_bio->remaining)) { struct mddev *mddev = r1_bio->mddev; @@ -2073,20 +2068,19 @@ static void put_sync_write_buf(struct r1bio *r1_bio, int uptodate) reschedule_retry(r1_bio); else { put_buf(r1_bio); - md_done_sync(mddev, s, uptodate); + md_done_sync(mddev, s); } } } static void end_sync_write(struct bio *bio) { - int uptodate = !bio->bi_status; struct r1bio *r1_bio = get_resync_r1bio(bio); struct mddev *mddev = r1_bio->mddev; struct r1conf *conf = mddev->private; struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev; - if (!uptodate) { + if (bio->bi_status) { abort_sync_write(mddev, r1_bio); set_bit(WriteErrorSeen, &rdev->flags); if (!test_and_set_bit(WantReplacement, &rdev->flags)) @@ -2099,7 +2093,7 @@ static void end_sync_write(struct bio *bio) set_bit(R1BIO_MadeGood, &r1_bio->state); } - put_sync_write_buf(r1_bio, uptodate); + put_sync_write_buf(r1_bio); } static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector, @@ -2116,8 +2110,7 @@ static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector, rdev->mddev->recovery); } /* need to record an error - either for the block or the device */ - if (!rdev_set_badblocks(rdev, sector, sectors, 0)) - md_error(rdev->mddev, rdev); + rdev_set_badblocks(rdev, sector, sectors, 0); return 0; } @@ -2348,9 +2341,8 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) */ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) || !fix_sync_read_error(r1_bio)) { - conf->recovery_disabled = mddev->recovery_disabled; - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_done_sync(mddev, r1_bio->sectors, 0); + md_done_sync(mddev, r1_bio->sectors); + md_sync_error(mddev); put_buf(r1_bio); return; } @@ -2385,7 +2377,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) submit_bio_noacct(wbio); } - put_sync_write_buf(r1_bio, 1); + put_sync_write_buf(r1_bio); } /* @@ -2442,8 +2434,7 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio) if (!success) { /* Cannot read from anywhere - mark it bad */ struct md_rdev *rdev = conf->mirrors[read_disk].rdev; - if (!rdev_set_badblocks(rdev, sect, s, 0)) - md_error(mddev, rdev); + rdev_set_badblocks(rdev, sect, s, 0); break; } /* write it back and re-read */ @@ -2487,7 +2478,7 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio) } } -static bool narrow_write_error(struct r1bio *r1_bio, int i) +static void narrow_write_error(struct r1bio *r1_bio, int i) { struct mddev *mddev = r1_bio->mddev; struct r1conf *conf = mddev->private; @@ -2504,17 +2495,16 @@ static bool narrow_write_error(struct r1bio *r1_bio, int i) * We currently own a reference on the rdev. */ - int block_sectors; + int block_sectors, lbs = bdev_logical_block_size(rdev->bdev) >> 9; sector_t sector; int sectors; int sect_to_write = r1_bio->sectors; - bool ok = true; if (rdev->badblocks.shift < 0) - return false; + block_sectors = lbs; + else + block_sectors = roundup(1 << rdev->badblocks.shift, lbs); - block_sectors = roundup(1 << rdev->badblocks.shift, - bdev_logical_block_size(rdev->bdev) >> 9); sector = r1_bio->sector; sectors = ((sector + block_sectors) & ~(sector_t)(block_sectors - 1)) @@ -2542,18 +2532,21 @@ static bool narrow_write_error(struct r1bio *r1_bio, int i) bio_trim(wbio, sector - r1_bio->sector, sectors); wbio->bi_iter.bi_sector += rdev->data_offset; - if (submit_bio_wait(wbio) < 0) - /* failure! */ - ok = rdev_set_badblocks(rdev, sector, - sectors, 0) - && ok; + if (submit_bio_wait(wbio) && + !rdev_set_badblocks(rdev, sector, sectors, 0)) { + /* + * Badblocks set failed, disk marked Faulty. + * No further operations needed. + */ + bio_put(wbio); + break; + } bio_put(wbio); sect_to_write -= sectors; sector += sectors; sectors = block_sectors; } - return ok; } static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio) @@ -2566,17 +2559,14 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio if (bio->bi_end_io == NULL) continue; if (!bio->bi_status && - test_bit(R1BIO_MadeGood, &r1_bio->state)) { + test_bit(R1BIO_MadeGood, &r1_bio->state)) rdev_clear_badblocks(rdev, r1_bio->sector, s, 0); - } if (bio->bi_status && - test_bit(R1BIO_WriteError, &r1_bio->state)) { - if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0)) - md_error(conf->mddev, rdev); - } + test_bit(R1BIO_WriteError, &r1_bio->state)) + rdev_set_badblocks(rdev, r1_bio->sector, s, 0); } put_buf(r1_bio); - md_done_sync(conf->mddev, s, 1); + md_done_sync(conf->mddev, s); } static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) @@ -2597,10 +2587,7 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) * errors. */ fail = true; - if (!narrow_write_error(r1_bio, m)) - md_error(conf->mddev, - conf->mirrors[m].rdev); - /* an I/O failed, we can't clear the bitmap */ + narrow_write_error(r1_bio, m); rdev_dec_pending(conf->mirrors[m].rdev, conf->mddev); } @@ -2955,16 +2942,12 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, *skipped = 1; put_buf(r1_bio); - if (!ok) { - /* Cannot record the badblocks, so need to + if (!ok) + /* Cannot record the badblocks, md_error has set INTR, * abort the resync. - * If there are multiple read targets, could just - * fail the really bad ones ??? */ - conf->recovery_disabled = mddev->recovery_disabled; - set_bit(MD_RECOVERY_INTR, &mddev->recovery); return 0; - } else + else return min_bad; } @@ -3151,7 +3134,6 @@ static struct r1conf *setup_conf(struct mddev *mddev) init_waitqueue_head(&conf->wait_barrier); bio_list_init(&conf->pending_bio_list); - conf->recovery_disabled = mddev->recovery_disabled - 1; err = -EIO; for (i = 0; i < conf->raid_disks * 2; i++) { @@ -3254,6 +3236,7 @@ static int raid1_run(struct mddev *mddev) if (!mddev_is_dm(mddev)) { ret = raid1_set_limits(mddev); if (ret) { + md_unregister_thread(mddev, &conf->thread); if (!mddev->private) raid1_free(mddev, conf); return ret; diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 2ebe35aaa534..c98d43a7ae99 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -93,11 +93,6 @@ struct r1conf { */ int fullsync; - /* When the same as mddev->recovery_disabled we don't allow - * recovery to be attempted as we expect a read error. - */ - int recovery_disabled; - mempool_t *r1bio_pool; mempool_t r1buf_pool; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 84be4cc7e873..9debb20cf129 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1990,7 +1990,7 @@ static int enough(struct r10conf *conf, int ignore) * - &mddev->degraded is bumped. * * @rdev is marked as &Faulty excluding case when array is failed and - * &mddev->fail_last_dev is off. + * MD_FAILLAST_DEV is not set. */ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev) { @@ -2002,7 +2002,7 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev) if (test_bit(In_sync, &rdev->flags) && !enough(conf, rdev->raid_disk)) { set_bit(MD_BROKEN, &mddev->flags); - if (!mddev->fail_last_dev) { + if (!test_bit(MD_FAILLAST_DEV, &mddev->flags)) { spin_unlock_irqrestore(&conf->device_lock, flags); return; } @@ -2130,8 +2130,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) mirror = first; for ( ; mirror <= last ; mirror++) { p = &conf->mirrors[mirror]; - if (p->recovery_disabled == mddev->recovery_disabled) - continue; if (p->rdev) { if (test_bit(WantReplacement, &p->rdev->flags) && p->replacement == NULL && repl_slot < 0) @@ -2143,7 +2141,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (err) return err; p->head_position = 0; - p->recovery_disabled = mddev->recovery_disabled - 1; rdev->raid_disk = mirror; err = 0; if (rdev->saved_raid_disk != mirror) @@ -2196,7 +2193,6 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) * is not possible. */ if (!test_bit(Faulty, &rdev->flags) && - mddev->recovery_disabled != p->recovery_disabled && (!p->replacement || p->replacement == rdev) && number < conf->geo.raid_disks && enough(conf, -1)) { @@ -2276,7 +2272,7 @@ static void end_sync_request(struct r10bio *r10_bio) reschedule_retry(r10_bio); else put_buf(r10_bio); - md_done_sync(mddev, s, 1); + md_done_sync(mddev, s); break; } else { struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio; @@ -2452,7 +2448,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) done: if (atomic_dec_and_test(&r10_bio->remaining)) { - md_done_sync(mddev, r10_bio->sectors, 1); + md_done_sync(mddev, r10_bio->sectors); put_buf(r10_bio); } } @@ -2535,8 +2531,6 @@ static void fix_recovery_read_error(struct r10bio *r10_bio) pr_notice("md/raid10:%s: recovery aborted due to read error\n", mdname(mddev)); - conf->mirrors[dw].recovery_disabled - = mddev->recovery_disabled; set_bit(MD_RECOVERY_INTR, &mddev->recovery); break; @@ -2604,8 +2598,7 @@ static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector, &rdev->mddev->recovery); } /* need to record an error - either for the block or the device */ - if (!rdev_set_badblocks(rdev, sector, sectors, 0)) - md_error(rdev->mddev, rdev); + rdev_set_badblocks(rdev, sector, sectors, 0); return 0; } @@ -2686,7 +2679,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 r10_bio->devs[slot].addr + sect, s, 0)) { - md_error(mddev, rdev); r10_bio->devs[slot].bio = IO_BLOCKED; } @@ -2773,7 +2765,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 } } -static bool narrow_write_error(struct r10bio *r10_bio, int i) +static void narrow_write_error(struct r10bio *r10_bio, int i) { struct bio *bio = r10_bio->master_bio; struct mddev *mddev = r10_bio->mddev; @@ -2790,17 +2782,16 @@ static bool narrow_write_error(struct r10bio *r10_bio, int i) * We currently own a reference to the rdev. */ - int block_sectors; + int block_sectors, lbs = bdev_logical_block_size(rdev->bdev) >> 9; sector_t sector; int sectors; int sect_to_write = r10_bio->sectors; - bool ok = true; if (rdev->badblocks.shift < 0) - return false; + block_sectors = lbs; + else + block_sectors = roundup(1 << rdev->badblocks.shift, lbs); - block_sectors = roundup(1 << rdev->badblocks.shift, - bdev_logical_block_size(rdev->bdev) >> 9); sector = r10_bio->sector; sectors = ((r10_bio->sector + block_sectors) & ~(sector_t)(block_sectors - 1)) @@ -2820,18 +2811,21 @@ static bool narrow_write_error(struct r10bio *r10_bio, int i) choose_data_offset(r10_bio, rdev); wbio->bi_opf = REQ_OP_WRITE; - if (submit_bio_wait(wbio) < 0) - /* Failure! */ - ok = rdev_set_badblocks(rdev, wsector, - sectors, 0) - && ok; + if (submit_bio_wait(wbio) && + !rdev_set_badblocks(rdev, wsector, sectors, 0)) { + /* + * Badblocks set failed, disk marked Faulty. + * No further operations needed. + */ + bio_put(wbio); + break; + } bio_put(wbio); sect_to_write -= sectors; sector += sectors; sectors = block_sectors; } - return ok; } static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) @@ -2891,35 +2885,29 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) if (r10_bio->devs[m].bio == NULL || r10_bio->devs[m].bio->bi_end_io == NULL) continue; - if (!r10_bio->devs[m].bio->bi_status) { + if (!r10_bio->devs[m].bio->bi_status) rdev_clear_badblocks( rdev, r10_bio->devs[m].addr, r10_bio->sectors, 0); - } else { - if (!rdev_set_badblocks( - rdev, - r10_bio->devs[m].addr, - r10_bio->sectors, 0)) - md_error(conf->mddev, rdev); - } + else + rdev_set_badblocks(rdev, + r10_bio->devs[m].addr, + r10_bio->sectors, 0); rdev = conf->mirrors[dev].replacement; if (r10_bio->devs[m].repl_bio == NULL || r10_bio->devs[m].repl_bio->bi_end_io == NULL) continue; - if (!r10_bio->devs[m].repl_bio->bi_status) { + if (!r10_bio->devs[m].repl_bio->bi_status) rdev_clear_badblocks( rdev, r10_bio->devs[m].addr, r10_bio->sectors, 0); - } else { - if (!rdev_set_badblocks( - rdev, - r10_bio->devs[m].addr, - r10_bio->sectors, 0)) - md_error(conf->mddev, rdev); - } + else + rdev_set_badblocks(rdev, + r10_bio->devs[m].addr, + r10_bio->sectors, 0); } put_buf(r10_bio); } else { @@ -2936,8 +2924,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) rdev_dec_pending(rdev, conf->mddev); } else if (bio != NULL && bio->bi_status) { fail = true; - if (!narrow_write_error(r10_bio, m)) - md_error(conf->mddev, rdev); + narrow_write_error(r10_bio, m); rdev_dec_pending(rdev, conf->mddev); } bio = r10_bio->devs[m].repl_bio; @@ -3168,11 +3155,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, int i; int max_sync; sector_t sync_blocks; - sector_t sectors_skipped = 0; - int chunks_skipped = 0; sector_t chunk_mask = conf->geo.chunk_mask; int page_idx = 0; - int error_disk = -1; /* * Allow skipping a full rebuild for incremental assembly @@ -3193,7 +3177,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (init_resync(conf)) return 0; - skipped: if (sector_nr >= max_sector) { conf->cluster_sync_low = 0; conf->cluster_sync_high = 0; @@ -3245,33 +3228,12 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, mddev->bitmap_ops->close_sync(mddev); close_sync(conf); *skipped = 1; - return sectors_skipped; + return 0; } if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) return reshape_request(mddev, sector_nr, skipped); - if (chunks_skipped >= conf->geo.raid_disks) { - pr_err("md/raid10:%s: %s fails\n", mdname(mddev), - test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? "resync" : "recovery"); - if (error_disk >= 0 && - !test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - /* - * recovery fails, set mirrors.recovery_disabled, - * device shouldn't be added to there. - */ - conf->mirrors[error_disk].recovery_disabled = - mddev->recovery_disabled; - return 0; - } - /* - * if there has been nothing to do on any drive, - * then there is nothing to do at all. - */ - *skipped = 1; - return (max_sector - sector_nr) + sectors_skipped; - } - if (max_sector > mddev->resync_max) max_sector = mddev->resync_max; /* Don't do IO beyond here */ @@ -3354,7 +3316,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, /* yep, skip the sync_blocks here, but don't assume * that there will never be anything to do here */ - chunks_skipped = -1; continue; } if (mrdev) @@ -3402,7 +3363,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, !test_bit(In_sync, &rdev->flags)) continue; /* This is where we read from */ - any_working = 1; sector = r10_bio->devs[j].addr; if (is_badblock(rdev, sector, max_sync, @@ -3417,6 +3377,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, continue; } } + any_working = 1; bio = r10_bio->devs[0].bio; bio->bi_next = biolist; biolist = bio; @@ -3485,29 +3446,19 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, for (k = 0; k < conf->copies; k++) if (r10_bio->devs[k].devnum == i) break; - if (mrdev && !test_bit(In_sync, - &mrdev->flags) - && !rdev_set_badblocks( - mrdev, - r10_bio->devs[k].addr, - max_sync, 0)) - any_working = 0; - if (mreplace && - !rdev_set_badblocks( - mreplace, - r10_bio->devs[k].addr, - max_sync, 0)) - any_working = 0; - } - if (!any_working) { - if (!test_and_set_bit(MD_RECOVERY_INTR, - &mddev->recovery)) - pr_warn("md/raid10:%s: insufficient working devices for recovery.\n", - mdname(mddev)); - mirror->recovery_disabled - = mddev->recovery_disabled; - } else { - error_disk = i; + if (mrdev && + !test_bit(In_sync, &mrdev->flags)) + rdev_set_badblocks( + mrdev, + r10_bio->devs[k].addr, + max_sync, 0); + if (mreplace) + rdev_set_badblocks( + mreplace, + r10_bio->devs[k].addr, + max_sync, 0); + pr_warn("md/raid10:%s: cannot recovery sector %llu + %d.\n", + mdname(mddev), r10_bio->devs[k].addr, max_sync); } put_buf(r10_bio); if (rb2) @@ -3548,7 +3499,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, rb2->master_bio = NULL; put_buf(rb2); } - goto giveup; + *skipped = 1; + return max_sync; } } else { /* resync. Schedule a read for every block at this virt offset */ @@ -3572,7 +3524,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, &mddev->recovery)) { /* We can skip this block */ *skipped = 1; - return sync_blocks + sectors_skipped; + return sync_blocks; } if (sync_blocks < max_sync) max_sync = sync_blocks; @@ -3664,8 +3616,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, mddev); } put_buf(r10_bio); - biolist = NULL; - goto giveup; + *skipped = 1; + return max_sync; } } @@ -3685,7 +3637,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (WARN_ON(!bio_add_page(bio, page, len, 0))) { bio->bi_status = BLK_STS_RESOURCE; bio_endio(bio); - goto giveup; + *skipped = 1; + return max_sync; } } nr_sectors += len>>9; @@ -3753,25 +3706,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, } } - if (sectors_skipped) - /* pretend they weren't skipped, it makes - * no important difference in this case - */ - md_done_sync(mddev, sectors_skipped, 1); - - return sectors_skipped + nr_sectors; - giveup: - /* There is nowhere to write, so all non-sync - * drives must be failed or in resync, all drives - * have a bad block, so try the next chunk... - */ - if (sector_nr + max_sync < max_sector) - max_sector = sector_nr + max_sync; - - sectors_skipped += (max_sector - sector_nr); - chunks_skipped ++; - sector_nr = max_sector; - goto skipped; + return nr_sectors; } static sector_t @@ -4134,8 +4069,6 @@ static int raid10_run(struct mddev *mddev) disk->replacement->saved_raid_disk < 0) { conf->fullsync = 1; } - - disk->recovery_disabled = mddev->recovery_disabled - 1; } if (mddev->resync_offset != MaxSector) @@ -4913,7 +4846,8 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio) if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) if (handle_reshape_read_error(mddev, r10_bio) < 0) { /* Reshape has been aborted */ - md_done_sync(mddev, r10_bio->sectors, 0); + md_done_sync(mddev, r10_bio->sectors); + md_sync_error(mddev); return; } @@ -5071,7 +5005,7 @@ static void end_reshape_request(struct r10bio *r10_bio) { if (!atomic_dec_and_test(&r10_bio->remaining)) return; - md_done_sync(r10_bio->mddev, r10_bio->sectors, 1); + md_done_sync(r10_bio->mddev, r10_bio->sectors); bio_put(r10_bio->master_bio); put_buf(r10_bio); } diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index da00a55f7a55..ec79d87fb92f 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -18,11 +18,6 @@ struct raid10_info { struct md_rdev *rdev, *replacement; sector_t head_position; - int recovery_disabled; /* matches - * mddev->recovery_disabled - * when we shouldn't try - * recovering this device. - */ }; struct r10conf { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 8dc98f545969..8854e024f311 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -56,7 +56,11 @@ #include "md-bitmap.h" #include "raid5-log.h" -#define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED) +#define UNSUPPORTED_MDDEV_FLAGS \ + ((1L << MD_FAILFAST_SUPPORTED) | \ + (1L << MD_FAILLAST_DEV) | \ + (1L << MD_SERIALIZE_POLICY)) + #define cpu_to_group(cpu) cpu_to_node(cpu) #define ANY_GROUP NUMA_NO_NODE @@ -773,14 +777,14 @@ struct stripe_request_ctx { /* last sector in the request */ sector_t last_sector; + /* the request had REQ_PREFLUSH, cleared after the first stripe_head */ + bool do_flush; + /* * bitmap to track stripe sectors that have been added to stripes * add one to account for unaligned requests */ - DECLARE_BITMAP(sectors_to_do, RAID5_MAX_REQ_STRIPES + 1); - - /* the request had REQ_PREFLUSH, cleared after the first stripe_head */ - bool do_flush; + unsigned long sectors_to_do[]; }; /* @@ -2817,11 +2821,9 @@ static void raid5_end_read_request(struct bio * bi) else { clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); - if (!(set_bad - && test_bit(In_sync, &rdev->flags) - && rdev_set_badblocks( - rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0))) - md_error(conf->mddev, rdev); + if (!(set_bad && test_bit(In_sync, &rdev->flags))) + rdev_set_badblocks(rdev, sh->sector, + RAID5_STRIPE_SECTORS(conf), 0); } } rdev_dec_pending(rdev, conf->mddev); @@ -2920,7 +2922,6 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) if (has_failed(conf)) { set_bit(MD_BROKEN, &conf->mddev->flags); - conf->recovery_disabled = mddev->recovery_disabled; pr_crit("md/raid:%s: Cannot continue operation (%d/%d failed).\n", mdname(mddev), mddev->degraded, conf->raid_disks); @@ -3599,11 +3600,10 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, else rdev = NULL; if (rdev) { - if (!rdev_set_badblocks( - rdev, - sh->sector, - RAID5_STRIPE_SECTORS(conf), 0)) - md_error(conf->mddev, rdev); + rdev_set_badblocks(rdev, + sh->sector, + RAID5_STRIPE_SECTORS(conf), + 0); rdev_dec_pending(rdev, conf->mddev); } } @@ -3723,11 +3723,11 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, RAID5_STRIPE_SECTORS(conf), 0)) abort = 1; } - if (abort) - conf->recovery_disabled = - conf->mddev->recovery_disabled; } - md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), !abort); + md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf)); + + if (abort) + md_sync_error(conf->mddev); } static int want_replace(struct stripe_head *sh, int disk_idx) @@ -3751,9 +3751,14 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, struct r5dev *dev = &sh->dev[disk_idx]; struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], &sh->dev[s->failed_num[1]] }; + struct mddev *mddev = sh->raid_conf->mddev; + bool force_rcw = false; int i; - bool force_rcw = (sh->raid_conf->rmw_level == PARITY_DISABLE_RMW); + if (sh->raid_conf->rmw_level == PARITY_DISABLE_RMW || + (mddev->bitmap_ops && mddev->bitmap_ops->blocks_synced && + !mddev->bitmap_ops->blocks_synced(mddev, sh->sector))) + force_rcw = true; if (test_bit(R5_LOCKED, &dev->flags) || test_bit(R5_UPTODATE, &dev->flags)) @@ -5157,7 +5162,7 @@ static void handle_stripe(struct stripe_head *sh) if ((s.syncing || s.replacing) && s.locked == 0 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && test_bit(STRIPE_INSYNC, &sh->state)) { - md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1); + md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf)); clear_bit(STRIPE_SYNCING, &sh->state); if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) wake_up_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap); @@ -5224,7 +5229,7 @@ static void handle_stripe(struct stripe_head *sh) clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); wake_up(&conf->wait_for_reshape); - md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1); + md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf)); } if (s.expanding && s.locked == 0 && @@ -5253,9 +5258,8 @@ finish: if (test_and_clear_bit(R5_WriteError, &dev->flags)) { /* We own a safe reference to the rdev */ rdev = conf->disks[i].rdev; - if (!rdev_set_badblocks(rdev, sh->sector, - RAID5_STRIPE_SECTORS(conf), 0)) - md_error(conf->mddev, rdev); + rdev_set_badblocks(rdev, sh->sector, + RAID5_STRIPE_SECTORS(conf), 0); rdev_dec_pending(rdev, conf->mddev); } if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { @@ -6080,13 +6084,13 @@ static sector_t raid5_bio_lowest_chunk_sector(struct r5conf *conf, static bool raid5_make_request(struct mddev *mddev, struct bio * bi) { DEFINE_WAIT_FUNC(wait, woken_wake_function); - bool on_wq; struct r5conf *conf = mddev->private; - sector_t logical_sector; - struct stripe_request_ctx ctx = {}; const int rw = bio_data_dir(bi); + struct stripe_request_ctx *ctx; + sector_t logical_sector; enum stripe_result res; int s, stripe_cnt; + bool on_wq; if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { int ret = log_handle_flush_request(conf, bi); @@ -6098,11 +6102,6 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) return true; } /* ret == -EAGAIN, fallback */ - /* - * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH, - * we need to flush journal device - */ - ctx.do_flush = bi->bi_opf & REQ_PREFLUSH; } md_write_start(mddev, bi); @@ -6125,16 +6124,25 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) } logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1); - ctx.first_sector = logical_sector; - ctx.last_sector = bio_end_sector(bi); bi->bi_next = NULL; - stripe_cnt = DIV_ROUND_UP_SECTOR_T(ctx.last_sector - logical_sector, + ctx = mempool_alloc(conf->ctx_pool, GFP_NOIO); + memset(ctx, 0, conf->ctx_size); + ctx->first_sector = logical_sector; + ctx->last_sector = bio_end_sector(bi); + /* + * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH, + * we need to flush journal device + */ + if (unlikely(bi->bi_opf & REQ_PREFLUSH)) + ctx->do_flush = true; + + stripe_cnt = DIV_ROUND_UP_SECTOR_T(ctx->last_sector - logical_sector, RAID5_STRIPE_SECTORS(conf)); - bitmap_set(ctx.sectors_to_do, 0, stripe_cnt); + bitmap_set(ctx->sectors_to_do, 0, stripe_cnt); pr_debug("raid456: %s, logical %llu to %llu\n", __func__, - bi->bi_iter.bi_sector, ctx.last_sector); + bi->bi_iter.bi_sector, ctx->last_sector); /* Bail out if conflicts with reshape and REQ_NOWAIT is set */ if ((bi->bi_opf & REQ_NOWAIT) && @@ -6142,6 +6150,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) bio_wouldblock_error(bi); if (rw == WRITE) md_write_end(mddev); + mempool_free(ctx, conf->ctx_pool); return true; } md_account_bio(mddev, &bi); @@ -6160,10 +6169,10 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) add_wait_queue(&conf->wait_for_reshape, &wait); on_wq = true; } - s = (logical_sector - ctx.first_sector) >> RAID5_STRIPE_SHIFT(conf); + s = (logical_sector - ctx->first_sector) >> RAID5_STRIPE_SHIFT(conf); while (1) { - res = make_stripe_request(mddev, conf, &ctx, logical_sector, + res = make_stripe_request(mddev, conf, ctx, logical_sector, bi); if (res == STRIPE_FAIL || res == STRIPE_WAIT_RESHAPE) break; @@ -6180,9 +6189,9 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) * raid5_activate_delayed() from making progress * and thus deadlocking. */ - if (ctx.batch_last) { - raid5_release_stripe(ctx.batch_last); - ctx.batch_last = NULL; + if (ctx->batch_last) { + raid5_release_stripe(ctx->batch_last); + ctx->batch_last = NULL; } wait_woken(&wait, TASK_UNINTERRUPTIBLE, @@ -6190,21 +6199,23 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) continue; } - s = find_next_bit_wrap(ctx.sectors_to_do, stripe_cnt, s); + s = find_next_bit_wrap(ctx->sectors_to_do, stripe_cnt, s); if (s == stripe_cnt) break; - logical_sector = ctx.first_sector + + logical_sector = ctx->first_sector + (s << RAID5_STRIPE_SHIFT(conf)); } if (unlikely(on_wq)) remove_wait_queue(&conf->wait_for_reshape, &wait); - if (ctx.batch_last) - raid5_release_stripe(ctx.batch_last); + if (ctx->batch_last) + raid5_release_stripe(ctx->batch_last); if (rw == WRITE) md_write_end(mddev); + + mempool_free(ctx, conf->ctx_pool); if (res == STRIPE_WAIT_RESHAPE) { md_free_cloned_bio(bi); return false; @@ -7374,6 +7385,9 @@ static void free_conf(struct r5conf *conf) bioset_exit(&conf->bio_split); kfree(conf->stripe_hashtbl); kfree(conf->pending_data); + + mempool_destroy(conf->ctx_pool); + kfree(conf); } @@ -7536,8 +7550,6 @@ static struct r5conf *setup_conf(struct mddev *mddev) } conf->bypass_threshold = BYPASS_THRESHOLD; - conf->recovery_disabled = mddev->recovery_disabled - 1; - conf->raid_disks = mddev->raid_disks; if (mddev->reshape_position == MaxSector) conf->previous_raid_disks = mddev->raid_disks; @@ -7729,6 +7741,25 @@ static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded return 0; } +static int raid5_create_ctx_pool(struct r5conf *conf) +{ + struct stripe_request_ctx *ctx; + int size; + + if (mddev_is_dm(conf->mddev)) + size = BITS_TO_LONGS(RAID5_MAX_REQ_STRIPES); + else + size = BITS_TO_LONGS( + queue_max_hw_sectors(conf->mddev->gendisk->queue) >> + RAID5_STRIPE_SHIFT(conf)); + + conf->ctx_size = struct_size(ctx, sectors_to_do, size); + conf->ctx_pool = mempool_create_kmalloc_pool(NR_RAID_BIOS, + conf->ctx_size); + + return conf->ctx_pool ? 0 : -ENOMEM; +} + static int raid5_set_limits(struct mddev *mddev) { struct r5conf *conf = mddev->private; @@ -7785,6 +7816,8 @@ static int raid5_set_limits(struct mddev *mddev) * Limit the max sectors based on this. */ lim.max_hw_sectors = RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf); + if ((lim.max_hw_sectors << 9) < lim.io_opt) + lim.max_hw_sectors = lim.io_opt >> 9; /* No restrictions on the number of segments in the request */ lim.max_segments = USHRT_MAX; @@ -8057,7 +8090,12 @@ static int raid5_run(struct mddev *mddev) goto abort; } - if (log_init(conf, journal_dev, raid5_has_ppl(conf))) + ret = raid5_create_ctx_pool(conf); + if (ret) + goto abort; + + ret = log_init(conf, journal_dev, raid5_has_ppl(conf)); + if (ret) goto abort; return 0; @@ -8211,7 +8249,6 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) * isn't possible. */ if (!test_bit(Faulty, &rdev->flags) && - mddev->recovery_disabled != conf->recovery_disabled && !has_failed(conf) && (!p->replacement || p->replacement == rdev) && number < conf->raid_disks) { @@ -8272,8 +8309,6 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) return 0; } - if (mddev->recovery_disabled == conf->recovery_disabled) - return -EBUSY; if (rdev->saved_raid_disk < 0 && has_failed(conf)) /* no point adding a device */ diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index eafc6e9ed6ee..110b1c2d0a86 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -640,7 +640,6 @@ struct r5conf { * (fresh device added). * Cleared when a sync completes. */ - int recovery_disabled; /* per cpu variables */ struct raid5_percpu __percpu *percpu; int scribble_disks; @@ -690,6 +689,9 @@ struct r5conf { struct list_head pending_list; int pending_data_cnt; struct r5pending_data *next_pending_data; + + mempool_t *ctx_pool; + int ctx_size; }; #if PAGE_SIZE == DEFAULT_STRIPE_SIZE diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 7bf228df6001..19b67cf5d550 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1333,7 +1333,8 @@ static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl) } static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq, - blk_status_t status) + blk_status_t status, + const struct io_comp_batch *iob) { struct nvme_ctrl *ctrl = rq->end_io_data; unsigned long rtt = jiffies - (rq->deadline - rq->timeout); diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index a9c097dacad6..fb62633ccbb0 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -410,7 +410,8 @@ static void nvme_uring_task_cb(struct io_tw_req tw_req, io_tw_token_t tw) } static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, - blk_status_t err) + blk_status_t err, + const struct io_comp_batch *iob) { struct io_uring_cmd *ioucmd = req->end_io_data; struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); @@ -425,14 +426,20 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, pdu->result = le64_to_cpu(nvme_req(req)->result.u64); /* - * IOPOLL could potentially complete this request directly, but - * if multiple rings are polling on the same queue, then it's possible - * for one ring to find completions for another ring. Punting the - * completion via task_work will always direct it to the right - * location, rather than potentially complete requests for ringA - * under iopoll invocations from ringB. + * For IOPOLL, check if this completion is happening in the context + * of the same io_ring that owns the request (local context). If so, + * we can complete inline without task_work overhead. Otherwise, we + * must punt to task_work to ensure completion happens in the correct + * ring's context. */ - io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb); + if (blk_rq_is_poll(req) && iob && + iob->poll_ctx == io_uring_cmd_ctx_handle(ioucmd)) { + if (pdu->bio) + blk_rq_unmap_user(pdu->bio); + io_uring_cmd_done32(ioucmd, pdu->status, pdu->result, 0); + } else { + io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb); + } return RQ_END_IO_FREE; } diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index d86f2565a92c..80df992d1ae8 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -290,14 +290,14 @@ struct nvme_iod { u8 flags; u8 nr_descriptors; - unsigned int total_len; + size_t total_len; struct dma_iova_state dma_state; void *descriptors[NVME_MAX_NR_DESCRIPTORS]; struct nvme_dma_vec *dma_vecs; unsigned int nr_dma_vecs; dma_addr_t meta_dma; - unsigned int meta_total_len; + size_t meta_total_len; struct dma_iova_state meta_dma_state; struct nvme_sgl_desc *meta_descriptor; }; @@ -845,11 +845,9 @@ static bool nvme_pci_prp_save_mapping(struct request *req, static bool nvme_pci_prp_iter_next(struct request *req, struct device *dma_dev, struct blk_dma_iter *iter) { - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - if (iter->len) return true; - if (!blk_rq_dma_map_iter_next(req, dma_dev, &iod->dma_state, iter)) + if (!blk_rq_dma_map_iter_next(req, dma_dev, iter)) return false; return nvme_pci_prp_save_mapping(req, dma_dev, iter); } @@ -1024,8 +1022,7 @@ static blk_status_t nvme_pci_setup_data_sgl(struct request *req, } nvme_pci_sgl_set_data(&sg_list[mapped++], iter); iod->total_len += iter->len; - } while (blk_rq_dma_map_iter_next(req, nvmeq->dev->dev, &iod->dma_state, - iter)); + } while (blk_rq_dma_map_iter_next(req, nvmeq->dev->dev, iter)); nvme_pci_sgl_set_seg(&iod->cmd.common.dptr.sgl, sgl_dma, mapped); if (unlikely(iter->status)) @@ -1634,7 +1631,8 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); } -static enum rq_end_io_ret abort_endio(struct request *req, blk_status_t error) +static enum rq_end_io_ret abort_endio(struct request *req, blk_status_t error, + const struct io_comp_batch *iob) { struct nvme_queue *nvmeq = req->mq_hctx->driver_data; @@ -2877,7 +2875,8 @@ out_unlock: } static enum rq_end_io_ret nvme_del_queue_end(struct request *req, - blk_status_t error) + blk_status_t error, + const struct io_comp_batch *iob) { struct nvme_queue *nvmeq = req->end_io_data; @@ -2887,14 +2886,15 @@ static enum rq_end_io_ret nvme_del_queue_end(struct request *req, } static enum rq_end_io_ret nvme_del_cq_end(struct request *req, - blk_status_t error) + blk_status_t error, + const struct io_comp_batch *iob) { struct nvme_queue *nvmeq = req->end_io_data; if (error) set_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags); - return nvme_del_queue_end(req, error); + return nvme_del_queue_end(req, error, iob); } static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 3da31bb1183e..5e366502fb75 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -298,7 +298,7 @@ static void nvmet_execute_get_log_page_rmi(struct nvmet_req *req) if (status) goto out; - if (!req->ns->bdev || bdev_nonrot(req->ns->bdev)) { + if (!req->ns->bdev || !bdev_rot(req->ns->bdev)) { status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; goto out; } @@ -1084,7 +1084,7 @@ static void nvmet_execute_id_cs_indep(struct nvmet_req *req) id->nmic = NVME_NS_NMIC_SHARED; if (req->ns->readonly) id->nsattr |= NVME_NS_ATTR_RO; - if (req->ns->bdev && !bdev_nonrot(req->ns->bdev)) + if (req->ns->bdev && bdev_rot(req->ns->bdev)) id->nsfeat |= NVME_NS_ROTATIONAL; /* * We need flush command to flush the file's metadata, diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 67c423a8b052..5d541c2a46a5 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -247,7 +247,8 @@ static void nvmet_passthru_execute_cmd_work(struct work_struct *w) } static enum rq_end_io_ret nvmet_passthru_req_done(struct request *rq, - blk_status_t blk_status) + blk_status_t blk_status, + const struct io_comp_batch *iob) { struct nvmet_req *req = rq->end_io_data; diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index b6e8730e049e..147127fb4db9 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -2118,7 +2118,8 @@ maybe_retry: } static enum rq_end_io_ret eh_lock_door_done(struct request *req, - blk_status_t status) + blk_status_t status, + const struct io_comp_batch *iob) { blk_mq_free_request(req); return RQ_END_IO_NONE; diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 57fba34832ad..1a521f9d821a 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -177,7 +177,8 @@ typedef struct sg_device { /* holds the state of each scsi generic device */ } Sg_device; /* tasklet or soft irq callback */ -static enum rq_end_io_ret sg_rq_end_io(struct request *rq, blk_status_t status); +static enum rq_end_io_ret sg_rq_end_io(struct request *rq, blk_status_t status, + const struct io_comp_batch *iob); static int sg_start_req(Sg_request *srp, unsigned char *cmd); static int sg_finish_rem_req(Sg_request * srp); static int sg_build_indirect(Sg_scatter_hold * schp, Sg_fd * sfp, int buff_size); @@ -1309,7 +1310,8 @@ sg_rq_end_io_usercontext(struct work_struct *work) * level when a command is completed (or has failed). */ static enum rq_end_io_ret -sg_rq_end_io(struct request *rq, blk_status_t status) +sg_rq_end_io(struct request *rq, blk_status_t status, + const struct io_comp_batch *iob) { struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(rq); struct sg_request *srp = rq->end_io_data; diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 168f25e4aaa3..8aeaa3b68c25 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -525,7 +525,8 @@ static void st_do_stats(struct scsi_tape *STp, struct request *req) } static enum rq_end_io_ret st_scsi_execute_end(struct request *req, - blk_status_t status) + blk_status_t status, + const struct io_comp_batch *iob) { struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req); struct st_request *SRpnt = req->end_io_data; diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index db4e09042469..823b2665f95b 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -39,7 +39,8 @@ static inline struct pscsi_dev_virt *PSCSI_DEV(struct se_device *dev) } static sense_reason_t pscsi_execute_cmd(struct se_cmd *cmd); -static enum rq_end_io_ret pscsi_req_done(struct request *, blk_status_t); +static enum rq_end_io_ret pscsi_req_done(struct request *, blk_status_t, + const struct io_comp_batch *); /* pscsi_attach_hba(): * @@ -1001,7 +1002,8 @@ static sector_t pscsi_get_blocks(struct se_device *dev) } static enum rq_end_io_ret pscsi_req_done(struct request *req, - blk_status_t status) + blk_status_t status, + const struct io_comp_batch *iob) { struct se_cmd *cmd = req->end_io_data; struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req); diff --git a/fs/buffer.c b/fs/buffer.c index fd53b806ab7e..b67791690ed3 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -2821,7 +2822,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, wbc_account_cgroup_owner(wbc, bh->b_folio, bh->b_size); } - submit_bio(bio); + blk_crypto_submit_bio(bio); } void submit_bh(blk_opf_t opf, struct buffer_head *bh) diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 5f5599020e94..6da683ea69dc 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -47,50 +47,71 @@ bool fscrypt_decrypt_bio(struct bio *bio) } EXPORT_SYMBOL(fscrypt_decrypt_bio); +struct fscrypt_zero_done { + atomic_t pending; + blk_status_t status; + struct completion done; +}; + +static void fscrypt_zeroout_range_done(struct fscrypt_zero_done *done) +{ + if (atomic_dec_and_test(&done->pending)) + complete(&done->done); +} + +static void fscrypt_zeroout_range_end_io(struct bio *bio) +{ + struct fscrypt_zero_done *done = bio->bi_private; + + if (bio->bi_status) + cmpxchg(&done->status, 0, bio->bi_status); + fscrypt_zeroout_range_done(done); + bio_put(bio); +} + static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, - pgoff_t lblk, sector_t pblk, + pgoff_t lblk, sector_t sector, unsigned int len) { const unsigned int blockbits = inode->i_blkbits; const unsigned int blocks_per_page = 1 << (PAGE_SHIFT - blockbits); - struct bio *bio; - int ret, err = 0; - int num_pages = 0; - - /* This always succeeds since __GFP_DIRECT_RECLAIM is set. */ - bio = bio_alloc(inode->i_sb->s_bdev, BIO_MAX_VECS, REQ_OP_WRITE, - GFP_NOFS); + struct fscrypt_zero_done done = { + .pending = ATOMIC_INIT(1), + .done = COMPLETION_INITIALIZER_ONSTACK(done.done), + }; while (len) { - unsigned int blocks_this_page = min(len, blocks_per_page); - unsigned int bytes_this_page = blocks_this_page << blockbits; + struct bio *bio; + unsigned int n; - if (num_pages == 0) { - fscrypt_set_bio_crypt_ctx(bio, inode, lblk, GFP_NOFS); - bio->bi_iter.bi_sector = - pblk << (blockbits - SECTOR_SHIFT); - } - ret = bio_add_page(bio, ZERO_PAGE(0), bytes_this_page, 0); - if (WARN_ON_ONCE(ret != bytes_this_page)) { - err = -EIO; - goto out; - } - num_pages++; - len -= blocks_this_page; - lblk += blocks_this_page; - pblk += blocks_this_page; - if (num_pages == BIO_MAX_VECS || !len || - !fscrypt_mergeable_bio(bio, inode, lblk)) { - err = submit_bio_wait(bio); - if (err) - goto out; - bio_reset(bio, inode->i_sb->s_bdev, REQ_OP_WRITE); - num_pages = 0; + bio = bio_alloc(inode->i_sb->s_bdev, BIO_MAX_VECS, REQ_OP_WRITE, + GFP_NOFS); + bio->bi_iter.bi_sector = sector; + bio->bi_private = &done; + bio->bi_end_io = fscrypt_zeroout_range_end_io; + fscrypt_set_bio_crypt_ctx(bio, inode, lblk, GFP_NOFS); + + for (n = 0; n < BIO_MAX_VECS; n++) { + unsigned int blocks_this_page = + min(len, blocks_per_page); + unsigned int bytes_this_page = blocks_this_page << blockbits; + + __bio_add_page(bio, ZERO_PAGE(0), bytes_this_page, 0); + len -= blocks_this_page; + lblk += blocks_this_page; + sector += (bytes_this_page >> SECTOR_SHIFT); + if (!len || !fscrypt_mergeable_bio(bio, inode, lblk)) + break; } + + atomic_inc(&done.pending); + blk_crypto_submit_bio(bio); } -out: - bio_put(bio); - return err; + + fscrypt_zeroout_range_done(&done); + + wait_for_completion(&done.done); + return blk_status_to_errno(done.status); } /** @@ -132,7 +153,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, return 0; if (fscrypt_inode_uses_inline_crypto(inode)) - return fscrypt_zeroout_range_inline_crypt(inode, lblk, pblk, + return fscrypt_zeroout_range_inline_crypt(inode, lblk, sector, len); BUILD_BUG_ON(ARRAY_SIZE(pages) > BIO_MAX_VECS); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 39abfeec5f36..a8c95eee91b7 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -7,6 +7,7 @@ * Written by Theodore Ts'o, 2010. */ +#include #include #include #include @@ -401,7 +402,7 @@ void ext4_io_submit(struct ext4_io_submit *io) if (bio) { if (io->io_wbc->sync_mode == WB_SYNC_ALL) io->io_bio->bi_opf |= REQ_SYNC; - submit_bio(io->io_bio); + blk_crypto_submit_bio(io->io_bio); } io->io_bio = NULL; } diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index e7f2350c725b..49a6d36a8dba 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -345,7 +346,7 @@ int ext4_mpage_readpages(struct inode *inode, if (bio && (last_block_in_bio != first_block - 1 || !fscrypt_mergeable_bio(bio, inode, next_block))) { submit_and_realloc: - submit_bio(bio); + blk_crypto_submit_bio(bio); bio = NULL; } if (bio == NULL) { @@ -371,14 +372,14 @@ int ext4_mpage_readpages(struct inode *inode, if (((map.m_flags & EXT4_MAP_BOUNDARY) && (relative_block == map.m_len)) || (first_hole != blocks_per_folio)) { - submit_bio(bio); + blk_crypto_submit_bio(bio); bio = NULL; } else last_block_in_bio = first_block + blocks_per_folio - 1; continue; confused: if (bio) { - submit_bio(bio); + blk_crypto_submit_bio(bio); bio = NULL; } if (!folio_test_uptodate(folio)) @@ -389,7 +390,7 @@ next_page: ; /* A label shall be followed by a statement until C23 */ } if (bio) - submit_bio(bio); + blk_crypto_submit_bio(bio); return 0; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index c30e69392a62..c3dd8a5c8589 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -513,7 +513,7 @@ void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio, trace_f2fs_submit_read_bio(sbi->sb, type, bio); iostat_update_submit_ctx(bio, type); - submit_bio(bio); + blk_crypto_submit_bio(bio); } static void f2fs_submit_write_bio(struct f2fs_sb_info *sbi, struct bio *bio, @@ -522,7 +522,7 @@ static void f2fs_submit_write_bio(struct f2fs_sb_info *sbi, struct bio *bio, WARN_ON_ONCE(is_read_io(bio_op(bio))); trace_f2fs_submit_write_bio(sbi->sb, type, bio); iostat_update_submit_ctx(bio, type); - submit_bio(bio); + blk_crypto_submit_bio(bio); } static void __submit_merged_bio(struct f2fs_bio_info *io) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index cd4b1d3c90ab..31a0c1bb41f6 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -5,6 +5,7 @@ * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ */ +#include #include #include #include @@ -5047,7 +5048,7 @@ static void f2fs_dio_write_submit_io(const struct iomap_iter *iter, enum temp_type temp = f2fs_get_segment_temp(sbi, type); bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi, DATA, temp); - submit_bio(bio); + blk_crypto_submit_bio(bio); } static const struct iomap_dio_ops f2fs_iomap_dio_write_ops = { diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index a06c73eaa890..3f0ef4488361 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -3,6 +3,7 @@ * Copyright (C) 2010 Red Hat, Inc. * Copyright (c) 2016-2025 Christoph Hellwig. */ +#include #include #include #include @@ -75,7 +76,7 @@ static void iomap_dio_submit_bio(const struct iomap_iter *iter, dio->dops->submit_io(iter, bio, pos); } else { WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_ANON_WRITE); - submit_bio(bio); + blk_crypto_submit_bio(bio); } } diff --git a/include/linux/bio.h b/include/linux/bio.h index 6156f2d66d4a..7a2f3fc5be57 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -256,12 +256,6 @@ static inline struct folio *bio_first_folio_all(struct bio *bio) return page_folio(bio_first_page_all(bio)); } -static inline struct bio_vec *bio_last_bvec_all(struct bio *bio) -{ - WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); - return &bio->bi_io_vec[bio->bi_vcnt - 1]; -} - /** * struct folio_iter - State for iterating all folios in a bio. * @folio: The current folio we're iterating. NULL after the last folio. diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h index 58b0c5254a67..f7c3cb4a342f 100644 --- a/include/linux/blk-crypto.h +++ b/include/linux/blk-crypto.h @@ -132,6 +132,11 @@ static inline bool bio_has_crypt_ctx(struct bio *bio) return bio->bi_crypt_context; } +static inline struct bio_crypt_ctx *bio_crypt_ctx(struct bio *bio) +{ + return bio->bi_crypt_context; +} + void bio_crypt_set_ctx(struct bio *bio, const struct blk_crypto_key *key, const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], gfp_t gfp_mask); @@ -169,8 +174,35 @@ static inline bool bio_has_crypt_ctx(struct bio *bio) return false; } +static inline struct bio_crypt_ctx *bio_crypt_ctx(struct bio *bio) +{ + return NULL; +} + #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ +bool __blk_crypto_submit_bio(struct bio *bio); + +/** + * blk_crypto_submit_bio - Submit a bio that may have a crypto context + * @bio: bio to submit + * + * If @bio has no crypto context, or the crypt context attached to @bio is + * supported by the underlying device's inline encryption hardware, just submit + * @bio. + * + * Otherwise, try to perform en/decryption for this bio by falling back to the + * kernel crypto API. For encryption this means submitting newly allocated + * bios for the encrypted payload while keeping back the source bio until they + * complete, while for reads the decryption happens in-place by a hooked in + * completion handler. + */ +static inline void blk_crypto_submit_bio(struct bio *bio) +{ + if (!bio_has_crypt_ctx(bio) || __blk_crypto_submit_bio(bio)) + submit_bio(bio); +} + int __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask); /** * bio_crypt_clone - clone bio encryption context diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index a6b84206eb94..c15b1ac62765 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -91,7 +91,7 @@ static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, return bio_integrity_intervals(bi, sectors) * bi->metadata_size; } -static inline bool blk_integrity_rq(struct request *rq) +static inline bool blk_integrity_rq(const struct request *rq) { return rq->cmd_flags & REQ_INTEGRITY; } @@ -168,9 +168,9 @@ static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, { return 0; } -static inline int blk_integrity_rq(struct request *rq) +static inline bool blk_integrity_rq(const struct request *rq) { - return 0; + return false; } static inline struct bio_vec rq_integrity_vec(struct request *rq) diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h index cb88fc791fbd..214c181ff2c9 100644 --- a/include/linux/blk-mq-dma.h +++ b/include/linux/blk-mq-dma.h @@ -28,7 +28,7 @@ struct blk_dma_iter { bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, struct dma_iova_state *state, struct blk_dma_iter *iter); bool blk_rq_dma_map_iter_next(struct request *req, struct device *dma_dev, - struct dma_iova_state *state, struct blk_dma_iter *iter); + struct blk_dma_iter *iter); /** * blk_rq_dma_map_coalesce - were all segments coalesced? diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index cae9e857aea4..18a2388ba581 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -13,6 +13,7 @@ struct blk_mq_tags; struct blk_flush_queue; +struct io_comp_batch; #define BLKDEV_MIN_RQ 4 #define BLKDEV_DEFAULT_RQ 128 @@ -22,7 +23,8 @@ enum rq_end_io_ret { RQ_END_IO_FREE, }; -typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t); +typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t, + const struct io_comp_batch *); /* * request flags */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 5dc061d318a4..19a888a2f104 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -232,6 +232,8 @@ struct bio { atomic_t __bi_remaining; + /* The actual vec list, preserved by bio_reset() */ + struct bio_vec *bi_io_vec; struct bvec_iter bi_iter; union { @@ -275,8 +277,6 @@ struct bio { atomic_t __bi_cnt; /* pin count */ - struct bio_vec *bi_io_vec; /* the actual vec list */ - struct bio_set *bi_pool; }; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 72e34acd439c..99ef8cd7673c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -340,14 +340,13 @@ typedef unsigned int __bitwise blk_features_t; /* skip this queue in blk_mq_(un)quiesce_tagset */ #define BLK_FEAT_SKIP_TAGSET_QUIESCE ((__force blk_features_t)(1u << 13)) +/* atomic writes enabled */ +#define BLK_FEAT_ATOMIC_WRITES ((__force blk_features_t)(1u << 14)) + /* undocumented magic for bcache */ #define BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE \ ((__force blk_features_t)(1u << 15)) -/* atomic writes enabled */ -#define BLK_FEAT_ATOMIC_WRITES \ - ((__force blk_features_t)(1u << 16)) - /* * Flags automatically inherited when stacking limits. */ @@ -551,7 +550,8 @@ struct request_queue { /* * queue settings */ - unsigned long nr_requests; /* Max # of requests */ + unsigned int nr_requests; /* Max # of requests */ + unsigned int async_depth; /* Max # of async requests */ #ifdef CONFIG_BLK_INLINE_ENCRYPTION struct blk_crypto_profile *crypto_profile; @@ -681,7 +681,7 @@ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) #define blk_queue_noxmerges(q) \ test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) -#define blk_queue_nonrot(q) (!((q)->limits.features & BLK_FEAT_ROTATIONAL)) +#define blk_queue_rot(q) ((q)->limits.features & BLK_FEAT_ROTATIONAL) #define blk_queue_io_stat(q) ((q)->limits.features & BLK_FEAT_IO_STAT) #define blk_queue_passthrough_stat(q) \ ((q)->limits.flags & BLK_FLAG_IOSTATS_PASSTHROUGH) @@ -1026,7 +1026,7 @@ extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags); extern void blk_queue_exit(struct request_queue *q); extern void blk_sync_queue(struct request_queue *q); -/* Helper to convert REQ_OP_XXX to its string format XXX */ +/* Convert a request operation REQ_OP_name into the string "name" */ extern const char *blk_op_str(enum req_op op); int blk_status_to_errno(blk_status_t status); @@ -1044,7 +1044,7 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev) return bdev->bd_queue; /* this is never NULL */ } -/* Helper to convert BLK_ZONE_ZONE_XXX to its string format XXX */ +/* Convert a zone condition BLK_ZONE_COND_name into the string "name" */ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond); static inline unsigned int bio_zone_no(struct bio *bio) @@ -1462,9 +1462,14 @@ bdev_write_zeroes_unmap_sectors(struct block_device *bdev) return bdev_limits(bdev)->max_wzeroes_unmap_sectors; } +static inline bool bdev_rot(struct block_device *bdev) +{ + return blk_queue_rot(bdev_get_queue(bdev)); +} + static inline bool bdev_nonrot(struct block_device *bdev) { - return blk_queue_nonrot(bdev_get_queue(bdev)); + return !bdev_rot(bdev); } static inline bool bdev_synchronous(struct block_device *bdev) @@ -1822,6 +1827,7 @@ struct io_comp_batch { struct rq_list req_list; bool need_ts; void (*complete)(struct io_comp_batch *); + void *poll_ctx; }; static inline bool blk_atomic_write_start_sect_aligned(sector_t sector, diff --git a/include/linux/types.h b/include/linux/types.h index d4437e9c452c..d673747eda8a 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -171,6 +171,11 @@ typedef u64 phys_addr_t; typedef u32 phys_addr_t; #endif +struct phys_vec { + phys_addr_t paddr; + size_t len; +}; + typedef phys_addr_t resource_size_t; /* diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index ec77dabba45b..a88876756805 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -55,7 +55,8 @@ _IOWR('u', 0x15, struct ublksrv_ctrl_cmd) #define UBLK_U_CMD_QUIESCE_DEV \ _IOWR('u', 0x16, struct ublksrv_ctrl_cmd) - +#define UBLK_U_CMD_TRY_STOP_DEV \ + _IOWR('u', 0x17, struct ublksrv_ctrl_cmd) /* * 64bits are enough now, and it should be easy to extend in case of * running out of feature flags @@ -103,6 +104,30 @@ #define UBLK_U_IO_UNREGISTER_IO_BUF \ _IOWR('u', 0x24, struct ublksrv_io_cmd) +/* + * return 0 if the command is run successfully, otherwise failure code + * is returned + */ +#define UBLK_U_IO_PREP_IO_CMDS \ + _IOWR('u', 0x25, struct ublk_batch_io) +/* + * If failure code is returned, nothing in the command buffer is handled. + * Otherwise, the returned value means how many bytes in command buffer + * are handled actually, then number of handled IOs can be calculated with + * `elem_bytes` for each IO. IOs in the remained bytes are not committed, + * userspace has to check return value for dealing with partial committing + * correctly. + */ +#define UBLK_U_IO_COMMIT_IO_CMDS \ + _IOWR('u', 0x26, struct ublk_batch_io) + +/* + * Fetch io commands to provided buffer in multishot style, + * `IORING_URING_CMD_MULTISHOT` is required for this command. + */ +#define UBLK_U_IO_FETCH_IO_CMDS \ + _IOWR('u', 0x27, struct ublk_batch_io) + /* only ABORT means that no re-fetch */ #define UBLK_IO_RES_OK 0 #define UBLK_IO_RES_NEED_GET_DATA 1 @@ -134,6 +159,10 @@ #define UBLKSRV_IO_BUF_TOTAL_BITS (UBLK_QID_OFF + UBLK_QID_BITS) #define UBLKSRV_IO_BUF_TOTAL_SIZE (1ULL << UBLKSRV_IO_BUF_TOTAL_BITS) +/* Copy to/from request integrity buffer instead of data buffer */ +#define UBLK_INTEGRITY_FLAG_OFF 62 +#define UBLKSRV_IO_INTEGRITY_FLAG (1ULL << UBLK_INTEGRITY_FLAG_OFF) + /* * ublk server can register data buffers for incoming I/O requests with a sparse * io_uring buffer table. The request buffer can then be used as the data buffer @@ -311,6 +340,36 @@ */ #define UBLK_F_BUF_REG_OFF_DAEMON (1ULL << 14) +/* + * Support the following commands for delivering & committing io command + * in batch. + * + * - UBLK_U_IO_PREP_IO_CMDS + * - UBLK_U_IO_COMMIT_IO_CMDS + * - UBLK_U_IO_FETCH_IO_CMDS + * - UBLK_U_IO_REGISTER_IO_BUF + * - UBLK_U_IO_UNREGISTER_IO_BUF + * + * The existing UBLK_U_IO_FETCH_REQ, UBLK_U_IO_COMMIT_AND_FETCH_REQ and + * UBLK_U_IO_NEED_GET_DATA uring_cmd are not supported for this feature. + */ +#define UBLK_F_BATCH_IO (1ULL << 15) + +/* + * ublk device supports requests with integrity/metadata buffer. + * Requires UBLK_F_USER_COPY. + */ +#define UBLK_F_INTEGRITY (1ULL << 16) + +/* + * The device supports the UBLK_CMD_TRY_STOP_DEV command, which + * allows stopping the device only if there are no openers. + */ +#define UBLK_F_SAFE_STOP_DEV (1ULL << 17) + +/* Disable automatic partition scanning when device is started */ +#define UBLK_F_NO_AUTO_PART_SCAN (1ULL << 18) + /* device state */ #define UBLK_S_DEV_DEAD 0 #define UBLK_S_DEV_LIVE 1 @@ -408,6 +467,8 @@ struct ublksrv_ctrl_dev_info { * passed in. */ #define UBLK_IO_F_NEED_REG_BUF (1U << 17) +/* Request has an integrity data buffer */ +#define UBLK_IO_F_INTEGRITY (1UL << 18) /* * io cmd is described by this structure, and stored in share memory, indexed @@ -525,6 +586,51 @@ struct ublksrv_io_cmd { }; }; +struct ublk_elem_header { + __u16 tag; /* IO tag */ + + /* + * Buffer index for incoming io command, only valid iff + * UBLK_F_AUTO_BUF_REG is set + */ + __u16 buf_index; + __s32 result; /* I/O completion result (commit only) */ +}; + +/* + * uring_cmd buffer structure for batch commands + * + * buffer includes multiple elements, which number is specified by + * `nr_elem`. Each element buffer is organized in the following order: + * + * struct ublk_elem_buffer { + * // Mandatory fields (8 bytes) + * struct ublk_elem_header header; + * + * // Optional fields (8 bytes each, included based on flags) + * + * // Buffer address (if UBLK_BATCH_F_HAS_BUF_ADDR) for copying data + * // between ublk request and ublk server buffer + * __u64 buf_addr; + * + * // returned Zone append LBA (if UBLK_BATCH_F_HAS_ZONE_LBA) + * __u64 zone_lba; + * } + * + * Used for `UBLK_U_IO_PREP_IO_CMDS` and `UBLK_U_IO_COMMIT_IO_CMDS` + */ +struct ublk_batch_io { + __u16 q_id; +#define UBLK_BATCH_F_HAS_ZONE_LBA (1 << 0) +#define UBLK_BATCH_F_HAS_BUF_ADDR (1 << 1) +#define UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK (1 << 2) + __u16 flags; + __u16 nr_elem; + __u8 elem_bytes; + __u8 reserved; + __u64 reserved2; +}; + struct ublk_param_basic { #define UBLK_ATTR_READ_ONLY (1 << 0) #define UBLK_ATTR_ROTATIONAL (1 << 1) @@ -600,6 +706,17 @@ struct ublk_param_segment { __u8 pad[2]; }; +struct ublk_param_integrity { + __u32 flags; /* LBMD_PI_CAP_* from linux/fs.h */ + __u16 max_integrity_segments; /* 0 means no limit */ + __u8 interval_exp; + __u8 metadata_size; /* UBLK_PARAM_TYPE_INTEGRITY requires nonzero */ + __u8 pi_offset; + __u8 csum_type; /* LBMD_PI_CSUM_* from linux/fs.h */ + __u8 tag_size; + __u8 pad[5]; +}; + struct ublk_params { /* * Total length of parameters, userspace has to set 'len' for both @@ -614,6 +731,7 @@ struct ublk_params { #define UBLK_PARAM_TYPE_ZONED (1 << 3) #define UBLK_PARAM_TYPE_DMA_ALIGN (1 << 4) #define UBLK_PARAM_TYPE_SEGMENT (1 << 5) +#define UBLK_PARAM_TYPE_INTEGRITY (1 << 6) /* requires UBLK_F_INTEGRITY */ __u32 types; /* types of parameter included */ struct ublk_param_basic basic; @@ -622,6 +740,7 @@ struct ublk_params { struct ublk_param_zoned zoned; struct ublk_param_dma_align dma; struct ublk_param_segment seg; + struct ublk_param_integrity integrity; }; #endif diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 73b38888a304..95ce553fff8d 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1055,17 +1055,6 @@ static int io_import_kbuf(int ddir, struct iov_iter *iter, iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, count); iov_iter_advance(iter, offset); - - if (count < imu->len) { - const struct bio_vec *bvec = iter->bvec; - - len += iter->iov_offset; - while (len > bvec->bv_len) { - len -= bvec->bv_len; - bvec++; - } - iter->nr_segs = 1 + bvec - iter->bvec; - } return 0; } diff --git a/io_uring/rw.c b/io_uring/rw.c index 3190c41bfdc9..d10386f56d49 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -1328,6 +1328,12 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) struct io_kiocb *req, *tmp; int nr_events = 0; + /* + * Store the polling io_ring_ctx so drivers can detect if they're + * completing a request in the same ring context that's polling. + */ + iob.poll_ctx = ctx; + /* * Only spin for completions if we don't have multiple devices hanging * off our complete list. diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index d031c8d80be4..c4db5c2e7103 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -793,7 +793,7 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, return PTR_ERR(bt); } blk_trace_setup_finalize(q, name, 1, bt, &buts2); - strcpy(buts.name, buts2.name); + strscpy(buts.name, buts2.name, BLKTRACE_BDEV_SIZE); mutex_unlock(&q->debugfs_mutex); if (copy_to_user(arg, &buts, sizeof(buts))) { diff --git a/rust/kernel/block/mq/gen_disk.rs b/rust/kernel/block/mq/gen_disk.rs index 1ce815c8cdab..c8b0ecb17082 100644 --- a/rust/kernel/block/mq/gen_disk.rs +++ b/rust/kernel/block/mq/gen_disk.rs @@ -107,8 +107,7 @@ impl GenDiskBuilder { drop(unsafe { T::QueueData::from_foreign(data) }); }); - // SAFETY: `bindings::queue_limits` contain only fields that are valid when zeroed. - let mut lim: bindings::queue_limits = unsafe { core::mem::zeroed() }; + let mut lim: bindings::queue_limits = pin_init::zeroed(); lim.logical_block_size = self.logical_block_size; lim.physical_block_size = self.physical_block_size; diff --git a/rust/kernel/block/mq/tag_set.rs b/rust/kernel/block/mq/tag_set.rs index c3cf56d52bee..dae9df408a86 100644 --- a/rust/kernel/block/mq/tag_set.rs +++ b/rust/kernel/block/mq/tag_set.rs @@ -38,9 +38,7 @@ impl TagSet { num_tags: u32, num_maps: u32, ) -> impl PinInit { - // SAFETY: `blk_mq_tag_set` only contains integers and pointers, which - // all are allowed to be 0. - let tag_set: bindings::blk_mq_tag_set = unsafe { core::mem::zeroed() }; + let tag_set: bindings::blk_mq_tag_set = pin_init::zeroed(); let tag_set: Result<_> = core::mem::size_of::() .try_into() .map(|cmd_size| { diff --git a/tools/testing/selftests/ublk/.gitignore b/tools/testing/selftests/ublk/.gitignore index 8b2871ea7751..e17bd28f27e0 100644 --- a/tools/testing/selftests/ublk/.gitignore +++ b/tools/testing/selftests/ublk/.gitignore @@ -1,3 +1,5 @@ -kublk -/tools +# SPDX-License-Identifier: GPL-2.0 *-verify.state +/tools +kublk +metadata_size diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 06ba6fde098d..8ac2d4a682a1 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -7,22 +7,21 @@ endif LDLIBS += -lpthread -lm -luring -TEST_PROGS := test_generic_01.sh -TEST_PROGS += test_generic_02.sh +TEST_PROGS := test_generic_02.sh TEST_PROGS += test_generic_03.sh -TEST_PROGS += test_generic_04.sh -TEST_PROGS += test_generic_05.sh TEST_PROGS += test_generic_06.sh TEST_PROGS += test_generic_07.sh TEST_PROGS += test_generic_08.sh TEST_PROGS += test_generic_09.sh TEST_PROGS += test_generic_10.sh -TEST_PROGS += test_generic_11.sh TEST_PROGS += test_generic_12.sh TEST_PROGS += test_generic_13.sh -TEST_PROGS += test_generic_14.sh -TEST_PROGS += test_generic_15.sh +TEST_PROGS += test_generic_16.sh + +TEST_PROGS += test_batch_01.sh +TEST_PROGS += test_batch_02.sh +TEST_PROGS += test_batch_03.sh TEST_PROGS += test_null_01.sh TEST_PROGS += test_null_02.sh @@ -34,6 +33,14 @@ TEST_PROGS += test_loop_04.sh TEST_PROGS += test_loop_05.sh TEST_PROGS += test_loop_06.sh TEST_PROGS += test_loop_07.sh + +TEST_PROGS += test_integrity_01.sh +TEST_PROGS += test_integrity_02.sh + +TEST_PROGS += test_recover_01.sh +TEST_PROGS += test_recover_02.sh +TEST_PROGS += test_recover_03.sh +TEST_PROGS += test_recover_04.sh TEST_PROGS += test_stripe_01.sh TEST_PROGS += test_stripe_02.sh TEST_PROGS += test_stripe_03.sh @@ -41,6 +48,9 @@ TEST_PROGS += test_stripe_04.sh TEST_PROGS += test_stripe_05.sh TEST_PROGS += test_stripe_06.sh +TEST_PROGS += test_part_01.sh +TEST_PROGS += test_part_02.sh + TEST_PROGS += test_stress_01.sh TEST_PROGS += test_stress_02.sh TEST_PROGS += test_stress_03.sh @@ -48,13 +58,55 @@ TEST_PROGS += test_stress_04.sh TEST_PROGS += test_stress_05.sh TEST_PROGS += test_stress_06.sh TEST_PROGS += test_stress_07.sh +TEST_PROGS += test_stress_08.sh +TEST_PROGS += test_stress_09.sh -TEST_GEN_PROGS_EXTENDED = kublk +TEST_FILES := settings + +TEST_GEN_PROGS_EXTENDED = kublk metadata_size +STANDALONE_UTILS := metadata_size.c LOCAL_HDRS += $(wildcard *.h) include ../lib.mk -$(TEST_GEN_PROGS_EXTENDED): $(wildcard *.c) +$(OUTPUT)/kublk: $(filter-out $(STANDALONE_UTILS),$(wildcard *.c)) check: shellcheck -x -f gcc *.sh + +# Test groups for running subsets of tests +# JOBS=1 (default): sequential with kselftest TAP output +# JOBS>1: parallel execution with xargs -P +# Usage: make run_null JOBS=4 +JOBS ?= 1 +export JOBS + +# Auto-detect test groups from TEST_PROGS (test__.sh -> group) +TEST_GROUPS := $(shell echo "$(TEST_PROGS)" | tr ' ' '\n' | \ + sed 's/test_\([^_]*\)_.*/\1/' | sort -u) + +# Template for group test targets +# $(1) = group name (e.g., null, generic, stress) +define RUN_GROUP +run_$(1): all + @if [ $$(JOBS) -gt 1 ]; then \ + echo $$(filter test_$(1)_%.sh,$$(TEST_PROGS)) | tr ' ' '\n' | \ + xargs -P $$(JOBS) -n1 sh -c './"$$$$0"' || true; \ + else \ + $$(call RUN_TESTS, $$(filter test_$(1)_%.sh,$$(TEST_PROGS))); \ + fi +.PHONY: run_$(1) +endef + +# Generate targets for each discovered test group +$(foreach group,$(TEST_GROUPS),$(eval $(call RUN_GROUP,$(group)))) + +# Run all tests (parallel when JOBS>1) +run_all: all + @if [ $(JOBS) -gt 1 ]; then \ + echo $(TEST_PROGS) | tr ' ' '\n' | \ + xargs -P $(JOBS) -n1 sh -c './"$$0"' || true; \ + else \ + $(call RUN_TESTS, $(TEST_PROGS)); \ + fi +.PHONY: run_all diff --git a/tools/testing/selftests/ublk/batch.c b/tools/testing/selftests/ublk/batch.c new file mode 100644 index 000000000000..a54025b00917 --- /dev/null +++ b/tools/testing/selftests/ublk/batch.c @@ -0,0 +1,607 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Description: UBLK_F_BATCH_IO buffer management + */ + +#include "kublk.h" + +static inline void *ublk_get_commit_buf(struct ublk_thread *t, + unsigned short buf_idx) +{ + unsigned idx; + + if (buf_idx < t->commit_buf_start || + buf_idx >= t->commit_buf_start + t->nr_commit_buf) + return NULL; + idx = buf_idx - t->commit_buf_start; + return t->commit_buf + idx * t->commit_buf_size; +} + +/* + * Allocate one buffer for UBLK_U_IO_PREP_IO_CMDS or UBLK_U_IO_COMMIT_IO_CMDS + * + * Buffer index is returned. + */ +static inline unsigned short ublk_alloc_commit_buf(struct ublk_thread *t) +{ + int idx = allocator_get(&t->commit_buf_alloc); + + if (idx >= 0) + return idx + t->commit_buf_start; + return UBLKS_T_COMMIT_BUF_INV_IDX; +} + +/* + * Free one commit buffer which is used by UBLK_U_IO_PREP_IO_CMDS or + * UBLK_U_IO_COMMIT_IO_CMDS + */ +static inline void ublk_free_commit_buf(struct ublk_thread *t, + unsigned short i) +{ + unsigned short idx = i - t->commit_buf_start; + + ublk_assert(idx < t->nr_commit_buf); + ublk_assert(allocator_get_val(&t->commit_buf_alloc, idx) != 0); + + allocator_put(&t->commit_buf_alloc, idx); +} + +static unsigned char ublk_commit_elem_buf_size(struct ublk_dev *dev) +{ + if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_USER_COPY | + UBLK_F_AUTO_BUF_REG)) + return 8; + + /* one extra 8bytes for carrying buffer address */ + return 16; +} + +static unsigned ublk_commit_buf_size(struct ublk_thread *t) +{ + struct ublk_dev *dev = t->dev; + unsigned elem_size = ublk_commit_elem_buf_size(dev); + unsigned int total = elem_size * dev->dev_info.queue_depth; + unsigned int page_sz = getpagesize(); + + return round_up(total, page_sz); +} + +static void free_batch_commit_buf(struct ublk_thread *t) +{ + if (t->commit_buf) { + unsigned buf_size = ublk_commit_buf_size(t); + unsigned int total = buf_size * t->nr_commit_buf; + + munlock(t->commit_buf, total); + free(t->commit_buf); + } + allocator_deinit(&t->commit_buf_alloc); + free(t->commit); +} + +static int alloc_batch_commit_buf(struct ublk_thread *t) +{ + unsigned buf_size = ublk_commit_buf_size(t); + unsigned int total = buf_size * t->nr_commit_buf; + unsigned int page_sz = getpagesize(); + void *buf = NULL; + int i, ret, j = 0; + + t->commit = calloc(t->nr_queues, sizeof(*t->commit)); + for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) { + if (t->q_map[i]) + t->commit[j++].q_id = i; + } + + allocator_init(&t->commit_buf_alloc, t->nr_commit_buf); + + t->commit_buf = NULL; + ret = posix_memalign(&buf, page_sz, total); + if (ret || !buf) + goto fail; + + t->commit_buf = buf; + + /* lock commit buffer pages for fast access */ + if (mlock(t->commit_buf, total)) + ublk_err("%s: can't lock commit buffer %s\n", __func__, + strerror(errno)); + + return 0; + +fail: + free_batch_commit_buf(t); + return ret; +} + +static unsigned int ublk_thread_nr_queues(const struct ublk_thread *t) +{ + int i; + int ret = 0; + + for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) + ret += !!t->q_map[i]; + + return ret; +} + +void ublk_batch_prepare(struct ublk_thread *t) +{ + /* + * We only handle single device in this thread context. + * + * All queues have same feature flags, so use queue 0's for + * calculate uring_cmd flags. + * + * This way looks not elegant, but it works so far. + */ + struct ublk_queue *q = &t->dev->q[0]; + + /* cache nr_queues because we don't support dynamic load-balance yet */ + t->nr_queues = ublk_thread_nr_queues(t); + + t->commit_buf_elem_size = ublk_commit_elem_buf_size(t->dev); + t->commit_buf_size = ublk_commit_buf_size(t); + t->commit_buf_start = t->nr_bufs; + t->nr_commit_buf = 2 * t->nr_queues; + t->nr_bufs += t->nr_commit_buf; + + t->cmd_flags = 0; + if (ublk_queue_use_auto_zc(q)) { + if (ublk_queue_auto_zc_fallback(q)) + t->cmd_flags |= UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK; + } else if (!ublk_queue_no_buf(q)) + t->cmd_flags |= UBLK_BATCH_F_HAS_BUF_ADDR; + + t->state |= UBLKS_T_BATCH_IO; + + ublk_log("%s: thread %d commit(nr_bufs %u, buf_size %u, start %u)\n", + __func__, t->idx, + t->nr_commit_buf, t->commit_buf_size, + t->nr_bufs); +} + +static void free_batch_fetch_buf(struct ublk_thread *t) +{ + int i; + + for (i = 0; i < t->nr_fetch_bufs; i++) { + io_uring_free_buf_ring(&t->ring, t->fetch[i].br, 1, i); + munlock(t->fetch[i].fetch_buf, t->fetch[i].fetch_buf_size); + free(t->fetch[i].fetch_buf); + } + free(t->fetch); +} + +static int alloc_batch_fetch_buf(struct ublk_thread *t) +{ + /* page aligned fetch buffer, and it is mlocked for speedup delivery */ + unsigned pg_sz = getpagesize(); + unsigned buf_size = round_up(t->dev->dev_info.queue_depth * 2, pg_sz); + int ret; + int i = 0; + + /* double fetch buffer for each queue */ + t->nr_fetch_bufs = t->nr_queues * 2; + t->fetch = calloc(t->nr_fetch_bufs, sizeof(*t->fetch)); + + /* allocate one buffer for each queue */ + for (i = 0; i < t->nr_fetch_bufs; i++) { + t->fetch[i].fetch_buf_size = buf_size; + + if (posix_memalign((void **)&t->fetch[i].fetch_buf, pg_sz, + t->fetch[i].fetch_buf_size)) + return -ENOMEM; + + /* lock fetch buffer page for fast fetching */ + if (mlock(t->fetch[i].fetch_buf, t->fetch[i].fetch_buf_size)) + ublk_err("%s: can't lock fetch buffer %s\n", __func__, + strerror(errno)); + t->fetch[i].br = io_uring_setup_buf_ring(&t->ring, 1, + i, IOU_PBUF_RING_INC, &ret); + if (!t->fetch[i].br) { + ublk_err("Buffer ring register failed %d\n", ret); + return ret; + } + } + + return 0; +} + +int ublk_batch_alloc_buf(struct ublk_thread *t) +{ + int ret; + + ublk_assert(t->nr_commit_buf < 2 * UBLK_MAX_QUEUES); + + ret = alloc_batch_commit_buf(t); + if (ret) + return ret; + return alloc_batch_fetch_buf(t); +} + +void ublk_batch_free_buf(struct ublk_thread *t) +{ + free_batch_commit_buf(t); + free_batch_fetch_buf(t); +} + +static void ublk_init_batch_cmd(struct ublk_thread *t, __u16 q_id, + struct io_uring_sqe *sqe, unsigned op, + unsigned short elem_bytes, + unsigned short nr_elem, + unsigned short buf_idx) +{ + struct ublk_batch_io *cmd; + __u64 user_data; + + cmd = (struct ublk_batch_io *)ublk_get_sqe_cmd(sqe); + + ublk_set_sqe_cmd_op(sqe, op); + + sqe->fd = 0; /* dev->fds[0] */ + sqe->opcode = IORING_OP_URING_CMD; + sqe->flags = IOSQE_FIXED_FILE; + + cmd->q_id = q_id; + cmd->flags = 0; + cmd->reserved = 0; + cmd->elem_bytes = elem_bytes; + cmd->nr_elem = nr_elem; + + user_data = build_user_data(buf_idx, _IOC_NR(op), nr_elem, q_id, 0); + io_uring_sqe_set_data64(sqe, user_data); + + t->cmd_inflight += 1; + + ublk_dbg(UBLK_DBG_IO_CMD, "%s: thread %u qid %d cmd_op %x data %lx " + "nr_elem %u elem_bytes %u buf_size %u buf_idx %d " + "cmd_inflight %u\n", + __func__, t->idx, q_id, op, user_data, + cmd->nr_elem, cmd->elem_bytes, + nr_elem * elem_bytes, buf_idx, t->cmd_inflight); +} + +static void ublk_setup_commit_sqe(struct ublk_thread *t, + struct io_uring_sqe *sqe, + unsigned short buf_idx) +{ + struct ublk_batch_io *cmd; + + cmd = (struct ublk_batch_io *)ublk_get_sqe_cmd(sqe); + + /* Use plain user buffer instead of fixed buffer */ + cmd->flags |= t->cmd_flags; +} + +static void ublk_batch_queue_fetch(struct ublk_thread *t, + struct ublk_queue *q, + unsigned short buf_idx) +{ + unsigned short nr_elem = t->fetch[buf_idx].fetch_buf_size / 2; + struct io_uring_sqe *sqe; + + io_uring_buf_ring_add(t->fetch[buf_idx].br, t->fetch[buf_idx].fetch_buf, + t->fetch[buf_idx].fetch_buf_size, + 0, 0, 0); + io_uring_buf_ring_advance(t->fetch[buf_idx].br, 1); + + ublk_io_alloc_sqes(t, &sqe, 1); + + ublk_init_batch_cmd(t, q->q_id, sqe, UBLK_U_IO_FETCH_IO_CMDS, 2, nr_elem, + buf_idx); + + sqe->rw_flags= IORING_URING_CMD_MULTISHOT; + sqe->buf_group = buf_idx; + sqe->flags |= IOSQE_BUFFER_SELECT; + + t->fetch[buf_idx].fetch_buf_off = 0; +} + +void ublk_batch_start_fetch(struct ublk_thread *t) +{ + int i; + int j = 0; + + for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) { + if (t->q_map[i]) { + struct ublk_queue *q = &t->dev->q[i]; + + /* submit two fetch commands for each queue */ + ublk_batch_queue_fetch(t, q, j++); + ublk_batch_queue_fetch(t, q, j++); + } + } +} + +static unsigned short ublk_compl_batch_fetch(struct ublk_thread *t, + struct ublk_queue *q, + const struct io_uring_cqe *cqe) +{ + unsigned short buf_idx = user_data_to_tag(cqe->user_data); + unsigned start = t->fetch[buf_idx].fetch_buf_off; + unsigned end = start + cqe->res; + void *buf = t->fetch[buf_idx].fetch_buf; + int i; + + if (cqe->res < 0) + return buf_idx; + + if ((end - start) / 2 > q->q_depth) { + ublk_err("%s: fetch duplicated ios offset %u count %u\n", __func__, start, cqe->res); + + for (i = start; i < end; i += 2) { + unsigned short tag = *(unsigned short *)(buf + i); + + ublk_err("%u ", tag); + } + ublk_err("\n"); + } + + for (i = start; i < end; i += 2) { + unsigned short tag = *(unsigned short *)(buf + i); + + if (tag >= q->q_depth) + ublk_err("%s: bad tag %u\n", __func__, tag); + + if (q->tgt_ops->queue_io) + q->tgt_ops->queue_io(t, q, tag); + } + t->fetch[buf_idx].fetch_buf_off = end; + return buf_idx; +} + +static int __ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q) +{ + unsigned short nr_elem = q->q_depth; + unsigned short buf_idx = ublk_alloc_commit_buf(t); + struct io_uring_sqe *sqe; + void *buf; + int i; + + ublk_assert(buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX); + + ublk_io_alloc_sqes(t, &sqe, 1); + + ublk_assert(nr_elem == q->q_depth); + buf = ublk_get_commit_buf(t, buf_idx); + for (i = 0; i < nr_elem; i++) { + struct ublk_batch_elem *elem = (struct ublk_batch_elem *)( + buf + i * t->commit_buf_elem_size); + struct ublk_io *io = &q->ios[i]; + + elem->tag = i; + elem->result = 0; + + if (ublk_queue_use_auto_zc(q)) + elem->buf_index = ublk_batch_io_buf_idx(t, q, i); + else if (!ublk_queue_no_buf(q)) + elem->buf_addr = (__u64)io->buf_addr; + } + + sqe->addr = (__u64)buf; + sqe->len = t->commit_buf_elem_size * nr_elem; + + ublk_init_batch_cmd(t, q->q_id, sqe, UBLK_U_IO_PREP_IO_CMDS, + t->commit_buf_elem_size, nr_elem, buf_idx); + ublk_setup_commit_sqe(t, sqe, buf_idx); + return 0; +} + +int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q) +{ + int ret = 0; + + pthread_spin_lock(&q->lock); + if (q->flags & UBLKS_Q_PREPARED) + goto unlock; + ret = __ublk_batch_queue_prep_io_cmds(t, q); + if (!ret) + q->flags |= UBLKS_Q_PREPARED; +unlock: + pthread_spin_unlock(&q->lock); + + return ret; +} + +static void ublk_batch_compl_commit_cmd(struct ublk_thread *t, + const struct io_uring_cqe *cqe, + unsigned op) +{ + unsigned short buf_idx = user_data_to_tag(cqe->user_data); + + if (op == _IOC_NR(UBLK_U_IO_PREP_IO_CMDS)) + ublk_assert(cqe->res == 0); + else if (op == _IOC_NR(UBLK_U_IO_COMMIT_IO_CMDS)) { + int nr_elem = user_data_to_tgt_data(cqe->user_data); + + ublk_assert(cqe->res == t->commit_buf_elem_size * nr_elem); + } else + ublk_assert(0); + + ublk_free_commit_buf(t, buf_idx); +} + +void ublk_batch_compl_cmd(struct ublk_thread *t, + const struct io_uring_cqe *cqe) +{ + unsigned op = user_data_to_op(cqe->user_data); + struct ublk_queue *q; + unsigned buf_idx; + unsigned q_id; + + if (op == _IOC_NR(UBLK_U_IO_PREP_IO_CMDS) || + op == _IOC_NR(UBLK_U_IO_COMMIT_IO_CMDS)) { + t->cmd_inflight--; + ublk_batch_compl_commit_cmd(t, cqe, op); + return; + } + + /* FETCH command is per queue */ + q_id = user_data_to_q_id(cqe->user_data); + q = &t->dev->q[q_id]; + buf_idx = ublk_compl_batch_fetch(t, q, cqe); + + if (cqe->res < 0 && cqe->res != -ENOBUFS) { + t->cmd_inflight--; + t->state |= UBLKS_T_STOPPING; + } else if (!(cqe->flags & IORING_CQE_F_MORE) || cqe->res == -ENOBUFS) { + t->cmd_inflight--; + ublk_batch_queue_fetch(t, q, buf_idx); + } +} + +static void __ublk_batch_commit_io_cmds(struct ublk_thread *t, + struct batch_commit_buf *cb) +{ + struct io_uring_sqe *sqe; + unsigned short buf_idx; + unsigned short nr_elem = cb->done; + + /* nothing to commit */ + if (!nr_elem) { + ublk_free_commit_buf(t, cb->buf_idx); + return; + } + + ublk_io_alloc_sqes(t, &sqe, 1); + buf_idx = cb->buf_idx; + sqe->addr = (__u64)cb->elem; + sqe->len = nr_elem * t->commit_buf_elem_size; + + /* commit isn't per-queue command */ + ublk_init_batch_cmd(t, cb->q_id, sqe, UBLK_U_IO_COMMIT_IO_CMDS, + t->commit_buf_elem_size, nr_elem, buf_idx); + ublk_setup_commit_sqe(t, sqe, buf_idx); +} + +void ublk_batch_commit_io_cmds(struct ublk_thread *t) +{ + int i; + + for (i = 0; i < t->nr_queues; i++) { + struct batch_commit_buf *cb = &t->commit[i]; + + if (cb->buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX) + __ublk_batch_commit_io_cmds(t, cb); + } + +} + +static void __ublk_batch_init_commit(struct ublk_thread *t, + struct batch_commit_buf *cb, + unsigned short buf_idx) +{ + /* so far only support 1:1 queue/thread mapping */ + cb->buf_idx = buf_idx; + cb->elem = ublk_get_commit_buf(t, buf_idx); + cb->done = 0; + cb->count = t->commit_buf_size / + t->commit_buf_elem_size; +} + +/* COMMIT_IO_CMDS is per-queue command, so use its own commit buffer */ +static void ublk_batch_init_commit(struct ublk_thread *t, + struct batch_commit_buf *cb) +{ + unsigned short buf_idx = ublk_alloc_commit_buf(t); + + ublk_assert(buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX); + ublk_assert(!ublk_batch_commit_prepared(cb)); + + __ublk_batch_init_commit(t, cb, buf_idx); +} + +void ublk_batch_prep_commit(struct ublk_thread *t) +{ + int i; + + for (i = 0; i < t->nr_queues; i++) + t->commit[i].buf_idx = UBLKS_T_COMMIT_BUF_INV_IDX; +} + +void ublk_batch_complete_io(struct ublk_thread *t, struct ublk_queue *q, + unsigned tag, int res) +{ + unsigned q_t_idx = ublk_queue_idx_in_thread(t, q); + struct batch_commit_buf *cb = &t->commit[q_t_idx]; + struct ublk_batch_elem *elem; + struct ublk_io *io = &q->ios[tag]; + + if (!ublk_batch_commit_prepared(cb)) + ublk_batch_init_commit(t, cb); + + ublk_assert(q->q_id == cb->q_id); + + elem = (struct ublk_batch_elem *)(cb->elem + cb->done * t->commit_buf_elem_size); + elem->tag = tag; + elem->buf_index = ublk_batch_io_buf_idx(t, q, tag); + elem->result = res; + + if (!ublk_queue_no_buf(q)) + elem->buf_addr = (__u64) (uintptr_t) io->buf_addr; + + cb->done += 1; + ublk_assert(cb->done <= cb->count); +} + +void ublk_batch_setup_map(unsigned char (*q_thread_map)[UBLK_MAX_QUEUES], + int nthreads, int queues) +{ + int i, j; + + /* + * Setup round-robin queue-to-thread mapping for arbitrary N:M combinations. + * + * This algorithm distributes queues across threads (and threads across queues) + * in a balanced round-robin fashion to ensure even load distribution. + * + * Examples: + * - 2 threads, 4 queues: T0=[Q0,Q2], T1=[Q1,Q3] + * - 4 threads, 2 queues: T0=[Q0], T1=[Q1], T2=[Q0], T3=[Q1] + * - 3 threads, 3 queues: T0=[Q0], T1=[Q1], T2=[Q2] (1:1 mapping) + * + * Phase 1: Mark which queues each thread handles (boolean mapping) + */ + for (i = 0, j = 0; i < queues || j < nthreads; i++, j++) { + q_thread_map[j % nthreads][i % queues] = 1; + } + + /* + * Phase 2: Convert boolean mapping to sequential indices within each thread. + * + * Transform from: q_thread_map[thread][queue] = 1 (handles queue) + * To: q_thread_map[thread][queue] = N (queue index within thread) + * + * This allows each thread to know the local index of each queue it handles, + * which is essential for buffer allocation and management. For example: + * - Thread 0 handling queues [0,2] becomes: q_thread_map[0][0]=1, q_thread_map[0][2]=2 + * - Thread 1 handling queues [1,3] becomes: q_thread_map[1][1]=1, q_thread_map[1][3]=2 + */ + for (j = 0; j < nthreads; j++) { + unsigned char seq = 1; + + for (i = 0; i < queues; i++) { + if (q_thread_map[j][i]) + q_thread_map[j][i] = seq++; + } + } + +#if 0 + for (j = 0; j < nthreads; j++) { + printf("thread %0d: ", j); + for (i = 0; i < queues; i++) { + if (q_thread_map[j][i]) + printf("%03u ", i); + } + printf("\n"); + } + printf("\n"); + for (j = 0; j < nthreads; j++) { + for (i = 0; i < queues; i++) { + printf("%03u ", q_thread_map[j][i]); + } + printf("\n"); + } +#endif +} diff --git a/tools/testing/selftests/ublk/common.c b/tools/testing/selftests/ublk/common.c index 01580a6f8519..530f9877c9dd 100644 --- a/tools/testing/selftests/ublk/common.c +++ b/tools/testing/selftests/ublk/common.c @@ -12,11 +12,11 @@ void backing_file_tgt_deinit(struct ublk_dev *dev) } } -int backing_file_tgt_init(struct ublk_dev *dev) +int backing_file_tgt_init(struct ublk_dev *dev, unsigned int nr_direct) { int fd, i; - assert(dev->nr_fds == 1); + ublk_assert(dev->nr_fds == 1); for (i = 0; i < dev->tgt.nr_backing_files; i++) { char *file = dev->tgt.backing_file[i]; @@ -25,7 +25,7 @@ int backing_file_tgt_init(struct ublk_dev *dev) ublk_dbg(UBLK_DBG_DEV, "%s: file %d: %s\n", __func__, i, file); - fd = open(file, O_RDWR | O_DIRECT); + fd = open(file, O_RDWR | (i < nr_direct ? O_DIRECT : 0)); if (fd < 0) { ublk_err("%s: backing file %s can't be opened: %s\n", __func__, file, strerror(errno)); diff --git a/tools/testing/selftests/ublk/fault_inject.c b/tools/testing/selftests/ublk/fault_inject.c index b227bd78b252..3b897f69c014 100644 --- a/tools/testing/selftests/ublk/fault_inject.c +++ b/tools/testing/selftests/ublk/fault_inject.c @@ -33,6 +33,7 @@ static int ublk_fault_inject_tgt_init(const struct dev_ctx *ctx, .dev_sectors = dev_size >> 9, }, }; + ublk_set_integrity_params(ctx, &dev->tgt.params); dev->private_data = (void *)(unsigned long)(ctx->fault_inject.delay_us * 1000); return 0; diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c index 269d5f124e06..228af2580ac6 100644 --- a/tools/testing/selftests/ublk/file_backed.c +++ b/tools/testing/selftests/ublk/file_backed.c @@ -10,7 +10,7 @@ static enum io_uring_op ublk_to_uring_op(const struct ublksrv_io_desc *iod, int return zc ? IORING_OP_READ_FIXED : IORING_OP_READ; else if (ublk_op == UBLK_IO_OP_WRITE) return zc ? IORING_OP_WRITE_FIXED : IORING_OP_WRITE; - assert(0); + ublk_assert(0); } static int loop_queue_flush_io(struct ublk_thread *t, struct ublk_queue *q, @@ -35,8 +35,23 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, unsigned auto_zc = ublk_queue_use_auto_zc(q); enum io_uring_op op = ublk_to_uring_op(iod, zc | auto_zc); struct ublk_io *io = ublk_get_io(q, tag); + __u64 offset = iod->start_sector << 9; + __u32 len = iod->nr_sectors << 9; struct io_uring_sqe *sqe[3]; void *addr = io->buf_addr; + unsigned short buf_index = ublk_io_buf_idx(t, q, tag); + + if (iod->op_flags & UBLK_IO_F_INTEGRITY) { + ublk_io_alloc_sqes(t, sqe, 1); + /* Use second backing file for integrity data */ + io_uring_prep_rw(op, sqe[0], ublk_get_registered_fd(q, 2), + io->integrity_buf, + ublk_integrity_len(q, len), + ublk_integrity_len(q, offset)); + sqe[0]->flags = IOSQE_FIXED_FILE; + /* tgt_data = 1 indicates integrity I/O */ + sqe[0]->user_data = build_user_data(tag, ublk_op, 1, q->q_id, 1); + } if (!zc || auto_zc) { ublk_io_alloc_sqes(t, sqe, 1); @@ -45,34 +60,34 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, io_uring_prep_rw(op, sqe[0], ublk_get_registered_fd(q, 1) /*fds[1]*/, addr, - iod->nr_sectors << 9, - iod->start_sector << 9); + len, + offset); if (auto_zc) - sqe[0]->buf_index = tag; + sqe[0]->buf_index = buf_index; io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE); /* bit63 marks us as tgt io */ sqe[0]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1); - return 1; + return !!(iod->op_flags & UBLK_IO_F_INTEGRITY) + 1; } ublk_io_alloc_sqes(t, sqe, 3); - io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, io->buf_index); + io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, buf_index); sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; sqe[0]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1); io_uring_prep_rw(op, sqe[1], ublk_get_registered_fd(q, 1) /*fds[1]*/, 0, - iod->nr_sectors << 9, - iod->start_sector << 9); - sqe[1]->buf_index = tag; + len, + offset); + sqe[1]->buf_index = buf_index; sqe[1]->flags |= IOSQE_FIXED_FILE | IOSQE_IO_HARDLINK; sqe[1]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1); - io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, io->buf_index); + io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, buf_index); sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1); - return 2; + return !!(iod->op_flags & UBLK_IO_F_INTEGRITY) + 2; } static int loop_queue_tgt_io(struct ublk_thread *t, struct ublk_queue *q, int tag) @@ -119,12 +134,17 @@ static void ublk_loop_io_done(struct ublk_thread *t, struct ublk_queue *q, unsigned op = user_data_to_op(cqe->user_data); struct ublk_io *io = ublk_get_io(q, tag); - if (cqe->res < 0 || op != ublk_cmd_op_nr(UBLK_U_IO_UNREGISTER_IO_BUF)) { - if (!io->result) - io->result = cqe->res; - if (cqe->res < 0) - ublk_err("%s: io failed op %x user_data %lx\n", - __func__, op, cqe->user_data); + if (cqe->res < 0) { + io->result = cqe->res; + ublk_err("%s: io failed op %x user_data %lx\n", + __func__, op, cqe->user_data); + } else if (op != ublk_cmd_op_nr(UBLK_U_IO_UNREGISTER_IO_BUF)) { + __s32 data_len = user_data_to_tgt_data(cqe->user_data) + ? ublk_integrity_data_len(q, cqe->res) + : cqe->res; + + if (!io->result || data_len < io->result) + io->result = data_len; } /* buffer register op is IOSQE_CQE_SKIP_SUCCESS */ @@ -135,9 +155,30 @@ static void ublk_loop_io_done(struct ublk_thread *t, struct ublk_queue *q, ublk_complete_io(t, q, tag, io->result); } +static int ublk_loop_memset_file(int fd, __u8 byte, size_t len) +{ + off_t offset = 0; + __u8 buf[4096]; + + memset(buf, byte, sizeof(buf)); + while (len) { + int ret = pwrite(fd, buf, min(len, sizeof(buf)), offset); + + if (ret < 0) + return -errno; + if (!ret) + return -EIO; + + len -= ret; + offset += ret; + } + return 0; +} + static int ublk_loop_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) { unsigned long long bytes; + unsigned long blocks; int ret; struct ublk_params p = { .types = UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DMA_ALIGN, @@ -154,19 +195,39 @@ static int ublk_loop_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) }, }; + ublk_set_integrity_params(ctx, &p); if (ctx->auto_zc_fallback) { ublk_err("%s: not support auto_zc_fallback\n", __func__); return -EINVAL; } - ret = backing_file_tgt_init(dev); + /* Use O_DIRECT only for data file */ + ret = backing_file_tgt_init(dev, 1); if (ret) return ret; - if (dev->tgt.nr_backing_files != 1) + /* Expect a second file for integrity data */ + if (dev->tgt.nr_backing_files != 1 + !!ctx->metadata_size) return -EINVAL; - bytes = dev->tgt.backing_file_size[0]; + blocks = dev->tgt.backing_file_size[0] >> p.basic.logical_bs_shift; + if (ctx->metadata_size) { + unsigned long metadata_blocks = + dev->tgt.backing_file_size[1] / ctx->metadata_size; + unsigned long integrity_len; + + /* Ensure both data and integrity data fit in backing files */ + blocks = min(blocks, metadata_blocks); + integrity_len = blocks * ctx->metadata_size; + /* + * Initialize PI app tag and ref tag to 0xFF + * to disable bio-integrity-auto checks + */ + ret = ublk_loop_memset_file(dev->fds[2], 0xFF, integrity_len); + if (ret) + return ret; + } + bytes = blocks << p.basic.logical_bs_shift; dev->tgt.dev_size = bytes; p.basic.dev_sectors = bytes >> 9; dev->tgt.params = p; diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index f197ad9cc262..e1c3b3c55e56 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -3,6 +3,7 @@ * Description: uring_cmd based ublk */ +#include #include "kublk.h" #define MAX_NR_TGT_ARG 64 @@ -107,6 +108,15 @@ static int ublk_ctrl_stop_dev(struct ublk_dev *dev) return __ublk_ctrl_cmd(dev, &data); } +static int ublk_ctrl_try_stop_dev(struct ublk_dev *dev) +{ + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_U_CMD_TRY_STOP_DEV, + }; + + return __ublk_ctrl_cmd(dev, &data); +} + static int ublk_ctrl_start_dev(struct ublk_dev *dev, int daemon_pid) { @@ -415,14 +425,18 @@ static void ublk_queue_deinit(struct ublk_queue *q) if (q->io_cmd_buf) munmap(q->io_cmd_buf, ublk_queue_cmd_buf_sz(q)); - for (i = 0; i < nr_ios; i++) + for (i = 0; i < nr_ios; i++) { free(q->ios[i].buf_addr); + free(q->ios[i].integrity_buf); + } } static void ublk_thread_deinit(struct ublk_thread *t) { io_uring_unregister_buffers(&t->ring); + ublk_batch_free_buf(t); + io_uring_unregister_ring_fd(&t->ring); if (t->ring.ring_fd > 0) { @@ -432,19 +446,22 @@ static void ublk_thread_deinit(struct ublk_thread *t) } } -static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags) +static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags, + __u8 metadata_size) { struct ublk_dev *dev = q->dev; int depth = dev->dev_info.queue_depth; int i; - int cmd_buf_size, io_buf_size; + int cmd_buf_size, io_buf_size, integrity_size; unsigned long off; + pthread_spin_init(&q->lock, PTHREAD_PROCESS_PRIVATE); q->tgt_ops = dev->tgt.ops; q->flags = 0; q->q_depth = depth; q->flags = dev->dev_info.flags; q->flags |= extra_flags; + q->metadata_size = metadata_size; /* Cache fd in queue for fast path access */ q->ublk_fd = dev->fds[0]; @@ -460,11 +477,23 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags) } io_buf_size = dev->dev_info.max_io_buf_bytes; + integrity_size = ublk_integrity_len(q, io_buf_size); for (i = 0; i < q->q_depth; i++) { q->ios[i].buf_addr = NULL; q->ios[i].flags = UBLKS_IO_NEED_FETCH_RQ | UBLKS_IO_FREE; q->ios[i].tag = i; + if (integrity_size) { + q->ios[i].integrity_buf = malloc(integrity_size); + if (!q->ios[i].integrity_buf) { + ublk_err("ublk dev %d queue %d io %d malloc(%d) failed: %m\n", + dev->dev_info.dev_id, q->q_id, i, + integrity_size); + goto fail; + } + } + + if (ublk_queue_no_buf(q)) continue; @@ -491,6 +520,10 @@ static int ublk_thread_init(struct ublk_thread *t, unsigned long long extra_flag int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth; int ret; + /* FETCH_IO_CMDS is multishot, so increase cq depth for BATCH_IO */ + if (ublk_dev_batch_io(dev)) + cq_depth += dev->dev_info.queue_depth * 2; + ret = ublk_setup_ring(&t->ring, ring_depth, cq_depth, IORING_SETUP_COOP_TASKRUN | IORING_SETUP_SINGLE_ISSUER | @@ -505,15 +538,33 @@ static int ublk_thread_init(struct ublk_thread *t, unsigned long long extra_flag unsigned nr_ios = dev->dev_info.queue_depth * dev->dev_info.nr_hw_queues; unsigned max_nr_ios_per_thread = nr_ios / dev->nthreads; max_nr_ios_per_thread += !!(nr_ios % dev->nthreads); - ret = io_uring_register_buffers_sparse( - &t->ring, max_nr_ios_per_thread); + + t->nr_bufs = max_nr_ios_per_thread; + } else { + t->nr_bufs = 0; + } + + if (ublk_dev_batch_io(dev)) + ublk_batch_prepare(t); + + if (t->nr_bufs) { + ret = io_uring_register_buffers_sparse(&t->ring, t->nr_bufs); if (ret) { - ublk_err("ublk dev %d thread %d register spare buffers failed %d", + ublk_err("ublk dev %d thread %d register spare buffers failed %d\n", dev->dev_info.dev_id, t->idx, ret); goto fail; } } + if (ublk_dev_batch_io(dev)) { + ret = ublk_batch_alloc_buf(t); + if (ret) { + ublk_err("ublk dev %d thread %d alloc batch buf failed %d\n", + dev->dev_info.dev_id, t->idx, ret); + goto fail; + } + } + io_uring_register_ring_fd(&t->ring); if (flags & UBLKS_Q_NO_UBLK_FIXED_FD) { @@ -579,16 +630,17 @@ static void ublk_dev_unprep(struct ublk_dev *dev) close(dev->fds[0]); } -static void ublk_set_auto_buf_reg(const struct ublk_queue *q, +static void ublk_set_auto_buf_reg(const struct ublk_thread *t, + const struct ublk_queue *q, struct io_uring_sqe *sqe, unsigned short tag) { struct ublk_auto_buf_reg buf = {}; if (q->tgt_ops->buf_index) - buf.index = q->tgt_ops->buf_index(q, tag); + buf.index = q->tgt_ops->buf_index(t, q, tag); else - buf.index = q->ios[tag].buf_index; + buf.index = ublk_io_buf_idx(t, q, tag); if (ublk_queue_auto_zc_fallback(q)) buf.flags = UBLK_AUTO_BUF_REG_FALLBACK; @@ -607,13 +659,13 @@ static void ublk_user_copy(const struct ublk_io *io, __u8 match_ublk_op) __u8 ublk_op = ublksrv_get_op(iod); __u32 len = iod->nr_sectors << 9; void *addr = io->buf_addr; + ssize_t copied; if (ublk_op != match_ublk_op) return; while (len) { __u32 copy_len = min(len, UBLK_USER_COPY_LEN); - ssize_t copied; if (ublk_op == UBLK_IO_OP_WRITE) copied = pread(q->ublk_fd, addr, copy_len, off); @@ -626,6 +678,20 @@ static void ublk_user_copy(const struct ublk_io *io, __u8 match_ublk_op) off += copy_len; len -= copy_len; } + + if (!(iod->op_flags & UBLK_IO_F_INTEGRITY)) + return; + + len = ublk_integrity_len(q, iod->nr_sectors << 9); + off = ublk_user_copy_offset(q->q_id, io->tag); + off |= UBLKSRV_IO_INTEGRITY_FLAG; + if (ublk_op == UBLK_IO_OP_WRITE) + copied = pread(q->ublk_fd, io->integrity_buf, len, off); + else if (ublk_op == UBLK_IO_OP_READ) + copied = pwrite(q->ublk_fd, io->integrity_buf, len, off); + else + assert(0); + assert(copied == (ssize_t)len); } int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io) @@ -690,7 +756,7 @@ int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io) cmd->addr = 0; if (ublk_queue_use_auto_zc(q)) - ublk_set_auto_buf_reg(q, sqe[0], io->tag); + ublk_set_auto_buf_reg(t, q, sqe[0], io->tag); user_data = build_user_data(io->tag, _IOC_NR(cmd_op), 0, q->q_id, 0); io_uring_sqe_set_data64(sqe[0], user_data); @@ -779,13 +845,15 @@ static void ublk_handle_uring_cmd(struct ublk_thread *t, unsigned tag = user_data_to_tag(cqe->user_data); struct ublk_io *io = &q->ios[tag]; + t->cmd_inflight--; + if (!fetch) { t->state |= UBLKS_T_STOPPING; io->flags &= ~UBLKS_IO_NEED_FETCH_RQ; } if (cqe->res == UBLK_IO_RES_OK) { - assert(tag < q->q_depth); + ublk_assert(tag < q->q_depth); if (ublk_queue_use_user_copy(q)) ublk_user_copy(io, UBLK_IO_OP_WRITE); @@ -813,28 +881,30 @@ static void ublk_handle_cqe(struct ublk_thread *t, { struct ublk_dev *dev = t->dev; unsigned q_id = user_data_to_q_id(cqe->user_data); - struct ublk_queue *q = &dev->q[q_id]; unsigned cmd_op = user_data_to_op(cqe->user_data); - if (cqe->res < 0 && cqe->res != -ENODEV) - ublk_err("%s: res %d userdata %llx queue state %x\n", __func__, - cqe->res, cqe->user_data, q->flags); + if (cqe->res < 0 && cqe->res != -ENODEV && cqe->res != -ENOBUFS) + ublk_err("%s: res %d userdata %llx thread state %x\n", __func__, + cqe->res, cqe->user_data, t->state); - ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (qid %d tag %u cmd_op %u target %d/%d) stopping %d\n", - __func__, cqe->res, q->q_id, user_data_to_tag(cqe->user_data), - cmd_op, is_target_io(cqe->user_data), + ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (thread %d qid %d tag %u cmd_op %x " + "data %lx target %d/%d) stopping %d\n", + __func__, cqe->res, t->idx, q_id, + user_data_to_tag(cqe->user_data), + cmd_op, cqe->user_data, is_target_io(cqe->user_data), user_data_to_tgt_data(cqe->user_data), (t->state & UBLKS_T_STOPPING)); /* Don't retrieve io in case of target io */ if (is_target_io(cqe->user_data)) { - ublksrv_handle_tgt_cqe(t, q, cqe); + ublksrv_handle_tgt_cqe(t, &dev->q[q_id], cqe); return; } - t->cmd_inflight--; - - ublk_handle_uring_cmd(t, q, cqe); + if (ublk_thread_batch_io(t)) + ublk_batch_compl_cmd(t, cqe); + else + ublk_handle_uring_cmd(t, &dev->q[q_id], cqe); } static int ublk_reap_events_uring(struct ublk_thread *t) @@ -866,7 +936,13 @@ static int ublk_process_io(struct ublk_thread *t) return -ENODEV; ret = io_uring_submit_and_wait(&t->ring, 1); - reapped = ublk_reap_events_uring(t); + if (ublk_thread_batch_io(t)) { + ublk_batch_prep_commit(t); + reapped = ublk_reap_events_uring(t); + ublk_batch_commit_io_cmds(t); + } else { + reapped = ublk_reap_events_uring(t); + } ublk_dbg(UBLK_DBG_THREAD, "submit result %d, reapped %d stop %d idle %d\n", ret, reapped, (t->state & UBLKS_T_STOPPING), @@ -882,6 +958,7 @@ struct ublk_thread_info { sem_t *ready; cpu_set_t *affinity; unsigned long long extra_flags; + unsigned char (*q_thread_map)[UBLK_MAX_QUEUES]; }; static void ublk_thread_set_sched_affinity(const struct ublk_thread_info *info) @@ -891,6 +968,26 @@ static void ublk_thread_set_sched_affinity(const struct ublk_thread_info *info) info->dev->dev_info.dev_id, info->idx); } +static void ublk_batch_setup_queues(struct ublk_thread *t) +{ + int i; + + for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) { + struct ublk_queue *q = &t->dev->q[i]; + int ret; + + /* + * Only prepare io commands in the mapped thread context, + * otherwise io command buffer index may not work as expected + */ + if (t->q_map[i] == 0) + continue; + + ret = ublk_batch_queue_prep_io_cmds(t, q); + ublk_assert(ret >= 0); + } +} + static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_info *info) { struct ublk_thread t = { @@ -900,6 +997,10 @@ static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_inf int dev_id = info->dev->dev_info.dev_id; int ret; + /* Copy per-thread queue mapping into thread-local variable */ + if (info->q_thread_map) + memcpy(t.q_map, info->q_thread_map[info->idx], sizeof(t.q_map)); + ret = ublk_thread_init(&t, info->extra_flags); if (ret) { ublk_err("ublk dev %d thread %u init failed\n", @@ -911,8 +1012,14 @@ static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_inf ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %u started\n", gettid(), dev_id, t.idx); - /* submit all io commands to ublk driver */ - ublk_submit_fetch_commands(&t); + if (!ublk_thread_batch_io(&t)) { + /* submit all io commands to ublk driver */ + ublk_submit_fetch_commands(&t); + } else { + ublk_batch_setup_queues(&t); + ublk_batch_start_fetch(&t); + } + do { if (ublk_process_io(&t) < 0) break; @@ -984,6 +1091,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) struct ublk_thread_info *tinfo; unsigned long long extra_flags = 0; cpu_set_t *affinity_buf; + unsigned char (*q_thread_map)[UBLK_MAX_QUEUES] = NULL; void *thread_ret; sem_t ready; int ret, i; @@ -1003,6 +1111,16 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) if (ret) return ret; + if (ublk_dev_batch_io(dev)) { + q_thread_map = calloc(dev->nthreads, sizeof(*q_thread_map)); + if (!q_thread_map) { + ret = -ENOMEM; + goto fail; + } + ublk_batch_setup_map(q_thread_map, dev->nthreads, + dinfo->nr_hw_queues); + } + if (ctx->auto_zc_fallback) extra_flags = UBLKS_Q_AUTO_BUF_REG_FALLBACK; if (ctx->no_ublk_fixed_fd) @@ -1012,7 +1130,8 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) dev->q[i].dev = dev; dev->q[i].q_id = i; - ret = ublk_queue_init(&dev->q[i], extra_flags); + ret = ublk_queue_init(&dev->q[i], extra_flags, + ctx->metadata_size); if (ret) { ublk_err("ublk dev %d queue %d init queue failed\n", dinfo->dev_id, i); @@ -1025,6 +1144,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) tinfo[i].idx = i; tinfo[i].ready = &ready; tinfo[i].extra_flags = extra_flags; + tinfo[i].q_thread_map = q_thread_map; /* * If threads are not tied 1:1 to queues, setting thread @@ -1044,6 +1164,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) for (i = 0; i < dev->nthreads; i++) sem_wait(&ready); free(affinity_buf); + free(q_thread_map); /* everything is fine now, start us */ if (ctx->recovery) @@ -1214,7 +1335,8 @@ static int __cmd_dev_add(const struct dev_ctx *ctx) goto fail; } - if (nthreads != nr_queues && !ctx->per_io_tasks) { + if (nthreads != nr_queues && (!ctx->per_io_tasks && + !(ctx->flags & UBLK_F_BATCH_IO))) { ublk_err("%s: threads %u must be same as queues %u if " "not using per_io_tasks\n", __func__, nthreads, nr_queues); @@ -1394,6 +1516,42 @@ static int cmd_dev_del(struct dev_ctx *ctx) return 0; } +static int cmd_dev_stop(struct dev_ctx *ctx) +{ + int number = ctx->dev_id; + struct ublk_dev *dev; + int ret; + + if (number < 0) { + ublk_err("%s: device id is required\n", __func__); + return -EINVAL; + } + + dev = ublk_ctrl_init(); + dev->dev_info.dev_id = number; + + ret = ublk_ctrl_get_info(dev); + if (ret < 0) + goto fail; + + if (ctx->safe_stop) { + ret = ublk_ctrl_try_stop_dev(dev); + if (ret < 0) + ublk_err("%s: try_stop dev %d failed ret %d\n", + __func__, number, ret); + } else { + ret = ublk_ctrl_stop_dev(dev); + if (ret < 0) + ublk_err("%s: stop dev %d failed ret %d\n", + __func__, number, ret); + } + +fail: + ublk_ctrl_deinit(dev); + + return ret; +} + static int __cmd_dev_list(struct dev_ctx *ctx) { struct ublk_dev *dev = ublk_ctrl_init(); @@ -1456,6 +1614,10 @@ static int cmd_dev_get_features(void) FEAT_NAME(UBLK_F_QUIESCE), FEAT_NAME(UBLK_F_PER_IO_DAEMON), FEAT_NAME(UBLK_F_BUF_REG_OFF_DAEMON), + FEAT_NAME(UBLK_F_INTEGRITY), + FEAT_NAME(UBLK_F_SAFE_STOP_DEV), + FEAT_NAME(UBLK_F_BATCH_IO), + FEAT_NAME(UBLK_F_NO_AUTO_PART_SCAN), }; struct ublk_dev *dev; __u64 features = 0; @@ -1551,6 +1713,9 @@ static void __cmd_create_help(char *exe, bool recovery) printf("\t[--foreground] [--quiet] [-z] [--auto_zc] [--auto_zc_fallback] [--debug_mask mask] [-r 0|1] [-g] [-u]\n"); printf("\t[-e 0|1 ] [-i 0|1] [--no_ublk_fixed_fd]\n"); printf("\t[--nthreads threads] [--per_io_tasks]\n"); + printf("\t[--integrity_capable] [--integrity_reftag] [--metadata_size SIZE] " + "[--pi_offset OFFSET] [--csum_type ip|t10dif|nvme] [--tag_size SIZE]\n"); + printf("\t[--batch|-b] [--no_auto_part_scan]\n"); printf("\t[target options] [backfile1] [backfile2] ...\n"); printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n"); printf("\tdefault: nthreads=nr_queues"); @@ -1583,6 +1748,8 @@ static int cmd_dev_help(char *exe) printf("%s del [-n dev_id] -a \n", exe); printf("\t -a delete all devices -n delete specified device\n\n"); + printf("%s stop -n dev_id [--safe]\n", exe); + printf("\t --safe only stop if device has no active openers\n\n"); printf("%s list [-n dev_id] -a \n", exe); printf("\t -a list all devices, -n list specified device, default -a \n\n"); printf("%s features\n", exe); @@ -1614,6 +1781,15 @@ int main(int argc, char *argv[]) { "nthreads", 1, NULL, 0 }, { "per_io_tasks", 0, NULL, 0 }, { "no_ublk_fixed_fd", 0, NULL, 0 }, + { "integrity_capable", 0, NULL, 0 }, + { "integrity_reftag", 0, NULL, 0 }, + { "metadata_size", 1, NULL, 0 }, + { "pi_offset", 1, NULL, 0 }, + { "csum_type", 1, NULL, 0 }, + { "tag_size", 1, NULL, 0 }, + { "safe", 0, NULL, 0 }, + { "batch", 0, NULL, 'b'}, + { "no_auto_part_scan", 0, NULL, 0 }, { 0, 0, 0, 0 } }; const struct ublk_tgt_ops *ops = NULL; @@ -1625,6 +1801,7 @@ int main(int argc, char *argv[]) .nr_hw_queues = 2, .dev_id = -1, .tgt_type = "unknown", + .csum_type = LBMD_PI_CSUM_NONE, }; int ret = -EINVAL, i; int tgt_argc = 1; @@ -1636,12 +1813,15 @@ int main(int argc, char *argv[]) opterr = 0; optind = 2; - while ((opt = getopt_long(argc, argv, "t:n:d:q:r:e:i:s:gazu", + while ((opt = getopt_long(argc, argv, "t:n:d:q:r:e:i:s:gazub", longopts, &option_idx)) != -1) { switch (opt) { case 'a': ctx.all = 1; break; + case 'b': + ctx.flags |= UBLK_F_BATCH_IO; + break; case 'n': ctx.dev_id = strtol(optarg, NULL, 10); break; @@ -1699,6 +1879,32 @@ int main(int argc, char *argv[]) ctx.per_io_tasks = 1; if (!strcmp(longopts[option_idx].name, "no_ublk_fixed_fd")) ctx.no_ublk_fixed_fd = 1; + if (!strcmp(longopts[option_idx].name, "integrity_capable")) + ctx.integrity_flags |= LBMD_PI_CAP_INTEGRITY; + if (!strcmp(longopts[option_idx].name, "integrity_reftag")) + ctx.integrity_flags |= LBMD_PI_CAP_REFTAG; + if (!strcmp(longopts[option_idx].name, "metadata_size")) + ctx.metadata_size = strtoul(optarg, NULL, 0); + if (!strcmp(longopts[option_idx].name, "pi_offset")) + ctx.pi_offset = strtoul(optarg, NULL, 0); + if (!strcmp(longopts[option_idx].name, "csum_type")) { + if (!strcmp(optarg, "ip")) { + ctx.csum_type = LBMD_PI_CSUM_IP; + } else if (!strcmp(optarg, "t10dif")) { + ctx.csum_type = LBMD_PI_CSUM_CRC16_T10DIF; + } else if (!strcmp(optarg, "nvme")) { + ctx.csum_type = LBMD_PI_CSUM_CRC64_NVME; + } else { + ublk_err("invalid csum_type: %s\n", optarg); + return -EINVAL; + } + } + if (!strcmp(longopts[option_idx].name, "tag_size")) + ctx.tag_size = strtoul(optarg, NULL, 0); + if (!strcmp(longopts[option_idx].name, "safe")) + ctx.safe_stop = 1; + if (!strcmp(longopts[option_idx].name, "no_auto_part_scan")) + ctx.flags |= UBLK_F_NO_AUTO_PART_SCAN; break; case '?': /* @@ -1722,6 +1928,11 @@ int main(int argc, char *argv[]) } } + if (ctx.per_io_tasks && (ctx.flags & UBLK_F_BATCH_IO)) { + ublk_err("per_io_task and F_BATCH_IO conflict\n"); + return -EINVAL; + } + /* auto_zc_fallback depends on F_AUTO_BUF_REG & F_SUPPORT_ZERO_COPY */ if (ctx.auto_zc_fallback && !((ctx.flags & UBLK_F_AUTO_BUF_REG) && @@ -1741,6 +1952,28 @@ int main(int argc, char *argv[]) return -EINVAL; } + if (ctx.metadata_size) { + if (!(ctx.flags & UBLK_F_USER_COPY)) { + ublk_err("integrity requires user_copy\n"); + return -EINVAL; + } + + ctx.flags |= UBLK_F_INTEGRITY; + } else if (ctx.integrity_flags || + ctx.pi_offset || + ctx.csum_type != LBMD_PI_CSUM_NONE || + ctx.tag_size) { + ublk_err("integrity parameters require metadata_size\n"); + return -EINVAL; + } + + if ((ctx.flags & UBLK_F_AUTO_BUF_REG) && + (ctx.flags & UBLK_F_BATCH_IO) && + (ctx.nthreads > ctx.nr_hw_queues)) { + ublk_err("too many threads for F_AUTO_BUF_REG & F_BATCH_IO\n"); + return -EINVAL; + } + i = optind; while (i < argc && ctx.nr_files < MAX_BACK_FILES) { ctx.files[ctx.nr_files++] = argv[i++]; @@ -1766,6 +1999,8 @@ int main(int argc, char *argv[]) } } else if (!strcmp(cmd, "del")) ret = cmd_dev_del(&ctx); + else if (!strcmp(cmd, "stop")) + ret = cmd_dev_stop(&ctx); else if (!strcmp(cmd, "list")) { ctx.all = 1; ret = cmd_dev_list(&ctx); diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index cae2e30f0cdd..02f0c55d006b 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -78,6 +78,13 @@ struct dev_ctx { unsigned int auto_zc_fallback:1; unsigned int per_io_tasks:1; unsigned int no_ublk_fixed_fd:1; + unsigned int safe_stop:1; + unsigned int no_auto_part_scan:1; + __u32 integrity_flags; + __u8 metadata_size; + __u8 pi_offset; + __u8 csum_type; + __u8 tag_size; int _evtfd; int _shmid; @@ -107,6 +114,7 @@ struct ublk_ctrl_cmd_data { struct ublk_io { char *buf_addr; + void *integrity_buf; #define UBLKS_IO_NEED_FETCH_RQ (1UL << 0) #define UBLKS_IO_NEED_COMMIT_RQ_COMP (1UL << 1) @@ -143,7 +151,8 @@ struct ublk_tgt_ops { void (*usage)(const struct ublk_tgt_ops *ops); /* return buffer index for UBLK_F_AUTO_BUF_REG */ - unsigned short (*buf_index)(const struct ublk_queue *, int tag); + unsigned short (*buf_index)(const struct ublk_thread *t, + const struct ublk_queue *, int tag); }; struct ublk_tgt { @@ -165,23 +174,76 @@ struct ublk_queue { const struct ublk_tgt_ops *tgt_ops; struct ublksrv_io_desc *io_cmd_buf; -/* borrow one bit of ublk uapi flags, which may never be used */ +/* borrow three bit of ublk uapi flags, which may never be used */ #define UBLKS_Q_AUTO_BUF_REG_FALLBACK (1ULL << 63) #define UBLKS_Q_NO_UBLK_FIXED_FD (1ULL << 62) +#define UBLKS_Q_PREPARED (1ULL << 61) __u64 flags; int ublk_fd; /* cached ublk char device fd */ + __u8 metadata_size; struct ublk_io ios[UBLK_QUEUE_DEPTH]; + + /* used for prep io commands */ + pthread_spinlock_t lock; +}; + +/* align with `ublk_elem_header` */ +struct ublk_batch_elem { + __u16 tag; + __u16 buf_index; + __s32 result; + __u64 buf_addr; +}; + +struct batch_commit_buf { + unsigned short q_id; + unsigned short buf_idx; + void *elem; + unsigned short done; + unsigned short count; +}; + +struct batch_fetch_buf { + struct io_uring_buf_ring *br; + void *fetch_buf; + unsigned int fetch_buf_size; + unsigned int fetch_buf_off; }; struct ublk_thread { + /* Thread-local copy of queue-to-thread mapping for this thread */ + unsigned char q_map[UBLK_MAX_QUEUES]; + struct ublk_dev *dev; - unsigned idx; + unsigned short idx; + unsigned short nr_queues; #define UBLKS_T_STOPPING (1U << 0) #define UBLKS_T_IDLE (1U << 1) +#define UBLKS_T_BATCH_IO (1U << 31) /* readonly */ unsigned state; unsigned int cmd_inflight; unsigned int io_inflight; + + unsigned short nr_bufs; + + /* followings are for BATCH_IO */ + unsigned short commit_buf_start; + unsigned char commit_buf_elem_size; + /* + * We just support single device, so pre-calculate commit/prep flags + */ + unsigned short cmd_flags; + unsigned int nr_commit_buf; + unsigned int commit_buf_size; + void *commit_buf; +#define UBLKS_T_COMMIT_BUF_INV_IDX ((unsigned short)-1) + struct allocator commit_buf_alloc; + struct batch_commit_buf *commit; + /* FETCH_IO_CMDS buffer */ + unsigned short nr_fetch_bufs; + struct batch_fetch_buf *fetch; + struct io_uring ring; }; @@ -202,6 +264,55 @@ struct ublk_dev { extern int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io); +static inline int __ublk_use_batch_io(__u64 flags) +{ + return flags & UBLK_F_BATCH_IO; +} + +static inline int ublk_queue_batch_io(const struct ublk_queue *q) +{ + return __ublk_use_batch_io(q->flags); +} + +static inline int ublk_dev_batch_io(const struct ublk_dev *dev) +{ + return __ublk_use_batch_io(dev->dev_info.flags); +} + +/* only work for handle single device in this pthread context */ +static inline int ublk_thread_batch_io(const struct ublk_thread *t) +{ + return t->state & UBLKS_T_BATCH_IO; +} + +static inline void ublk_set_integrity_params(const struct dev_ctx *ctx, + struct ublk_params *params) +{ + if (!ctx->metadata_size) + return; + + params->types |= UBLK_PARAM_TYPE_INTEGRITY; + params->integrity = (struct ublk_param_integrity) { + .flags = ctx->integrity_flags, + .interval_exp = params->basic.logical_bs_shift, + .metadata_size = ctx->metadata_size, + .pi_offset = ctx->pi_offset, + .csum_type = ctx->csum_type, + .tag_size = ctx->tag_size, + }; +} + +static inline size_t ublk_integrity_len(const struct ublk_queue *q, size_t len) +{ + /* All targets currently use interval_exp = logical_bs_shift = 9 */ + return (len >> 9) * q->metadata_size; +} + +static inline size_t +ublk_integrity_data_len(const struct ublk_queue *q, size_t integrity_len) +{ + return (integrity_len / q->metadata_size) << 9; +} static inline int ublk_io_auto_zc_fallback(const struct ublksrv_io_desc *iod) { @@ -224,9 +335,9 @@ static inline __u64 build_user_data(unsigned tag, unsigned op, { /* we only have 7 bits to encode q_id */ _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7, "UBLK_MAX_QUEUES_SHIFT must be <= 7"); - assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16) && !(q_id >> 7)); + ublk_assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16) && !(q_id >> 7)); - return tag | (op << 16) | (tgt_data << 24) | + return tag | ((__u64)op << 16) | ((__u64)tgt_data << 24) | (__u64)q_id << 56 | (__u64)is_target_io << 63; } @@ -357,35 +468,24 @@ static inline void ublk_set_sqe_cmd_op(struct io_uring_sqe *sqe, __u32 cmd_op) addr[1] = 0; } +static inline unsigned short ublk_batch_io_buf_idx( + const struct ublk_thread *t, const struct ublk_queue *q, + unsigned tag); + +static inline unsigned short ublk_io_buf_idx(const struct ublk_thread *t, + const struct ublk_queue *q, + unsigned tag) +{ + if (ublk_queue_batch_io(q)) + return ublk_batch_io_buf_idx(t, q, tag); + return q->ios[tag].buf_index; +} + static inline struct ublk_io *ublk_get_io(struct ublk_queue *q, unsigned tag) { return &q->ios[tag]; } -static inline int ublk_complete_io(struct ublk_thread *t, struct ublk_queue *q, - unsigned tag, int res) -{ - struct ublk_io *io = &q->ios[tag]; - - ublk_mark_io_done(io, res); - - return ublk_queue_io_cmd(t, io); -} - -static inline void ublk_queued_tgt_io(struct ublk_thread *t, struct ublk_queue *q, - unsigned tag, int queued) -{ - if (queued < 0) - ublk_complete_io(t, q, tag, queued); - else { - struct ublk_io *io = ublk_get_io(q, tag); - - t->io_inflight += queued; - io->tgt_ios = queued; - io->result = 0; - } -} - static inline int ublk_completed_tgt_io(struct ublk_thread *t, struct ublk_queue *q, unsigned tag) { @@ -421,12 +521,90 @@ static inline int ublk_queue_no_buf(const struct ublk_queue *q) return ublk_queue_use_zc(q) || ublk_queue_use_auto_zc(q); } +static inline int ublk_batch_commit_prepared(struct batch_commit_buf *cb) +{ + return cb->buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX; +} + +static inline unsigned ublk_queue_idx_in_thread(const struct ublk_thread *t, + const struct ublk_queue *q) +{ + unsigned char idx; + + idx = t->q_map[q->q_id]; + ublk_assert(idx != 0); + return idx - 1; +} + +/* + * Each IO's buffer index has to be calculated by this helper for + * UBLKS_T_BATCH_IO + */ +static inline unsigned short ublk_batch_io_buf_idx( + const struct ublk_thread *t, const struct ublk_queue *q, + unsigned tag) +{ + return ublk_queue_idx_in_thread(t, q) * q->q_depth + tag; +} + +/* Queue UBLK_U_IO_PREP_IO_CMDS for a specific queue with batch elements */ +int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q); +/* Start fetching I/O commands using multishot UBLK_U_IO_FETCH_IO_CMDS */ +void ublk_batch_start_fetch(struct ublk_thread *t); +/* Handle completion of batch I/O commands (prep/commit) */ +void ublk_batch_compl_cmd(struct ublk_thread *t, + const struct io_uring_cqe *cqe); +/* Initialize batch I/O state and calculate buffer parameters */ +void ublk_batch_prepare(struct ublk_thread *t); +/* Allocate and register commit buffers for batch operations */ +int ublk_batch_alloc_buf(struct ublk_thread *t); +/* Free commit buffers and cleanup batch allocator */ +void ublk_batch_free_buf(struct ublk_thread *t); + +/* Prepare a new commit buffer for batching completed I/O operations */ +void ublk_batch_prep_commit(struct ublk_thread *t); +/* Submit UBLK_U_IO_COMMIT_IO_CMDS with batched completed I/O operations */ +void ublk_batch_commit_io_cmds(struct ublk_thread *t); +/* Add a completed I/O operation to the current batch commit buffer */ +void ublk_batch_complete_io(struct ublk_thread *t, struct ublk_queue *q, + unsigned tag, int res); +void ublk_batch_setup_map(unsigned char (*q_thread_map)[UBLK_MAX_QUEUES], + int nthreads, int queues); + +static inline int ublk_complete_io(struct ublk_thread *t, struct ublk_queue *q, + unsigned tag, int res) +{ + if (ublk_queue_batch_io(q)) { + ublk_batch_complete_io(t, q, tag, res); + return 0; + } else { + struct ublk_io *io = &q->ios[tag]; + + ublk_mark_io_done(io, res); + return ublk_queue_io_cmd(t, io); + } +} + +static inline void ublk_queued_tgt_io(struct ublk_thread *t, struct ublk_queue *q, + unsigned tag, int queued) +{ + if (queued < 0) + ublk_complete_io(t, q, tag, queued); + else { + struct ublk_io *io = ublk_get_io(q, tag); + + t->io_inflight += queued; + io->tgt_ios = queued; + io->result = 0; + } +} + extern const struct ublk_tgt_ops null_tgt_ops; extern const struct ublk_tgt_ops loop_tgt_ops; extern const struct ublk_tgt_ops stripe_tgt_ops; extern const struct ublk_tgt_ops fault_inject_tgt_ops; void backing_file_tgt_deinit(struct ublk_dev *dev); -int backing_file_tgt_init(struct ublk_dev *dev); +int backing_file_tgt_init(struct ublk_dev *dev, unsigned int nr_direct); #endif diff --git a/tools/testing/selftests/ublk/metadata_size.c b/tools/testing/selftests/ublk/metadata_size.c new file mode 100644 index 000000000000..76ecddf04d25 --- /dev/null +++ b/tools/testing/selftests/ublk/metadata_size.c @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +int main(int argc, char **argv) +{ + struct logical_block_metadata_cap cap = {}; + const char *filename; + int fd; + int result; + + if (argc != 2) { + fprintf(stderr, "Usage: %s BLOCK_DEVICE\n", argv[0]); + return 1; + } + + filename = argv[1]; + fd = open(filename, O_RDONLY); + if (fd < 0) { + perror(filename); + return 1; + } + + result = ioctl(fd, FS_IOC_GETLBMD_CAP, &cap); + if (result < 0) { + perror("ioctl"); + return 1; + } + + printf("metadata_size: %u\n", cap.lbmd_size); + printf("pi_offset: %u\n", cap.lbmd_pi_offset); + printf("pi_tuple_size: %u\n", cap.lbmd_pi_size); + return 0; +} diff --git a/tools/testing/selftests/ublk/null.c b/tools/testing/selftests/ublk/null.c index 280043f6b689..7656888f4149 100644 --- a/tools/testing/selftests/ublk/null.c +++ b/tools/testing/selftests/ublk/null.c @@ -36,6 +36,7 @@ static int ublk_null_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) .max_segments = 32, }, }; + ublk_set_integrity_params(ctx, &dev->tgt.params); if (info->flags & UBLK_F_SUPPORT_ZERO_COPY) dev->tgt.sq_depth = dev->tgt.cq_depth = 2 * info->queue_depth; @@ -43,12 +44,12 @@ static int ublk_null_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) } static void __setup_nop_io(int tag, const struct ublksrv_io_desc *iod, - struct io_uring_sqe *sqe, int q_id) + struct io_uring_sqe *sqe, int q_id, unsigned buf_idx) { unsigned ublk_op = ublksrv_get_op(iod); io_uring_prep_nop(sqe); - sqe->buf_index = tag; + sqe->buf_index = buf_idx; sqe->flags |= IOSQE_FIXED_FILE; sqe->rw_flags = IORING_NOP_FIXED_BUFFER | IORING_NOP_INJECT_RESULT; sqe->len = iod->nr_sectors << 9; /* injected result */ @@ -60,18 +61,19 @@ static int null_queue_zc_io(struct ublk_thread *t, struct ublk_queue *q, { const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); struct io_uring_sqe *sqe[3]; + unsigned short buf_idx = ublk_io_buf_idx(t, q, tag); ublk_io_alloc_sqes(t, sqe, 3); - io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, ublk_get_io(q, tag)->buf_index); + io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, buf_idx); sqe[0]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1); sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; - __setup_nop_io(tag, iod, sqe[1], q->q_id); + __setup_nop_io(tag, iod, sqe[1], q->q_id, buf_idx); sqe[1]->flags |= IOSQE_IO_HARDLINK; - io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, ublk_get_io(q, tag)->buf_index); + io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, buf_idx); sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1); // buf register is marked as IOSQE_CQE_SKIP_SUCCESS @@ -85,7 +87,7 @@ static int null_queue_auto_zc_io(struct ublk_thread *t, struct ublk_queue *q, struct io_uring_sqe *sqe[1]; ublk_io_alloc_sqes(t, sqe, 1); - __setup_nop_io(tag, iod, sqe[0], q->q_id); + __setup_nop_io(tag, iod, sqe[0], q->q_id, ublk_io_buf_idx(t, q, tag)); return 1; } @@ -136,11 +138,12 @@ static int ublk_null_queue_io(struct ublk_thread *t, struct ublk_queue *q, * return invalid buffer index for triggering auto buffer register failure, * then UBLK_IO_RES_NEED_REG_BUF handling is covered */ -static unsigned short ublk_null_buf_index(const struct ublk_queue *q, int tag) +static unsigned short ublk_null_buf_index(const struct ublk_thread *t, + const struct ublk_queue *q, int tag) { if (ublk_queue_auto_zc_fallback(q)) return (unsigned short)-1; - return q->ios[tag].buf_index; + return ublk_io_buf_idx(t, q, tag); } const struct ublk_tgt_ops null_tgt_ops = { diff --git a/tools/testing/selftests/ublk/settings b/tools/testing/selftests/ublk/settings new file mode 100644 index 000000000000..682a40f1c8e6 --- /dev/null +++ b/tools/testing/selftests/ublk/settings @@ -0,0 +1 @@ +timeout=150 diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c index fd412e1f01c0..dca819f5366e 100644 --- a/tools/testing/selftests/ublk/stripe.c +++ b/tools/testing/selftests/ublk/stripe.c @@ -96,12 +96,12 @@ static void calculate_stripe_array(const struct stripe_conf *conf, this->seq = seq; s->nr += 1; } else { - assert(seq == this->seq); - assert(this->start + this->nr_sects == stripe_off); + ublk_assert(seq == this->seq); + ublk_assert(this->start + this->nr_sects == stripe_off); this->nr_sects += nr_sects; } - assert(this->nr_vec < this->cap); + ublk_assert(this->nr_vec < this->cap); this->vec[this->nr_vec].iov_base = (void *)(base + done); this->vec[this->nr_vec++].iov_len = nr_sects << 9; @@ -120,7 +120,7 @@ static inline enum io_uring_op stripe_to_uring_op( return zc ? IORING_OP_READV_FIXED : IORING_OP_READV; else if (ublk_op == UBLK_IO_OP_WRITE) return zc ? IORING_OP_WRITEV_FIXED : IORING_OP_WRITEV; - assert(0); + ublk_assert(0); } static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, @@ -135,6 +135,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, struct ublk_io *io = ublk_get_io(q, tag); int i, extra = zc ? 2 : 0; void *base = io->buf_addr; + unsigned short buf_idx = ublk_io_buf_idx(t, q, tag); io->private_data = s; calculate_stripe_array(conf, iod, s, base); @@ -142,7 +143,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, ublk_io_alloc_sqes(t, sqe, s->nr + extra); if (zc) { - io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, io->buf_index); + io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, buf_idx); sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; sqe[0]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1); @@ -158,7 +159,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, t->start << 9); io_uring_sqe_set_flags(sqe[i], IOSQE_FIXED_FILE); if (auto_zc || zc) { - sqe[i]->buf_index = tag; + sqe[i]->buf_index = buf_idx; if (zc) sqe[i]->flags |= IOSQE_IO_HARDLINK; } @@ -168,7 +169,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, if (zc) { struct io_uring_sqe *unreg = sqe[s->nr + 1]; - io_uring_prep_buf_unregister(unreg, q, tag, q->q_id, io->buf_index); + io_uring_prep_buf_unregister(unreg, q, tag, q->q_id, buf_idx); unreg->user_data = build_user_data( tag, ublk_cmd_op_nr(unreg->cmd_op), 0, q->q_id, 1); } @@ -298,6 +299,10 @@ static int ublk_stripe_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) ublk_err("%s: not support auto_zc_fallback\n", __func__); return -EINVAL; } + if (ctx->metadata_size) { + ublk_err("%s: integrity not supported\n", __func__); + return -EINVAL; + } if ((chunk_size & (chunk_size - 1)) || !chunk_size) { ublk_err("invalid chunk size %u\n", chunk_size); @@ -311,14 +316,14 @@ static int ublk_stripe_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) chunk_shift = ilog2(chunk_size); - ret = backing_file_tgt_init(dev); + ret = backing_file_tgt_init(dev, dev->tgt.nr_backing_files); if (ret) return ret; if (!dev->tgt.nr_backing_files || dev->tgt.nr_backing_files > NR_STRIPE) return -EINVAL; - assert(dev->nr_fds == dev->tgt.nr_backing_files + 1); + ublk_assert(dev->nr_fds == dev->tgt.nr_backing_files + 1); for (i = 0; i < dev->tgt.nr_backing_files; i++) dev->tgt.backing_file_size[i] &= ~((1 << chunk_shift) - 1); diff --git a/tools/testing/selftests/ublk/test_batch_01.sh b/tools/testing/selftests/ublk/test_batch_01.sh new file mode 100755 index 000000000000..a18fb39af8be --- /dev/null +++ b/tools/testing/selftests/ublk/test_batch_01.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +if ! _have_feature "BATCH_IO"; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "generic" "test basic function of UBLK_F_BATCH_IO" + +_create_backfile 0 256M +_create_backfile 1 256M + +dev_id=$(_add_ublk_dev -t loop -q 2 -b "${UBLK_BACKFILES[0]}") +_check_add_dev $TID $? + +if ! _mkfs_mount_test /dev/ublkb"${dev_id}"; then + _cleanup_test "generic" + _show_result $TID 255 +fi + +dev_id=$(_add_ublk_dev -t stripe -b --auto_zc "${UBLK_BACKFILES[0]}" "${UBLK_BACKFILES[1]}") +_check_add_dev $TID $? +_mkfs_mount_test /dev/ublkb"${dev_id}" +ERR_CODE=$? + +_cleanup_test "generic" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_batch_02.sh b/tools/testing/selftests/ublk/test_batch_02.sh new file mode 100755 index 000000000000..7ca384d11987 --- /dev/null +++ b/tools/testing/selftests/ublk/test_batch_02.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +if ! _have_feature "BATCH_IO"; then + exit "$UBLK_SKIP_CODE" +fi + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "generic" "test UBLK_F_BATCH_IO with 4_threads vs. 1_queues" + +_create_backfile 0 512M + +dev_id=$(_add_ublk_dev -t loop -q 1 --nthreads 4 -b "${UBLK_BACKFILES[0]}") +_check_add_dev $TID $? + +# run fio over the ublk disk +fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio --rw=readwrite \ + --iodepth=32 --size=100M --numjobs=4 > /dev/null 2>&1 +ERR_CODE=$? + +_cleanup_test "generic" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_batch_03.sh b/tools/testing/selftests/ublk/test_batch_03.sh new file mode 100755 index 000000000000..aca9cf144b55 --- /dev/null +++ b/tools/testing/selftests/ublk/test_batch_03.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +if ! _have_feature "BATCH_IO"; then + exit "$UBLK_SKIP_CODE" +fi + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "generic" "test UBLK_F_BATCH_IO with 1_threads vs. 4_queues" + +_create_backfile 0 512M + +dev_id=$(_add_ublk_dev -t loop -q 4 --nthreads 1 -b "${UBLK_BACKFILES[0]}") +_check_add_dev $TID $? + +# run fio over the ublk disk +fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio --rw=readwrite \ + --iodepth=32 --size=100M --numjobs=4 > /dev/null 2>&1 +ERR_CODE=$? + +_cleanup_test "generic" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index ea9a5f3eb70a..163a40007910 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -1,6 +1,11 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +# Derive TID from script name: test__.sh -> _ +# Can be overridden in test script after sourcing this file +TID=$(basename "$0" .sh) +TID=${TID#test_} + UBLK_SKIP_CODE=4 _have_program() { @@ -10,6 +15,16 @@ _have_program() { return 1 } +# Sleep with awareness of parallel execution. +# Usage: _ublk_sleep +_ublk_sleep() { + if [ "${JOBS:-1}" -gt 1 ]; then + sleep "$2" + else + sleep "$1" + fi +} + _get_disk_dev_t() { local dev_id=$1 local dev @@ -43,7 +58,7 @@ _create_backfile() { old_file="${UBLK_BACKFILES[$index]}" [ -f "$old_file" ] && rm -f "$old_file" - new_file=$(mktemp ublk_file_"${new_size}"_XXXXX) + new_file=$(mktemp ${UBLK_TEST_DIR}/ublk_file_"${new_size}"_XXXXX) truncate -s "${new_size}" "${new_file}" UBLK_BACKFILES["$index"]="$new_file" } @@ -60,7 +75,7 @@ _remove_files() { _create_tmp_dir() { local my_file; - my_file=$(mktemp -d ublk_dir_XXXXX) + my_file=$(mktemp -d ${UBLK_TEST_DIR}/ublk_dir_XXXXX) echo "$my_file" } @@ -101,11 +116,6 @@ _check_root() { fi } -_remove_ublk_devices() { - ${UBLK_PROG} del -a - modprobe -r ublk_drv > /dev/null 2>&1 -} - _get_ublk_dev_state() { ${UBLK_PROG} list -n "$1" | grep "state" | awk '{print $11}' } @@ -119,8 +129,12 @@ _prep_test() { local type=$1 shift 1 modprobe ublk_drv > /dev/null 2>&1 - UBLK_TMP=$(mktemp ublk_test_XXXXX) + local base_dir=${TMPDIR:-./ublktest-dir} + mkdir -p "$base_dir" + UBLK_TEST_DIR=$(mktemp -d ${base_dir}/${TID}.XXXXXX) + UBLK_TMP=$(mktemp ${UBLK_TEST_DIR}/ublk_test_XXXXX) [ "$UBLK_TEST_QUIET" -eq 0 ] && echo "ublk $type: $*" + echo "ublk selftest: $TID starting at $(date '+%F %T')" | tee /dev/kmsg } _remove_test_files() @@ -162,9 +176,16 @@ _check_add_dev() } _cleanup_test() { - "${UBLK_PROG}" del -a + if [ -f "${UBLK_TEST_DIR}/.ublk_devs" ]; then + while read -r dev_id; do + ${UBLK_PROG} del -n "${dev_id}" + done < "${UBLK_TEST_DIR}/.ublk_devs" + rm -f "${UBLK_TEST_DIR}/.ublk_devs" + fi _remove_files + rmdir ${UBLK_TEST_DIR} + echo "ublk selftest: $TID done at $(date '+%F %T')" | tee /dev/kmsg } _have_feature() @@ -197,10 +218,11 @@ _create_ublk_dev() { fi if [ "$settle" = "yes" ]; then - udevadm settle + udevadm settle --timeout=20 fi if [[ "$dev_id" =~ ^[0-9]+$ ]]; then + echo "$dev_id" >> "${UBLK_TEST_DIR}/.ublk_devs" echo "${dev_id}" else return 255 @@ -220,7 +242,7 @@ _recover_ublk_dev() { local state dev_id=$(_create_ublk_dev "recover" "yes" "$@") - for ((j=0;j<20;j++)); do + for ((j=0;j<100;j++)); do state=$(_get_ublk_dev_state "${dev_id}") [ "$state" == "LIVE" ] && break sleep 1 @@ -240,7 +262,7 @@ __ublk_quiesce_dev() return "$state" fi - for ((j=0;j<50;j++)); do + for ((j=0;j<100;j++)); do state=$(_get_ublk_dev_state "${dev_id}") [ "$state" == "$exp_state" ] && break sleep 1 @@ -259,7 +281,7 @@ __ublk_kill_daemon() daemon_pid=$(_get_ublk_daemon_pid "${dev_id}") state=$(_get_ublk_dev_state "${dev_id}") - for ((j=0;j<50;j++)); do + for ((j=0;j<100;j++)); do [ "$state" == "$exp_state" ] && break kill -9 "$daemon_pid" > /dev/null 2>&1 sleep 1 @@ -268,12 +290,23 @@ __ublk_kill_daemon() echo "$state" } -__remove_ublk_dev_return() { +_ublk_del_dev() { local dev_id=$1 ${UBLK_PROG} del -n "${dev_id}" + + # Remove from tracking file + if [ -f "${UBLK_TEST_DIR}/.ublk_devs" ]; then + sed -i "/^${dev_id}$/d" "${UBLK_TEST_DIR}/.ublk_devs" + fi +} + +__remove_ublk_dev_return() { + local dev_id=$1 + + _ublk_del_dev "${dev_id}" local res=$? - udevadm settle + udevadm settle --timeout=20 return ${res} } @@ -384,6 +417,16 @@ _ublk_test_top_dir() cd "$(dirname "$0")" && pwd } +METADATA_SIZE_PROG="$(_ublk_test_top_dir)/metadata_size" + +_get_metadata_size() +{ + local dev_id=$1 + local field=$2 + + "$METADATA_SIZE_PROG" "/dev/ublkb$dev_id" | grep "$field" | grep -o "[0-9]*" +} + UBLK_PROG=$(_ublk_test_top_dir)/kublk UBLK_TEST_QUIET=1 UBLK_TEST_SHOW_RESULT=1 diff --git a/tools/testing/selftests/ublk/test_generic_01.sh b/tools/testing/selftests/ublk/test_generic_01.sh deleted file mode 100755 index 21a31cd5491a..000000000000 --- a/tools/testing/selftests/ublk/test_generic_01.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh - -TID="generic_01" -ERR_CODE=0 - -if ! _have_program bpftrace; then - exit "$UBLK_SKIP_CODE" -fi - -if ! _have_program fio; then - exit "$UBLK_SKIP_CODE" -fi - -_prep_test "null" "sequential io order" - -dev_id=$(_add_ublk_dev -t null) -_check_add_dev $TID $? - -dev_t=$(_get_disk_dev_t "$dev_id") -bpftrace trace/seq_io.bt "$dev_t" "W" 1 > "$UBLK_TMP" 2>&1 & -btrace_pid=$! -sleep 2 - -if ! kill -0 "$btrace_pid" > /dev/null 2>&1; then - _cleanup_test "null" - exit "$UBLK_SKIP_CODE" -fi - -# run fio over this ublk disk -fio --name=write_seq \ - --filename=/dev/ublkb"${dev_id}" \ - --ioengine=libaio --iodepth=16 \ - --rw=write \ - --size=512M \ - --direct=1 \ - --bs=4k > /dev/null 2>&1 -ERR_CODE=$? -kill "$btrace_pid" -wait -if grep -q "io_out_of_order" "$UBLK_TMP"; then - cat "$UBLK_TMP" - ERR_CODE=255 -fi -_cleanup_test "null" -_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_generic_02.sh b/tools/testing/selftests/ublk/test_generic_02.sh index 12920768b1a0..46b657143fd6 100755 --- a/tools/testing/selftests/ublk/test_generic_02.sh +++ b/tools/testing/selftests/ublk/test_generic_02.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_02" ERR_CODE=0 if ! _have_program bpftrace; then @@ -14,7 +13,7 @@ if ! _have_program fio; then exit "$UBLK_SKIP_CODE" fi -_prep_test "null" "sequential io order for MQ" +_prep_test "null" "ublk dispatch won't reorder IO for MQ" dev_id=$(_add_ublk_dev -t null -q 2) _check_add_dev $TID $? @@ -22,15 +21,20 @@ _check_add_dev $TID $? dev_t=$(_get_disk_dev_t "$dev_id") bpftrace trace/seq_io.bt "$dev_t" "W" 1 > "$UBLK_TMP" 2>&1 & btrace_pid=$! -sleep 2 -if ! kill -0 "$btrace_pid" > /dev/null 2>&1; then +# Wait for bpftrace probes to be attached (BEGIN block prints BPFTRACE_READY) +for _ in $(seq 100); do + grep -q "BPFTRACE_READY" "$UBLK_TMP" 2>/dev/null && break + sleep 0.1 +done + +if ! kill -0 "$btrace_pid" 2>/dev/null; then _cleanup_test "null" exit "$UBLK_SKIP_CODE" fi -# run fio over this ublk disk -fio --name=write_seq \ +# run fio over this ublk disk (pinned to CPU 0) +taskset -c 0 fio --name=write_seq \ --filename=/dev/ublkb"${dev_id}" \ --ioengine=libaio --iodepth=16 \ --rw=write \ @@ -40,8 +44,11 @@ fio --name=write_seq \ ERR_CODE=$? kill "$btrace_pid" wait -if grep -q "io_out_of_order" "$UBLK_TMP"; then - cat "$UBLK_TMP" + +# Check for out-of-order completions detected by bpftrace +if grep -q "^out_of_order:" "$UBLK_TMP"; then + echo "I/O reordering detected:" + grep "^out_of_order:" "$UBLK_TMP" ERR_CODE=255 fi _cleanup_test "null" diff --git a/tools/testing/selftests/ublk/test_generic_03.sh b/tools/testing/selftests/ublk/test_generic_03.sh index b551aa76cb0d..8934ea926762 100755 --- a/tools/testing/selftests/ublk/test_generic_03.sh +++ b/tools/testing/selftests/ublk/test_generic_03.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_03" ERR_CODE=0 _prep_test "null" "check dma & segment limits for zero copy" diff --git a/tools/testing/selftests/ublk/test_generic_06.sh b/tools/testing/selftests/ublk/test_generic_06.sh index fd42062b7b76..14a05054fcd8 100755 --- a/tools/testing/selftests/ublk/test_generic_06.sh +++ b/tools/testing/selftests/ublk/test_generic_06.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_06" ERR_CODE=0 _prep_test "fault_inject" "fast cleanup when all I/Os of one hctx are in server" diff --git a/tools/testing/selftests/ublk/test_generic_07.sh b/tools/testing/selftests/ublk/test_generic_07.sh index cba86451fa5e..8dcfd8978f50 100755 --- a/tools/testing/selftests/ublk/test_generic_07.sh +++ b/tools/testing/selftests/ublk/test_generic_07.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_07" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_generic_08.sh b/tools/testing/selftests/ublk/test_generic_08.sh index b222f3a77e12..ce88c31d6b9c 100755 --- a/tools/testing/selftests/ublk/test_generic_08.sh +++ b/tools/testing/selftests/ublk/test_generic_08.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_08" ERR_CODE=0 if ! _have_feature "AUTO_BUF_REG"; then diff --git a/tools/testing/selftests/ublk/test_generic_09.sh b/tools/testing/selftests/ublk/test_generic_09.sh index bb6f77ca5522..744d0cdaa242 100755 --- a/tools/testing/selftests/ublk/test_generic_09.sh +++ b/tools/testing/selftests/ublk/test_generic_09.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_09" ERR_CODE=0 if ! _have_feature "AUTO_BUF_REG"; then diff --git a/tools/testing/selftests/ublk/test_generic_10.sh b/tools/testing/selftests/ublk/test_generic_10.sh index abc11c3d416b..4b4293b9081f 100755 --- a/tools/testing/selftests/ublk/test_generic_10.sh +++ b/tools/testing/selftests/ublk/test_generic_10.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_10" ERR_CODE=0 if ! _have_feature "UPDATE_SIZE"; then diff --git a/tools/testing/selftests/ublk/test_generic_12.sh b/tools/testing/selftests/ublk/test_generic_12.sh index b4046201b4d9..54b81ddfe9f9 100755 --- a/tools/testing/selftests/ublk/test_generic_12.sh +++ b/tools/testing/selftests/ublk/test_generic_12.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_12" ERR_CODE=0 if ! _have_program bpftrace; then diff --git a/tools/testing/selftests/ublk/test_generic_13.sh b/tools/testing/selftests/ublk/test_generic_13.sh index b7aa90b1cb74..922115aa14f4 100755 --- a/tools/testing/selftests/ublk/test_generic_13.sh +++ b/tools/testing/selftests/ublk/test_generic_13.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_13" ERR_CODE=0 _prep_test "null" "check that feature list is complete" diff --git a/tools/testing/selftests/ublk/test_generic_16.sh b/tools/testing/selftests/ublk/test_generic_16.sh new file mode 100755 index 000000000000..3ef367836ac5 --- /dev/null +++ b/tools/testing/selftests/ublk/test_generic_16.sh @@ -0,0 +1,56 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +_prep_test "null" "stop --safe command" + +# Check if SAFE_STOP_DEV feature is supported +if ! _have_feature "SAFE_STOP_DEV"; then + _cleanup_test "null" + exit "$UBLK_SKIP_CODE" +fi + +# Test 1: stop --safe on idle device should succeed +dev_id=$(_add_ublk_dev -t null -q 2 -d 32) +_check_add_dev $TID $? + +# Device is idle (no openers), stop --safe should succeed +if ! ${UBLK_PROG} stop -n "${dev_id}" --safe; then + echo "stop --safe on idle device failed unexpectedly!" + ERR_CODE=255 +fi + +# Clean up device +_ublk_del_dev "${dev_id}" > /dev/null 2>&1 +udevadm settle + +# Test 2: stop --safe on device with active opener should fail +dev_id=$(_add_ublk_dev -t null -q 2 -d 32) +_check_add_dev $TID $? + +# Open device in background (dd reads indefinitely) +dd if=/dev/ublkb${dev_id} of=/dev/null bs=4k iflag=direct > /dev/null 2>&1 & +dd_pid=$! + +# Give dd time to start +sleep 0.2 + +# Device has active opener, stop --safe should fail with -EBUSY +if ${UBLK_PROG} stop -n "${dev_id}" --safe 2>/dev/null; then + echo "stop --safe on busy device succeeded unexpectedly!" + ERR_CODE=255 +fi + +# Kill dd and clean up +kill $dd_pid 2>/dev/null +wait $dd_pid 2>/dev/null + +# Now device should be idle, regular delete should work +_ublk_del_dev "${dev_id}" +udevadm settle + +_cleanup_test "null" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_integrity_01.sh b/tools/testing/selftests/ublk/test_integrity_01.sh new file mode 100755 index 000000000000..6713b280a6ff --- /dev/null +++ b/tools/testing/selftests/ublk/test_integrity_01.sh @@ -0,0 +1,105 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +_check_value() { + local name=$1 + local actual=$2 + local expected=$3 + + if [ "$actual" != "$expected" ]; then + echo "$name $actual != $expected" + ERR_CODE=255 + return 1 + fi + return 0 +} + +_test_metadata_only() { + local dev_id + + dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --metadata_size 8) + _check_add_dev "$TID" $? + + _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 8 && + _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 && + _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 0 && + _check_value "device_is_integrity_capable" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 && + _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" nop && + _check_value "protection_interval_bytes" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && + _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 + + _ublk_del_dev "${dev_id}" +} + +_test_integrity_capable_ip() { + local dev_id + + dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --integrity_capable --metadata_size 64 --pi_offset 56 --csum_type ip) + _check_add_dev "$TID" $? + + _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 64 && + _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 56 && + _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 8 && + _check_value "device_is_integrity_capable" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 1 && + _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" T10-DIF-TYPE3-IP && + _check_value "protection_interval_bytes" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && + _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 + + _ublk_del_dev "${dev_id}" +} + +_test_integrity_reftag_t10dif() { + local dev_id + + dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --integrity_reftag --metadata_size 8 --csum_type t10dif) + _check_add_dev "$TID" $? + + _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 8 && + _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 && + _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 8 && + _check_value "device_is_integrity_capable" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 && + _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" T10-DIF-TYPE1-CRC && + _check_value "protection_interval_bytes" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && + _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 + + _ublk_del_dev "${dev_id}" +} + +_test_nvme_csum() { + local dev_id + + dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --metadata_size 16 --csum_type nvme --tag_size 8) + _check_add_dev "$TID" $? + + _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 16 && + _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 && + _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 16 && + _check_value "device_is_integrity_capable" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 && + _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" EXT-DIF-TYPE3-CRC64 && + _check_value "protection_interval_bytes" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && + _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 8 + + _ublk_del_dev "${dev_id}" +} + +_prep_test "null" "integrity params" + +_test_metadata_only +_test_integrity_capable_ip +_test_integrity_reftag_t10dif +_test_nvme_csum + +_cleanup_test +_show_result "$TID" $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_integrity_02.sh b/tools/testing/selftests/ublk/test_integrity_02.sh new file mode 100755 index 000000000000..aaf1f52da559 --- /dev/null +++ b/tools/testing/selftests/ublk/test_integrity_02.sh @@ -0,0 +1,141 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +if ! _have_program fio; then + exit $UBLK_SKIP_CODE +fi + +fio_version=$(fio --version) +if [[ "$fio_version" =~ fio-[0-9]+\.[0-9]+$ ]]; then + echo "Requires development fio version with https://github.com/axboe/fio/pull/1992" + exit $UBLK_SKIP_CODE +fi + +ERR_CODE=0 + +# Global variables set during device setup +dev_id="" +fio_args="" +fio_err="" + +_setup_device() { + _create_backfile 0 256M + _create_backfile 1 32M # 256M * (64 integrity bytes / 512 data bytes) + + local integrity_params="--integrity_capable --integrity_reftag + --metadata_size 64 --pi_offset 56 --csum_type t10dif" + dev_id=$(_add_ublk_dev -t loop -u $integrity_params "${UBLK_BACKFILES[@]}") + _check_add_dev "$TID" $? + + # 1M * (64 integrity bytes / 512 data bytes) = 128K + fio_args="--ioengine io_uring --direct 1 --bsrange 512-1M --iodepth 32 + --md_per_io_size 128K --pi_act 0 --pi_chk GUARD,REFTAG,APPTAG + --filename /dev/ublkb$dev_id" + + fio_err=$(mktemp "${UBLK_TEST_DIR}"/fio_err_XXXXX) +} + +_test_fill_and_verify() { + fio --name fill --rw randwrite $fio_args > /dev/null + if [ $? != 0 ]; then + echo "fio fill failed" + ERR_CODE=255 + return 1 + fi + + fio --name verify --rw randread $fio_args > /dev/null + if [ $? != 0 ]; then + echo "fio verify failed" + ERR_CODE=255 + return 1 + fi +} + +_test_corrupted_reftag() { + local dd_reftag_args="bs=1 seek=60 count=4 oflag=dsync conv=notrunc status=none" + local expected_err="REFTAG compare error: LBA: 0 Expected=0, Actual=" + + # Overwrite 4-byte reftag at offset 56 + 4 = 60 + dd if=/dev/urandom "of=${UBLK_BACKFILES[1]}" $dd_reftag_args + if [ $? != 0 ]; then + echo "dd corrupted_reftag failed" + ERR_CODE=255 + return 1 + fi + + if fio --name corrupted_reftag --rw randread $fio_args > /dev/null 2> "$fio_err"; then + echo "fio corrupted_reftag unexpectedly succeeded" + ERR_CODE=255 + return 1 + fi + + if ! grep -q "$expected_err" "$fio_err"; then + echo "fio corrupted_reftag message not found: $expected_err" + ERR_CODE=255 + return 1 + fi + + # Reset to 0 + dd if=/dev/zero "of=${UBLK_BACKFILES[1]}" $dd_reftag_args + if [ $? != 0 ]; then + echo "dd restore corrupted_reftag failed" + ERR_CODE=255 + return 1 + fi +} + +_test_corrupted_data() { + local dd_data_args="bs=512 count=1 oflag=direct,dsync conv=notrunc status=none" + local expected_err="Guard compare error: LBA: 0 Expected=0, Actual=" + + dd if=/dev/zero "of=${UBLK_BACKFILES[0]}" $dd_data_args + if [ $? != 0 ]; then + echo "dd corrupted_data failed" + ERR_CODE=255 + return 1 + fi + + if fio --name corrupted_data --rw randread $fio_args > /dev/null 2> "$fio_err"; then + echo "fio corrupted_data unexpectedly succeeded" + ERR_CODE=255 + return 1 + fi + + if ! grep -q "$expected_err" "$fio_err"; then + echo "fio corrupted_data message not found: $expected_err" + ERR_CODE=255 + return 1 + fi +} + +_test_bad_apptag() { + local expected_err="APPTAG compare error: LBA: [0-9]* Expected=4321, Actual=1234" + + if fio --name bad_apptag --rw randread $fio_args --apptag 0x4321 > /dev/null 2> "$fio_err"; then + echo "fio bad_apptag unexpectedly succeeded" + ERR_CODE=255 + return 1 + fi + + if ! grep -q "$expected_err" "$fio_err"; then + echo "fio bad_apptag message not found: $expected_err" + ERR_CODE=255 + return 1 + fi +} + +_prep_test "loop" "end-to-end integrity" + +_setup_device + +_test_fill_and_verify && \ +_test_corrupted_reftag && \ +_test_corrupted_data && \ +_test_bad_apptag + +rm -f "$fio_err" + +_cleanup_test +_show_result "$TID" $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_loop_01.sh b/tools/testing/selftests/ublk/test_loop_01.sh index 833fa0dbc700..338a235fd82a 100755 --- a/tools/testing/selftests/ublk/test_loop_01.sh +++ b/tools/testing/selftests/ublk/test_loop_01.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="loop_01" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_loop_02.sh b/tools/testing/selftests/ublk/test_loop_02.sh index 874568b3646b..04c52454e2ec 100755 --- a/tools/testing/selftests/ublk/test_loop_02.sh +++ b/tools/testing/selftests/ublk/test_loop_02.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="loop_02" ERR_CODE=0 _prep_test "loop" "mkfs & mount & umount" diff --git a/tools/testing/selftests/ublk/test_loop_03.sh b/tools/testing/selftests/ublk/test_loop_03.sh index c30f797c6429..6e8f649fe93d 100755 --- a/tools/testing/selftests/ublk/test_loop_03.sh +++ b/tools/testing/selftests/ublk/test_loop_03.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="loop_03" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_loop_04.sh b/tools/testing/selftests/ublk/test_loop_04.sh index b01d75b3214d..9f6774ec0de6 100755 --- a/tools/testing/selftests/ublk/test_loop_04.sh +++ b/tools/testing/selftests/ublk/test_loop_04.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="loop_04" ERR_CODE=0 _prep_test "loop" "mkfs & mount & umount with zero copy" diff --git a/tools/testing/selftests/ublk/test_loop_05.sh b/tools/testing/selftests/ublk/test_loop_05.sh index de2141533074..2b8d99e007be 100755 --- a/tools/testing/selftests/ublk/test_loop_05.sh +++ b/tools/testing/selftests/ublk/test_loop_05.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="loop_05" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_loop_06.sh b/tools/testing/selftests/ublk/test_loop_06.sh index 1d1a8a725502..e73f6f4844db 100755 --- a/tools/testing/selftests/ublk/test_loop_06.sh +++ b/tools/testing/selftests/ublk/test_loop_06.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="loop_06" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_loop_07.sh b/tools/testing/selftests/ublk/test_loop_07.sh index 493f3fb611a5..264d20e7c530 100755 --- a/tools/testing/selftests/ublk/test_loop_07.sh +++ b/tools/testing/selftests/ublk/test_loop_07.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="loop_07" ERR_CODE=0 _prep_test "loop" "mkfs & mount & umount with user copy" diff --git a/tools/testing/selftests/ublk/test_null_01.sh b/tools/testing/selftests/ublk/test_null_01.sh index c2cb8f7a09fe..eebce8076530 100755 --- a/tools/testing/selftests/ublk/test_null_01.sh +++ b/tools/testing/selftests/ublk/test_null_01.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="null_01" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_null_02.sh b/tools/testing/selftests/ublk/test_null_02.sh index 8accd35beb55..654bdff39664 100755 --- a/tools/testing/selftests/ublk/test_null_02.sh +++ b/tools/testing/selftests/ublk/test_null_02.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="null_02" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_null_03.sh b/tools/testing/selftests/ublk/test_null_03.sh index 0051067b4686..29cd09f06672 100755 --- a/tools/testing/selftests/ublk/test_null_03.sh +++ b/tools/testing/selftests/ublk/test_null_03.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="null_03" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_part_01.sh b/tools/testing/selftests/ublk/test_part_01.sh new file mode 100755 index 000000000000..8028f6e4b3a5 --- /dev/null +++ b/tools/testing/selftests/ublk/test_part_01.sh @@ -0,0 +1,104 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +format_backing_file() +{ + local backing_file=$1 + + # Create ublk device to write partition table + local tmp_dev=$(_add_ublk_dev -t loop "${backing_file}") + [ $? -ne 0 ] && return 1 + + # Write partition table with sfdisk + sfdisk /dev/ublkb"${tmp_dev}" > /dev/null 2>&1 < /dev/null 2>&1 + udevadm settle + + if [ ! -e /dev/ublkb"${dev_id}"p1 ] || [ ! -e /dev/ublkb"${dev_id}"p2 ]; then + "${UBLK_PROG}" del -n "${dev_id}" + return 1 + fi + + "${UBLK_PROG}" del -n "${dev_id}" + return 0 +} + +if ! _have_program sfdisk || ! _have_program blockdev; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "generic" "test UBLK_F_NO_AUTO_PART_SCAN" + +if ! _have_feature "UBLK_F_NO_AUTO_PART_SCAN"; then + _cleanup_test "generic" + exit "$UBLK_SKIP_CODE" +fi + + +# Create and format backing file with partition table +_create_backfile 0 256M +format_backing_file "${UBLK_BACKFILES[0]}" +[ $? -ne 0 ] && ERR_CODE=255 + +# Test normal auto partition scan +[ "$ERR_CODE" -eq 0 ] && test_auto_part_scan "${UBLK_BACKFILES[0]}" +[ $? -ne 0 ] && ERR_CODE=255 + +# Test no auto partition scan with manual scan +[ "$ERR_CODE" -eq 0 ] && test_no_auto_part_scan "${UBLK_BACKFILES[0]}" +[ $? -ne 0 ] && ERR_CODE=255 + +_cleanup_test "generic" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_generic_15.sh b/tools/testing/selftests/ublk/test_part_02.sh similarity index 94% rename from tools/testing/selftests/ublk/test_generic_15.sh rename to tools/testing/selftests/ublk/test_part_02.sh index 76379362e0a2..7d42ab4d6e83 100755 --- a/tools/testing/selftests/ublk/test_generic_15.sh +++ b/tools/testing/selftests/ublk/test_part_02.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_15" ERR_CODE=0 _test_partition_scan_no_hang() @@ -34,7 +33,7 @@ _test_partition_scan_no_hang() # The add command should return quickly because partition scan is async. # Now sleep briefly to let the async partition scan work start and hit # the delay in the fault_inject handler. - sleep 1 + _ublk_sleep 1 5 # Kill the ublk daemon while partition scan is potentially blocked # And check state transitions properly @@ -47,13 +46,13 @@ _test_partition_scan_no_hang() if [ "$state" != "${expected_state}" ]; then echo "FAIL: Device state is $state, expected ${expected_state}" ERR_CODE=255 - ${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1 + _ublk_del_dev "${dev_id}" > /dev/null 2>&1 return fi echo "PASS: Device transitioned to ${expected_state} in ${elapsed}s without hanging" # Clean up the device - ${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1 + _ublk_del_dev "${dev_id}" > /dev/null 2>&1 } _prep_test "partition_scan" "verify async partition scan prevents IO hang" diff --git a/tools/testing/selftests/ublk/test_generic_04.sh b/tools/testing/selftests/ublk/test_recover_01.sh similarity index 81% rename from tools/testing/selftests/ublk/test_generic_04.sh rename to tools/testing/selftests/ublk/test_recover_01.sh index baf5b156193d..2672f9c40fa8 100755 --- a/tools/testing/selftests/ublk/test_generic_04.sh +++ b/tools/testing/selftests/ublk/test_recover_01.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_04" ERR_CODE=0 ublk_run_recover_test() @@ -26,6 +25,11 @@ _create_backfile 0 256M _create_backfile 1 128M _create_backfile 2 128M +ublk_run_recover_test -t null -q 2 -r 1 -b & +ublk_run_recover_test -t loop -q 2 -r 1 -b "${UBLK_BACKFILES[0]}" & +ublk_run_recover_test -t stripe -q 2 -r 1 -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +wait + ublk_run_recover_test -t null -q 2 -r 1 & ublk_run_recover_test -t loop -q 2 -r 1 "${UBLK_BACKFILES[0]}" & ublk_run_recover_test -t stripe -q 2 -r 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & diff --git a/tools/testing/selftests/ublk/test_generic_05.sh b/tools/testing/selftests/ublk/test_recover_02.sh similarity index 82% rename from tools/testing/selftests/ublk/test_generic_05.sh rename to tools/testing/selftests/ublk/test_recover_02.sh index 7b5083afc02a..bda5064bc31f 100755 --- a/tools/testing/selftests/ublk/test_generic_05.sh +++ b/tools/testing/selftests/ublk/test_recover_02.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_05" ERR_CODE=0 ublk_run_recover_test() @@ -30,6 +29,11 @@ _create_backfile 0 256M _create_backfile 1 128M _create_backfile 2 128M +ublk_run_recover_test -t null -q 2 -r 1 -z -b & +ublk_run_recover_test -t loop -q 2 -r 1 -z -b "${UBLK_BACKFILES[0]}" & +ublk_run_recover_test -t stripe -q 2 -r 1 -z -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +wait + ublk_run_recover_test -t null -q 2 -r 1 -z & ublk_run_recover_test -t loop -q 2 -r 1 -z "${UBLK_BACKFILES[0]}" & ublk_run_recover_test -t stripe -q 2 -r 1 -z "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & diff --git a/tools/testing/selftests/ublk/test_generic_11.sh b/tools/testing/selftests/ublk/test_recover_03.sh similarity index 98% rename from tools/testing/selftests/ublk/test_generic_11.sh rename to tools/testing/selftests/ublk/test_recover_03.sh index d1f973c8c645..e0dc0b8fe5d6 100755 --- a/tools/testing/selftests/ublk/test_generic_11.sh +++ b/tools/testing/selftests/ublk/test_recover_03.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_11" ERR_CODE=0 ublk_run_quiesce_recover() diff --git a/tools/testing/selftests/ublk/test_generic_14.sh b/tools/testing/selftests/ublk/test_recover_04.sh similarity index 98% rename from tools/testing/selftests/ublk/test_generic_14.sh rename to tools/testing/selftests/ublk/test_recover_04.sh index cd9b44b97c24..178443394ca5 100755 --- a/tools/testing/selftests/ublk/test_generic_14.sh +++ b/tools/testing/selftests/ublk/test_recover_04.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_14" ERR_CODE=0 ublk_run_recover_test() diff --git a/tools/testing/selftests/ublk/test_stress_01.sh b/tools/testing/selftests/ublk/test_stress_01.sh index 7d3150f057d4..a9322ce496e9 100755 --- a/tools/testing/selftests/ublk/test_stress_01.sh +++ b/tools/testing/selftests/ublk/test_stress_01.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_01" ERR_CODE=0 ublk_io_and_remove() diff --git a/tools/testing/selftests/ublk/test_stress_02.sh b/tools/testing/selftests/ublk/test_stress_02.sh index 4bdd921081e5..6c114194f9c9 100755 --- a/tools/testing/selftests/ublk/test_stress_02.sh +++ b/tools/testing/selftests/ublk/test_stress_02.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_02" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_stress_03.sh b/tools/testing/selftests/ublk/test_stress_03.sh index 3ed4c9b2d8c0..4e81ca0db758 100755 --- a/tools/testing/selftests/ublk/test_stress_03.sh +++ b/tools/testing/selftests/ublk/test_stress_03.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_03" ERR_CODE=0 ublk_io_and_remove() diff --git a/tools/testing/selftests/ublk/test_stress_04.sh b/tools/testing/selftests/ublk/test_stress_04.sh index efa8dc33234b..6c6f44b172bc 100755 --- a/tools/testing/selftests/ublk/test_stress_04.sh +++ b/tools/testing/selftests/ublk/test_stress_04.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_04" ERR_CODE=0 ublk_io_and_kill_daemon() diff --git a/tools/testing/selftests/ublk/test_stress_05.sh b/tools/testing/selftests/ublk/test_stress_05.sh index 68a194144302..7e9324de2030 100755 --- a/tools/testing/selftests/ublk/test_stress_05.sh +++ b/tools/testing/selftests/ublk/test_stress_05.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_05" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_stress_06.sh b/tools/testing/selftests/ublk/test_stress_06.sh index 37188ec2e1f7..c72e5d0b14be 100755 --- a/tools/testing/selftests/ublk/test_stress_06.sh +++ b/tools/testing/selftests/ublk/test_stress_06.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_06" ERR_CODE=0 ublk_io_and_remove() diff --git a/tools/testing/selftests/ublk/test_stress_07.sh b/tools/testing/selftests/ublk/test_stress_07.sh index fb061fc26d36..04c2764d5238 100755 --- a/tools/testing/selftests/ublk/test_stress_07.sh +++ b/tools/testing/selftests/ublk/test_stress_07.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_07" ERR_CODE=0 ublk_io_and_kill_daemon() diff --git a/tools/testing/selftests/ublk/test_stress_08.sh b/tools/testing/selftests/ublk/test_stress_08.sh new file mode 100755 index 000000000000..37f7d204879a --- /dev/null +++ b/tools/testing/selftests/ublk/test_stress_08.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh +ERR_CODE=0 + +ublk_io_and_remove() +{ + run_io_and_remove "$@" + ERR_CODE=$? + if [ ${ERR_CODE} -ne 0 ]; then + echo "$TID failure: $*" + _show_result $TID $ERR_CODE + fi +} + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + +if ! _have_feature "ZERO_COPY"; then + exit "$UBLK_SKIP_CODE" +fi +if ! _have_feature "AUTO_BUF_REG"; then + exit "$UBLK_SKIP_CODE" +fi +if ! _have_feature "BATCH_IO"; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "stress" "run IO and remove device(zero copy)" + +_create_backfile 0 256M +_create_backfile 1 128M +_create_backfile 2 128M + +ublk_io_and_remove 8G -t null -q 4 -b & +ublk_io_and_remove 256M -t loop -q 4 --auto_zc -b "${UBLK_BACKFILES[0]}" & +ublk_io_and_remove 256M -t stripe -q 4 --auto_zc -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +ublk_io_and_remove 8G -t null -q 4 -z --auto_zc --auto_zc_fallback -b & +wait + +_cleanup_test "stress" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_stress_09.sh b/tools/testing/selftests/ublk/test_stress_09.sh new file mode 100755 index 000000000000..53c1e3b2ab30 --- /dev/null +++ b/tools/testing/selftests/ublk/test_stress_09.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh +ERR_CODE=0 + +ublk_io_and_kill_daemon() +{ + run_io_and_kill_daemon "$@" + ERR_CODE=$? + if [ ${ERR_CODE} -ne 0 ]; then + echo "$TID failure: $*" + _show_result $TID $ERR_CODE + fi +} + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi +if ! _have_feature "ZERO_COPY"; then + exit "$UBLK_SKIP_CODE" +fi +if ! _have_feature "AUTO_BUF_REG"; then + exit "$UBLK_SKIP_CODE" +fi +if ! _have_feature "BATCH_IO"; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "stress" "run IO and kill ublk server(zero copy)" + +_create_backfile 0 256M +_create_backfile 1 128M +_create_backfile 2 128M + +ublk_io_and_kill_daemon 8G -t null -q 4 -z -b & +ublk_io_and_kill_daemon 256M -t loop -q 4 --auto_zc -b "${UBLK_BACKFILES[0]}" & +ublk_io_and_kill_daemon 256M -t stripe -q 4 -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +ublk_io_and_kill_daemon 8G -t null -q 4 -z --auto_zc --auto_zc_fallback -b & +wait + +_cleanup_test "stress" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_stripe_01.sh b/tools/testing/selftests/ublk/test_stripe_01.sh index 4e4f0fdf3c9b..3bc821aadad8 100755 --- a/tools/testing/selftests/ublk/test_stripe_01.sh +++ b/tools/testing/selftests/ublk/test_stripe_01.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stripe_01" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_stripe_02.sh b/tools/testing/selftests/ublk/test_stripe_02.sh index 5820ab2efba4..4a7d2b21a6bf 100755 --- a/tools/testing/selftests/ublk/test_stripe_02.sh +++ b/tools/testing/selftests/ublk/test_stripe_02.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stripe_02" ERR_CODE=0 _prep_test "stripe" "mkfs & mount & umount" diff --git a/tools/testing/selftests/ublk/test_stripe_03.sh b/tools/testing/selftests/ublk/test_stripe_03.sh index 20b977e27814..a1c159d54e53 100755 --- a/tools/testing/selftests/ublk/test_stripe_03.sh +++ b/tools/testing/selftests/ublk/test_stripe_03.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stripe_03" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_stripe_04.sh b/tools/testing/selftests/ublk/test_stripe_04.sh index 1b51ed2f1d84..0c30bd6c2b3b 100755 --- a/tools/testing/selftests/ublk/test_stripe_04.sh +++ b/tools/testing/selftests/ublk/test_stripe_04.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stripe_04" ERR_CODE=0 _prep_test "stripe" "mkfs & mount & umount on zero copy" diff --git a/tools/testing/selftests/ublk/test_stripe_05.sh b/tools/testing/selftests/ublk/test_stripe_05.sh index 05d71951d710..6ddfa88ad226 100755 --- a/tools/testing/selftests/ublk/test_stripe_05.sh +++ b/tools/testing/selftests/ublk/test_stripe_05.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stripe_05" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_stripe_06.sh b/tools/testing/selftests/ublk/test_stripe_06.sh index d06cac7626e2..a2c7bf4cc613 100755 --- a/tools/testing/selftests/ublk/test_stripe_06.sh +++ b/tools/testing/selftests/ublk/test_stripe_06.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stripe_06" ERR_CODE=0 _prep_test "stripe" "mkfs & mount & umount on user copy" diff --git a/tools/testing/selftests/ublk/trace/seq_io.bt b/tools/testing/selftests/ublk/trace/seq_io.bt index b2f60a92b118..9d36ba35468f 100644 --- a/tools/testing/selftests/ublk/trace/seq_io.bt +++ b/tools/testing/selftests/ublk/trace/seq_io.bt @@ -2,23 +2,52 @@ $1: dev_t $2: RWBS $3: strlen($2) + + Track request order between block_io_start and block_rq_complete. + Sequence starts at 1 so 0 means "never seen". On first valid + completion, sync complete_seq to handle probe attachment races. + block_rq_complete listed first to reduce missed completion window. */ + BEGIN { - @last_rw[$1, str($2)] = (uint64)0; + @start_seq = (uint64)1; + @complete_seq = (uint64)0; + @out_of_order = (uint64)0; + @start_order[0] = (uint64)0; + delete(@start_order[0]); + printf("BPFTRACE_READY\n"); } + tracepoint:block:block_rq_complete +/(int64)args.dev == $1 && !strncmp(args.rwbs, str($2), $3)/ { - $dev = $1; - if ((int64)args.dev == $1 && !strncmp(args.rwbs, str($2), $3)) { - $last = @last_rw[$dev, str($2)]; - if ((uint64)args.sector != $last) { - printf("io_out_of_order: exp %llu actual %llu\n", - args.sector, $last); + $expected = @start_order[args.sector]; + if ($expected > 0) { + if (@complete_seq == 0) { + @complete_seq = $expected; } - @last_rw[$dev, str($2)] = (args.sector + args.nr_sector); + if ($expected != @complete_seq) { + printf("out_of_order: sector %llu started at seq %llu but completed at seq %llu\n", + args.sector, $expected, @complete_seq); + @out_of_order = @out_of_order + 1; + } + delete(@start_order[args.sector]); + @complete_seq = @complete_seq + 1; } } -END { - clear(@last_rw); +tracepoint:block:block_io_start +/(int64)args.dev == $1 && !strncmp(args.rwbs, str($2), $3)/ +{ + @start_order[args.sector] = @start_seq; + @start_seq = @start_seq + 1; +} + +END { + printf("total_start: %llu total_complete: %llu out_of_order: %llu\n", + @start_seq - 1, @complete_seq, @out_of_order); + clear(@start_order); + clear(@start_seq); + clear(@complete_seq); + clear(@out_of_order); } diff --git a/tools/testing/selftests/ublk/utils.h b/tools/testing/selftests/ublk/utils.h index a852e0b7153e..aab522f26167 100644 --- a/tools/testing/selftests/ublk/utils.h +++ b/tools/testing/selftests/ublk/utils.h @@ -21,6 +21,60 @@ #define round_up(val, rnd) \ (((val) + ((rnd) - 1)) & ~((rnd) - 1)) +/* small sized & per-thread allocator */ +struct allocator { + unsigned int size; + cpu_set_t *set; +}; + +static inline int allocator_init(struct allocator *a, unsigned size) +{ + a->set = CPU_ALLOC(size); + a->size = size; + + if (a->set) + return 0; + return -ENOMEM; +} + +static inline void allocator_deinit(struct allocator *a) +{ + CPU_FREE(a->set); + a->set = NULL; + a->size = 0; +} + +static inline int allocator_get(struct allocator *a) +{ + int i; + + for (i = 0; i < a->size; i += 1) { + size_t set_size = CPU_ALLOC_SIZE(a->size); + + if (!CPU_ISSET_S(i, set_size, a->set)) { + CPU_SET_S(i, set_size, a->set); + return i; + } + } + + return -1; +} + +static inline void allocator_put(struct allocator *a, int i) +{ + size_t set_size = CPU_ALLOC_SIZE(a->size); + + if (i >= 0 && i < a->size) + CPU_CLR_S(i, set_size, a->set); +} + +static inline int allocator_get_val(struct allocator *a, int i) +{ + size_t set_size = CPU_ALLOC_SIZE(a->size); + + return CPU_ISSET_S(i, set_size, a->set); +} + static inline unsigned int ilog2(unsigned int x) { if (x == 0) @@ -43,6 +97,7 @@ static inline void ublk_err(const char *fmt, ...) va_start(ap, fmt); vfprintf(stderr, fmt, ap); + va_end(ap); } static inline void ublk_log(const char *fmt, ...) @@ -52,6 +107,7 @@ static inline void ublk_log(const char *fmt, ...) va_start(ap, fmt); vfprintf(stdout, fmt, ap); + va_end(ap); } } @@ -62,7 +118,15 @@ static inline void ublk_dbg(int level, const char *fmt, ...) va_start(ap, fmt); vfprintf(stdout, fmt, ap); + va_end(ap); } } +#define ublk_assert(x) do { \ + if (!(x)) { \ + ublk_err("%s %d: assert!\n", __func__, __LINE__); \ + assert(x); \ + } \ +} while (0) + #endif