mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-09-04 20:19:47 +08:00
block: fix potential deadlock while running nr_hw_queue update
Move scheduler tags (sched_tags) allocation and deallocation outside both the ->elevator_lock and ->freeze_lock when updating nr_hw_queues. This change breaks the dependency chain from the percpu allocator lock to the elevator lock, helping to prevent potential deadlocks, as observed in the reported lockdep splat[1]. This commit introduces batch allocation and deallocation helpers for sched_tags, which are now used from within __blk_mq_update_nr_hw_queues routine while iterating through the tagset. With this change, all sched_tags memory management is handled entirely outside the ->elevator_lock and the ->freeze_lock context, thereby eliminating the lock dependency that could otherwise manifest during nr_hw_queues updates. [1] https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/ Reported-by: Stefan Haberland <sth@linux.ibm.com> Closes: https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/ Reviewed-by: Ming Lei <ming.lei@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Hannes Reinecke <hare@suse.de> Signed-off-by: Nilay Shroff <nilay@linux.ibm.com> Link: https://lore.kernel.org/r/20250730074614.2537382-4-nilay@linux.ibm.com Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
parent
f5a6604f7a
commit
04225d13ae
@ -427,6 +427,32 @@ void blk_mq_free_sched_tags(struct elevator_tags *et,
|
|||||||
kfree(et);
|
kfree(et);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void blk_mq_free_sched_tags_batch(struct xarray *et_table,
|
||||||
|
struct blk_mq_tag_set *set)
|
||||||
|
{
|
||||||
|
struct request_queue *q;
|
||||||
|
struct elevator_tags *et;
|
||||||
|
|
||||||
|
lockdep_assert_held_write(&set->update_nr_hwq_lock);
|
||||||
|
|
||||||
|
list_for_each_entry(q, &set->tag_list, tag_set_list) {
|
||||||
|
/*
|
||||||
|
* Accessing q->elevator without holding q->elevator_lock is
|
||||||
|
* safe because we're holding here set->update_nr_hwq_lock in
|
||||||
|
* the writer context. So, scheduler update/switch code (which
|
||||||
|
* acquires the same lock but in the reader context) can't run
|
||||||
|
* concurrently.
|
||||||
|
*/
|
||||||
|
if (q->elevator) {
|
||||||
|
et = xa_load(et_table, q->id);
|
||||||
|
if (unlikely(!et))
|
||||||
|
WARN_ON_ONCE(1);
|
||||||
|
else
|
||||||
|
blk_mq_free_sched_tags(et, set);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
|
struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
|
||||||
unsigned int nr_hw_queues)
|
unsigned int nr_hw_queues)
|
||||||
{
|
{
|
||||||
@ -477,6 +503,45 @@ out:
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int blk_mq_alloc_sched_tags_batch(struct xarray *et_table,
|
||||||
|
struct blk_mq_tag_set *set, unsigned int nr_hw_queues)
|
||||||
|
{
|
||||||
|
struct request_queue *q;
|
||||||
|
struct elevator_tags *et;
|
||||||
|
gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
|
||||||
|
|
||||||
|
lockdep_assert_held_write(&set->update_nr_hwq_lock);
|
||||||
|
|
||||||
|
list_for_each_entry(q, &set->tag_list, tag_set_list) {
|
||||||
|
/*
|
||||||
|
* Accessing q->elevator without holding q->elevator_lock is
|
||||||
|
* safe because we're holding here set->update_nr_hwq_lock in
|
||||||
|
* the writer context. So, scheduler update/switch code (which
|
||||||
|
* acquires the same lock but in the reader context) can't run
|
||||||
|
* concurrently.
|
||||||
|
*/
|
||||||
|
if (q->elevator) {
|
||||||
|
et = blk_mq_alloc_sched_tags(set, nr_hw_queues);
|
||||||
|
if (!et)
|
||||||
|
goto out_unwind;
|
||||||
|
if (xa_insert(et_table, q->id, et, gfp))
|
||||||
|
goto out_free_tags;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
out_free_tags:
|
||||||
|
blk_mq_free_sched_tags(et, set);
|
||||||
|
out_unwind:
|
||||||
|
list_for_each_entry_continue_reverse(q, &set->tag_list, tag_set_list) {
|
||||||
|
if (q->elevator) {
|
||||||
|
et = xa_load(et_table, q->id);
|
||||||
|
if (et)
|
||||||
|
blk_mq_free_sched_tags(et, set);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return -ENOMEM;
|
||||||
|
}
|
||||||
|
|
||||||
/* caller must have a reference to @e, will grab another one if successful */
|
/* caller must have a reference to @e, will grab another one if successful */
|
||||||
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e,
|
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e,
|
||||||
struct elevator_tags *et)
|
struct elevator_tags *et)
|
||||||
|
@ -25,8 +25,12 @@ void blk_mq_sched_free_rqs(struct request_queue *q);
|
|||||||
|
|
||||||
struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
|
struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
|
||||||
unsigned int nr_hw_queues);
|
unsigned int nr_hw_queues);
|
||||||
|
int blk_mq_alloc_sched_tags_batch(struct xarray *et_table,
|
||||||
|
struct blk_mq_tag_set *set, unsigned int nr_hw_queues);
|
||||||
void blk_mq_free_sched_tags(struct elevator_tags *et,
|
void blk_mq_free_sched_tags(struct elevator_tags *et,
|
||||||
struct blk_mq_tag_set *set);
|
struct blk_mq_tag_set *set);
|
||||||
|
void blk_mq_free_sched_tags_batch(struct xarray *et_table,
|
||||||
|
struct blk_mq_tag_set *set);
|
||||||
|
|
||||||
static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
|
static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
|
||||||
{
|
{
|
||||||
|
@ -4974,12 +4974,13 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
|
|||||||
* Switch back to the elevator type stored in the xarray.
|
* Switch back to the elevator type stored in the xarray.
|
||||||
*/
|
*/
|
||||||
static void blk_mq_elv_switch_back(struct request_queue *q,
|
static void blk_mq_elv_switch_back(struct request_queue *q,
|
||||||
struct xarray *elv_tbl)
|
struct xarray *elv_tbl, struct xarray *et_tbl)
|
||||||
{
|
{
|
||||||
struct elevator_type *e = xa_load(elv_tbl, q->id);
|
struct elevator_type *e = xa_load(elv_tbl, q->id);
|
||||||
|
struct elevator_tags *t = xa_load(et_tbl, q->id);
|
||||||
|
|
||||||
/* The elv_update_nr_hw_queues unfreezes the queue. */
|
/* The elv_update_nr_hw_queues unfreezes the queue. */
|
||||||
elv_update_nr_hw_queues(q, e);
|
elv_update_nr_hw_queues(q, e, t);
|
||||||
|
|
||||||
/* Drop the reference acquired in blk_mq_elv_switch_none. */
|
/* Drop the reference acquired in blk_mq_elv_switch_none. */
|
||||||
if (e)
|
if (e)
|
||||||
@ -5031,7 +5032,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
|
|||||||
int prev_nr_hw_queues = set->nr_hw_queues;
|
int prev_nr_hw_queues = set->nr_hw_queues;
|
||||||
unsigned int memflags;
|
unsigned int memflags;
|
||||||
int i;
|
int i;
|
||||||
struct xarray elv_tbl;
|
struct xarray elv_tbl, et_tbl;
|
||||||
|
|
||||||
lockdep_assert_held(&set->tag_list_lock);
|
lockdep_assert_held(&set->tag_list_lock);
|
||||||
|
|
||||||
@ -5044,6 +5045,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
|
|||||||
|
|
||||||
memflags = memalloc_noio_save();
|
memflags = memalloc_noio_save();
|
||||||
|
|
||||||
|
xa_init(&et_tbl);
|
||||||
|
if (blk_mq_alloc_sched_tags_batch(&et_tbl, set, nr_hw_queues) < 0)
|
||||||
|
goto out_memalloc_restore;
|
||||||
|
|
||||||
xa_init(&elv_tbl);
|
xa_init(&elv_tbl);
|
||||||
|
|
||||||
list_for_each_entry(q, &set->tag_list, tag_set_list) {
|
list_for_each_entry(q, &set->tag_list, tag_set_list) {
|
||||||
@ -5087,7 +5092,7 @@ fallback:
|
|||||||
switch_back:
|
switch_back:
|
||||||
/* The blk_mq_elv_switch_back unfreezes queue for us. */
|
/* The blk_mq_elv_switch_back unfreezes queue for us. */
|
||||||
list_for_each_entry(q, &set->tag_list, tag_set_list)
|
list_for_each_entry(q, &set->tag_list, tag_set_list)
|
||||||
blk_mq_elv_switch_back(q, &elv_tbl);
|
blk_mq_elv_switch_back(q, &elv_tbl, &et_tbl);
|
||||||
|
|
||||||
list_for_each_entry(q, &set->tag_list, tag_set_list) {
|
list_for_each_entry(q, &set->tag_list, tag_set_list) {
|
||||||
blk_mq_sysfs_register_hctxs(q);
|
blk_mq_sysfs_register_hctxs(q);
|
||||||
@ -5098,7 +5103,8 @@ switch_back:
|
|||||||
}
|
}
|
||||||
|
|
||||||
xa_destroy(&elv_tbl);
|
xa_destroy(&elv_tbl);
|
||||||
|
xa_destroy(&et_tbl);
|
||||||
|
out_memalloc_restore:
|
||||||
memalloc_noio_restore(memflags);
|
memalloc_noio_restore(memflags);
|
||||||
|
|
||||||
/* Free the excess tags when nr_hw_queues shrink. */
|
/* Free the excess tags when nr_hw_queues shrink. */
|
||||||
|
@ -12,6 +12,7 @@
|
|||||||
#include "blk-crypto-internal.h"
|
#include "blk-crypto-internal.h"
|
||||||
|
|
||||||
struct elevator_type;
|
struct elevator_type;
|
||||||
|
struct elevator_tags;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Default upper limit for the software max_sectors limit used for regular I/Os.
|
* Default upper limit for the software max_sectors limit used for regular I/Os.
|
||||||
@ -330,7 +331,8 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
|
|||||||
|
|
||||||
bool blk_insert_flush(struct request *rq);
|
bool blk_insert_flush(struct request *rq);
|
||||||
|
|
||||||
void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e);
|
void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e,
|
||||||
|
struct elevator_tags *t);
|
||||||
void elevator_set_default(struct request_queue *q);
|
void elevator_set_default(struct request_queue *q);
|
||||||
void elevator_set_none(struct request_queue *q);
|
void elevator_set_none(struct request_queue *q);
|
||||||
|
|
||||||
|
@ -705,7 +705,8 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
|
|||||||
* The I/O scheduler depends on the number of hardware queues, this forces a
|
* The I/O scheduler depends on the number of hardware queues, this forces a
|
||||||
* reattachment when nr_hw_queues changes.
|
* reattachment when nr_hw_queues changes.
|
||||||
*/
|
*/
|
||||||
void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e)
|
void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e,
|
||||||
|
struct elevator_tags *t)
|
||||||
{
|
{
|
||||||
struct blk_mq_tag_set *set = q->tag_set;
|
struct blk_mq_tag_set *set = q->tag_set;
|
||||||
struct elv_change_ctx ctx = {};
|
struct elv_change_ctx ctx = {};
|
||||||
@ -715,25 +716,21 @@ void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e)
|
|||||||
|
|
||||||
if (e && !blk_queue_dying(q) && blk_queue_registered(q)) {
|
if (e && !blk_queue_dying(q) && blk_queue_registered(q)) {
|
||||||
ctx.name = e->elevator_name;
|
ctx.name = e->elevator_name;
|
||||||
ctx.et = blk_mq_alloc_sched_tags(set, set->nr_hw_queues);
|
ctx.et = t;
|
||||||
if (!ctx.et) {
|
|
||||||
WARN_ON_ONCE(1);
|
|
||||||
goto unfreeze;
|
|
||||||
}
|
|
||||||
mutex_lock(&q->elevator_lock);
|
mutex_lock(&q->elevator_lock);
|
||||||
/* force to reattach elevator after nr_hw_queue is updated */
|
/* force to reattach elevator after nr_hw_queue is updated */
|
||||||
ret = elevator_switch(q, &ctx);
|
ret = elevator_switch(q, &ctx);
|
||||||
mutex_unlock(&q->elevator_lock);
|
mutex_unlock(&q->elevator_lock);
|
||||||
}
|
}
|
||||||
unfreeze:
|
|
||||||
blk_mq_unfreeze_queue_nomemrestore(q);
|
blk_mq_unfreeze_queue_nomemrestore(q);
|
||||||
if (!ret)
|
if (!ret)
|
||||||
WARN_ON_ONCE(elevator_change_done(q, &ctx));
|
WARN_ON_ONCE(elevator_change_done(q, &ctx));
|
||||||
/*
|
/*
|
||||||
* Free sched tags if it's allocated but we couldn't switch elevator.
|
* Free sched tags if it's allocated but we couldn't switch elevator.
|
||||||
*/
|
*/
|
||||||
if (ctx.et && !ctx.new)
|
if (t && !ctx.new)
|
||||||
blk_mq_free_sched_tags(ctx.et, set);
|
blk_mq_free_sched_tags(t, set);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
Loading…
Reference in New Issue
Block a user