From 70904263512a74a3b8941dd9e6e515ca6fc57821 Mon Sep 17 00:00:00 2001 From: Ross Lagerwall Date: Fri, 14 Jul 2023 11:11:06 +0100 Subject: [PATCH 1/6] blk-mq: Fix stall due to recursive flush plug We have seen rare IO stalls as follows: * blk_mq_plug_issue_direct() is entered with an mq_list containing two requests. * For the first request, it sets last == false and enters the driver's queue_rq callback. * The driver queue_rq callback indirectly calls schedule() which calls blk_flush_plug(). This may happen if the driver has the BLK_MQ_F_BLOCKING flag set and is allowed to sleep in ->queue_rq. * blk_flush_plug() handles the remaining request in the mq_list. mq_list is now empty. * The original call to queue_rq resumes (with last == false). * The loop in blk_mq_plug_issue_direct() terminates because there are no remaining requests in mq_list. The IO is now stalled because the last request submitted to the driver had last == false and there was no subsequent call to commit_rqs(). Fix this by returning early in blk_mq_flush_plug_list() if rq_count is 0 which it will be in the recursive case, rather than checking if the mq_list is empty. At the same time, adjust one of the callers to skip the mq_list empty check as it is not necessary. Fixes: dc5fc361d891 ("block: attempt direct issue of plug list") Signed-off-by: Ross Lagerwall Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20230714101106.3635611-1-ross.lagerwall@citrix.com Signed-off-by: Jens Axboe --- block/blk-core.c | 3 +-- block/blk-mq.c | 9 ++++++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 99d8b9812b18..90de50082146 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1144,8 +1144,7 @@ void __blk_flush_plug(struct blk_plug *plug, bool from_schedule) { if (!list_empty(&plug->cb_list)) flush_plug_callbacks(plug, from_schedule); - if (!rq_list_empty(plug->mq_list)) - blk_mq_flush_plug_list(plug, from_schedule); + blk_mq_flush_plug_list(plug, from_schedule); /* * Unconditionally flush out cached requests, even if the unplug * event came from schedule. Since we know hold references to the diff --git a/block/blk-mq.c b/block/blk-mq.c index d50b1d62a3d9..b04ff6f56926 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2754,7 +2754,14 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) { struct request *rq; - if (rq_list_empty(plug->mq_list)) + /* + * We may have been called recursively midway through handling + * plug->mq_list via a schedule() in the driver's queue_rq() callback. + * To avoid mq_list changing under our feet, clear rq_count early and + * bail out specifically if rq_count is 0 rather than checking + * whether the mq_list is empty. + */ + if (plug->rq_count == 0) return; plug->rq_count = 0; From 3641c90c4e369c8d0af5483e879174400a152cf8 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Thu, 20 Jul 2023 17:55:12 +0800 Subject: [PATCH 2/6] blk-mq: delete dead struct blk_mq_hw_ctx->queued field This counter is not used anywhere, so delete it. Signed-off-by: Chengming Zhou Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20230720095512.1403123-1-chengming.zhou@linux.dev Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b96e00499f9e..495ca198775f 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -397,8 +397,6 @@ struct blk_mq_hw_ctx { */ struct blk_mq_tags *sched_tags; - /** @queued: Number of queued requests. */ - unsigned long queued; /** @run: Number of dispatched requests. */ unsigned long run; From 013adcbef165c3eaf73e297b7482290593815ab8 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Thu, 20 Jul 2023 20:14:41 +0800 Subject: [PATCH 3/6] blk-iocost: skip empty flush bio in iocost The flush bio may have data, may have no data (empty flush), we couldn't calculate cost for empty flush bio. So we'd better just skip it for now. Another side effect is that empty flush bio's bio_end_sector() is 0, cause iocg->cursor reset to 0, may break the cost calculation of other bios. This isn't good enough, since flush bio still consume the device bandwidth, but flush request is special, can be merged randomly in the flush state machine, we don't know how to calculate cost for it for now. Its completion time also has flaws, which may include the pre-flush or post-flush completion time, but I don't know if we need to fix that and how to fix it. Signed-off-by: Chengming Zhou Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20230720121441.1408522-1-chengming.zhou@linux.dev Signed-off-by: Jens Axboe --- block/blk-iocost.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 9dfcf540f400..dd64e2066f01 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2516,6 +2516,10 @@ static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg, u64 seek_pages = 0; u64 cost = 0; + /* Can't calculate cost for empty bio */ + if (!bio->bi_iter.bi_size) + goto out; + switch (bio_op(bio)) { case REQ_OP_READ: coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO]; From 106397376c0369fcc01c58dd189ff925a2724a57 Mon Sep 17 00:00:00 2001 From: David Jeffery Date: Fri, 21 Jul 2023 17:57:15 +0800 Subject: [PATCH 4/6] sbitmap: fix batching wakeup Current code supposes that it is enough to provide forward progress by just waking up one wait queue after one completion batch is done. Unfortunately this way isn't enough, cause waiter can be added to wait queue just after it is woken up. Follows one example(64 depth, wake_batch is 8) 1) all 64 tags are active 2) in each wait queue, there is only one single waiter 3) each time one completion batch(8 completions) wakes up just one waiter in each wait queue, then immediately one new sleeper is added to this wait queue 4) after 64 completions, 8 waiters are wakeup, and there are still 8 waiters in each wait queue 5) after another 8 active tags are completed, only one waiter can be wakeup, and the other 7 can't be waken up anymore. Turns out it isn't easy to fix this problem, so simply wakeup enough waiters for single batch. Cc: Kemeng Shi Cc: Chengming Zhou Cc: Jan Kara Signed-off-by: David Jeffery Signed-off-by: Ming Lei Reviewed-by: Gabriel Krisman Bertazi Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20230721095715.232728-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- lib/sbitmap.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/lib/sbitmap.c b/lib/sbitmap.c index eff4e42c425a..d0a5081dfd12 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -550,7 +550,7 @@ EXPORT_SYMBOL_GPL(sbitmap_queue_min_shallow_depth); static void __sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr) { - int i, wake_index; + int i, wake_index, woken; if (!atomic_read(&sbq->ws_active)) return; @@ -567,13 +567,12 @@ static void __sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr) */ wake_index = sbq_index_inc(wake_index); - /* - * It is sufficient to wake up at least one waiter to - * guarantee forward progress. - */ - if (waitqueue_active(&ws->wait) && - wake_up_nr(&ws->wait, nr)) - break; + if (waitqueue_active(&ws->wait)) { + woken = wake_up_nr(&ws->wait, nr); + if (woken == nr) + break; + nr -= woken; + } } if (wake_index != atomic_read(&sbq->wake_index)) From 23881aec85f3219e8462e87c708815ee2cd82358 Mon Sep 17 00:00:00 2001 From: Mauricio Faria de Oliveira Date: Thu, 20 Jul 2023 11:30:32 -0300 Subject: [PATCH 5/6] loop: deprecate autoloading callback loop_probe() The 'probe' callback in __register_blkdev() is only used under the CONFIG_BLOCK_LEGACY_AUTOLOAD deprecation guard. The loop_probe() function is only used for that callback, so guard it too, accordingly. See commit fbdee71bb5d8 ("block: deprecate autoloading based on dev_t"). Signed-off-by: Mauricio Faria de Oliveira Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230720143033.841001-2-mfo@canonical.com Signed-off-by: Jens Axboe --- drivers/block/loop.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 37511d2b2caf..6e56c3faacac 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -2093,6 +2093,7 @@ static void loop_remove(struct loop_device *lo) put_disk(lo->lo_disk); } +#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD static void loop_probe(dev_t dev) { int idx = MINOR(dev) >> part_shift; @@ -2101,6 +2102,9 @@ static void loop_probe(dev_t dev) return; loop_add(idx); } +#else +#define loop_probe NULL +#endif /* !CONFIG_BLOCK_LEGACY_AUTOLOAD */ static int loop_control_remove(int idx) { From bb5faa99f0ce40756ab7bbbce4f16c01ca5ebd5a Mon Sep 17 00:00:00 2001 From: Mauricio Faria de Oliveira Date: Thu, 20 Jul 2023 11:30:33 -0300 Subject: [PATCH 6/6] loop: do not enforce max_loop hard limit by (new) default Problem: The max_loop parameter is used for 2 different purposes: 1) initial number of loop devices to pre-create on init 2) maximum number of loop devices to add on access/open() Historically, its default value (zero) caused 1) to create non-zero number of devices (CONFIG_BLK_DEV_LOOP_MIN_COUNT), and no hard limit on 2) to add devices with autoloading. However, the default value changed in commit 85c50197716c ("loop: Fix the max_loop commandline argument treatment when it is set to 0") to CONFIG_BLK_DEV_LOOP_MIN_COUNT, for max_loop=0 not to pre-create devices. That does improve 1), but unfortunately it breaks 2), as the default behavior changed from no-limit to hard-limit. Example: For example, this userspace code broke for N >= CONFIG, if the user relied on the default value 0 for max_loop: mknod("/dev/loopN"); open("/dev/loopN"); // now fails with ENXIO Though affected users may "fix" it with (loop.)max_loop=0, this means to require a kernel parameter change on stable kernel update (that commit Fixes: an old commit in stable). Solution: The original semantics for the default value in 2) can be applied if the parameter is not set (ie, default behavior). This still keeps the intended function in 1) and 2) if set, and that commit's intended improvement in 1) if max_loop=0. Before 85c50197716c: - default: 1) CONFIG devices 2) no limit - max_loop=0: 1) CONFIG devices 2) no limit - max_loop=X: 1) X devices 2) X limit After 85c50197716c: - default: 1) CONFIG devices 2) CONFIG limit (*) - max_loop=0: 1) 0 devices (*) 2) no limit - max_loop=X: 1) X devices 2) X limit This commit: - default: 1) CONFIG devices 2) no limit (*) - max_loop=0: 1) 0 devices 2) no limit - max_loop=X: 1) X devices 2) X limit Future: The issue/regression from that commit only affects code under the CONFIG_BLOCK_LEGACY_AUTOLOAD deprecation guard, thus the fix too is contained under it. Once that deprecated functionality/code is removed, the purpose 2) of max_loop (hard limit) is no longer in use, so the module parameter description can be changed then. Tests: Linux 6.4-rc7 CONFIG_BLK_DEV_LOOP_MIN_COUNT=8 CONFIG_BLOCK_LEGACY_AUTOLOAD=y - default (original) # ls -1 /dev/loop* /dev/loop-control /dev/loop0 ... /dev/loop7 # ./test-loop open: /dev/loop8: No such device or address - default (patched) # ls -1 /dev/loop* /dev/loop-control /dev/loop0 ... /dev/loop7 # ./test-loop # - max_loop=0 (original & patched): # ls -1 /dev/loop* /dev/loop-control # ./test-loop # - max_loop=8 (original & patched): # ls -1 /dev/loop* /dev/loop-control /dev/loop0 ... /dev/loop7 # ./test-loop open: /dev/loop8: No such device or address - max_loop=0 (patched; CONFIG_BLOCK_LEGACY_AUTOLOAD is not set) # ls -1 /dev/loop* /dev/loop-control # ./test-loop open: /dev/loop8: No such device or address Fixes: 85c50197716c ("loop: Fix the max_loop commandline argument treatment when it is set to 0") Signed-off-by: Mauricio Faria de Oliveira Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230720143033.841001-3-mfo@canonical.com Signed-off-by: Jens Axboe --- drivers/block/loop.c | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 6e56c3faacac..637c5bda2387 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1775,14 +1775,43 @@ static const struct block_device_operations lo_fops = { /* * If max_loop is specified, create that many devices upfront. * This also becomes a hard limit. If max_loop is not specified, + * the default isn't a hard limit (as before commit 85c50197716c + * changed the default value from 0 for max_loop=0 reasons), just * create CONFIG_BLK_DEV_LOOP_MIN_COUNT loop devices at module * init time. Loop devices can be requested on-demand with the * /dev/loop-control interface, or be instantiated by accessing * a 'dead' device node. */ static int max_loop = CONFIG_BLK_DEV_LOOP_MIN_COUNT; -module_param(max_loop, int, 0444); + +#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD +static bool max_loop_specified; + +static int max_loop_param_set_int(const char *val, + const struct kernel_param *kp) +{ + int ret; + + ret = param_set_int(val, kp); + if (ret < 0) + return ret; + + max_loop_specified = true; + return 0; +} + +static const struct kernel_param_ops max_loop_param_ops = { + .set = max_loop_param_set_int, + .get = param_get_int, +}; + +module_param_cb(max_loop, &max_loop_param_ops, &max_loop, 0444); MODULE_PARM_DESC(max_loop, "Maximum number of loop devices"); +#else +module_param(max_loop, int, 0444); +MODULE_PARM_DESC(max_loop, "Initial number of loop devices"); +#endif + module_param(max_part, int, 0444); MODULE_PARM_DESC(max_part, "Maximum number of partitions per loop device"); @@ -2098,7 +2127,7 @@ static void loop_probe(dev_t dev) { int idx = MINOR(dev) >> part_shift; - if (max_loop && idx >= max_loop) + if (max_loop_specified && max_loop && idx >= max_loop) return; loop_add(idx); } @@ -2285,6 +2314,9 @@ module_exit(loop_exit); static int __init max_loop_setup(char *str) { max_loop = simple_strtol(str, NULL, 0); +#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD + max_loop_specified = true; +#endif return 1; }