From 284b94be1925dbe035ce5218d8b5c197321262c7 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 26 Sep 2019 06:23:54 +0800 Subject: [PATCH 1/7] blk-mq: move lockdep_assert_held() into elevator_exit Commit c48dac137a62 ("block: don't hold q->sysfs_lock in elevator_init_mq") removes q->sysfs_lock from elevator_init_mq(), but forgot to deal with lockdep_assert_held() called in blk_mq_sched_free_requests() which is run in failure path of elevator_init_mq(). blk_mq_sched_free_requests() is called in the following 3 functions: elevator_init_mq() elevator_exit() blk_cleanup_queue() In blk_cleanup_queue(), blk_mq_sched_free_requests() is followed exactly by 'mutex_lock(&q->sysfs_lock)'. So moving the lockdep_assert_held() from blk_mq_sched_free_requests() into elevator_exit() for fixing the report by syzbot. Reported-by: syzbot+da3b7677bb913dc1b737@syzkaller.appspotmail.com Fixed: c48dac137a62 ("block: don't hold q->sysfs_lock in elevator_init_mq") Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 2 -- block/blk.h | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index c9d183d6c499..ca22afd47b3d 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -555,8 +555,6 @@ void blk_mq_sched_free_requests(struct request_queue *q) struct blk_mq_hw_ctx *hctx; int i; - lockdep_assert_held(&q->sysfs_lock); - queue_for_each_hw_ctx(q, hctx, i) { if (hctx->sched_tags) blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i); diff --git a/block/blk.h b/block/blk.h index ed347f7a97b1..25773d668ec0 100644 --- a/block/blk.h +++ b/block/blk.h @@ -194,6 +194,8 @@ void elv_unregister_queue(struct request_queue *q); static inline void elevator_exit(struct request_queue *q, struct elevator_queue *e) { + lockdep_assert_held(&q->sysfs_lock); + blk_mq_sched_free_requests(q); __elevator_exit(q, e); } From b89f625e28d44552083f43752f62d8621ded0a04 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 23 Sep 2019 23:12:09 +0800 Subject: [PATCH 2/7] block: don't release queue's sysfs lock during switching elevator cecf5d87ff20 ("block: split .sysfs_lock into two locks") starts to release & acquire sysfs_lock before registering/un-registering elevator queue during switching elevator for avoiding potential deadlock from showing & storing 'queue/iosched' attributes and removing elevator's kobject. Turns out there isn't such deadlock because 'q->sysfs_lock' isn't required in .show & .store of queue/iosched's attributes, and just elevator's sysfs lock is acquired in elv_iosched_store() and elv_iosched_show(). So it is safe to hold queue's sysfs lock when registering/un-registering elevator queue. The biggest issue is that commit cecf5d87ff20 assumes that concurrent write on 'queue/scheduler' can't happen. However, this assumption isn't true, because kernfs_fop_write() only guarantees that concurrent write aren't called on the same open file, but the write could be from different open on the file. So we can't release & re-acquire queue's sysfs lock during switching elevator, otherwise use-after-free on elevator could be triggered. Fixes the issue by not releasing queue's sysfs lock during switching elevator. Fixes: cecf5d87ff20 ("block: split .sysfs_lock into two locks") Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Greg KH Cc: Mike Snitzer Reviewed-by: Bart Van Assche Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 13 ++++--------- block/elevator.c | 31 +------------------------------ 2 files changed, 5 insertions(+), 39 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index b82736c781c5..962fc0c44381 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -989,13 +989,11 @@ int blk_register_queue(struct gendisk *disk) blk_mq_debugfs_register(q); } - /* - * The flag of QUEUE_FLAG_REGISTERED isn't set yet, so elevator - * switch won't happen at all. - */ + mutex_lock(&q->sysfs_lock); if (q->elevator) { ret = elv_register_queue(q, false); if (ret) { + mutex_unlock(&q->sysfs_lock); mutex_unlock(&q->sysfs_dir_lock); kobject_del(&q->kobj); blk_trace_remove_sysfs(dev); @@ -1005,7 +1003,6 @@ int blk_register_queue(struct gendisk *disk) has_elevator = true; } - mutex_lock(&q->sysfs_lock); blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); wbt_enable_default(q); blk_throtl_register_queue(q); @@ -1062,12 +1059,10 @@ void blk_unregister_queue(struct gendisk *disk) kobject_del(&q->kobj); blk_trace_remove_sysfs(disk_to_dev(disk)); - /* - * q->kobj has been removed, so it is safe to check if elevator - * exists without holding q->sysfs_lock. - */ + mutex_lock(&q->sysfs_lock); if (q->elevator) elv_unregister_queue(q); + mutex_unlock(&q->sysfs_lock); mutex_unlock(&q->sysfs_dir_lock); kobject_put(&disk_to_dev(disk)->kobj); diff --git a/block/elevator.c b/block/elevator.c index bba10e83478a..5437059c9261 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -503,9 +503,7 @@ int elv_register_queue(struct request_queue *q, bool uevent) if (uevent) kobject_uevent(&e->kobj, KOBJ_ADD); - mutex_lock(&q->sysfs_lock); e->registered = 1; - mutex_unlock(&q->sysfs_lock); } return error; } @@ -523,11 +521,9 @@ void elv_unregister_queue(struct request_queue *q) kobject_uevent(&e->kobj, KOBJ_REMOVE); kobject_del(&e->kobj); - mutex_lock(&q->sysfs_lock); e->registered = 0; /* Re-enable throttling in case elevator disabled it */ wbt_enable_default(q); - mutex_unlock(&q->sysfs_lock); } } @@ -590,32 +586,11 @@ int elevator_switch_mq(struct request_queue *q, lockdep_assert_held(&q->sysfs_lock); if (q->elevator) { - if (q->elevator->registered) { - mutex_unlock(&q->sysfs_lock); - - /* - * Concurrent elevator switch can't happen becasue - * sysfs write is always exclusively on same file. - * - * Also the elevator queue won't be freed after - * sysfs_lock is released becasue kobject_del() in - * blk_unregister_queue() waits for completion of - * .store & .show on its attributes. - */ + if (q->elevator->registered) elv_unregister_queue(q); - mutex_lock(&q->sysfs_lock); - } ioc_clear_queue(q); elevator_exit(q, q->elevator); - - /* - * sysfs_lock may be dropped, so re-check if queue is - * unregistered. If yes, don't switch to new elevator - * any more - */ - if (!blk_queue_registered(q)) - return 0; } ret = blk_mq_init_sched(q, new_e); @@ -623,11 +598,7 @@ int elevator_switch_mq(struct request_queue *q, goto out; if (new_e) { - mutex_unlock(&q->sysfs_lock); - ret = elv_register_queue(q, true); - - mutex_lock(&q->sysfs_lock); if (ret) { elevator_exit(q, q->elevator); goto out; From 25d41e4aadb0788b4fae8a8fca90f437b9ebd727 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 25 Sep 2019 16:02:07 -0700 Subject: [PATCH 3/7] iocost: better trace vrate changes vrate_adj tracepoint traces vrate changes; however, it does so only when busy_level is non-zero. busy_level turning to zero can sometimes be as interesting an event. This patch also enables vrate_adj tracepoint on other vrate related events - busy_level changes and non-zero nr_lagging. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 3b39deb8b9f8..32d4d6d86a99 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1343,7 +1343,7 @@ static void ioc_timer_fn(struct timer_list *timer) u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; u32 missed_ppm[2], rq_wait_pct; u64 period_vtime; - int i; + int prev_busy_level, i; /* how were the latencies during the period? */ ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct); @@ -1531,6 +1531,7 @@ skip_surplus_transfers: * and experiencing shortages but not surpluses, we're too stingy * and should increase vtime rate. */ + prev_busy_level = ioc->busy_level; if (rq_wait_pct > RQ_WAIT_BUSY_PCT || missed_ppm[READ] > ppm_rthr || missed_ppm[WRITE] > ppm_wthr) { @@ -1592,6 +1593,10 @@ skip_surplus_transfers: atomic64_set(&ioc->vtime_rate, vrate); ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP( ioc->period_us * vrate * INUSE_MARGIN_PCT, 100); + } else if (ioc->busy_level != prev_busy_level || nr_lagging) { + trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate), + &missed_ppm, rq_wait_pct, nr_lagging, + nr_shortages, nr_surpluses); } ioc_refresh_params(ioc, false); From 7cd806a9a953f234b9865c30028f47fd738ce375 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 25 Sep 2019 16:03:09 -0700 Subject: [PATCH 4/7] iocost: improve nr_lagging handling Some IOs may span multiple periods. As latencies are collected on completion, the inbetween periods won't register them and may incorrectly decide to increase vrate. nr_lagging tracks these IOs to avoid those situations. Currently, whenever there are IOs which are spanning from the previous period, busy_level is reset to 0 if negative thus suppressing vrate increase. This has the following two problems. * When latency target percentiles aren't set, vrate adjustment should only be governed by queue depth depletion; however, the current code keeps nr_lagging active which pulls in latency results and can keep down vrate unexpectedly. * When lagging condition is detected, it resets the entire negative busy_level. This turned out to be way too aggressive on some devices which sometimes experience extended latencies on a small subset of commands. In addition, a lagging IO will be accounted as latency target miss on completion anyway and resetting busy_level amplifies its impact unnecessarily. This patch fixes the above two problems by disabling nr_lagging counting when latency target percentiles aren't set and blocking vrate increases when there are lagging IOs while leaving busy_level as-is. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 32d4d6d86a99..10160de62e3b 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1407,7 +1407,8 @@ static void ioc_timer_fn(struct timer_list *timer) * comparing vdone against period start. If lagging behind * IOs from past periods, don't increase vrate. */ - if (!atomic_read(&iocg_to_blkg(iocg)->use_delay) && + if ((ppm_rthr != MILLION || ppm_wthr != MILLION) && + !atomic_read(&iocg_to_blkg(iocg)->use_delay) && time_after64(vtime, vdone) && time_after64(vtime, now.vnow - MAX_LAGGING_PERIODS * period_vtime) && @@ -1537,21 +1538,23 @@ skip_surplus_transfers: missed_ppm[WRITE] > ppm_wthr) { ioc->busy_level = max(ioc->busy_level, 0); ioc->busy_level++; - } else if (nr_lagging) { - ioc->busy_level = max(ioc->busy_level, 0); - } else if (nr_shortages && !nr_surpluses && - rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 && + } else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 && missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 && missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) { - ioc->busy_level = min(ioc->busy_level, 0); - ioc->busy_level--; + /* take action iff there is contention */ + if (nr_shortages && !nr_lagging) { + ioc->busy_level = min(ioc->busy_level, 0); + /* redistribute surpluses first */ + if (!nr_surpluses) + ioc->busy_level--; + } } else { ioc->busy_level = 0; } ioc->busy_level = clamp(ioc->busy_level, -1000, 1000); - if (ioc->busy_level) { + if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) { u64 vrate = atomic64_read(&ioc->vtime_rate); u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max; From 7afcccafa59fb63b58f863a6c5e603a34625955b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 25 Sep 2019 16:03:35 -0700 Subject: [PATCH 5/7] iocost: bump up default latency targets for hard disks The default hard disk param sets latency targets at 50ms. As the default target percentiles are zero, these don't directly regulate vrate; however, they're still used to calculate the period length - 100ms in this case. This is excessively low. A SATA drive with QD32 saturated with random IOs can easily reach avg completion latency of several hundred msecs. A period duration which is substantially lower than avg completion latency can lead to wildly fluctuating vrate. Let's bump up the default latency targets to 250ms so that the period duration is sufficiently long. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 10160de62e3b..2a3db80c1dce 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -529,8 +529,8 @@ struct iocg_wake_ctx { static const struct ioc_params autop[] = { [AUTOP_HDD] = { .qos = { - [QOS_RLAT] = 50000, /* 50ms */ - [QOS_WLAT] = 50000, + [QOS_RLAT] = 250000, /* 250ms */ + [QOS_WLAT] = 250000, [QOS_MIN] = VRATE_MIN_PPM, [QOS_MAX] = VRATE_MAX_PPM, }, From 2af2783f2ea4f273598557b247e320509247df80 Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Tue, 17 Sep 2019 20:04:27 +0800 Subject: [PATCH 6/7] rq-qos: get rid of redundant wbt_update_limits() We have updated limits after calling wbt_set_min_lat(). No need to update again. Reviewed-by: Bob Liu Signed-off-by: Yufen Yu Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 962fc0c44381..46f5198be017 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -482,7 +482,6 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, blk_mq_quiesce_queue(q); wbt_set_min_lat(q, val); - wbt_update_limits(q); blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); From 8d6996630c03d7ceeabe2611378fea5ca1c3f1b3 Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Fri, 27 Sep 2019 16:19:55 +0800 Subject: [PATCH 7/7] block: fix null pointer dereference in blk_mq_rq_timed_out() We got a null pointer deference BUG_ON in blk_mq_rq_timed_out() as following: [ 108.825472] BUG: kernel NULL pointer dereference, address: 0000000000000040 [ 108.827059] PGD 0 P4D 0 [ 108.827313] Oops: 0000 [#1] SMP PTI [ 108.827657] CPU: 6 PID: 198 Comm: kworker/6:1H Not tainted 5.3.0-rc8+ #431 [ 108.829503] Workqueue: kblockd blk_mq_timeout_work [ 108.829913] RIP: 0010:blk_mq_check_expired+0x258/0x330 [ 108.838191] Call Trace: [ 108.838406] bt_iter+0x74/0x80 [ 108.838665] blk_mq_queue_tag_busy_iter+0x204/0x450 [ 108.839074] ? __switch_to_asm+0x34/0x70 [ 108.839405] ? blk_mq_stop_hw_queue+0x40/0x40 [ 108.839823] ? blk_mq_stop_hw_queue+0x40/0x40 [ 108.840273] ? syscall_return_via_sysret+0xf/0x7f [ 108.840732] blk_mq_timeout_work+0x74/0x200 [ 108.841151] process_one_work+0x297/0x680 [ 108.841550] worker_thread+0x29c/0x6f0 [ 108.841926] ? rescuer_thread+0x580/0x580 [ 108.842344] kthread+0x16a/0x1a0 [ 108.842666] ? kthread_flush_work+0x170/0x170 [ 108.843100] ret_from_fork+0x35/0x40 The bug is caused by the race between timeout handle and completion for flush request. When timeout handle function blk_mq_rq_timed_out() try to read 'req->q->mq_ops', the 'req' have completed and reinitiated by next flush request, which would call blk_rq_init() to clear 'req' as 0. After commit 12f5b93145 ("blk-mq: Remove generation seqeunce"), normal requests lifetime are protected by refcount. Until 'rq->ref' drop to zero, the request can really be free. Thus, these requests cannot been reused before timeout handle finish. However, flush request has defined .end_io and rq->end_io() is still called even if 'rq->ref' doesn't drop to zero. After that, the 'flush_rq' can be reused by the next flush request handle, resulting in null pointer deference BUG ON. We fix this problem by covering flush request with 'rq->ref'. If the refcount is not zero, flush_end_io() return and wait the last holder recall it. To record the request status, we add a new entry 'rq_status', which will be used in flush_end_io(). Cc: Christoph Hellwig Cc: Keith Busch Cc: Bart Van Assche Cc: stable@vger.kernel.org # v4.18+ Reviewed-by: Ming Lei Reviewed-by: Bob Liu Signed-off-by: Yufen Yu ------- v2: - move rq_status from struct request to struct blk_flush_queue v3: - remove unnecessary '{}' pair. v4: - let spinlock to protect 'fq->rq_status' v5: - move rq_status after flush_running_idx member of struct blk_flush_queue Signed-off-by: Jens Axboe --- block/blk-flush.c | 10 ++++++++++ block/blk-mq.c | 5 ++++- block/blk.h | 7 +++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index aedd9320e605..1eec9cbe5a0a 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -214,6 +214,16 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) /* release the tag's ownership to the req cloned from */ spin_lock_irqsave(&fq->mq_flush_lock, flags); + + if (!refcount_dec_and_test(&flush_rq->ref)) { + fq->rq_status = error; + spin_unlock_irqrestore(&fq->mq_flush_lock, flags); + return; + } + + if (fq->rq_status != BLK_STS_OK) + error = fq->rq_status; + hctx = flush_rq->mq_hctx; if (!q->elevator) { blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); diff --git a/block/blk-mq.c b/block/blk-mq.c index 29275f5a996f..6e3b15f70cd7 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -918,7 +918,10 @@ static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, */ if (blk_mq_req_expired(rq, next)) blk_mq_rq_timed_out(rq, reserved); - if (refcount_dec_and_test(&rq->ref)) + + if (is_flush_rq(rq, hctx)) + rq->end_io(rq, 0); + else if (refcount_dec_and_test(&rq->ref)) __blk_mq_free_request(rq); return true; diff --git a/block/blk.h b/block/blk.h index 25773d668ec0..47fba9362e60 100644 --- a/block/blk.h +++ b/block/blk.h @@ -19,6 +19,7 @@ struct blk_flush_queue { unsigned int flush_queue_delayed:1; unsigned int flush_pending_idx:1; unsigned int flush_running_idx:1; + blk_status_t rq_status; unsigned long flush_pending_since; struct list_head flush_queue[2]; struct list_head flush_data_in_flight; @@ -47,6 +48,12 @@ static inline void __blk_get_queue(struct request_queue *q) kobject_get(&q->kobj); } +static inline bool +is_flush_rq(struct request *req, struct blk_mq_hw_ctx *hctx) +{ + return hctx->fq->flush_rq == req; +} + struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, int node, int cmd_size, gfp_t flags); void blk_free_flush_queue(struct blk_flush_queue *q);