From 65fac0d54f374625b43a9d6ad1f2c212bd41f518 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 26 Jul 2022 20:22:24 +0800 Subject: [PATCH 1/6] blk-mq: fix io hung due to missing commit_rqs Currently, in virtio_scsi, if 'bd->last' is not set to true while dispatching request, such io will stay in driver's queue, and driver will wait for block layer to dispatch more rqs. However, if block layer failed to dispatch more rq, it should trigger commit_rqs to inform driver. There is a problem in blk_mq_try_issue_list_directly() that commit_rqs won't be called: // assume that queue_depth is set to 1, list contains two rq blk_mq_try_issue_list_directly blk_mq_request_issue_directly // dispatch first rq // last is false __blk_mq_try_issue_directly blk_mq_get_dispatch_budget // succeed to get first budget __blk_mq_issue_directly scsi_queue_rq cmd->flags |= SCMD_LAST virtscsi_queuecommand kick = (sc->flags & SCMD_LAST) != 0 // kick is false, first rq won't issue to disk queued++ blk_mq_request_issue_directly // dispatch second rq __blk_mq_try_issue_directly blk_mq_get_dispatch_budget // failed to get second budget ret == BLK_STS_RESOURCE blk_mq_request_bypass_insert // errors is still 0 if (!list_empty(list) || errors && ...) // won't pass, commit_rqs won't be called In this situation, first rq relied on second rq to dispatch, while second rq relied on first rq to complete, thus they will both hung. Fix the problem by also treat 'BLK_STS_*RESOURCE' as 'errors' since it means that request is not queued successfully. Same problem exists in blk_mq_dispatch_rq_list(), 'BLK_STS_*RESOURCE' can't be treated as 'errors' here, fix the problem by calling commit_rqs if queue_rq return 'BLK_STS_*RESOURCE'. Fixes: d666ba98f849 ("blk-mq: add mq_ops->commit_rqs()") Signed-off-by: Yu Kuai Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20220726122224.1790882-1-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 3c1e6b6d991d..c96c8c4f751b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1931,7 +1931,8 @@ out: /* If we didn't flush the entire list, we could have told the driver * there was more coming, but that turned out to be a lie. */ - if ((!list_empty(list) || errors) && q->mq_ops->commit_rqs && queued) + if ((!list_empty(list) || errors || needs_resource || + ret == BLK_STS_DEV_RESOURCE) && q->mq_ops->commit_rqs && queued) q->mq_ops->commit_rqs(hctx); /* * Any items that need requeuing? Stuff them into hctx->dispatch, @@ -2660,6 +2661,7 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, list_del_init(&rq->queuelist); ret = blk_mq_request_issue_directly(rq, list_empty(list)); if (ret != BLK_STS_OK) { + errors++; if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { blk_mq_request_bypass_insert(rq, false, @@ -2667,7 +2669,6 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, break; } blk_mq_end_request(rq, ret); - errors++; } else queued++; } From c490a0b5a4f36da3918181a8acdc6991d967c5f3 Mon Sep 17 00:00:00 2001 From: Siddh Raman Pant Date: Tue, 23 Aug 2022 21:38:10 +0530 Subject: [PATCH 2/6] loop: Check for overflow while configuring loop The userspace can configure a loop using an ioctl call, wherein a configuration of type loop_config is passed (see lo_ioctl()'s case on line 1550 of drivers/block/loop.c). This proceeds to call loop_configure() which in turn calls loop_set_status_from_info() (see line 1050 of loop.c), passing &config->info which is of type loop_info64*. This function then sets the appropriate values, like the offset. loop_device has lo_offset of type loff_t (see line 52 of loop.c), which is typdef-chained to long long, whereas loop_info64 has lo_offset of type __u64 (see line 56 of include/uapi/linux/loop.h). The function directly copies offset from info to the device as follows (See line 980 of loop.c): lo->lo_offset = info->lo_offset; This results in an overflow, which triggers a warning in iomap_iter() due to a call to iomap_iter_done() which has: WARN_ON_ONCE(iter->iomap.offset > iter->pos); Thus, check for negative value during loop_set_status_from_info(). Bug report: https://syzkaller.appspot.com/bug?id=c620fe14aac810396d3c3edc9ad73848bf69a29e Reported-and-tested-by: syzbot+a8e049cd3abd342936b6@syzkaller.appspotmail.com Cc: stable@vger.kernel.org Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Siddh Raman Pant Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220823160810.181275-1-code@siddh.me Signed-off-by: Jens Axboe --- drivers/block/loop.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index e3c0ba93c1a3..ad92192c7d61 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -979,6 +979,11 @@ loop_set_status_from_info(struct loop_device *lo, lo->lo_offset = info->lo_offset; lo->lo_sizelimit = info->lo_sizelimit; + + /* loff_t vars have been assigned __u64 */ + if (lo->lo_offset < 0 || lo->lo_sizelimit < 0) + return -EOVERFLOW; + memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE); lo->lo_file_name[LO_NAME_SIZE-1] = 0; lo->lo_flags = info->lo_flags; From 265ad47a40da581be77172b4a8e1fb72b2bd914a Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 10 Aug 2022 11:20:12 -0700 Subject: [PATCH 3/6] md/raid10: Fix the data type of an r10_sync_page_io() argument Fix the following sparse warning: drivers/md/raid10.c:2647:60: sparse: sparse: incorrect type in argument 5 (different base types) @@ expected restricted blk_opf_t [usertype] opf @@ got int rw @@ This patch does not change any functionality since REQ_OP_READ = READ = 0 and since REQ_OP_WRITE = WRITE = 1. Cc: Rong A Chen Cc: Jens Axboe Cc: Paul Menzel Fixes: 4ce4c73f662b ("md/core: Combine two sync_page_io() arguments") Reported-by: kernel test robot Signed-off-by: Bart Van Assche Signed-off-by: Song Liu --- drivers/md/raid10.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 9117fcdee1be..64d6e4cd8a3a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2639,18 +2639,18 @@ static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev) } static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector, - int sectors, struct page *page, int rw) + int sectors, struct page *page, enum req_op op) { sector_t first_bad; int bad_sectors; if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors) - && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags))) + && (op == REQ_OP_READ || test_bit(WriteErrorSeen, &rdev->flags))) return -1; - if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) + if (sync_page_io(rdev, sector, sectors << 9, page, op, false)) /* success */ return 1; - if (rw == WRITE) { + if (op == REQ_OP_WRITE) { set_bit(WriteErrorSeen, &rdev->flags); if (!test_and_set_bit(WantReplacement, &rdev->flags)) set_bit(MD_RECOVERY_NEEDED, @@ -2780,7 +2780,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 if (r10_sync_page_io(rdev, r10_bio->devs[sl].addr + sect, - s, conf->tmppage, WRITE) + s, conf->tmppage, REQ_OP_WRITE) == 0) { /* Well, this device is dead */ pr_notice("md/raid10:%s: read correction write failed (%d sectors at %llu on %pg)\n", @@ -2814,8 +2814,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 switch (r10_sync_page_io(rdev, r10_bio->devs[sl].addr + sect, - s, conf->tmppage, - READ)) { + s, conf->tmppage, REQ_OP_READ)) { case 0: /* Well, this device is dead */ pr_notice("md/raid10:%s: unable to read back corrected sectors (%d sectors at %llu on %pg)\n", From 5e8daf906f890560df430d30617c692a794acb73 Mon Sep 17 00:00:00 2001 From: David Sloan Date: Thu, 11 Aug 2022 11:14:13 -0600 Subject: [PATCH 4/6] md: Flush workqueue md_rdev_misc_wq in md_alloc() A race condition still exists when removing and re-creating md devices in test cases. However, it is only seen on some setups. The race condition was tracked down to a reference still being held to the kobject by the rdev in the md_rdev_misc_wq which will be released in rdev_delayed_delete(). md_alloc() waits for previous deletions by waiting on the md_misc_wq, but the md_rdev_misc_wq may still be holding a reference to a recently removed device. To fix this, also flush the md_rdev_misc_wq in md_alloc(). Signed-off-by: David Sloan [logang@deltatee.com: rewrote commit message] Signed-off-by: Logan Gunthorpe Signed-off-by: Song Liu --- drivers/md/md.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index afaf36b2f6ab..71d221601bf8 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5620,6 +5620,7 @@ struct mddev *md_alloc(dev_t dev, char *name) * removed (mddev_delayed_delete). */ flush_workqueue(md_misc_wq); + flush_workqueue(md_rdev_misc_wq); mutex_lock(&disks_mutex); mddev = mddev_alloc(dev); From 1d258758cf06a0734482989911d184dd5837ed4e Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 17 Aug 2022 20:05:13 +0800 Subject: [PATCH 5/6] Revert "md-raid: destroy the bitmap after destroying the thread" This reverts commit e151db8ecfb019b7da31d076130a794574c89f6f. Because it obviously breaks clustered raid as noticed by Neil though it fixed KASAN issue for dm-raid, let's revert it and fix KASAN issue in next commit. [1]. https://lore.kernel.org/linux-raid/a6657e08-b6a7-358b-2d2a-0ac37d49d23a@linux.dev/T/#m95ac225cab7409f66c295772483d091084a6d470 Fixes: e151db8ecfb0 ("md-raid: destroy the bitmap after destroying the thread") Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 71d221601bf8..107c4c953c35 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6239,11 +6239,11 @@ static void mddev_detach(struct mddev *mddev) static void __md_stop(struct mddev *mddev) { struct md_personality *pers = mddev->pers; + md_bitmap_destroy(mddev); mddev_detach(mddev); /* Ensure ->event_work is done */ if (mddev->event_work.func) flush_workqueue(md_misc_wq); - md_bitmap_destroy(mddev); spin_lock(&mddev->lock); mddev->pers = NULL; spin_unlock(&mddev->lock); From 0dd84b319352bb8ba64752d4e45396d8b13e6018 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 17 Aug 2022 20:05:14 +0800 Subject: [PATCH 6/6] md: call __md_stop_writes in md_stop From the link [1], we can see raid1d was running even after the path raid_dtr -> md_stop -> __md_stop. Let's stop write first in destructor to align with normal md-raid to fix the KASAN issue. [1]. https://lore.kernel.org/linux-raid/CAPhsuW5gc4AakdGNdF8ubpezAuDLFOYUO_sfMZcec6hQFm8nhg@mail.gmail.com/T/#m7f12bf90481c02c6d2da68c64aeed4779b7df74a Fixes: 48df498daf62 ("md: move bitmap_destroy to the beginning of __md_stop") Reported-by: Mikulas Patocka Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index 107c4c953c35..729be2c5296c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6261,6 +6261,7 @@ void md_stop(struct mddev *mddev) /* stop the array and free an attached data structures. * This is called from dm-raid */ + __md_stop_writes(mddev); __md_stop(mddev); bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set);