From ae8650b45d1837aae117fa147aeef69540bb3fe8 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 19 Jul 2024 15:15:04 +0800 Subject: [PATCH 001/120] blk-cgroup: check for pd_(alloc|free)_fn in blkcg_activate_policy() Currently all policies implement pd_(alloc|free)_fn, however, this is not necessary for ioprio that only works for blkcg, not blkg. There are no functional changes, prepare to cleanup activating ioprio policy. Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20240719071506.158075-2-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 69e70964398c..fa6ec02ab3cf 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1554,6 +1554,14 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) if (blkcg_policy_enabled(q, pol)) return 0; + /* + * Policy is allowed to be registered without pd_alloc_fn/pd_free_fn, + * for example, ioprio. Such policy will work on blkcg level, not disk + * level, and don't need to be activated. + */ + if (WARN_ON_ONCE(!pol->pd_alloc_fn || !pol->pd_free_fn)) + return -EINVAL; + if (queue_is_mq(q)) blk_mq_freeze_queue(q); retry: @@ -1733,9 +1741,12 @@ int blkcg_policy_register(struct blkcg_policy *pol) goto err_unlock; } - /* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */ + /* + * Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs, and policy + * without pd_alloc_fn/pd_free_fn can't be activated. + */ if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) || - (!pol->pd_alloc_fn ^ !pol->pd_free_fn)) + (!pol->pd_alloc_fn ^ !pol->pd_free_fn)) goto err_unlock; /* register @pol */ From d0e92795997c8dce7ec804791e359a93211f9719 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 19 Jul 2024 15:15:05 +0800 Subject: [PATCH 002/120] blk-ioprio: remove ioprio_blkcg_from_bio() Currently, if config is enabled, then ioprio is always enabled by default from blkcg_init_disk(), hence there is no point to check if the policy is enabled from blkg in ioprio_blkcg_from_bio(). Hence remove it and get blkcg directly from bio. Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Acked-by: Tejun Heo Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20240719071506.158075-3-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-ioprio.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c index 4051fada01f1..ae52b418e984 100644 --- a/block/blk-ioprio.c +++ b/block/blk-ioprio.c @@ -84,16 +84,6 @@ ioprio_blkcg_from_css(struct cgroup_subsys_state *css) return blkcg_to_ioprio_blkcg(css_to_blkcg(css)); } -static struct ioprio_blkcg *ioprio_blkcg_from_bio(struct bio *bio) -{ - struct blkg_policy_data *pd = blkg_to_pd(bio->bi_blkg, &ioprio_policy); - - if (!pd) - return NULL; - - return blkcg_to_ioprio_blkcg(pd->blkg->blkcg); -} - static int ioprio_show_prio_policy(struct seq_file *sf, void *v) { struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(seq_css(sf)); @@ -186,7 +176,7 @@ static struct blkcg_policy ioprio_policy = { void blkcg_set_ioprio(struct bio *bio) { - struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio); + struct ioprio_blkcg *blkcg = blkcg_to_ioprio_blkcg(bio->bi_blkg->blkcg); u16 prio; if (!blkcg || blkcg->prio_policy == POLICY_NO_CHANGE) From 79c6c60a6c1f5128b31ab80b7ac1c4d8255342ac Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 19 Jul 2024 15:15:06 +0800 Subject: [PATCH 003/120] blk-ioprio: remove per-disk structure ioprio works on the blk-cgroup level, all disks in the same cgroup are the same, and the struct ioprio_blkg doesn't have anything in it. Hence register the policy is enough, because cpd_alloc/free_fn will be handled for each blk-cgroup, and there is no need to activate the policy for disk. Hence remove blk_ioprio_init/exit and ioprio_alloc/free_pd. Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Acked-by: Tejun Heo Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20240719071506.158075-4-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 8 -------- block/blk-ioprio.c | 45 --------------------------------------------- block/blk-ioprio.h | 9 --------- 3 files changed, 62 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index fa6ec02ab3cf..e68c725cf8d9 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1458,7 +1458,6 @@ int blkcg_init_disk(struct gendisk *disk) struct request_queue *q = disk->queue; struct blkcg_gq *new_blkg, *blkg; bool preloaded; - int ret; new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL); if (!new_blkg) @@ -1478,15 +1477,8 @@ int blkcg_init_disk(struct gendisk *disk) if (preloaded) radix_tree_preload_end(); - ret = blk_ioprio_init(disk); - if (ret) - goto err_destroy_all; - return 0; -err_destroy_all: - blkg_destroy_all(disk); - return ret; err_unlock: spin_unlock_irq(&q->queue_lock); if (preloaded) diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c index ae52b418e984..8fff7ccc0ac7 100644 --- a/block/blk-ioprio.c +++ b/block/blk-ioprio.c @@ -49,14 +49,6 @@ static const char *policy_name[] = { static struct blkcg_policy ioprio_policy; -/** - * struct ioprio_blkg - Per (cgroup, request queue) data. - * @pd: blkg_policy_data structure. - */ -struct ioprio_blkg { - struct blkg_policy_data pd; -}; - /** * struct ioprio_blkcg - Per cgroup data. * @cpd: blkcg_policy_data structure. @@ -67,11 +59,6 @@ struct ioprio_blkcg { enum prio_policy prio_policy; }; -static inline struct ioprio_blkg *pd_to_ioprio(struct blkg_policy_data *pd) -{ - return pd ? container_of(pd, struct ioprio_blkg, pd) : NULL; -} - static struct ioprio_blkcg *blkcg_to_ioprio_blkcg(struct blkcg *blkcg) { return container_of(blkcg_to_cpd(blkcg, &ioprio_policy), @@ -108,25 +95,6 @@ static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf, return nbytes; } -static struct blkg_policy_data * -ioprio_alloc_pd(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp) -{ - struct ioprio_blkg *ioprio_blkg; - - ioprio_blkg = kzalloc(sizeof(*ioprio_blkg), gfp); - if (!ioprio_blkg) - return NULL; - - return &ioprio_blkg->pd; -} - -static void ioprio_free_pd(struct blkg_policy_data *pd) -{ - struct ioprio_blkg *ioprio_blkg = pd_to_ioprio(pd); - - kfree(ioprio_blkg); -} - static struct blkcg_policy_data *ioprio_alloc_cpd(gfp_t gfp) { struct ioprio_blkcg *blkcg; @@ -169,9 +137,6 @@ static struct blkcg_policy ioprio_policy = { .cpd_alloc_fn = ioprio_alloc_cpd, .cpd_free_fn = ioprio_free_cpd, - - .pd_alloc_fn = ioprio_alloc_pd, - .pd_free_fn = ioprio_free_pd, }; void blkcg_set_ioprio(struct bio *bio) @@ -209,16 +174,6 @@ void blkcg_set_ioprio(struct bio *bio) bio->bi_ioprio = prio; } -void blk_ioprio_exit(struct gendisk *disk) -{ - blkcg_deactivate_policy(disk, &ioprio_policy); -} - -int blk_ioprio_init(struct gendisk *disk) -{ - return blkcg_activate_policy(disk, &ioprio_policy); -} - static int __init ioprio_init(void) { return blkcg_policy_register(&ioprio_policy); diff --git a/block/blk-ioprio.h b/block/blk-ioprio.h index b6afb8e80de0..9265143f9bc9 100644 --- a/block/blk-ioprio.h +++ b/block/blk-ioprio.h @@ -9,17 +9,8 @@ struct request_queue; struct bio; #ifdef CONFIG_BLK_CGROUP_IOPRIO -int blk_ioprio_init(struct gendisk *disk); -void blk_ioprio_exit(struct gendisk *disk); void blkcg_set_ioprio(struct bio *bio); #else -static inline int blk_ioprio_init(struct gendisk *disk) -{ - return 0; -} -static inline void blk_ioprio_exit(struct gendisk *disk) -{ -} static inline void blkcg_set_ioprio(struct bio *bio) { } From 23a55f4492fcf868d068da31a2cd30c15f46207d Mon Sep 17 00:00:00 2001 From: Ofir Gal Date: Thu, 18 Jul 2024 11:45:12 +0300 Subject: [PATCH 004/120] net: introduce helper sendpages_ok() Network drivers are using sendpage_ok() to check the first page of an iterator in order to disable MSG_SPLICE_PAGES. The iterator can represent list of contiguous pages. When MSG_SPLICE_PAGES is enabled skb_splice_from_iter() is being used, it requires all pages in the iterator to be sendable. Therefore it needs to check that each page is sendable. The patch introduces a helper sendpages_ok(), it returns true if all the contiguous pages are sendable. Drivers who want to send contiguous pages with MSG_SPLICE_PAGES may use this helper to check whether the page list is OK. If the helper does not return true, the driver should remove MSG_SPLICE_PAGES flag. Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Ofir Gal Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20240718084515.3833733-2-ofir.gal@volumez.com Signed-off-by: Jens Axboe --- include/linux/net.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/include/linux/net.h b/include/linux/net.h index 688320b79fcc..b75bc534c1b3 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -322,6 +322,25 @@ static inline bool sendpage_ok(struct page *page) return !PageSlab(page) && page_count(page) >= 1; } +/* + * Check sendpage_ok on contiguous pages. + */ +static inline bool sendpages_ok(struct page *page, size_t len, size_t offset) +{ + struct page *p = page + (offset >> PAGE_SHIFT); + size_t count = 0; + + while (count < len) { + if (!sendpage_ok(p)) + return false; + + p++; + count += PAGE_SIZE; + } + + return true; +} + int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t len); int kernel_sendmsg_locked(struct sock *sk, struct msghdr *msg, From 6af7331a70b4888df43ec1d7e1803ae2c43b6981 Mon Sep 17 00:00:00 2001 From: Ofir Gal Date: Thu, 18 Jul 2024 11:45:13 +0300 Subject: [PATCH 005/120] nvme-tcp: use sendpages_ok() instead of sendpage_ok() Currently nvme_tcp_try_send_data() use sendpage_ok() in order to disable MSG_SPLICE_PAGES, it check the first page of the iterator, the iterator may represent contiguous pages. MSG_SPLICE_PAGES enables skb_splice_from_iter() which checks all the pages it sends with sendpage_ok(). When nvme_tcp_try_send_data() sends an iterator that the first page is sendable, but one of the other pages isn't skb_splice_from_iter() warns and aborts the data transfer. Using the new helper sendpages_ok() in order to disable MSG_SPLICE_PAGES solves the issue. Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Ofir Gal Link: https://lore.kernel.org/r/20240718084515.3833733-3-ofir.gal@volumez.com Signed-off-by: Jens Axboe --- drivers/nvme/host/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index a2a47d3ab99f..9ea6be0b0392 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1051,7 +1051,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) else msg.msg_flags |= MSG_MORE; - if (!sendpage_ok(page)) + if (!sendpages_ok(page, len, offset)) msg.msg_flags &= ~MSG_SPLICE_PAGES; bvec_set_page(&bvec, page, len, offset); From 7960af373ade3b39e10106ef415e43a1d2aa48c6 Mon Sep 17 00:00:00 2001 From: Ofir Gal Date: Thu, 18 Jul 2024 11:45:14 +0300 Subject: [PATCH 006/120] drbd: use sendpages_ok() instead of sendpage_ok() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently _drbd_send_page() use sendpage_ok() in order to enable MSG_SPLICE_PAGES, it check the first page of the iterator, the iterator may represent contiguous pages. MSG_SPLICE_PAGES enables skb_splice_from_iter() which checks all the pages it sends with sendpage_ok(). When _drbd_send_page() sends an iterator that the first page is sendable, but one of the other pages isn't skb_splice_from_iter() warns and aborts the data transfer. Using the new helper sendpages_ok() in order to enable MSG_SPLICE_PAGES solves the issue. Acked-by: Christoph Böhmwalder Signed-off-by: Ofir Gal Link: https://lore.kernel.org/r/20240718084515.3833733-4-ofir.gal@volumez.com Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index a9e49b212341..449123eb54bf 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1550,7 +1550,7 @@ static int _drbd_send_page(struct drbd_peer_device *peer_device, struct page *pa * put_page(); and would cause either a VM_BUG directly, or * __page_cache_release a page that would actually still be referenced * by someone, leading to some obscure delayed Oops somewhere else. */ - if (!drbd_disable_sendpage && sendpage_ok(page)) + if (!drbd_disable_sendpage && sendpages_ok(page, len, offset)) msg.msg_flags |= MSG_NOSIGNAL | MSG_SPLICE_PAGES; drbd_update_congested(peer_device->connection); From 7543ae2269a855683a39af57048035f44bc8ef9c Mon Sep 17 00:00:00 2001 From: Wouter Verhelst Date: Thu, 25 Jul 2024 18:45:36 +0200 Subject: [PATCH 007/120] nbd: add support for rotational devices The NBD protocol defines the flag NBD_FLAG_ROTATIONAL to flag that the export in use should be treated as a rotational device. Add support for that flag to the kernel driver. Signed-off-by: Wouter Verhelst Reviewed-by: Eric Blake Link: https://lore.kernel.org/r/20240725164536.1275851-1-w@uter.be Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 3 +++ include/uapi/linux/nbd.h | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 41a90150b501..5b1811b1ba5f 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -350,6 +350,9 @@ static int __nbd_set_size(struct nbd_device *nbd, loff_t bytesize, lim.features |= BLK_FEAT_WRITE_CACHE; lim.features &= ~BLK_FEAT_FUA; } + if (nbd->config->flags & NBD_FLAG_ROTATIONAL) + lim.features |= BLK_FEAT_ROTATIONAL; + lim.logical_block_size = blksize; lim.physical_block_size = blksize; error = queue_limits_commit_update(nbd->disk->queue, &lim); diff --git a/include/uapi/linux/nbd.h b/include/uapi/linux/nbd.h index 80ce0ef43afd..d75215f2c675 100644 --- a/include/uapi/linux/nbd.h +++ b/include/uapi/linux/nbd.h @@ -51,8 +51,9 @@ enum { #define NBD_FLAG_READ_ONLY (1 << 1) /* device is read-only */ #define NBD_FLAG_SEND_FLUSH (1 << 2) /* can flush writeback cache */ #define NBD_FLAG_SEND_FUA (1 << 3) /* send FUA (forced unit access) */ -/* there is a gap here to match userspace */ +#define NBD_FLAG_ROTATIONAL (1 << 4) /* device is rotational */ #define NBD_FLAG_SEND_TRIM (1 << 5) /* send trim/discard */ +/* there is a gap here to match userspace */ #define NBD_FLAG_CAN_MULTI_CONN (1 << 8) /* Server supports multiple connections per export. */ /* values for cmd flags in the upper 16 bits of request type */ From f48ada402d2f1e46fa241bcc6725bdde70725e15 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 2 Aug 2024 17:51:47 +0800 Subject: [PATCH 008/120] drbd: Remove unused extern declarations Commit b411b3637fa7 ("The DRBD driver") declared but never implemented drbd_read_remote(), is_valid_ar_handle() and drbd_set_recv_tcq(). And commit 668700b40a7c ("drbd: Create a dedicated workqueue for sending acks on the control connection") never implemented drbd_send_ping_wf(). Commit 2451fc3b2bd3 ("drbd: Removed the BIO_RW_BARRIER support form the receiver/epoch code") leave w_e_reissue() declaration unused. Commit 8fe605513ab4 ("drbd: Rename drbdd_init() -> drbd_receiver()") rename drbdd_init() and leave unsued declaration. Also drbd_asender() is removed in commit 1c03e52083c8 ("drbd: Rename asender to ack_receiver"). Signed-off-by: YueHaibing Link: https://lore.kernel.org/r/20240802095147.2788218-1-yuehaibing@huawei.com Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_int.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 94dc0a235919..d2937bca1fe4 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -297,10 +297,6 @@ struct drbd_epoch { unsigned long flags; }; -/* Prototype declaration of function defined in drbd_receiver.c */ -int drbdd_init(struct drbd_thread *); -int drbd_asender(struct drbd_thread *); - /* drbd_epoch flag bits */ enum { DE_HAVE_BARRIER_NUMBER, @@ -1390,9 +1386,6 @@ extern void conn_free_crypto(struct drbd_connection *connection); extern void do_submit(struct work_struct *ws); extern void __drbd_make_request(struct drbd_device *, struct bio *); void drbd_submit_bio(struct bio *bio); -extern int drbd_read_remote(struct drbd_device *device, struct drbd_request *req); -extern int is_valid_ar_handle(struct drbd_request *, sector_t); - /* drbd_nl.c */ @@ -1474,7 +1467,6 @@ extern int w_resync_timer(struct drbd_work *, int); extern int w_send_write_hint(struct drbd_work *, int); extern int w_send_dblock(struct drbd_work *, int); extern int w_send_read_req(struct drbd_work *, int); -extern int w_e_reissue(struct drbd_work *, int); extern int w_restart_disk_io(struct drbd_work *, int); extern int w_send_out_of_sync(struct drbd_work *, int); @@ -1488,7 +1480,6 @@ extern int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, unsigned int nr_sectors, int flags); extern int drbd_receiver(struct drbd_thread *thi); extern int drbd_ack_receiver(struct drbd_thread *thi); -extern void drbd_send_ping_wf(struct work_struct *ws); extern void drbd_send_acks_wf(struct work_struct *ws); extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device); extern bool drbd_rs_should_slow_down(struct drbd_peer_device *peer_device, sector_t sector, @@ -1504,7 +1495,6 @@ extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request #define drbd_free_peer_req(m,e) __drbd_free_peer_req(m, e, 0) #define drbd_free_net_peer_req(m,e) __drbd_free_peer_req(m, e, 1) extern struct page *drbd_alloc_pages(struct drbd_peer_device *, unsigned int, bool); -extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled); extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed); extern int drbd_connected(struct drbd_peer_device *); From 9327b51c9a9c864f5177127e09851da9d78b4943 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 12 Aug 2024 09:36:24 +0800 Subject: [PATCH 009/120] ublk: move zone report data out of request pdu ublk zoned takes 16 bytes in each request pdu just for handling REPORT_ZONE operation, this way does waste memory since request pdu is allocated statically. Store the transient zone report data into one global xarray, and remove it after the report zone request is completed. This way is reasonable since report zone is run in slow code path. Fixes: 29802d7ca33b ("ublk: enable zoned storage support") Cc: Damien Le Moal Cc: Andreas Hindborg Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20240812013624.587587-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 62 +++++++++++++++++++++++++++++----------- 1 file changed, 46 insertions(+), 16 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 890c08792ba8..0e295471318a 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -71,9 +71,6 @@ struct ublk_rq_data { struct llist_node node; struct kref ref; - __u64 sector; - __u32 operation; - __u32 nr_zones; }; struct ublk_uring_cmd_pdu { @@ -214,6 +211,33 @@ static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq) #ifdef CONFIG_BLK_DEV_ZONED +struct ublk_zoned_report_desc { + __u64 sector; + __u32 operation; + __u32 nr_zones; +}; + +static DEFINE_XARRAY(ublk_zoned_report_descs); + +static int ublk_zoned_insert_report_desc(const struct request *req, + struct ublk_zoned_report_desc *desc) +{ + return xa_insert(&ublk_zoned_report_descs, (unsigned long)req, + desc, GFP_KERNEL); +} + +static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc( + const struct request *req) +{ + return xa_erase(&ublk_zoned_report_descs, (unsigned long)req); +} + +static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc( + const struct request *req) +{ + return xa_load(&ublk_zoned_report_descs, (unsigned long)req); +} + static int ublk_get_nr_zones(const struct ublk_device *ub) { const struct ublk_param_basic *p = &ub->params.basic; @@ -308,7 +332,7 @@ static int ublk_report_zones(struct gendisk *disk, sector_t sector, unsigned int zones_in_request = min_t(unsigned int, remaining_zones, max_zones_per_request); struct request *req; - struct ublk_rq_data *pdu; + struct ublk_zoned_report_desc desc; blk_status_t status; memset(buffer, 0, buffer_length); @@ -319,20 +343,23 @@ static int ublk_report_zones(struct gendisk *disk, sector_t sector, goto out; } - pdu = blk_mq_rq_to_pdu(req); - pdu->operation = UBLK_IO_OP_REPORT_ZONES; - pdu->sector = sector; - pdu->nr_zones = zones_in_request; + desc.operation = UBLK_IO_OP_REPORT_ZONES; + desc.sector = sector; + desc.nr_zones = zones_in_request; + ret = ublk_zoned_insert_report_desc(req, &desc); + if (ret) + goto free_req; ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length, GFP_KERNEL); - if (ret) { - blk_mq_free_request(req); - goto out; - } + if (ret) + goto erase_desc; status = blk_execute_rq(req, 0); ret = blk_status_to_errno(status); +erase_desc: + ublk_zoned_erase_report_desc(req); +free_req: blk_mq_free_request(req); if (ret) goto out; @@ -366,7 +393,7 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, { struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag); struct ublk_io *io = &ubq->ios[req->tag]; - struct ublk_rq_data *pdu = blk_mq_rq_to_pdu(req); + struct ublk_zoned_report_desc *desc; u32 ublk_op; switch (req_op(req)) { @@ -389,12 +416,15 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, ublk_op = UBLK_IO_OP_ZONE_RESET_ALL; break; case REQ_OP_DRV_IN: - ublk_op = pdu->operation; + desc = ublk_zoned_get_report_desc(req); + if (!desc) + return BLK_STS_IOERR; + ublk_op = desc->operation; switch (ublk_op) { case UBLK_IO_OP_REPORT_ZONES: iod->op_flags = ublk_op | ublk_req_build_flags(req); - iod->nr_zones = pdu->nr_zones; - iod->start_sector = pdu->sector; + iod->nr_zones = desc->nr_zones; + iod->start_sector = desc->sector; return BLK_STS_OK; default: return BLK_STS_IOERR; From 49923a0dff59fa6b34aa6cc16dc9eefdbbcd3846 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 12 Aug 2024 21:10:33 +0300 Subject: [PATCH 010/120] block: delete module stuff from t10-pi It is not possible to build t10-pi.ko anymore. This file doesn't even export functions. Signed-off-by: Alexey Dobriyan Reviewed-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/216ccc79-5b80-47b2-b507-990951aa810c@p183 Signed-off-by: Jens Axboe --- block/t10-pi.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/block/t10-pi.c b/block/t10-pi.c index 425e2836f3e1..4a90d200ae7b 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include "blk.h" @@ -472,6 +471,3 @@ void blk_integrity_complete(struct request *rq, unsigned int nr_bytes) else t10_pi_type1_complete(rq, nr_bytes); } - -MODULE_DESCRIPTION("T10 Protection Information module"); -MODULE_LICENSE("GPL"); From a28dc358e28fb0738dd23e401cd7646cb4b0f7f1 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 12 Aug 2024 21:12:10 +0300 Subject: [PATCH 011/120] block: constify ext_pi_ref_escape() This function doesn't mutate data. Signed-off-by: Alexey Dobriyan Reviewed-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/d24611b3-dddf-473a-903d-39290db03b11@p183 Signed-off-by: Jens Axboe --- block/t10-pi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/t10-pi.c b/block/t10-pi.c index 4a90d200ae7b..e7052a728966 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -239,9 +239,9 @@ static void ext_pi_crc64_generate(struct blk_integrity_iter *iter, } } -static bool ext_pi_ref_escape(u8 *ref_tag) +static bool ext_pi_ref_escape(const u8 ref_tag[6]) { - static u8 ref_escape[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; + static const u8 ref_escape[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; return memcmp(ref_tag, ref_escape, sizeof(ref_escape)) == 0; } From ca958879ade564daa0e0fa82aeeccf3bc7f73edd Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Tue, 16 Jul 2024 10:58:52 +0800 Subject: [PATCH 012/120] md: convert comma to semicolon Replace a comma between expression statements by a semicolon. Fixes: 5e5702898e93 ("md/raid10: Handle read errors during recovery better.") Signed-off-by: Chen Ni Link: https://lore.kernel.org/r/20240716025852.400259-1-nichen@iscas.ac.cn Signed-off-by: Song Liu --- drivers/md/raid10.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 2a9c4ee982e0..e55e020b5571 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2465,7 +2465,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio) s = PAGE_SIZE >> 9; rdev = conf->mirrors[dr].rdev; - addr = r10_bio->devs[0].addr + sect, + addr = r10_bio->devs[0].addr + sect; ok = sync_page_io(rdev, addr, s << 9, From b2261de75212910e2ca01fa673c8855a535d8c60 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Fri, 16 Aug 2024 17:58:21 +0800 Subject: [PATCH 013/120] blk-cgroup: Remove unused declaration blkg_path() Commit bb7e5a193d8b ("block, bfq: remove blkg_path()") removed the implementation but leave declaration. Signed-off-by: Yue Haibing Link: https://lore.kernel.org/r/20240816095821.877842-1-yuehaibing@huawei.com Signed-off-by: Jens Axboe --- block/blk-cgroup.h | 1 - 1 file changed, 1 deletion(-) diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 864fad4a850b..b9e3265c1eb3 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -485,7 +485,6 @@ static inline void blkcg_deactivate_policy(struct gendisk *disk, static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, struct blkcg_policy *pol) { return NULL; } static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } -static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; } static inline void blkg_get(struct blkcg_gq *blkg) { } static inline void blkg_put(struct blkcg_gq *blkg) { } static inline void blkcg_bio_issue_init(struct bio *bio) { } From 4e893ca8117022de68ce1b61c0309e3d17bb8a25 Mon Sep 17 00:00:00 2001 From: Stuart Hayes Date: Wed, 17 Jul 2024 13:55:50 -0500 Subject: [PATCH 014/120] nvme_core: scan namespaces asynchronously Use async function calls to make namespace scanning happen in parallel. Without the patch, NVME namespaces are scanned serially, so it can take a long time for all of a controller's namespaces to become available, especially with a slower (TCP) interface with large number of namespaces. It is not uncommon to have large numbers (hundreds or thousands) of namespaces on nvme-of with storage servers. The time it took for all namespaces to show up after connecting (via TCP) to a controller with 1002 namespaces was measured on one system: network latency without patch with patch 0 6s 1s 50ms 210s 10s 100ms 417s 18s Measurements taken on another system show the effect of the patch on the time nvme_scan_work() took to complete, when connecting to a linux nvme-of target with varying numbers of namespaces, on a network of 400us. namespaces without patch with patch 1 16ms 14ms 2 24ms 16ms 4 49ms 22ms 8 101ms 33ms 16 207ms 56ms 100 1.4s 0.6s 1000 12.9s 2.0s On the same system, connecting to a local PCIe NVMe drive (a Samsung PM1733) instead of a network target: namespaces without patch with patch 1 13ms 12ms 2 41ms 13ms Signed-off-by: Stuart Hayes Reviewed-by: Sagi Grimberg --- drivers/nvme/host/core.c | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 053d5b4909cd..b132887429c0 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4,6 +4,7 @@ * Copyright (c) 2011-2014, Intel Corporation. */ +#include #include #include #include @@ -4040,6 +4041,35 @@ static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid) } } +/** + * struct async_scan_info - keeps track of controller & NSIDs to scan + * @ctrl: Controller on which namespaces are being scanned + * @next_nsid: Index of next NSID to scan in ns_list + * @ns_list: Pointer to list of NSIDs to scan + * + * Note: There is a single async_scan_info structure shared by all instances + * of nvme_scan_ns_async() scanning a given controller, so the atomic + * operations on next_nsid are critical to ensure each instance scans a unique + * NSID. + */ +struct async_scan_info { + struct nvme_ctrl *ctrl; + atomic_t next_nsid; + __le32 *ns_list; +}; + +static void nvme_scan_ns_async(void *data, async_cookie_t cookie) +{ + struct async_scan_info *scan_info = data; + int idx; + u32 nsid; + + idx = (u32)atomic_fetch_inc(&scan_info->next_nsid); + nsid = le32_to_cpu(scan_info->ns_list[idx]); + + nvme_scan_ns(scan_info->ctrl, nsid); +} + static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, unsigned nsid) { @@ -4066,11 +4096,15 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) __le32 *ns_list; u32 prev = 0; int ret = 0, i; + ASYNC_DOMAIN(domain); + struct async_scan_info scan_info; ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL); if (!ns_list) return -ENOMEM; + scan_info.ctrl = ctrl; + scan_info.ns_list = ns_list; for (;;) { struct nvme_command cmd = { .identify.opcode = nvme_admin_identify, @@ -4086,19 +4120,23 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) goto free; } + atomic_set(&scan_info.next_nsid, 0); for (i = 0; i < nr_entries; i++) { u32 nsid = le32_to_cpu(ns_list[i]); if (!nsid) /* end of the list? */ goto out; - nvme_scan_ns(ctrl, nsid); + async_schedule_domain(nvme_scan_ns_async, &scan_info, + &domain); while (++prev < nsid) nvme_ns_remove_by_nsid(ctrl, prev); } + async_synchronize_full_domain(&domain); } out: nvme_remove_invalid_namespaces(ctrl, prev); free: + async_synchronize_full_domain(&domain); kfree(ns_list); return ret; } From 79559c75332458985ab8a21f11b08bf7c9b833b0 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 22 Jul 2024 14:02:18 +0200 Subject: [PATCH 015/120] nvme-keyring: restrict match length for version '1' identifiers TP8018 introduced a new TLS PSK identifier version (version 1), which appended a PSK hash value to the existing identifier (cf NVMe TCP specification v1.1, section 3.6.1.3 'TLS PSK and PSK Identity Derivation'). An original (version 0) identifier has the form: NVMe0 and a version 1 identifier has the form: NVMe1 This patch modifies the lookup algorthm to compare only the first part of the identifier (excluding the hash value) to handle both version 0 and version 1 identifiers. And the spec declares 'version 0' identifiers obsolete, so the lookup algorithm is modified to prever v1 identifiers. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/common/keyring.c | 36 +++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/common/keyring.c b/drivers/nvme/common/keyring.c index 6f7e7a8fa5ae..05e89307c8aa 100644 --- a/drivers/nvme/common/keyring.c +++ b/drivers/nvme/common/keyring.c @@ -36,14 +36,12 @@ static bool nvme_tls_psk_match(const struct key *key, pr_debug("%s: no key description\n", __func__); return false; } - match_len = strlen(key->description); - pr_debug("%s: id %s len %zd\n", __func__, key->description, match_len); - if (!match_data->raw_data) { pr_debug("%s: no match data\n", __func__); return false; } match_id = match_data->raw_data; + match_len = strlen(match_id); pr_debug("%s: match '%s' '%s' len %zd\n", __func__, match_id, key->description, match_len); return !memcmp(key->description, match_id, match_len); @@ -71,7 +69,7 @@ static struct key_type nvme_tls_psk_key_type = { static struct key *nvme_tls_psk_lookup(struct key *keyring, const char *hostnqn, const char *subnqn, - int hmac, bool generated) + u8 hmac, u8 psk_ver, bool generated) { char *identity; size_t identity_len = (NVMF_NQN_SIZE) * 2 + 11; @@ -82,8 +80,8 @@ static struct key *nvme_tls_psk_lookup(struct key *keyring, if (!identity) return ERR_PTR(-ENOMEM); - snprintf(identity, identity_len, "NVMe0%c%02d %s %s", - generated ? 'G' : 'R', hmac, hostnqn, subnqn); + snprintf(identity, identity_len, "NVMe%u%c%02u %s %s", + psk_ver, generated ? 'G' : 'R', hmac, hostnqn, subnqn); if (!keyring) keyring = nvme_keyring; @@ -107,21 +105,38 @@ static struct key *nvme_tls_psk_lookup(struct key *keyring, /* * NVMe PSK priority list * - * 'Retained' PSKs (ie 'generated == false') - * should be preferred to 'generated' PSKs, - * and SHA-384 should be preferred to SHA-256. + * 'Retained' PSKs (ie 'generated == false') should be preferred to 'generated' + * PSKs, PSKs with hash (psk_ver 1) should be preferred to PSKs without hash + * (psk_ver 0), and SHA-384 should be preferred to SHA-256. */ static struct nvme_tls_psk_priority_list { bool generated; + u8 psk_ver; enum nvme_tcp_tls_cipher cipher; } nvme_tls_psk_prio[] = { { .generated = false, + .psk_ver = 1, .cipher = NVME_TCP_TLS_CIPHER_SHA384, }, { .generated = false, + .psk_ver = 1, + .cipher = NVME_TCP_TLS_CIPHER_SHA256, }, + { .generated = false, + .psk_ver = 0, + .cipher = NVME_TCP_TLS_CIPHER_SHA384, }, + { .generated = false, + .psk_ver = 0, .cipher = NVME_TCP_TLS_CIPHER_SHA256, }, { .generated = true, + .psk_ver = 1, .cipher = NVME_TCP_TLS_CIPHER_SHA384, }, { .generated = true, + .psk_ver = 1, + .cipher = NVME_TCP_TLS_CIPHER_SHA256, }, + { .generated = true, + .psk_ver = 0, + .cipher = NVME_TCP_TLS_CIPHER_SHA384, }, + { .generated = true, + .psk_ver = 0, .cipher = NVME_TCP_TLS_CIPHER_SHA256, }, }; @@ -137,10 +152,11 @@ key_serial_t nvme_tls_psk_default(struct key *keyring, for (prio = 0; prio < ARRAY_SIZE(nvme_tls_psk_prio); prio++) { bool generated = nvme_tls_psk_prio[prio].generated; + u8 ver = nvme_tls_psk_prio[prio].psk_ver; enum nvme_tcp_tls_cipher cipher = nvme_tls_psk_prio[prio].cipher; tls_key = nvme_tls_psk_lookup(keyring, hostnqn, subnqn, - cipher, generated); + cipher, ver, generated); if (!IS_ERR(tls_key)) { tls_key_id = tls_key->serial; key_put(tls_key); From 363895767fbfa05891b0b4d9e06ebde7a10c6a07 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 22 Jul 2024 14:02:19 +0200 Subject: [PATCH 016/120] nvme-tcp: sanitize TLS key handling There is a difference between TLS configured (ie the user has provisioned/requested a key) and TLS enabled (ie the connection is encrypted with TLS). This becomes important for secure concatenation, where the initial authentication is run on an unencrypted connection (ie with TLS configured, but not enabled), and then the queue is reset to run over TLS (ie TLS configured _and_ enabled). So to differentiate between those two states store the generated key in opts->tls_key (as we're using the same TLS key for all queues), the key serial of the resulting TLS handshake in ctrl->tls_pskid (to signal that TLS on the admin queue is enabled), and a simple flag for the queues to indicated that TLS has been enabled. Signed-off-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 1 - drivers/nvme/host/nvme.h | 2 +- drivers/nvme/host/sysfs.c | 4 +-- drivers/nvme/host/tcp.c | 53 +++++++++++++++++++++++++++++---------- 4 files changed, 43 insertions(+), 17 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b132887429c0..93aa4c10adb7 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4715,7 +4715,6 @@ static void nvme_free_ctrl(struct device *dev) if (!subsys || ctrl->instance != subsys->instance) ida_free(&nvme_instance_ida, ctrl->instance); - key_put(ctrl->tls_key); nvme_free_cels(ctrl); nvme_mpath_uninit(ctrl); cleanup_srcu_struct(&ctrl->srcu); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index f900e44243ae..af22368800e7 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -373,7 +373,7 @@ struct nvme_ctrl { struct nvme_dhchap_key *ctrl_key; u16 transaction; #endif - struct key *tls_key; + key_serial_t tls_pskid; /* Power saving configuration */ u64 ps_max_latency_us; diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index ba05faaac562..72675b59a7a7 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -670,9 +670,9 @@ static ssize_t tls_key_show(struct device *dev, { struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - if (!ctrl->tls_key) + if (!ctrl->tls_pskid) return 0; - return sysfs_emit(buf, "%08x", key_serial(ctrl->tls_key)); + return sysfs_emit(buf, "%08x", ctrl->tls_pskid); } static DEVICE_ATTR_RO(tls_key); #endif diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 9ea6be0b0392..b40e95bee849 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -165,6 +165,7 @@ struct nvme_tcp_queue { bool hdr_digest; bool data_digest; + bool tls_enabled; struct ahash_request *rcv_hash; struct ahash_request *snd_hash; __le32 exp_ddgst; @@ -213,7 +214,21 @@ static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue) return queue - queue->ctrl->queues; } -static inline bool nvme_tcp_tls(struct nvme_ctrl *ctrl) +/* + * Check if the queue is TLS encrypted + */ +static inline bool nvme_tcp_queue_tls(struct nvme_tcp_queue *queue) +{ + if (!IS_ENABLED(CONFIG_NVME_TCP_TLS)) + return 0; + + return queue->tls_enabled; +} + +/* + * Check if TLS is configured for the controller. + */ +static inline bool nvme_tcp_tls_configured(struct nvme_ctrl *ctrl) { if (!IS_ENABLED(CONFIG_NVME_TCP_TLS)) return 0; @@ -368,7 +383,7 @@ static inline bool nvme_tcp_queue_has_pending(struct nvme_tcp_queue *queue) static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue) { - return !nvme_tcp_tls(&queue->ctrl->ctrl) && + return !nvme_tcp_queue_tls(queue) && nvme_tcp_queue_has_pending(queue); } @@ -1427,7 +1442,7 @@ static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) memset(&msg, 0, sizeof(msg)); iov.iov_base = icresp; iov.iov_len = sizeof(*icresp); - if (nvme_tcp_tls(&queue->ctrl->ctrl)) { + if (nvme_tcp_queue_tls(queue)) { msg.msg_control = cbuf; msg.msg_controllen = sizeof(cbuf); } @@ -1439,7 +1454,7 @@ static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) goto free_icresp; } ret = -ENOTCONN; - if (nvme_tcp_tls(&queue->ctrl->ctrl)) { + if (nvme_tcp_queue_tls(queue)) { ctype = tls_get_record_type(queue->sock->sk, (struct cmsghdr *)cbuf); if (ctype != TLS_RECORD_TYPE_DATA) { @@ -1587,7 +1602,10 @@ static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid) qid, pskid); queue->tls_err = -ENOKEY; } else { - ctrl->ctrl.tls_key = tls_key; + queue->tls_enabled = true; + if (qid == 0) + ctrl->ctrl.tls_pskid = key_serial(tls_key); + key_put(tls_key); queue->tls_err = 0; } @@ -1768,7 +1786,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid, } /* If PSKs are configured try to start TLS */ - if (IS_ENABLED(CONFIG_NVME_TCP_TLS) && pskid) { + if (nvme_tcp_tls_configured(nctrl) && pskid) { ret = nvme_tcp_start_tls(nctrl, queue, pskid); if (ret) goto err_init_connect; @@ -1829,6 +1847,8 @@ static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid) mutex_lock(&queue->queue_lock); if (test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags)) __nvme_tcp_stop_queue(queue); + /* Stopping the queue will disable TLS */ + queue->tls_enabled = false; mutex_unlock(&queue->queue_lock); } @@ -1925,16 +1945,17 @@ static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl) int ret; key_serial_t pskid = 0; - if (nvme_tcp_tls(ctrl)) { + if (nvme_tcp_tls_configured(ctrl)) { if (ctrl->opts->tls_key) pskid = key_serial(ctrl->opts->tls_key); - else + else { pskid = nvme_tls_psk_default(ctrl->opts->keyring, ctrl->opts->host->nqn, ctrl->opts->subsysnqn); - if (!pskid) { - dev_err(ctrl->device, "no valid PSK found\n"); - return -ENOKEY; + if (!pskid) { + dev_err(ctrl->device, "no valid PSK found\n"); + return -ENOKEY; + } } } @@ -1957,13 +1978,14 @@ static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) { int i, ret; - if (nvme_tcp_tls(ctrl) && !ctrl->tls_key) { + if (nvme_tcp_tls_configured(ctrl) && !ctrl->tls_pskid) { dev_err(ctrl->device, "no PSK negotiated\n"); return -ENOKEY; } + for (i = 1; i < ctrl->queue_count; i++) { ret = nvme_tcp_alloc_queue(ctrl, i, - key_serial(ctrl->tls_key)); + ctrl->tls_pskid); if (ret) goto out_free_queues; } @@ -2144,6 +2166,11 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, if (remove) nvme_unquiesce_admin_queue(ctrl); nvme_tcp_destroy_admin_queue(ctrl, remove); + if (ctrl->tls_pskid) { + dev_dbg(ctrl->device, "Wipe negotiated TLS_PSK %08x\n", + ctrl->tls_pskid); + ctrl->tls_pskid = 0; + } } static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, From 5bc46b49c828a6dfaab80b71ecb63fe76a1096d2 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 22 Jul 2024 14:02:20 +0200 Subject: [PATCH 017/120] nvme-tcp: check for invalidated or revoked key key_lookup() will always return a key, even if that key is revoked or invalidated. So check for invalid keys before continuing. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/common/keyring.c | 22 ++++++++++++++++++++++ drivers/nvme/host/Kconfig | 1 + drivers/nvme/host/fabrics.c | 2 +- drivers/nvme/host/tcp.c | 2 +- include/linux/nvme-keyring.h | 6 +++++- 5 files changed, 30 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/common/keyring.c b/drivers/nvme/common/keyring.c index 05e89307c8aa..ed5167f942d8 100644 --- a/drivers/nvme/common/keyring.c +++ b/drivers/nvme/common/keyring.c @@ -20,6 +20,28 @@ key_serial_t nvme_keyring_id(void) } EXPORT_SYMBOL_GPL(nvme_keyring_id); +static bool nvme_tls_psk_revoked(struct key *psk) +{ + return test_bit(KEY_FLAG_REVOKED, &psk->flags) || + test_bit(KEY_FLAG_INVALIDATED, &psk->flags); +} + +struct key *nvme_tls_key_lookup(key_serial_t key_id) +{ + struct key *key = key_lookup(key_id); + + if (IS_ERR(key)) { + pr_err("key id %08x not found\n", key_id); + return key; + } + if (nvme_tls_psk_revoked(key)) { + pr_err("key id %08x revoked\n", key_id); + return ERR_PTR(-EKEYREVOKED); + } + return key; +} +EXPORT_SYMBOL_GPL(nvme_tls_key_lookup); + static void nvme_tls_psk_describe(const struct key *key, struct seq_file *m) { seq_puts(m, key->description); diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index a3caef75aa0a..883aaab2d83e 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -109,6 +109,7 @@ config NVME_HOST_AUTH bool "NVMe over Fabrics In-Band Authentication in host side" depends on NVME_CORE select NVME_AUTH + select NVME_KEYRING if NVME_TCP_TLS help This provides support for NVMe over Fabrics In-Band Authentication in host side. diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index f5f545fa0103..432efcbf9e2f 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -665,7 +665,7 @@ static struct key *nvmf_parse_key(int key_id) return ERR_PTR(-EINVAL); } - key = key_lookup(key_id); + key = nvme_tls_key_lookup(key_id); if (IS_ERR(key)) pr_err("key id %08x not found\n", key_id); else diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index b40e95bee849..89c44413c593 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1596,7 +1596,7 @@ static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid) goto out_complete; } - tls_key = key_lookup(pskid); + tls_key = nvme_tls_key_lookup(pskid); if (IS_ERR(tls_key)) { dev_warn(ctrl->ctrl.device, "queue %d: Invalid key %x\n", qid, pskid); diff --git a/include/linux/nvme-keyring.h b/include/linux/nvme-keyring.h index e10333d78dbb..19d2b256180f 100644 --- a/include/linux/nvme-keyring.h +++ b/include/linux/nvme-keyring.h @@ -12,7 +12,7 @@ key_serial_t nvme_tls_psk_default(struct key *keyring, const char *hostnqn, const char *subnqn); key_serial_t nvme_keyring_id(void); - +struct key *nvme_tls_key_lookup(key_serial_t key_id); #else static inline key_serial_t nvme_tls_psk_default(struct key *keyring, @@ -24,5 +24,9 @@ static inline key_serial_t nvme_keyring_id(void) { return 0; } +static inline struct key *nvme_tls_key_lookup(key_serial_t key_id) +{ + return ERR_PTR(-ENOTSUPP); +} #endif /* !CONFIG_NVME_KEYRING */ #endif /* _NVME_KEYRING_H */ From c5f2ca52d00de5d0348a758a28b51ddf5e685a89 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 22 Jul 2024 14:02:21 +0200 Subject: [PATCH 018/120] nvme: add a newline to the 'tls_key' sysfs attribute Print a newline for easier userspace handling. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index 72675b59a7a7..c391ad6c5a88 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -672,7 +672,7 @@ static ssize_t tls_key_show(struct device *dev, if (!ctrl->tls_pskid) return 0; - return sysfs_emit(buf, "%08x", ctrl->tls_pskid); + return sysfs_emit(buf, "%08x\n", ctrl->tls_pskid); } static DEVICE_ATTR_RO(tls_key); #endif From 1e48b34c9bc79aa36700fccbfdf87e61e4431d2b Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 22 Jul 2024 14:02:22 +0200 Subject: [PATCH 019/120] nvme: split off TLS sysfs attributes into a separate group Split off TLS sysfs attributes into a separate group to improve readability and to keep all TLS related handling in one section. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/sysfs.c | 62 ++++++++++++++++++++++++++------------- 1 file changed, 41 insertions(+), 21 deletions(-) diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index c391ad6c5a88..dc7ceb53147f 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -664,19 +664,6 @@ static DEVICE_ATTR(dhchap_ctrl_secret, S_IRUGO | S_IWUSR, nvme_ctrl_dhchap_ctrl_secret_show, nvme_ctrl_dhchap_ctrl_secret_store); #endif -#ifdef CONFIG_NVME_TCP_TLS -static ssize_t tls_key_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - - if (!ctrl->tls_pskid) - return 0; - return sysfs_emit(buf, "%08x\n", ctrl->tls_pskid); -} -static DEVICE_ATTR_RO(tls_key); -#endif - static struct attribute *nvme_dev_attrs[] = { &dev_attr_reset_controller.attr, &dev_attr_rescan_controller.attr, @@ -703,9 +690,6 @@ static struct attribute *nvme_dev_attrs[] = { #ifdef CONFIG_NVME_HOST_AUTH &dev_attr_dhchap_secret.attr, &dev_attr_dhchap_ctrl_secret.attr, -#endif -#ifdef CONFIG_NVME_TCP_TLS - &dev_attr_tls_key.attr, #endif &dev_attr_adm_passthru_err_log_enabled.attr, NULL @@ -737,11 +721,6 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, if (a == &dev_attr_dhchap_ctrl_secret.attr && !ctrl->opts) return 0; #endif -#ifdef CONFIG_NVME_TCP_TLS - if (a == &dev_attr_tls_key.attr && - (!ctrl->opts || strcmp(ctrl->opts->transport, "tcp"))) - return 0; -#endif return a->mode; } @@ -752,8 +731,49 @@ const struct attribute_group nvme_dev_attrs_group = { }; EXPORT_SYMBOL_GPL(nvme_dev_attrs_group); +#ifdef CONFIG_NVME_TCP_TLS +static ssize_t tls_key_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (!ctrl->tls_pskid) + return 0; + return sysfs_emit(buf, "%08x\n", ctrl->tls_pskid); +} +static DEVICE_ATTR_RO(tls_key); + +static struct attribute *nvme_tls_attrs[] = { + &dev_attr_tls_key.attr, +}; + +static umode_t nvme_tls_attrs_are_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (!ctrl->opts || strcmp(ctrl->opts->transport, "tcp")) + return 0; + + if (a == &dev_attr_tls_key.attr && + !ctrl->opts->tls) + return 0; + + return a->mode; +} + +const struct attribute_group nvme_tls_attrs_group = { + .attrs = nvme_tls_attrs, + .is_visible = nvme_tls_attrs_are_visible, +}; +#endif + const struct attribute_group *nvme_dev_attr_groups[] = { &nvme_dev_attrs_group, +#ifdef CONFIG_NVME_TCP_TLS + &nvme_tls_attrs_group, +#endif NULL, }; From f5eb7397471bbc24d63011f8cb2d422ac606085d Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 22 Jul 2024 14:02:23 +0200 Subject: [PATCH 020/120] nvme-sysfs: add 'tls_configured_key' sysfs attribute There is a difference between the negotiated TLS key (which is always present for a TLS encrypted connection) and the configured TLS key (which is specified with the --tls_key command line option). To differentate between these two add a new sysfs attribute 'tls_configured_key' to hold the specified on the command line. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/sysfs.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index dc7ceb53147f..2055dad7bc63 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -743,8 +743,19 @@ static ssize_t tls_key_show(struct device *dev, } static DEVICE_ATTR_RO(tls_key); +static ssize_t tls_configured_key_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct key *key = ctrl->opts->tls_key; + + return sysfs_emit(buf, "%08x\n", key_serial(key)); +} +static DEVICE_ATTR_RO(tls_configured_key); + static struct attribute *nvme_tls_attrs[] = { &dev_attr_tls_key.attr, + &dev_attr_tls_configured_key.attr, }; static umode_t nvme_tls_attrs_are_visible(struct kobject *kobj, @@ -759,6 +770,9 @@ static umode_t nvme_tls_attrs_are_visible(struct kobject *kobj, if (a == &dev_attr_tls_key.attr && !ctrl->opts->tls) return 0; + if (a == &dev_attr_tls_configured_key.attr && + !ctrl->opts->tls_key) + return 0; return a->mode; } From 02a3688c53d648e027ca9c27423bf864a189d7c7 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 22 Jul 2024 14:02:24 +0200 Subject: [PATCH 021/120] nvme-sysfs: add 'tls_keyring' attribute Add a 'tls_keyring' attribute to display the contents of the --keyring option from the connect string. Adding this attribute allows us to recreate the original connect string from sysfs settings. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/sysfs.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index 2055dad7bc63..eb345551d6fe 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -753,9 +753,20 @@ static ssize_t tls_configured_key_show(struct device *dev, } static DEVICE_ATTR_RO(tls_configured_key); +static ssize_t tls_keyring_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct key *keyring = ctrl->opts->keyring; + + return sysfs_emit(buf, "%s\n", keyring->description); +} +static DEVICE_ATTR_RO(tls_keyring); + static struct attribute *nvme_tls_attrs[] = { &dev_attr_tls_key.attr, &dev_attr_tls_configured_key.attr, + &dev_attr_tls_keyring.attr, }; static umode_t nvme_tls_attrs_are_visible(struct kobject *kobj, @@ -773,6 +784,9 @@ static umode_t nvme_tls_attrs_are_visible(struct kobject *kobj, if (a == &dev_attr_tls_configured_key.attr && !ctrl->opts->tls_key) return 0; + if (a == &dev_attr_tls_keyring.attr && + !ctrl->opts->keyring) + return 0; return a->mode; } From bb2df18958b4287104c38541cf48c4a60c90e721 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 22 Jul 2024 14:02:25 +0200 Subject: [PATCH 022/120] nvmet-auth: allow to clear DH-HMAC-CHAP keys As we can set DH-HMAC-CHAP keys, we should also be able to unset them. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/auth.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c index 8bc3f431c77f..7897d02c681d 100644 --- a/drivers/nvme/target/auth.c +++ b/drivers/nvme/target/auth.c @@ -25,6 +25,18 @@ int nvmet_auth_set_key(struct nvmet_host *host, const char *secret, unsigned char key_hash; char *dhchap_secret; + if (!strlen(secret)) { + if (set_ctrl) { + kfree(host->dhchap_ctrl_secret); + host->dhchap_ctrl_secret = NULL; + host->dhchap_ctrl_key_hash = 0; + } else { + kfree(host->dhchap_secret); + host->dhchap_secret = NULL; + host->dhchap_key_hash = 0; + } + return 0; + } if (sscanf(secret, "DHHC-1:%hhd:%*s", &key_hash) != 1) return -EINVAL; if (key_hash > 3) { From ff4a0a4088adc6a37293a9cce25bb56ad2f26a16 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 22 Jul 2024 14:02:26 +0200 Subject: [PATCH 023/120] nvme-target: do not check authentication status for admin commands twice nvmet_check_ctrl_status() checks the authentication status, so we don't need to do that prior to calling it. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/target/admin-cmd.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index f7e1156ac7ec..d64480f01f4a 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -1005,8 +1005,6 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) if (nvme_is_fabrics(cmd)) return nvmet_parse_fabrics_admin_cmd(req); - if (unlikely(!nvmet_check_auth_status(req))) - return NVME_SC_AUTH_REQUIRED | NVME_STATUS_DNR; if (nvmet_is_disc_subsys(nvmet_req_subsys(req))) return nvmet_parse_discovery_cmd(req); From 87599eddc25ac03647ab76221523c6485e7594b1 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 25 Aug 2024 18:22:23 +0200 Subject: [PATCH 024/120] drbd: Remove an unused field in struct drbd_device 'next_barrier_nr' is not used in this driver. Remove it. It was already part of the original commit b411b3637fa7 ("The DRBD driver") Apparently, it has never been used. Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/d5322ef88d1d6f544963ee277cb0b427da8dceef.1724602922.git.christophe.jaillet@wanadoo.fr Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_int.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index d2937bca1fe4..2a05d955e30b 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -860,7 +860,6 @@ struct drbd_device { struct list_head read_ee; /* [RS]P_DATA_REQUEST being read */ struct list_head net_ee; /* zero-copy network send in progress */ - int next_barrier_nr; struct list_head resync_reads; atomic_t pp_in_use; /* allocated from page pool */ atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */ From 03c3d7c74371a46d967fbf41628874ec04ddda96 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Thu, 15 Aug 2024 22:11:31 +0200 Subject: [PATCH 025/120] nvme-rdma: send cntlid in the RDMA_CM_REQUEST Private Data When sending a RDMA_CM_REQUEST, the NVMe RDMA Transport Specification allows you to populate the cntlid field in the RDMA_CM_REQUEST Private Data. The cntlid is returned by the target on completion of the first RDMA_CM_REQUEST command (which creates the admin queue). The cntlid field can then be populated by the host when the I/O queues are created (using additional RDMA_CM_REQUEST commands), such that the target can perform extra validation for additional RDMA_CM_REQUEST commands. This additional error code and error message is also added, such that nvme_rdma_cm_msg() will display the proper error message if the target fails the RDMA_CM_REQUEST command because of this extra validation. Signed-off-by: Niklas Cassel Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/rdma.c | 2 ++ include/linux/nvme-rdma.h | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 2eb33842f971..e3520a91147b 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1876,6 +1876,8 @@ static int nvme_rdma_route_resolved(struct nvme_rdma_queue *queue) */ priv.hrqsize = cpu_to_le16(queue->queue_size); priv.hsqsize = cpu_to_le16(queue->ctrl->ctrl.sqsize); + /* cntlid should only be set when creating an I/O queue */ + priv.cntlid = cpu_to_le16(ctrl->ctrl.cntlid); } ret = rdma_connect_locked(queue->cm_id, ¶m); diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h index eb2f04d636c8..97c5f00b9aa3 100644 --- a/include/linux/nvme-rdma.h +++ b/include/linux/nvme-rdma.h @@ -25,6 +25,7 @@ enum nvme_rdma_cm_status { NVME_RDMA_CM_NO_RSC = 0x06, NVME_RDMA_CM_INVALID_IRD = 0x07, NVME_RDMA_CM_INVALID_ORD = 0x08, + NVME_RDMA_CM_INVALID_CNTLID = 0x09, }; static inline const char *nvme_rdma_cm_msg(enum nvme_rdma_cm_status status) @@ -46,6 +47,8 @@ static inline const char *nvme_rdma_cm_msg(enum nvme_rdma_cm_status status) return "invalid IRD"; case NVME_RDMA_CM_INVALID_ORD: return "Invalid ORD"; + case NVME_RDMA_CM_INVALID_CNTLID: + return "invalid controller ID"; default: return "unrecognized reason"; } @@ -64,7 +67,8 @@ struct nvme_rdma_cm_req { __le16 qid; __le16 hrqsize; __le16 hsqsize; - u8 rsvd[24]; + __le16 cntlid; + u8 rsvd[22]; }; /** From cead0b8991713bdeb5b947758dd62287fcf8da40 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Mon, 26 Aug 2024 16:09:43 +0530 Subject: [PATCH 026/120] nvme: rename apptag and appmask to lbat and lbatm Rename apptag and appmask to lbat and lbatm so that it matches the field names used in NVMe spec. Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Suggested-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 4 ++-- drivers/nvme/host/ioctl.c | 4 ++-- drivers/nvme/host/rdma.c | 4 ++-- drivers/nvme/target/rdma.c | 4 ++-- include/linux/nvme.h | 8 ++++---- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 93aa4c10adb7..75ab62cb4aa4 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -987,8 +987,8 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1); cmnd->rw.reftag = 0; - cmnd->rw.apptag = 0; - cmnd->rw.appmask = 0; + cmnd->rw.lbat = 0; + cmnd->rw.lbatm = 0; if (ns->head->ms) { /* diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index f1d58e70933f..850f81e08e7d 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -260,8 +260,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) c.rw.control = cpu_to_le16(io.control); c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); c.rw.reftag = cpu_to_le32(io.reftag); - c.rw.apptag = cpu_to_le16(io.apptag); - c.rw.appmask = cpu_to_le16(io.appmask); + c.rw.lbat = cpu_to_le16(io.apptag); + c.rw.lbatm = cpu_to_le16(io.appmask); return nvme_submit_user_cmd(ns->queue, &c, io.addr, length, metadata, meta_len, lower_32_bits(io.slba), NULL, 0, 0); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index e3520a91147b..15b5e06039a5 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1363,8 +1363,8 @@ static void nvme_rdma_set_sig_domain(struct blk_integrity *bi, if (control & NVME_RW_PRINFO_PRCHK_REF) domain->sig.dif.ref_remap = true; - domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.apptag); - domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.appmask); + domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.lbat); + domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.lbatm); domain->sig.dif.app_escape = true; if (pi_type == NVME_NS_DPS_PI_TYPE3) domain->sig.dif.ref_escape = true; diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 1eff8ca6a5f1..1b6264fa5803 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -578,8 +578,8 @@ static void nvmet_rdma_set_sig_domain(struct blk_integrity *bi, if (control & NVME_RW_PRINFO_PRCHK_REF) domain->sig.dif.ref_remap = true; - domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.apptag); - domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.appmask); + domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.lbat); + domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.lbatm); domain->sig.dif.app_escape = true; if (pi_type == NVME_NS_DPS_PI_TYPE3) domain->sig.dif.ref_escape = true; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 7b2ae2e43544..b58d9405d65e 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -987,8 +987,8 @@ struct nvme_rw_command { __le16 control; __le32 dsmgmt; __le32 reftag; - __le16 apptag; - __le16 appmask; + __le16 lbat; + __le16 lbatm; }; enum { @@ -1057,8 +1057,8 @@ struct nvme_write_zeroes_cmd { __le16 control; __le32 dsmgmt; __le32 reftag; - __le16 apptag; - __le16 appmask; + __le16 lbat; + __le16 lbatm; }; enum nvme_zone_mgmt_action { From 9bce8005ec0dcb23a58300e8522fe4a31da606fa Mon Sep 17 00:00:00 2001 From: Konstantin Ovsepian Date: Thu, 22 Aug 2024 08:41:36 -0700 Subject: [PATCH 027/120] blk_iocost: fix more out of bound shifts Recently running UBSAN caught few out of bound shifts in the ioc_forgive_debts() function: UBSAN: shift-out-of-bounds in block/blk-iocost.c:2142:38 shift exponent 80 is too large for 64-bit type 'u64' (aka 'unsigned long long') ... UBSAN: shift-out-of-bounds in block/blk-iocost.c:2144:30 shift exponent 80 is too large for 64-bit type 'u64' (aka 'unsigned long long') ... Call Trace: dump_stack_lvl+0xca/0x130 __ubsan_handle_shift_out_of_bounds+0x22c/0x280 ? __lock_acquire+0x6441/0x7c10 ioc_timer_fn+0x6cec/0x7750 ? blk_iocost_init+0x720/0x720 ? call_timer_fn+0x5d/0x470 call_timer_fn+0xfa/0x470 ? blk_iocost_init+0x720/0x720 __run_timer_base+0x519/0x700 ... Actual impact of this issue was not identified but I propose to fix the undefined behaviour. The proposed fix to prevent those out of bound shifts consist of precalculating exponent before using it the shift operations by taking min value from the actual exponent and maximum possible number of bits. Reported-by: Breno Leitao Signed-off-by: Konstantin Ovsepian Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20240822154137.2627818-1-ovs@ovs.to Signed-off-by: Jens Axboe --- block/blk-iocost.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 690ca99dfaca..5a6098a3db57 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2076,7 +2076,7 @@ static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors, struct ioc_now *now) { struct ioc_gq *iocg; - u64 dur, usage_pct, nr_cycles; + u64 dur, usage_pct, nr_cycles, nr_cycles_shift; /* if no debtor, reset the cycle */ if (!nr_debtors) { @@ -2138,10 +2138,12 @@ static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors, old_debt = iocg->abs_vdebt; old_delay = iocg->delay; + nr_cycles_shift = min_t(u64, nr_cycles, BITS_PER_LONG - 1); if (iocg->abs_vdebt) - iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles ?: 1; + iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles_shift ?: 1; + if (iocg->delay) - iocg->delay = iocg->delay >> nr_cycles ?: 1; + iocg->delay = iocg->delay >> nr_cycles_shift ?: 1; iocg_kick_waitq(iocg, true, now); From 752a59298ea9c695ec966fc5ba7173897a1ef361 Mon Sep 17 00:00:00 2001 From: Yang Ruibin <11162571@vivo.com> Date: Tue, 27 Aug 2024 10:27:40 +0800 Subject: [PATCH 028/120] pktcdvd: remove unnecessary debugfs_create_dir() error check Remove the debugfs_create_dir() error check. It's safe to pass in error pointers to the debugfs API, hence the user isn't supposed to include error checking of the return values. Signed-off-by: Yang Ruibin <11162571@vivo.com> Link: https://lore.kernel.org/r/20240827022741.3410294-1-11162571@vivo.com Signed-off-by: Jens Axboe --- drivers/block/pktcdvd.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 7cece5884b9c..3edb37a41312 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -498,8 +498,6 @@ static void pkt_debugfs_dev_new(struct pktcdvd_device *pd) if (!pkt_debugfs_root) return; pd->dfs_d_root = debugfs_create_dir(pd->disk->disk_name, pkt_debugfs_root); - if (!pd->dfs_d_root) - return; pd->dfs_f_info = debugfs_create_file("info", 0444, pd->dfs_d_root, pd, &pkt_seq_fops); From 86ad4cda79e0dade87d4bb0d32e1fe541d4a63e8 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 1 Aug 2024 20:47:46 +0800 Subject: [PATCH 029/120] md: Don't flush sync_work in md_write_start() Because flush sync_work may trigger mddev_suspend() if there are spares, and this should never be done in IO path because mddev_suspend() is used to wait for IO. This problem is found by code review. Fixes: bc08041b32ab ("md: suspend array in md_start_sync() if array need reconfiguration") Cc: stable@vger.kernel.org Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240801124746.242558-1-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index d3a837506a36..23cc77d51676 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8668,7 +8668,6 @@ void md_write_start(struct mddev *mddev, struct bio *bi) BUG_ON(mddev->ro == MD_RDONLY); if (mddev->ro == MD_AUTO_READ) { /* need to switch to read/write */ - flush_work(&mddev->sync_work); mddev->ro = MD_RDWR; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); From 2d389a759d02fa30bd686c797ad620b1da9d1320 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 1 Aug 2024 21:30:08 +0800 Subject: [PATCH 030/120] md/raid1: Clean up local variable 'b' from raid1_read_request() The local variable will only be used onced, in the error path that read_balance() failed to find a valid rdev to read. Since now the rdev is ensured can't be removed from conf while IO is still pending, remove the local variable and dereference rdev directly. Since we're here, also remove an extra empty line, and unnecessary type conversion from sector_t(u64) to unsigned long long. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240801133008.459998-1-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/raid1.c | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 7acfe7c9dc8d..531ddfc6efbd 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1317,7 +1317,6 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, int max_sectors; int rdisk; bool r1bio_existed = !!r1_bio; - char b[BDEVNAME_SIZE]; /* * If r1_bio is set, we are blocking the raid1d thread @@ -1326,16 +1325,6 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, */ gfp_t gfp = r1_bio ? (GFP_NOIO | __GFP_HIGH) : GFP_NOIO; - if (r1bio_existed) { - /* Need to get the block device name carefully */ - struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev; - - if (rdev) - snprintf(b, sizeof(b), "%pg", rdev->bdev); - else - strcpy(b, "???"); - } - /* * Still need barrier for READ in case that whole * array is frozen. @@ -1357,15 +1346,13 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, * used and no empty request is available. */ rdisk = read_balance(conf, r1_bio, &max_sectors); - if (rdisk < 0) { /* couldn't find anywhere to read from */ - if (r1bio_existed) { - pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n", + if (r1bio_existed) + pr_crit_ratelimited("md/raid1:%s: %pg: unrecoverable I/O read error for block %llu\n", mdname(mddev), - b, - (unsigned long long)r1_bio->sector); - } + conf->mirrors[r1_bio->read_disk].rdev->bdev, + r1_bio->sector); raid_end_bio_io(r1_bio); return; } From 2db4fa1b7e1897908d5cd11b1e22233ff0c6ba49 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:11 +0800 Subject: [PATCH 031/120] md/raid1: use md_bitmap_wait_behind_writes() in raid1_read_request() Use the existed helper instead of open coding it to make the code cleaner. There are no functional changes, and also avoid dereferencing bitmap directly to prepare inventing a new bitmap. Noted that this patch also export md_bitmap_wait_behind_writes(), which is necessary for now, and the exported api will be removed in following patches to convert bitmap apis into ops. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-2-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 1 + drivers/md/raid1.c | 7 ++----- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 08232d8dc815..08743dcc70f1 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1851,6 +1851,7 @@ void md_bitmap_wait_behind_writes(struct mddev *mddev) atomic_read(&bitmap->behind_writes) == 0); } } +EXPORT_SYMBOL_GPL(md_bitmap_wait_behind_writes); void md_bitmap_destroy(struct mddev *mddev) { diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 531ddfc6efbd..18aaa7247bba 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1311,7 +1311,6 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, struct r1conf *conf = mddev->private; struct raid1_info *mirror; struct bio *read_bio; - struct bitmap *bitmap = mddev->bitmap; const enum req_op op = bio_op(bio); const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; int max_sectors; @@ -1364,15 +1363,13 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, (unsigned long long)r1_bio->sector, mirror->rdev->bdev); - if (test_bit(WriteMostly, &mirror->rdev->flags) && - bitmap) { + if (test_bit(WriteMostly, &mirror->rdev->flags)) { /* * Reading from a write-mostly device must take care not to * over-take any writes that are 'behind' */ mddev_add_trace_msg(mddev, "raid1 wait behind writes"); - wait_event(bitmap->behind_wait, - atomic_read(&bitmap->behind_writes) == 0); + md_bitmap_wait_behind_writes(mddev); } if (max_sectors < bio_sectors(bio)) { From 38f287d7e495ae00d4481702f44ff7ca79f5c9bc Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:12 +0800 Subject: [PATCH 032/120] md/md-bitmap: replace md_bitmap_status() with a new helper md_bitmap_get_stats() There are no functional changes, and the new helper will be used in multiple places in following patches to avoid dereferencing bitmap directly. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-3-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 25 ++++++------------------- drivers/md/md-bitmap.h | 8 +++++++- drivers/md/md.c | 29 ++++++++++++++++++++++++++++- 3 files changed, 41 insertions(+), 21 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 08743dcc70f1..66ebe12d80ae 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2094,32 +2094,19 @@ int md_bitmap_copy_from_slot(struct mddev *mddev, int slot, } EXPORT_SYMBOL_GPL(md_bitmap_copy_from_slot); - -void md_bitmap_status(struct seq_file *seq, struct bitmap *bitmap) +int md_bitmap_get_stats(struct bitmap *bitmap, struct md_bitmap_stats *stats) { - unsigned long chunk_kb; struct bitmap_counts *counts; if (!bitmap) - return; + return -ENOENT; counts = &bitmap->counts; + stats->missing_pages = counts->missing_pages; + stats->pages = counts->pages; + stats->file = bitmap->storage.file; - chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10; - seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " - "%lu%s chunk", - counts->pages - counts->missing_pages, - counts->pages, - (counts->pages - counts->missing_pages) - << (PAGE_SHIFT - 10), - chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize, - chunk_kb ? "KB" : "B"); - if (bitmap->storage.file) { - seq_printf(seq, ", file: "); - seq_file_path(seq, bitmap->storage.file, " \t\n"); - } - - seq_printf(seq, "\n"); + return 0; } int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index bb9eb418780a..b60418e3f573 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -234,6 +234,12 @@ struct bitmap { int cluster_slot; /* Slot offset for clustered env */ }; +struct md_bitmap_stats { + unsigned long missing_pages; + unsigned long pages; + struct file *file; +}; + /* the bitmap API */ /* these are used only by md/bitmap */ @@ -244,7 +250,7 @@ void md_bitmap_destroy(struct mddev *mddev); void md_bitmap_print_sb(struct bitmap *bitmap); void md_bitmap_update_sb(struct bitmap *bitmap); -void md_bitmap_status(struct seq_file *seq, struct bitmap *bitmap); +int md_bitmap_get_stats(struct bitmap *bitmap, struct md_bitmap_stats *stats); int md_bitmap_setallbits(struct bitmap *bitmap); void md_bitmap_write_all(struct bitmap *bitmap); diff --git a/drivers/md/md.c b/drivers/md/md.c index 23cc77d51676..22731d2717ba 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8370,6 +8370,33 @@ static void md_seq_stop(struct seq_file *seq, void *v) spin_unlock(&all_mddevs_lock); } +static void md_bitmap_status(struct seq_file *seq, struct mddev *mddev) +{ + struct md_bitmap_stats stats; + unsigned long used_pages; + unsigned long chunk_kb; + int err; + + err = md_bitmap_get_stats(mddev->bitmap, &stats); + if (err) + return; + + chunk_kb = mddev->bitmap_info.chunksize >> 10; + used_pages = stats.pages - stats.missing_pages; + + seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], %lu%s chunk", + used_pages, stats.pages, used_pages << (PAGE_SHIFT - 10), + chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize, + chunk_kb ? "KB" : "B"); + + if (stats.file) { + seq_puts(seq, ", file: "); + seq_file_path(seq, stats.file, " \t\n"); + } + + seq_putc(seq, '\n'); +} + static int md_seq_show(struct seq_file *seq, void *v) { struct mddev *mddev; @@ -8453,7 +8480,7 @@ static int md_seq_show(struct seq_file *seq, void *v) } else seq_printf(seq, "\n "); - md_bitmap_status(seq, mddev->bitmap); + md_bitmap_status(seq, mddev); seq_printf(seq, "\n"); } From 968153812215d68c27c0c9d90da6ec2f6d17a606 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:13 +0800 Subject: [PATCH 033/120] md: use new helper md_bitmap_get_stats() in update_array_info() There are no functional changes, avoid dereferencing bitmap directly to prepare inventing a new bitmap. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-4-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 22731d2717ba..0cca8d3acfe3 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7577,15 +7577,17 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) if (rv) md_bitmap_destroy(mddev); } else { - /* remove the bitmap */ - if (!mddev->bitmap) { - rv = -ENOENT; + struct md_bitmap_stats stats; + + rv = md_bitmap_get_stats(mddev->bitmap, &stats); + if (rv) goto err; - } - if (mddev->bitmap->storage.file) { + + if (stats.file) { rv = -EINVAL; goto err; } + if (mddev->bitmap_info.nodes) { /* hold PW on all the bitmap lock */ if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { From d004442f46ccae9ea90fdda7a2b0516f1d42b88e Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:14 +0800 Subject: [PATCH 034/120] md/md-bitmap: add 'events_cleared' into struct md_bitmap_stats Also add a new helper to get events_cleared to avoid dereferencing bitmap directly to prepare inventing a new bitmap. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-5-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 2 ++ drivers/md/md-bitmap.h | 1 + drivers/md/md.c | 16 ++++++++++++++-- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 66ebe12d80ae..95afc22bd255 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2104,6 +2104,8 @@ int md_bitmap_get_stats(struct bitmap *bitmap, struct md_bitmap_stats *stats) counts = &bitmap->counts; stats->missing_pages = counts->missing_pages; stats->pages = counts->pages; + + stats->events_cleared = bitmap->events_cleared; stats->file = bitmap->storage.file; return 0; diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index b60418e3f573..751dca2366c3 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -235,6 +235,7 @@ struct bitmap { }; struct md_bitmap_stats { + u64 events_cleared; unsigned long missing_pages; unsigned long pages; struct file *file; diff --git a/drivers/md/md.c b/drivers/md/md.c index 0cca8d3acfe3..cde6663c7fcb 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1372,6 +1372,18 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor return ret; } +static u64 md_bitmap_events_cleared(struct mddev *mddev) +{ + struct md_bitmap_stats stats; + int err; + + err = md_bitmap_get_stats(mddev->bitmap, &stats); + if (err) + return 0; + + return stats.events_cleared; +} + /* * validate_super for 0.90.0 * note: we are not using "freshest" for 0.9 superblock @@ -1464,7 +1476,7 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, stru /* if adding to array with a bitmap, then we can accept an * older device ... but not too old. */ - if (ev1 < mddev->bitmap->events_cleared) + if (ev1 < md_bitmap_events_cleared(mddev)) return 0; if (ev1 < mddev->events) set_bit(Bitmap_sync, &rdev->flags); @@ -1991,7 +2003,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struc /* If adding to array with a bitmap, then we can accept an * older device, but not too old. */ - if (ev1 < mddev->bitmap->events_cleared) + if (ev1 < md_bitmap_events_cleared(mddev)) return 0; if (ev1 < mddev->events) set_bit(Bitmap_sync, &rdev->flags); From 82697ccf7e495c1ba81e315c2886d6220ff84c2c Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:15 +0800 Subject: [PATCH 035/120] md/md-cluster: fix spares warnings for __le64 drivers/md/md-cluster.c:1220:22: warning: incorrect type in assignment (different base types) drivers/md/md-cluster.c:1220:22: expected unsigned long my_sync_size drivers/md/md-cluster.c:1220:22: got restricted __le64 [usertype] sync_size drivers/md/md-cluster.c:1252:35: warning: incorrect type in assignment (different base types) drivers/md/md-cluster.c:1252:35: expected unsigned long sync_size drivers/md/md-cluster.c:1252:35: got restricted __le64 [usertype] sync_size drivers/md/md-cluster.c:1253:41: warning: restricted __le64 degrades to integer Fix the warnings by using le64_to_cpu() to convet __le64 to integer. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-6-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-cluster.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 1d0db62f0351..e642361b6526 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -1217,7 +1217,7 @@ static int cluster_check_sync_size(struct mddev *mddev) struct dlm_lock_resource *bm_lockres; sb = kmap_atomic(bitmap->storage.sb_page); - my_sync_size = sb->sync_size; + my_sync_size = le64_to_cpu(sb->sync_size); kunmap_atomic(sb); for (i = 0; i < node_num; i++) { @@ -1249,8 +1249,8 @@ static int cluster_check_sync_size(struct mddev *mddev) sb = kmap_atomic(bitmap->storage.sb_page); if (sync_size == 0) - sync_size = sb->sync_size; - else if (sync_size != sb->sync_size) { + sync_size = le64_to_cpu(sb->sync_size); + else if (sync_size != le64_to_cpu(sb->sync_size)) { kunmap_atomic(sb); md_bitmap_free(bitmap); return -1; From ec6bb299c7c3dd4ca1724d13d5f5fae3ee54fc65 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:16 +0800 Subject: [PATCH 036/120] md/md-bitmap: add 'sync_size' into struct md_bitmap_stats To avoid dereferencing bitmap directly in md-cluster to prepare inventing a new bitmap. BTW, also fix following checkpatch warnings: WARNING: Deprecated use of 'kmap_atomic', prefer 'kmap_local_page' instead WARNING: Deprecated use of 'kunmap_atomic', prefer 'kunmap_local' instead Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-7-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 6 ++++++ drivers/md/md-bitmap.h | 1 + drivers/md/md-cluster.c | 34 ++++++++++++++++++++-------------- 3 files changed, 27 insertions(+), 14 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 95afc22bd255..33812543d984 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2097,10 +2097,15 @@ EXPORT_SYMBOL_GPL(md_bitmap_copy_from_slot); int md_bitmap_get_stats(struct bitmap *bitmap, struct md_bitmap_stats *stats) { struct bitmap_counts *counts; + bitmap_super_t *sb; if (!bitmap) return -ENOENT; + sb = kmap_local_page(bitmap->storage.sb_page); + stats->sync_size = le64_to_cpu(sb->sync_size); + kunmap_local(sb); + counts = &bitmap->counts; stats->missing_pages = counts->missing_pages; stats->pages = counts->pages; @@ -2110,6 +2115,7 @@ int md_bitmap_get_stats(struct bitmap *bitmap, struct md_bitmap_stats *stats) return 0; } +EXPORT_SYMBOL_GPL(md_bitmap_get_stats); int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, int chunksize, int init) diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 751dca2366c3..a43a75575769 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -237,6 +237,7 @@ struct bitmap { struct md_bitmap_stats { u64 events_cleared; unsigned long missing_pages; + unsigned long sync_size; unsigned long pages; struct file *file; }; diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index e642361b6526..e3faf752f0b1 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -1207,18 +1207,21 @@ out: */ static int cluster_check_sync_size(struct mddev *mddev) { - int i, rv; - bitmap_super_t *sb; - unsigned long my_sync_size, sync_size = 0; - int node_num = mddev->bitmap_info.nodes; int current_slot = md_cluster_ops->slot_number(mddev); + int node_num = mddev->bitmap_info.nodes; struct bitmap *bitmap = mddev->bitmap; - char str[64]; struct dlm_lock_resource *bm_lockres; + struct md_bitmap_stats stats; + unsigned long sync_size = 0; + unsigned long my_sync_size; + char str[64]; + int i, rv; - sb = kmap_atomic(bitmap->storage.sb_page); - my_sync_size = le64_to_cpu(sb->sync_size); - kunmap_atomic(sb); + rv = md_bitmap_get_stats(bitmap, &stats); + if (rv) + return rv; + + my_sync_size = stats.sync_size; for (i = 0; i < node_num; i++) { if (i == current_slot) @@ -1247,15 +1250,18 @@ static int cluster_check_sync_size(struct mddev *mddev) md_bitmap_update_sb(bitmap); lockres_free(bm_lockres); - sb = kmap_atomic(bitmap->storage.sb_page); - if (sync_size == 0) - sync_size = le64_to_cpu(sb->sync_size); - else if (sync_size != le64_to_cpu(sb->sync_size)) { - kunmap_atomic(sb); + rv = md_bitmap_get_stats(bitmap, &stats); + if (rv) { + md_bitmap_free(bitmap); + return rv; + } + + if (sync_size == 0) { + sync_size = stats.sync_size; + } else if (sync_size != stats.sync_size) { md_bitmap_free(bitmap); return -1; } - kunmap_atomic(sb); md_bitmap_free(bitmap); } From 10bc2ac10597ebc0b25afbc72fa4284565548e36 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:17 +0800 Subject: [PATCH 037/120] md/md-bitmap: add 'file_pages' into struct md_bitmap_stats There are no functional changes, avoid dereferencing bitmap directly to prepare inventing a new bitmap. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-8-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 7 +++++-- drivers/md/md-bitmap.h | 1 + drivers/md/md.c | 17 +++++++++++------ 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 33812543d984..ba83b99d6185 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2096,6 +2096,7 @@ EXPORT_SYMBOL_GPL(md_bitmap_copy_from_slot); int md_bitmap_get_stats(struct bitmap *bitmap, struct md_bitmap_stats *stats) { + struct bitmap_storage *storage; struct bitmap_counts *counts; bitmap_super_t *sb; @@ -2110,9 +2111,11 @@ int md_bitmap_get_stats(struct bitmap *bitmap, struct md_bitmap_stats *stats) stats->missing_pages = counts->missing_pages; stats->pages = counts->pages; - stats->events_cleared = bitmap->events_cleared; - stats->file = bitmap->storage.file; + storage = &bitmap->storage; + stats->file_pages = storage->file_pages; + stats->file = storage->file; + stats->events_cleared = bitmap->events_cleared; return 0; } EXPORT_SYMBOL_GPL(md_bitmap_get_stats); diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index a43a75575769..870125670087 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -237,6 +237,7 @@ struct bitmap { struct md_bitmap_stats { u64 events_cleared; unsigned long missing_pages; + unsigned long file_pages; unsigned long sync_size; unsigned long pages; struct file *file; diff --git a/drivers/md/md.c b/drivers/md/md.c index cde6663c7fcb..06c0918111e3 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2335,7 +2335,6 @@ super_1_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) { /* All necessary checks on new >= old have been done */ - struct bitmap *bitmap; if (new_offset >= rdev->data_offset) return 1; @@ -2352,11 +2351,17 @@ super_1_allow_new_offset(struct md_rdev *rdev, */ if (rdev->sb_start + (32+4)*2 > new_offset) return 0; - bitmap = rdev->mddev->bitmap; - if (bitmap && !rdev->mddev->bitmap_info.file && - rdev->sb_start + rdev->mddev->bitmap_info.offset + - bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) - return 0; + + if (!rdev->mddev->bitmap_info.file) { + struct md_bitmap_stats stats; + int err; + + err = md_bitmap_get_stats(rdev->mddev->bitmap, &stats); + if (!err && rdev->sb_start + rdev->mddev->bitmap_info.offset + + stats.file_pages * (PAGE_SIZE >> 9) > new_offset) + return 0; + } + if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) return 0; From a0e7744a460ba5ca91f8d6fc4a696ee345b5baa9 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:18 +0800 Subject: [PATCH 038/120] md/md-bitmap: add 'behind_writes' and 'behind_wait' into struct md_bitmap_stats There are no functional changes, avoid dereferencing bitmap directly to prepare inventing a new bitmap. Also fix following checkpatch warning by using wq_has_sleeper(). WARNING: waitqueue_active without comment Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-9-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 2 ++ drivers/md/md-bitmap.h | 3 +++ drivers/md/raid1.c | 13 ++++++++----- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index ba83b99d6185..918510f36e33 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2115,6 +2115,8 @@ int md_bitmap_get_stats(struct bitmap *bitmap, struct md_bitmap_stats *stats) stats->file_pages = storage->file_pages; stats->file = storage->file; + stats->behind_writes = atomic_read(&bitmap->behind_writes); + stats->behind_wait = wq_has_sleeper(&bitmap->behind_wait); stats->events_cleared = bitmap->events_cleared; return 0; } diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 870125670087..909a661383c6 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -236,6 +236,9 @@ struct bitmap { struct md_bitmap_stats { u64 events_cleared; + int behind_writes; + bool behind_wait; + unsigned long missing_pages; unsigned long file_pages; unsigned long sync_size; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 18aaa7247bba..b6c4e44f9b4b 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1590,16 +1590,19 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, continue; if (first_clone) { + unsigned long max_write_behind = + mddev->bitmap_info.max_write_behind; + struct md_bitmap_stats stats; + int err; + /* do behind I/O ? * Not if there are too many, or cannot * allocate memory, or a reader on WriteMostly * is waiting for behind writes to flush */ - if (bitmap && write_behind && - (atomic_read(&bitmap->behind_writes) - < mddev->bitmap_info.max_write_behind) && - !waitqueue_active(&bitmap->behind_wait)) { + err = md_bitmap_get_stats(bitmap, &stats); + if (!err && write_behind && !stats.behind_wait && + stats.behind_writes < max_write_behind) alloc_behind_master_bio(r1_bio, bio); - } md_bitmap_startwrite(bitmap, r1_bio->sector, r1_bio->sectors, test_bit(R1BIO_BehindIO, &r1_bio->state)); From 9e4481ce0e55b4ef9795845d8b6770e3f6f4b24d Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:19 +0800 Subject: [PATCH 039/120] md/md-cluster: use helper md_bitmap_get_stats() to get pages in resize_bitmaps() Use the existed helper instead of open coding it, avoid dereferencing bitmap directly to prepare inventing a new bitmap. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-10-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-cluster.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index e3faf752f0b1..76febdc5d7f6 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -1143,13 +1143,16 @@ static int update_bitmap_size(struct mddev *mddev, sector_t size) static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsize) { - struct bitmap_counts *counts; - char str[64]; - struct dlm_lock_resource *bm_lockres; struct bitmap *bitmap = mddev->bitmap; - unsigned long my_pages = bitmap->counts.pages; + struct md_bitmap_stats stats; + unsigned long my_pages; int i, rv; + rv = md_bitmap_get_stats(bitmap, &stats); + if (rv) + return rv; + + my_pages = stats.pages; /* * We need to ensure all the nodes can grow to a larger * bitmap size before make the reshaping. @@ -1159,6 +1162,10 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz return rv; for (i = 0; i < mddev->bitmap_info.nodes; i++) { + struct dlm_lock_resource *bm_lockres; + struct bitmap_counts *counts; + char str[64]; + if (i == md_cluster_ops->slot_number(mddev)) continue; @@ -1170,6 +1177,9 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz } counts = &bitmap->counts; + rv = md_bitmap_get_stats(bitmap, &stats); + if (rv) + goto out; /* * If we can hold the bitmap lock of one node then * the slot is not occupied, update the pages. @@ -1186,7 +1196,7 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz counts->pages = my_pages; lockres_free(bm_lockres); - if (my_pages != counts->pages) + if (my_pages != stats.pages) /* * Let's revert the bitmap size if one node * can't resize bitmap From 27832ad3f7f0f5080d472fa8621ff92166ca9fac Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:20 +0800 Subject: [PATCH 040/120] md/md-bitmap: add a new helper md_bitmap_set_pages() Currently md-cluster will set bitmap->counts.pages directly, add a helper to do this to avoid dereferencing bitmap directly. Noted that after this patch bitmap is not dereferenced directly anymore and following patches will move the structure inside md-bitmap.c. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-11-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 6 ++++++ drivers/md/md-bitmap.h | 1 + drivers/md/md-cluster.c | 4 +--- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 918510f36e33..015997b4b835 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2094,6 +2094,12 @@ int md_bitmap_copy_from_slot(struct mddev *mddev, int slot, } EXPORT_SYMBOL_GPL(md_bitmap_copy_from_slot); +void md_bitmap_set_pages(struct bitmap *bitmap, unsigned long pages) +{ + bitmap->counts.pages = pages; +} +EXPORT_SYMBOL_GPL(md_bitmap_set_pages); + int md_bitmap_get_stats(struct bitmap *bitmap, struct md_bitmap_stats *stats) { struct bitmap_storage *storage; diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 909a661383c6..c4d64311c0e8 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -285,6 +285,7 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot); int md_bitmap_copy_from_slot(struct mddev *mddev, int slot, sector_t *lo, sector_t *hi, bool clear_bits); +void md_bitmap_set_pages(struct bitmap *bitmap, unsigned long pages); void md_bitmap_free(struct bitmap *bitmap); void md_bitmap_wait_behind_writes(struct mddev *mddev); diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 76febdc5d7f6..59f7fbca783b 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -1163,7 +1163,6 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz for (i = 0; i < mddev->bitmap_info.nodes; i++) { struct dlm_lock_resource *bm_lockres; - struct bitmap_counts *counts; char str[64]; if (i == md_cluster_ops->slot_number(mddev)) @@ -1175,7 +1174,6 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz bitmap = NULL; goto out; } - counts = &bitmap->counts; rv = md_bitmap_get_stats(bitmap, &stats); if (rv) @@ -1193,7 +1191,7 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz bm_lockres->flags |= DLM_LKF_NOQUEUE; rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); if (!rv) - counts->pages = my_pages; + md_bitmap_set_pages(bitmap, my_pages); lockres_free(bm_lockres); if (my_pages != stats.pages) From 7add9db6ba3e9bd12d2be97abbc13f3881a515db Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:21 +0800 Subject: [PATCH 041/120] md/md-bitmap: introduce struct bitmap_operations The structure is empty for now, and will be used in later patches to merge in bitmap operations, so that bitmap implementation won't be exposed. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-12-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 8 ++++++++ drivers/md/md-bitmap.h | 4 ++++ drivers/md/md.c | 1 + drivers/md/md.h | 1 + 4 files changed, 14 insertions(+) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 015997b4b835..69d9c959fe49 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2714,3 +2714,11 @@ const struct attribute_group md_bitmap_group = { .name = "bitmap", .attrs = md_bitmap_attrs, }; + +static struct bitmap_operations bitmap_ops = { +}; + +void mddev_set_bitmap_ops(struct mddev *mddev) +{ + mddev->bitmap_ops = &bitmap_ops; +} diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index c4d64311c0e8..14c21ab42f9e 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -246,7 +246,11 @@ struct md_bitmap_stats { struct file *file; }; +struct bitmap_operations { +}; + /* the bitmap API */ +void mddev_set_bitmap_ops(struct mddev *mddev); /* these are used only by md/bitmap */ struct bitmap *md_bitmap_create(struct mddev *mddev, int slot); diff --git a/drivers/md/md.c b/drivers/md/md.c index 06c0918111e3..f86e6911318f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -772,6 +772,7 @@ int mddev_init(struct mddev *mddev) mddev->resync_min = 0; mddev->resync_max = MaxSector; mddev->level = LEVEL_NONE; + mddev_set_bitmap_ops(mddev); INIT_WORK(&mddev->sync_work, md_start_sync); INIT_WORK(&mddev->del_work, mddev_delayed_delete); diff --git a/drivers/md/md.h b/drivers/md/md.h index a0d6827dced9..e56193f71ab4 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -536,6 +536,7 @@ struct mddev { int sync_checkers; /* # of threads checking writes_pending */ struct bitmap *bitmap; /* the bitmap for the device */ + struct bitmap_operations *bitmap_ops; struct { struct file *file; /* the bitmap file */ loff_t offset; /* offset from superblock of From 7545d385ec7e4c0d5e86e7cde4fe3fb8f4555fb9 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:22 +0800 Subject: [PATCH 042/120] md/md-bitmap: simplify md_bitmap_create() + md_bitmap_load() Other than internal api get_bitmap_from_slot(), all other places will set returned bitmap to mddev->bitmap. So move the setting of mddev->bitmap into md_bitmap_create() to simplify code. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-13-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 23 +++++++++++++++-------- drivers/md/md-bitmap.h | 2 +- drivers/md/md.c | 30 +++++++++--------------------- 3 files changed, 25 insertions(+), 30 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 69d9c959fe49..2f3ff8e64121 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1879,7 +1879,7 @@ void md_bitmap_destroy(struct mddev *mddev) * if this returns an error, bitmap_destroy must be called to do clean up * once mddev->bitmap is set */ -struct bitmap *md_bitmap_create(struct mddev *mddev, int slot) +static struct bitmap *__bitmap_create(struct mddev *mddev, int slot) { struct bitmap *bitmap; sector_t blocks = mddev->resync_max_sectors; @@ -1966,6 +1966,17 @@ struct bitmap *md_bitmap_create(struct mddev *mddev, int slot) return ERR_PTR(err); } +int md_bitmap_create(struct mddev *mddev, int slot) +{ + struct bitmap *bitmap = __bitmap_create(mddev, slot); + + if (IS_ERR(bitmap)) + return PTR_ERR(bitmap); + + mddev->bitmap = bitmap; + return 0; +} + int md_bitmap_load(struct mddev *mddev) { int err = 0; @@ -2030,7 +2041,7 @@ struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot) int rv = 0; struct bitmap *bitmap; - bitmap = md_bitmap_create(mddev, slot); + bitmap = __bitmap_create(mddev, slot); if (IS_ERR(bitmap)) { rv = PTR_ERR(bitmap); return ERR_PTR(rv); @@ -2384,7 +2395,6 @@ location_store(struct mddev *mddev, const char *buf, size_t len) } else { /* No bitmap, OK to set a location */ long long offset; - struct bitmap *bitmap; if (strncmp(buf, "none", 4) == 0) /* nothing to be done */; @@ -2411,13 +2421,10 @@ location_store(struct mddev *mddev, const char *buf, size_t len) } mddev->bitmap_info.offset = offset; - bitmap = md_bitmap_create(mddev, -1); - if (IS_ERR(bitmap)) { - rv = PTR_ERR(bitmap); + rv = md_bitmap_create(mddev, -1); + if (rv) goto out; - } - mddev->bitmap = bitmap; rv = md_bitmap_load(mddev); if (rv) { mddev->bitmap_info.offset = 0; diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 14c21ab42f9e..d66f447f4be6 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -253,7 +253,7 @@ struct bitmap_operations { void mddev_set_bitmap_ops(struct mddev *mddev); /* these are used only by md/bitmap */ -struct bitmap *md_bitmap_create(struct mddev *mddev, int slot); +int md_bitmap_create(struct mddev *mddev, int slot); int md_bitmap_load(struct mddev *mddev); void md_bitmap_flush(struct mddev *mddev); void md_bitmap_destroy(struct mddev *mddev); diff --git a/drivers/md/md.c b/drivers/md/md.c index f86e6911318f..7a041640c313 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6224,16 +6224,10 @@ int md_run(struct mddev *mddev) } if (err == 0 && pers->sync_request && (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { - struct bitmap *bitmap; - - bitmap = md_bitmap_create(mddev, -1); - if (IS_ERR(bitmap)) { - err = PTR_ERR(bitmap); + err = md_bitmap_create(mddev, -1); + if (err) pr_warn("%s: failed to create bitmap (%d)\n", mdname(mddev), err); - } else - mddev->bitmap = bitmap; - } if (err) goto bitmap_abort; @@ -7288,14 +7282,10 @@ static int set_bitmap_file(struct mddev *mddev, int fd) err = 0; if (mddev->pers) { if (fd >= 0) { - struct bitmap *bitmap; - - bitmap = md_bitmap_create(mddev, -1); - if (!IS_ERR(bitmap)) { - mddev->bitmap = bitmap; + err = md_bitmap_create(mddev, -1); + if (!err) err = md_bitmap_load(mddev); - } else - err = PTR_ERR(bitmap); + if (err) { md_bitmap_destroy(mddev); fd = -1; @@ -7304,6 +7294,7 @@ static int set_bitmap_file(struct mddev *mddev, int fd) md_bitmap_destroy(mddev); } } + if (fd < 0) { struct file *f = mddev->bitmap_info.file; if (f) { @@ -7572,7 +7563,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) goto err; } if (info->state & (1<bitmap) { rv = -EEXIST; @@ -7586,12 +7576,10 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.default_offset; mddev->bitmap_info.space = mddev->bitmap_info.default_space; - bitmap = md_bitmap_create(mddev, -1); - if (!IS_ERR(bitmap)) { - mddev->bitmap = bitmap; + rv = md_bitmap_create(mddev, -1); + if (!rv) rv = md_bitmap_load(mddev); - } else - rv = PTR_ERR(bitmap); + if (rv) md_bitmap_destroy(mddev); } else { From 04c80e649512f2c24f99052440cc808163eff40c Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:23 +0800 Subject: [PATCH 043/120] md/md-bitmap: merge md_bitmap_create() into bitmap_operations So that the implementation won't be exposed, and it'll be possible to invent a new bitmap by replacing bitmap_operations. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-14-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 5 +++-- drivers/md/md-bitmap.h | 2 +- drivers/md/md.c | 6 +++--- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 2f3ff8e64121..c534382f0b57 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1966,7 +1966,7 @@ static struct bitmap *__bitmap_create(struct mddev *mddev, int slot) return ERR_PTR(err); } -int md_bitmap_create(struct mddev *mddev, int slot) +static int bitmap_create(struct mddev *mddev, int slot) { struct bitmap *bitmap = __bitmap_create(mddev, slot); @@ -2421,7 +2421,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len) } mddev->bitmap_info.offset = offset; - rv = md_bitmap_create(mddev, -1); + rv = bitmap_create(mddev, -1); if (rv) goto out; @@ -2723,6 +2723,7 @@ const struct attribute_group md_bitmap_group = { }; static struct bitmap_operations bitmap_ops = { + .create = bitmap_create, }; void mddev_set_bitmap_ops(struct mddev *mddev) diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index d66f447f4be6..f4c4925102b6 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -247,13 +247,13 @@ struct md_bitmap_stats { }; struct bitmap_operations { + int (*create)(struct mddev *mddev, int slot); }; /* the bitmap API */ void mddev_set_bitmap_ops(struct mddev *mddev); /* these are used only by md/bitmap */ -int md_bitmap_create(struct mddev *mddev, int slot); int md_bitmap_load(struct mddev *mddev); void md_bitmap_flush(struct mddev *mddev); void md_bitmap_destroy(struct mddev *mddev); diff --git a/drivers/md/md.c b/drivers/md/md.c index 7a041640c313..7e774d4aaadd 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6224,7 +6224,7 @@ int md_run(struct mddev *mddev) } if (err == 0 && pers->sync_request && (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { - err = md_bitmap_create(mddev, -1); + err = mddev->bitmap_ops->create(mddev, -1); if (err) pr_warn("%s: failed to create bitmap (%d)\n", mdname(mddev), err); @@ -7282,7 +7282,7 @@ static int set_bitmap_file(struct mddev *mddev, int fd) err = 0; if (mddev->pers) { if (fd >= 0) { - err = md_bitmap_create(mddev, -1); + err = mddev->bitmap_ops->create(mddev, -1); if (!err) err = md_bitmap_load(mddev); @@ -7576,7 +7576,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.default_offset; mddev->bitmap_info.space = mddev->bitmap_info.default_space; - rv = md_bitmap_create(mddev, -1); + rv = mddev->bitmap_ops->create(mddev, -1); if (!rv) rv = md_bitmap_load(mddev); From e1e490805958617327be14eaf0ed31d71adc2c54 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:24 +0800 Subject: [PATCH 044/120] md/md-bitmap: merge md_bitmap_load() into bitmap_operations So that the implementation won't be exposed, and it'll be possible to invent a new bitmap by replacing bitmap_operations. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-15-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/dm-raid.c | 4 +++- drivers/md/md-bitmap.c | 6 +++--- drivers/md/md-bitmap.h | 2 +- drivers/md/md.c | 7 ++++--- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 0c3323e0adb2..c3e201fde4c5 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3949,7 +3949,9 @@ static int __load_dirty_region_bitmap(struct raid_set *rs) /* Try loading the bitmap unless "raid0", which does not have one */ if (!rs_is_raid0(rs) && !test_and_set_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags)) { - r = md_bitmap_load(&rs->md); + struct mddev *mddev = &rs->md; + + r = mddev->bitmap_ops->load(mddev); if (r) DMERR("Failed to load bitmap"); } diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index c534382f0b57..c236754df66e 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1977,7 +1977,7 @@ static int bitmap_create(struct mddev *mddev, int slot) return 0; } -int md_bitmap_load(struct mddev *mddev) +static int bitmap_load(struct mddev *mddev) { int err = 0; sector_t start = 0; @@ -2033,7 +2033,6 @@ int md_bitmap_load(struct mddev *mddev) out: return err; } -EXPORT_SYMBOL_GPL(md_bitmap_load); /* caller need to free returned bitmap with md_bitmap_free() */ struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot) @@ -2425,7 +2424,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len) if (rv) goto out; - rv = md_bitmap_load(mddev); + rv = bitmap_load(mddev); if (rv) { mddev->bitmap_info.offset = 0; md_bitmap_destroy(mddev); @@ -2724,6 +2723,7 @@ const struct attribute_group md_bitmap_group = { static struct bitmap_operations bitmap_ops = { .create = bitmap_create, + .load = bitmap_load, }; void mddev_set_bitmap_ops(struct mddev *mddev) diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index f4c4925102b6..f5b04b61d9e9 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -248,13 +248,13 @@ struct md_bitmap_stats { struct bitmap_operations { int (*create)(struct mddev *mddev, int slot); + int (*load)(struct mddev *mddev); }; /* the bitmap API */ void mddev_set_bitmap_ops(struct mddev *mddev); /* these are used only by md/bitmap */ -int md_bitmap_load(struct mddev *mddev); void md_bitmap_flush(struct mddev *mddev); void md_bitmap_destroy(struct mddev *mddev); diff --git a/drivers/md/md.c b/drivers/md/md.c index 7e774d4aaadd..9c63be7aa6a5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6316,7 +6316,8 @@ int do_md_run(struct mddev *mddev) err = md_run(mddev); if (err) goto out; - err = md_bitmap_load(mddev); + + err = mddev->bitmap_ops->load(mddev); if (err) { md_bitmap_destroy(mddev); goto out; @@ -7284,7 +7285,7 @@ static int set_bitmap_file(struct mddev *mddev, int fd) if (fd >= 0) { err = mddev->bitmap_ops->create(mddev, -1); if (!err) - err = md_bitmap_load(mddev); + err = mddev->bitmap_ops->load(mddev); if (err) { md_bitmap_destroy(mddev); @@ -7578,7 +7579,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.default_space; rv = mddev->bitmap_ops->create(mddev, -1); if (!rv) - rv = md_bitmap_load(mddev); + rv = mddev->bitmap_ops->load(mddev); if (rv) md_bitmap_destroy(mddev); From a2bd70319290d80127dc4257b8c17df3f027c15d Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:25 +0800 Subject: [PATCH 045/120] md/md-bitmap: merge md_bitmap_destroy() into bitmap_operations So that the implementation won't be exposed, and it'll be possible to invent a new bitmap by replacing bitmap_operations. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-16-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 7 ++++--- drivers/md/md-bitmap.h | 2 +- drivers/md/md.c | 15 ++++++++------- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index c236754df66e..dc898db266d0 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1853,7 +1853,7 @@ void md_bitmap_wait_behind_writes(struct mddev *mddev) } EXPORT_SYMBOL_GPL(md_bitmap_wait_behind_writes); -void md_bitmap_destroy(struct mddev *mddev) +static void bitmap_destroy(struct mddev *mddev) { struct bitmap *bitmap = mddev->bitmap; @@ -2384,7 +2384,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len) goto out; } - md_bitmap_destroy(mddev); + bitmap_destroy(mddev); mddev->bitmap_info.offset = 0; if (mddev->bitmap_info.file) { struct file *f = mddev->bitmap_info.file; @@ -2427,7 +2427,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len) rv = bitmap_load(mddev); if (rv) { mddev->bitmap_info.offset = 0; - md_bitmap_destroy(mddev); + bitmap_destroy(mddev); goto out; } } @@ -2724,6 +2724,7 @@ const struct attribute_group md_bitmap_group = { static struct bitmap_operations bitmap_ops = { .create = bitmap_create, .load = bitmap_load, + .destroy = bitmap_destroy, }; void mddev_set_bitmap_ops(struct mddev *mddev) diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index f5b04b61d9e9..c8d27b91241b 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -249,6 +249,7 @@ struct md_bitmap_stats { struct bitmap_operations { int (*create)(struct mddev *mddev, int slot); int (*load)(struct mddev *mddev); + void (*destroy)(struct mddev *mddev); }; /* the bitmap API */ @@ -256,7 +257,6 @@ void mddev_set_bitmap_ops(struct mddev *mddev); /* these are used only by md/bitmap */ void md_bitmap_flush(struct mddev *mddev); -void md_bitmap_destroy(struct mddev *mddev); void md_bitmap_print_sb(struct bitmap *bitmap); void md_bitmap_update_sb(struct bitmap *bitmap); diff --git a/drivers/md/md.c b/drivers/md/md.c index 9c63be7aa6a5..eeb17eb1fc02 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6297,7 +6297,7 @@ bitmap_abort: pers->free(mddev, mddev->private); mddev->private = NULL; module_put(pers->owner); - md_bitmap_destroy(mddev); + mddev->bitmap_ops->destroy(mddev); abort: bioset_exit(&mddev->io_clone_set); exit_sync_set: @@ -6319,7 +6319,7 @@ int do_md_run(struct mddev *mddev) err = mddev->bitmap_ops->load(mddev); if (err) { - md_bitmap_destroy(mddev); + mddev->bitmap_ops->destroy(mddev); goto out; } @@ -6505,7 +6505,8 @@ static void mddev_detach(struct mddev *mddev) static void __md_stop(struct mddev *mddev) { struct md_personality *pers = mddev->pers; - md_bitmap_destroy(mddev); + + mddev->bitmap_ops->destroy(mddev); mddev_detach(mddev); spin_lock(&mddev->lock); mddev->pers = NULL; @@ -7288,11 +7289,11 @@ static int set_bitmap_file(struct mddev *mddev, int fd) err = mddev->bitmap_ops->load(mddev); if (err) { - md_bitmap_destroy(mddev); + mddev->bitmap_ops->destroy(mddev); fd = -1; } } else if (fd < 0) { - md_bitmap_destroy(mddev); + mddev->bitmap_ops->destroy(mddev); } } @@ -7582,7 +7583,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) rv = mddev->bitmap_ops->load(mddev); if (rv) - md_bitmap_destroy(mddev); + mddev->bitmap_ops->destroy(mddev); } else { struct md_bitmap_stats stats; @@ -7609,7 +7610,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) module_put(md_cluster_mod); mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; } - md_bitmap_destroy(mddev); + mddev->bitmap_ops->destroy(mddev); mddev->bitmap_info.offset = 0; } } From ca925302e841ff0a0598b283f87c472d92b389f3 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:26 +0800 Subject: [PATCH 046/120] md/md-bitmap: merge md_bitmap_flush() into bitmap_operations So that the implementation won't be exposed, and it'll be possible to invent a new bitmap by replacing bitmap_operations. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-17-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 6 ++---- drivers/md/md-bitmap.h | 2 +- drivers/md/md.c | 3 ++- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index dc898db266d0..0035162fe6f3 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1773,10 +1773,7 @@ void md_bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long } } -/* - * flush out any pending updates - */ -void md_bitmap_flush(struct mddev *mddev) +static void bitmap_flush(struct mddev *mddev) { struct bitmap *bitmap = mddev->bitmap; long sleep; @@ -2725,6 +2722,7 @@ static struct bitmap_operations bitmap_ops = { .create = bitmap_create, .load = bitmap_load, .destroy = bitmap_destroy, + .flush = bitmap_flush, }; void mddev_set_bitmap_ops(struct mddev *mddev) diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index c8d27b91241b..c0858665554e 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -250,13 +250,13 @@ struct bitmap_operations { int (*create)(struct mddev *mddev, int slot); int (*load)(struct mddev *mddev); void (*destroy)(struct mddev *mddev); + void (*flush)(struct mddev *mddev); }; /* the bitmap API */ void mddev_set_bitmap_ops(struct mddev *mddev); /* these are used only by md/bitmap */ -void md_bitmap_flush(struct mddev *mddev); void md_bitmap_print_sb(struct bitmap *bitmap); void md_bitmap_update_sb(struct bitmap *bitmap); diff --git a/drivers/md/md.c b/drivers/md/md.c index eeb17eb1fc02..b422acd4b2a4 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6463,7 +6463,8 @@ static void __md_stop_writes(struct mddev *mddev) mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); } - md_bitmap_flush(mddev); + + mddev->bitmap_ops->flush(mddev); if (md_is_rdwr(mddev) && ((!mddev->in_sync && !mddev_is_clustered(mddev)) || From a0240e3ec753d3caf29edbaf6cf5685c7b447a2a Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:27 +0800 Subject: [PATCH 047/120] md/md-bitmap: make md_bitmap_print_sb() internal md_bitmap_print_sb() is only used inside md-bitmap.c, hence make it static, also rename it to bitmap_print_sb. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-18-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 5 ++--- drivers/md/md-bitmap.h | 1 - 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 0035162fe6f3..86444cbee56e 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -512,8 +512,7 @@ void md_bitmap_update_sb(struct bitmap *bitmap) } EXPORT_SYMBOL(md_bitmap_update_sb); -/* print out the bitmap file superblock */ -void md_bitmap_print_sb(struct bitmap *bitmap) +static void bitmap_print_sb(struct bitmap *bitmap) { bitmap_super_t *sb; @@ -760,7 +759,7 @@ out_no_sb: bitmap->mddev->bitmap_info.space > sectors_reserved) bitmap->mddev->bitmap_info.space = sectors_reserved; } else { - md_bitmap_print_sb(bitmap); + bitmap_print_sb(bitmap); if (bitmap->cluster_slot < 0) md_cluster_stop(bitmap->mddev); } diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index c0858665554e..5d5811c89b77 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -258,7 +258,6 @@ void mddev_set_bitmap_ops(struct mddev *mddev); /* these are used only by md/bitmap */ -void md_bitmap_print_sb(struct bitmap *bitmap); void md_bitmap_update_sb(struct bitmap *bitmap); int md_bitmap_get_stats(struct bitmap *bitmap, struct md_bitmap_stats *stats); From fe59b34676b4ec6b48a7b436d3422fc9317e047a Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:28 +0800 Subject: [PATCH 048/120] md/md-bitmap: merge md_bitmap_update_sb() into bitmap_operations So that the implementation won't be exposed, and it'll be possible to invent a new bitmap by replacing bitmap_operations. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-19-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 15 ++++++++------- drivers/md/md-bitmap.h | 3 ++- drivers/md/md-cluster.c | 2 +- drivers/md/md.c | 4 ++-- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 86444cbee56e..23c7d4be3ffd 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -472,7 +472,7 @@ static void md_bitmap_wait_writes(struct bitmap *bitmap) /* update the event counter and sync the superblock to disk */ -void md_bitmap_update_sb(struct bitmap *bitmap) +static void bitmap_update_sb(struct bitmap *bitmap) { bitmap_super_t *sb; @@ -510,7 +510,6 @@ void md_bitmap_update_sb(struct bitmap *bitmap) write_sb_page(bitmap, bitmap->storage.sb_index, bitmap->storage.sb_page, 1); } -EXPORT_SYMBOL(md_bitmap_update_sb); static void bitmap_print_sb(struct bitmap *bitmap) { @@ -892,7 +891,7 @@ static void md_bitmap_file_unmap(struct bitmap_storage *store) static void md_bitmap_file_kick(struct bitmap *bitmap) { if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) { - md_bitmap_update_sb(bitmap); + bitmap_update_sb(bitmap); if (bitmap->storage.file) { pr_warn("%s: kicking failed bitmap file %pD4 from array!\n", @@ -1792,7 +1791,7 @@ static void bitmap_flush(struct mddev *mddev) md_bitmap_daemon_work(mddev); if (mddev->bitmap_info.external) md_super_wait(mddev); - md_bitmap_update_sb(bitmap); + bitmap_update_sb(bitmap); } /* @@ -2022,7 +2021,7 @@ static int bitmap_load(struct mddev *mddev) mddev_set_timeout(mddev, mddev->bitmap_info.daemon_sleep, true); md_wakeup_thread(mddev->thread); - md_bitmap_update_sb(bitmap); + bitmap_update_sb(bitmap); if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) err = -EIO; @@ -2083,7 +2082,7 @@ int md_bitmap_copy_from_slot(struct mddev *mddev, int slot, } if (clear_bits) { - md_bitmap_update_sb(bitmap); + bitmap_update_sb(bitmap); /* BITMAP_PAGE_PENDING is set, but bitmap_unplug needs * BITMAP_PAGE_DIRTY or _NEEDWRITE to write ... */ for (i = 0; i < bitmap->storage.file_pages; i++) @@ -2578,7 +2577,7 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len) mddev_create_serial_pool(mddev, rdev); } if (old_mwb != backlog) - md_bitmap_update_sb(mddev->bitmap); + bitmap_update_sb(mddev->bitmap); mddev_unlock_and_resume(mddev); return len; @@ -2722,6 +2721,8 @@ static struct bitmap_operations bitmap_ops = { .load = bitmap_load, .destroy = bitmap_destroy, .flush = bitmap_flush, + + .update_sb = bitmap_update_sb, }; void mddev_set_bitmap_ops(struct mddev *mddev) diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 5d5811c89b77..ca0d8696136f 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -251,6 +251,8 @@ struct bitmap_operations { int (*load)(struct mddev *mddev); void (*destroy)(struct mddev *mddev); void (*flush)(struct mddev *mddev); + + void (*update_sb)(struct bitmap *bitmap); }; /* the bitmap API */ @@ -258,7 +260,6 @@ void mddev_set_bitmap_ops(struct mddev *mddev); /* these are used only by md/bitmap */ -void md_bitmap_update_sb(struct bitmap *bitmap); int md_bitmap_get_stats(struct bitmap *bitmap, struct md_bitmap_stats *stats); int md_bitmap_setallbits(struct bitmap *bitmap); diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 59f7fbca783b..ca30881556bd 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -1255,7 +1255,7 @@ static int cluster_check_sync_size(struct mddev *mddev) bm_lockres->flags |= DLM_LKF_NOQUEUE; rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); if (!rv) - md_bitmap_update_sb(bitmap); + mddev->bitmap_ops->update_sb(bitmap); lockres_free(bm_lockres); rv = md_bitmap_get_stats(bitmap, &stats); diff --git a/drivers/md/md.c b/drivers/md/md.c index b422acd4b2a4..5f9df96eab51 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2838,7 +2838,7 @@ repeat: mddev_add_trace_msg(mddev, "md md_update_sb"); rewrite: - md_bitmap_update_sb(mddev->bitmap); + mddev->bitmap_ops->update_sb(mddev->bitmap); rdev_for_each(rdev, mddev) { if (rdev->sb_loaded != 1) continue; /* no noise on spare devices */ @@ -10002,7 +10002,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) if (ret) pr_info("md-cluster: resize failed\n"); else - md_bitmap_update_sb(mddev->bitmap); + mddev->bitmap_ops->update_sb(mddev->bitmap); } /* Check for change of roles in the active devices */ From 696936838bc18a761ed778910975d51cf2c35e3a Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:29 +0800 Subject: [PATCH 049/120] md/md-bitmap: merge md_bitmap_status() into bitmap_operations So that the implementation won't be exposed, and it'll be possible to invent a new bitmap by replacing bitmap_operations. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-20-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 4 ++-- drivers/md/md-bitmap.h | 3 +-- drivers/md/md-cluster.c | 8 ++++---- drivers/md/md.c | 11 ++++++----- drivers/md/raid1.c | 2 +- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 23c7d4be3ffd..b2fbec3997f2 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2105,7 +2105,7 @@ void md_bitmap_set_pages(struct bitmap *bitmap, unsigned long pages) } EXPORT_SYMBOL_GPL(md_bitmap_set_pages); -int md_bitmap_get_stats(struct bitmap *bitmap, struct md_bitmap_stats *stats) +static int bitmap_get_stats(struct bitmap *bitmap, struct md_bitmap_stats *stats) { struct bitmap_storage *storage; struct bitmap_counts *counts; @@ -2131,7 +2131,6 @@ int md_bitmap_get_stats(struct bitmap *bitmap, struct md_bitmap_stats *stats) stats->events_cleared = bitmap->events_cleared; return 0; } -EXPORT_SYMBOL_GPL(md_bitmap_get_stats); int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, int chunksize, int init) @@ -2723,6 +2722,7 @@ static struct bitmap_operations bitmap_ops = { .flush = bitmap_flush, .update_sb = bitmap_update_sb, + .get_stats = bitmap_get_stats, }; void mddev_set_bitmap_ops(struct mddev *mddev) diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index ca0d8696136f..1df238cb82f0 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -253,6 +253,7 @@ struct bitmap_operations { void (*flush)(struct mddev *mddev); void (*update_sb)(struct bitmap *bitmap); + int (*get_stats)(struct bitmap *bitmap, struct md_bitmap_stats *stats); }; /* the bitmap API */ @@ -260,8 +261,6 @@ void mddev_set_bitmap_ops(struct mddev *mddev); /* these are used only by md/bitmap */ -int md_bitmap_get_stats(struct bitmap *bitmap, struct md_bitmap_stats *stats); - int md_bitmap_setallbits(struct bitmap *bitmap); void md_bitmap_write_all(struct bitmap *bitmap); diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index ca30881556bd..a5f1135cc1fa 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -1148,7 +1148,7 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz unsigned long my_pages; int i, rv; - rv = md_bitmap_get_stats(bitmap, &stats); + rv = mddev->bitmap_ops->get_stats(bitmap, &stats); if (rv) return rv; @@ -1175,7 +1175,7 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz goto out; } - rv = md_bitmap_get_stats(bitmap, &stats); + rv = mddev->bitmap_ops->get_stats(bitmap, &stats); if (rv) goto out; /* @@ -1225,7 +1225,7 @@ static int cluster_check_sync_size(struct mddev *mddev) char str[64]; int i, rv; - rv = md_bitmap_get_stats(bitmap, &stats); + rv = mddev->bitmap_ops->get_stats(bitmap, &stats); if (rv) return rv; @@ -1258,7 +1258,7 @@ static int cluster_check_sync_size(struct mddev *mddev) mddev->bitmap_ops->update_sb(bitmap); lockres_free(bm_lockres); - rv = md_bitmap_get_stats(bitmap, &stats); + rv = mddev->bitmap_ops->get_stats(bitmap, &stats); if (rv) { md_bitmap_free(bitmap); return rv; diff --git a/drivers/md/md.c b/drivers/md/md.c index 5f9df96eab51..6c184748f317 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1378,7 +1378,7 @@ static u64 md_bitmap_events_cleared(struct mddev *mddev) struct md_bitmap_stats stats; int err; - err = md_bitmap_get_stats(mddev->bitmap, &stats); + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); if (err) return 0; @@ -2354,11 +2354,12 @@ super_1_allow_new_offset(struct md_rdev *rdev, return 0; if (!rdev->mddev->bitmap_info.file) { + struct mddev *mddev = rdev->mddev; struct md_bitmap_stats stats; int err; - err = md_bitmap_get_stats(rdev->mddev->bitmap, &stats); - if (!err && rdev->sb_start + rdev->mddev->bitmap_info.offset + + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); + if (!err && rdev->sb_start + mddev->bitmap_info.offset + stats.file_pages * (PAGE_SIZE >> 9) > new_offset) return 0; } @@ -7588,7 +7589,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) } else { struct md_bitmap_stats stats; - rv = md_bitmap_get_stats(mddev->bitmap, &stats); + rv = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); if (rv) goto err; @@ -8388,7 +8389,7 @@ static void md_bitmap_status(struct seq_file *seq, struct mddev *mddev) unsigned long chunk_kb; int err; - err = md_bitmap_get_stats(mddev->bitmap, &stats); + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); if (err) return; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index b6c4e44f9b4b..d490a9ac484f 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1599,7 +1599,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, * Not if there are too many, or cannot * allocate memory, or a reader on WriteMostly * is waiting for behind writes to flush */ - err = md_bitmap_get_stats(bitmap, &stats); + err = mddev->bitmap_ops->get_stats(bitmap, &stats); if (!err && write_behind && !stats.behind_wait && stats.behind_writes < max_write_behind) alloc_behind_master_bio(r1_bio, bio); From ea076ceb35d66d29fc0a50c15a4b0248c5122d2c Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:30 +0800 Subject: [PATCH 050/120] md/md-bitmap: remove md_bitmap_setallbits() md_bitmap_setallbits() is not used, hence can be removed. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-21-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 1df238cb82f0..0bf16f0143ad 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -261,7 +261,6 @@ void mddev_set_bitmap_ops(struct mddev *mddev); /* these are used only by md/bitmap */ -int md_bitmap_setallbits(struct bitmap *bitmap); void md_bitmap_write_all(struct bitmap *bitmap); void md_bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e); From b26313cb96f1b3fd6f07d3243f6cd426c5cbaf39 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:31 +0800 Subject: [PATCH 051/120] md/md-bitmap: merge bitmap_write_all() into bitmap_operations So that the implementation won't be exposed, and it'll be possible to invent a new bitmap by replacing bitmap_operations. Also change the parameter from bitmap to mddev, to avoid access bitmap outside md-bitmap.c as much as possible. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-22-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 14 +++++++------- drivers/md/md-bitmap.h | 3 +-- drivers/md/md.c | 2 +- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index b2fbec3997f2..47d074bf292d 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1224,22 +1224,21 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) return ret; } -void md_bitmap_write_all(struct bitmap *bitmap) +/* just flag bitmap pages as needing to be written. */ +static void bitmap_write_all(struct mddev *mddev) { - /* We don't actually write all bitmap blocks here, - * just flag them as needing to be written - */ int i; + struct bitmap *bitmap = mddev->bitmap; if (!bitmap || !bitmap->storage.filemap) return; + + /* Only one copy, so nothing needed */ if (bitmap->storage.file) - /* Only one copy, so nothing needed */ return; for (i = 0; i < bitmap->storage.file_pages; i++) - set_page_attr(bitmap, i, - BITMAP_PAGE_NEEDWRITE); + set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE); bitmap->allclean = 0; } @@ -2720,6 +2719,7 @@ static struct bitmap_operations bitmap_ops = { .load = bitmap_load, .destroy = bitmap_destroy, .flush = bitmap_flush, + .write_all = bitmap_write_all, .update_sb = bitmap_update_sb, .get_stats = bitmap_get_stats, diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 0bf16f0143ad..89cd60a7bb07 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -251,6 +251,7 @@ struct bitmap_operations { int (*load)(struct mddev *mddev); void (*destroy)(struct mddev *mddev); void (*flush)(struct mddev *mddev); + void (*write_all)(struct mddev *mddev); void (*update_sb)(struct bitmap *bitmap); int (*get_stats)(struct bitmap *bitmap, struct md_bitmap_stats *stats); @@ -261,8 +262,6 @@ void mddev_set_bitmap_ops(struct mddev *mddev); /* these are used only by md/bitmap */ -void md_bitmap_write_all(struct bitmap *bitmap); - void md_bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e); /* these are exported */ diff --git a/drivers/md/md.c b/drivers/md/md.c index 6c184748f317..1dc800113b7b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9544,7 +9544,7 @@ static void md_start_sync(struct work_struct *ws) * stored on all devices. So make sure all bitmap pages get written. */ if (spares) - md_bitmap_write_all(mddev->bitmap); + mddev->bitmap_ops->write_all(mddev); name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ? "reshape" : "resync"; From 2d3b130e177f14b461c47880b6e0b338fd6872f5 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:32 +0800 Subject: [PATCH 052/120] md/md-bitmap: merge md_bitmap_dirty_bits() into bitmap_operations So that the implementation won't be exposed, and it'll be possible to invent a new bitmap by replacing bitmap_operations. Also change the parameter from bitmap to mddev, to avoid access bitmap outside md-bitmap.c as much as possible. And while we're here, also fix coding style for bitmap_store(). Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-23-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 9 ++++++++- drivers/md/md-bitmap.h | 6 ++---- drivers/md/md.c | 14 ++++++++++---- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 47d074bf292d..6448f78d50c2 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1753,12 +1753,18 @@ static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, in } /* dirty the memory and file bits for bitmap chunks "s" to "e" */ -void md_bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e) +static void bitmap_dirty_bits(struct mddev *mddev, unsigned long s, + unsigned long e) { unsigned long chunk; + struct bitmap *bitmap = mddev->bitmap; + + if (!bitmap) + return; for (chunk = s; chunk <= e; chunk++) { sector_t sec = (sector_t)chunk << bitmap->counts.chunkshift; + md_bitmap_set_memory_bits(bitmap, sec, 1); md_bitmap_file_set_bit(bitmap, sec); if (sec < bitmap->mddev->recovery_cp) @@ -2720,6 +2726,7 @@ static struct bitmap_operations bitmap_ops = { .destroy = bitmap_destroy, .flush = bitmap_flush, .write_all = bitmap_write_all, + .dirty_bits = bitmap_dirty_bits, .update_sb = bitmap_update_sb, .get_stats = bitmap_get_stats, diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 89cd60a7bb07..875ecbb5b1e4 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -252,6 +252,8 @@ struct bitmap_operations { void (*destroy)(struct mddev *mddev); void (*flush)(struct mddev *mddev); void (*write_all)(struct mddev *mddev); + void (*dirty_bits)(struct mddev *mddev, unsigned long s, + unsigned long e); void (*update_sb)(struct bitmap *bitmap); int (*get_stats)(struct bitmap *bitmap, struct md_bitmap_stats *stats); @@ -260,10 +262,6 @@ struct bitmap_operations { /* the bitmap API */ void mddev_set_bitmap_ops(struct mddev *mddev); -/* these are used only by md/bitmap */ - -void md_bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e); - /* these are exported */ int md_bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind); diff --git a/drivers/md/md.c b/drivers/md/md.c index 1dc800113b7b..f72fa8e872f8 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4699,14 +4699,20 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len) /* buf should be ... or - ... (range) */ while (*buf) { chunk = end_chunk = simple_strtoul(buf, &end, 0); - if (buf == end) break; + if (buf == end) + break; + if (*end == '-') { /* range */ buf = end + 1; end_chunk = simple_strtoul(buf, &end, 0); - if (buf == end) break; + if (buf == end) + break; } - if (*end && !isspace(*end)) break; - md_bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); + + if (*end && !isspace(*end)) + break; + + mddev->bitmap_ops->dirty_bits(mddev, chunk, end_chunk); buf = skip_spaces(end); } md_bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ From c2257df4108ed872f46c96d6ea6092f17a747632 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:33 +0800 Subject: [PATCH 053/120] md/md-bitmap: merge md_bitmap_startwrite() into bitmap_operations So that the implementation won't be exposed, and it'll be possible to invent a new bitmap by replacing bitmap_operations. Also change the parameter from bitmap to mddev, to avoid access bitmap outside md-bitmap.c as much as possible. And change the type of 'behind' from int to bool. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-24-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 8 ++++++-- drivers/md/md-bitmap.h | 5 +++-- drivers/md/raid1.c | 5 +++-- drivers/md/raid10.c | 3 ++- drivers/md/raid5.c | 13 +++++-------- 5 files changed, 19 insertions(+), 15 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 6448f78d50c2..0168dbd5ecb0 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1458,8 +1458,11 @@ __acquires(bitmap->lock) &(bitmap->bp[page].map[pageoff]); } -int md_bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind) +static int bitmap_startwrite(struct mddev *mddev, sector_t offset, + unsigned long sectors, bool behind) { + struct bitmap *bitmap = mddev->bitmap; + if (!bitmap) return 0; @@ -1520,7 +1523,6 @@ int md_bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long s } return 0; } -EXPORT_SYMBOL(md_bitmap_startwrite); void md_bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int success, int behind) @@ -2728,6 +2730,8 @@ static struct bitmap_operations bitmap_ops = { .write_all = bitmap_write_all, .dirty_bits = bitmap_dirty_bits, + .startwrite = bitmap_startwrite, + .update_sb = bitmap_update_sb, .get_stats = bitmap_get_stats, }; diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 875ecbb5b1e4..1433d13e447a 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -255,6 +255,9 @@ struct bitmap_operations { void (*dirty_bits)(struct mddev *mddev, unsigned long s, unsigned long e); + int (*startwrite)(struct mddev *mddev, sector_t offset, + unsigned long sectors, bool behind); + void (*update_sb)(struct bitmap *bitmap); int (*get_stats)(struct bitmap *bitmap, struct md_bitmap_stats *stats); }; @@ -263,8 +266,6 @@ struct bitmap_operations { void mddev_set_bitmap_ops(struct mddev *mddev); /* these are exported */ -int md_bitmap_startwrite(struct bitmap *bitmap, sector_t offset, - unsigned long sectors, int behind); void md_bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int success, int behind); int md_bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index d490a9ac484f..708687cb28d5 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1604,8 +1604,9 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, stats.behind_writes < max_write_behind) alloc_behind_master_bio(r1_bio, bio); - md_bitmap_startwrite(bitmap, r1_bio->sector, r1_bio->sectors, - test_bit(R1BIO_BehindIO, &r1_bio->state)); + mddev->bitmap_ops->startwrite( + mddev, r1_bio->sector, r1_bio->sectors, + test_bit(R1BIO_BehindIO, &r1_bio->state)); first_clone = 0; } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index e55e020b5571..8f9172c3329a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1492,7 +1492,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, md_account_bio(mddev, &bio); r10_bio->master_bio = bio; atomic_set(&r10_bio->remaining, 1); - md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); + mddev->bitmap_ops->startwrite(mddev, r10_bio->sector, r10_bio->sectors, + false); for (i = 0; i < conf->copies; i++) { if (r10_bio->devs[i].bio) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index c14cf2410365..c24036d1e6da 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3563,8 +3563,8 @@ static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi, */ set_bit(STRIPE_BITMAP_PENDING, &sh->state); spin_unlock_irq(&sh->stripe_lock); - md_bitmap_startwrite(conf->mddev->bitmap, sh->sector, - RAID5_STRIPE_SECTORS(conf), 0); + conf->mddev->bitmap_ops->startwrite(conf->mddev, sh->sector, + RAID5_STRIPE_SECTORS(conf), false); spin_lock_irq(&sh->stripe_lock); clear_bit(STRIPE_BITMAP_PENDING, &sh->state); if (!sh->batch_head) { @@ -5788,13 +5788,10 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) } spin_unlock_irq(&sh->stripe_lock); if (conf->mddev->bitmap) { - for (d = 0; - d < conf->raid_disks - conf->max_degraded; + for (d = 0; d < conf->raid_disks - conf->max_degraded; d++) - md_bitmap_startwrite(mddev->bitmap, - sh->sector, - RAID5_STRIPE_SECTORS(conf), - 0); + mddev->bitmap_ops->startwrite(mddev, sh->sector, + RAID5_STRIPE_SECTORS(conf), false); sh->bm_seq = conf->seq_flush + 1; set_bit(STRIPE_BIT_DELAY, &sh->state); } From 3486015facc030f30d694b92dc18e58073c6c9e0 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:34 +0800 Subject: [PATCH 054/120] md/md-bitmap: merge md_bitmap_endwrite() into bitmap_operations So that the implementation won't be exposed, and it'll be possible to invent a new bitmap by replacing bitmap_operations. Also change the parameter from bitmap to mddev, to avoid access bitmap outside md-bitmap.c as much as possible. And change the type of 'success' and 'behind' from int to bool. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-25-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 9 ++++++--- drivers/md/md-bitmap.h | 4 ++-- drivers/md/raid1.c | 12 +++++++----- drivers/md/raid10.c | 11 ++++++----- drivers/md/raid5-cache.c | 8 ++++---- drivers/md/raid5.c | 18 ++++++++++-------- 6 files changed, 35 insertions(+), 27 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 0168dbd5ecb0..4d174ab25339 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1524,11 +1524,14 @@ static int bitmap_startwrite(struct mddev *mddev, sector_t offset, return 0; } -void md_bitmap_endwrite(struct bitmap *bitmap, sector_t offset, - unsigned long sectors, int success, int behind) +static void bitmap_endwrite(struct mddev *mddev, sector_t offset, + unsigned long sectors, bool success, bool behind) { + struct bitmap *bitmap = mddev->bitmap; + if (!bitmap) return; + if (behind) { if (atomic_dec_and_test(&bitmap->behind_writes)) wake_up(&bitmap->behind_wait); @@ -1575,7 +1578,6 @@ void md_bitmap_endwrite(struct bitmap *bitmap, sector_t offset, sectors = 0; } } -EXPORT_SYMBOL(md_bitmap_endwrite); static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded) @@ -2731,6 +2733,7 @@ static struct bitmap_operations bitmap_ops = { .dirty_bits = bitmap_dirty_bits, .startwrite = bitmap_startwrite, + .endwrite = bitmap_endwrite, .update_sb = bitmap_update_sb, .get_stats = bitmap_get_stats, diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 1433d13e447a..056a80ee500f 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -257,6 +257,8 @@ struct bitmap_operations { int (*startwrite)(struct mddev *mddev, sector_t offset, unsigned long sectors, bool behind); + void (*endwrite)(struct mddev *mddev, sector_t offset, + unsigned long sectors, bool success, bool behind); void (*update_sb)(struct bitmap *bitmap); int (*get_stats)(struct bitmap *bitmap, struct md_bitmap_stats *stats); @@ -266,8 +268,6 @@ struct bitmap_operations { void mddev_set_bitmap_ops(struct mddev *mddev); /* these are exported */ -void md_bitmap_endwrite(struct bitmap *bitmap, sector_t offset, - unsigned long sectors, int success, int behind); int md_bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded); void md_bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted); void md_bitmap_close_sync(struct bitmap *bitmap); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 708687cb28d5..b335293ec472 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -411,18 +411,20 @@ static void raid1_end_read_request(struct bio *bio) static void close_write(struct r1bio *r1_bio) { + struct mddev *mddev = r1_bio->mddev; + /* it really is the end of this request */ if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { bio_free_pages(r1_bio->behind_master_bio); bio_put(r1_bio->behind_master_bio); r1_bio->behind_master_bio = NULL; } + /* clear the bitmap if all writes complete successfully */ - md_bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, - r1_bio->sectors, - !test_bit(R1BIO_Degraded, &r1_bio->state), - test_bit(R1BIO_BehindIO, &r1_bio->state)); - md_write_end(r1_bio->mddev); + mddev->bitmap_ops->endwrite(mddev, r1_bio->sector, r1_bio->sectors, + !test_bit(R1BIO_Degraded, &r1_bio->state), + test_bit(R1BIO_BehindIO, &r1_bio->state)); + md_write_end(mddev); } static void r1_bio_write_done(struct r1bio *r1_bio) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 8f9172c3329a..ce28150e0464 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -426,12 +426,13 @@ static void raid10_end_read_request(struct bio *bio) static void close_write(struct r10bio *r10_bio) { + struct mddev *mddev = r10_bio->mddev; + /* clear the bitmap if all writes complete successfully */ - md_bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, - r10_bio->sectors, - !test_bit(R10BIO_Degraded, &r10_bio->state), - 0); - md_write_end(r10_bio->mddev); + mddev->bitmap_ops->endwrite(mddev, r10_bio->sector, r10_bio->sectors, + !test_bit(R10BIO_Degraded, &r10_bio->state), + false); + md_write_end(mddev); } static void one_write_done(struct r10bio *r10_bio) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 874874fe4fa1..23f2cbcf1a6c 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -313,10 +313,10 @@ void r5c_handle_cached_data_endio(struct r5conf *conf, if (sh->dev[i].written) { set_bit(R5_UPTODATE, &sh->dev[i].flags); r5c_return_dev_pending_writes(conf, &sh->dev[i]); - md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, - RAID5_STRIPE_SECTORS(conf), - !test_bit(STRIPE_DEGRADED, &sh->state), - 0); + conf->mddev->bitmap_ops->endwrite(conf->mddev, + sh->sector, RAID5_STRIPE_SECTORS(conf), + !test_bit(STRIPE_DEGRADED, &sh->state), + false); } } } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index c24036d1e6da..93d582b9f922 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3663,8 +3663,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, bi = nextbi; } if (bitmap_end) - md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, - RAID5_STRIPE_SECTORS(conf), 0, 0); + conf->mddev->bitmap_ops->endwrite(conf->mddev, + sh->sector, RAID5_STRIPE_SECTORS(conf), + false, false); bitmap_end = 0; /* and fail all 'written' */ bi = sh->dev[i].written; @@ -3709,8 +3710,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, } } if (bitmap_end) - md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, - RAID5_STRIPE_SECTORS(conf), 0, 0); + conf->mddev->bitmap_ops->endwrite(conf->mddev, + sh->sector, RAID5_STRIPE_SECTORS(conf), + false, false); /* If we were in the middle of a write the parity block might * still be locked - so just clear all R5_LOCKED flags */ @@ -4059,10 +4061,10 @@ returnbi: bio_endio(wbi); wbi = wbi2; } - md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, - RAID5_STRIPE_SECTORS(conf), - !test_bit(STRIPE_DEGRADED, &sh->state), - 0); + conf->mddev->bitmap_ops->endwrite(conf->mddev, + sh->sector, RAID5_STRIPE_SECTORS(conf), + !test_bit(STRIPE_DEGRADED, &sh->state), + false); if (head_sh->batch_head) { sh = list_first_entry(&sh->batch_list, struct stripe_head, From fe6a19d40ceb44281905485f56dda715e3214e0e Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:35 +0800 Subject: [PATCH 055/120] md/md-bitmap: merge md_bitmap_start_sync() into bitmap_operations So that the implementation won't be exposed, and it'll be possible to invent a new bitmap by replacing bitmap_operations. Also change the parameter from bitmap to mddev, to avoid access bitmap outside md-bitmap.c as much as possible. Also fix lots of code style. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-26-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 36 ++++++++++++++++++++---------------- drivers/md/md-bitmap.h | 3 ++- drivers/md/raid1.c | 10 +++++----- drivers/md/raid10.c | 22 ++++++++++++---------- drivers/md/raid5.c | 10 ++++++---- 5 files changed, 45 insertions(+), 36 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 4d174ab25339..57a4bae50e9c 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1579,24 +1579,26 @@ static void bitmap_endwrite(struct mddev *mddev, sector_t offset, } } -static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, - int degraded) +static bool __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, + sector_t *blocks, bool degraded) { bitmap_counter_t *bmc; - int rv; + bool rv; + if (bitmap == NULL) {/* FIXME or bitmap set as 'failed' */ *blocks = 1024; - return 1; /* always resync if no bitmap */ + return true; /* always resync if no bitmap */ } spin_lock_irq(&bitmap->counts.lock); + + rv = false; bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0); - rv = 0; if (bmc) { /* locked */ - if (RESYNC(*bmc)) - rv = 1; - else if (NEEDED(*bmc)) { - rv = 1; + if (RESYNC(*bmc)) { + rv = true; + } else if (NEEDED(*bmc)) { + rv = true; if (!degraded) { /* don't set/clear bits if degraded */ *bmc |= RESYNC_MASK; *bmc &= ~NEEDED_MASK; @@ -1604,11 +1606,12 @@ static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t } } spin_unlock_irq(&bitmap->counts.lock); + return rv; } -int md_bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, - int degraded) +static bool bitmap_start_sync(struct mddev *mddev, sector_t offset, + sector_t *blocks, bool degraded) { /* bitmap_start_sync must always report on multiples of whole * pages, otherwise resync (which is very PAGE_SIZE based) will @@ -1617,19 +1620,19 @@ int md_bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *block * At least PAGE_SIZE>>9 blocks are covered. * Return the 'or' of the result. */ - int rv = 0; + bool rv = false; sector_t blocks1; *blocks = 0; while (*blocks < (PAGE_SIZE>>9)) { - rv |= __bitmap_start_sync(bitmap, offset, + rv |= __bitmap_start_sync(mddev->bitmap, offset, &blocks1, degraded); offset += blocks1; *blocks += blocks1; } + return rv; } -EXPORT_SYMBOL(md_bitmap_start_sync); void md_bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted) { @@ -1723,7 +1726,7 @@ void md_bitmap_sync_with_cluster(struct mddev *mddev, WARN((blocks > new_lo) && old_lo, "alignment is not correct for lo\n"); for (sector = old_hi; sector < new_hi; ) { - md_bitmap_start_sync(bitmap, sector, &blocks, 0); + bitmap_start_sync(mddev, sector, &blocks, false); sector += blocks; } WARN((blocks > new_hi) && old_hi, "alignment is not correct for hi\n"); @@ -2005,7 +2008,7 @@ static int bitmap_load(struct mddev *mddev) */ while (sector < mddev->resync_max_sectors) { sector_t blocks; - md_bitmap_start_sync(bitmap, sector, &blocks, 0); + bitmap_start_sync(mddev, sector, &blocks, false); sector += blocks; } md_bitmap_close_sync(bitmap); @@ -2734,6 +2737,7 @@ static struct bitmap_operations bitmap_ops = { .startwrite = bitmap_startwrite, .endwrite = bitmap_endwrite, + .start_sync = bitmap_start_sync, .update_sb = bitmap_update_sb, .get_stats = bitmap_get_stats, diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 056a80ee500f..67c8f22e8726 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -259,6 +259,8 @@ struct bitmap_operations { unsigned long sectors, bool behind); void (*endwrite)(struct mddev *mddev, sector_t offset, unsigned long sectors, bool success, bool behind); + bool (*start_sync)(struct mddev *mddev, sector_t offset, + sector_t *blocks, bool degraded); void (*update_sb)(struct bitmap *bitmap); int (*get_stats)(struct bitmap *bitmap, struct md_bitmap_stats *stats); @@ -268,7 +270,6 @@ struct bitmap_operations { void mddev_set_bitmap_ops(struct mddev *mddev); /* these are exported */ -int md_bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded); void md_bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted); void md_bitmap_close_sync(struct bitmap *bitmap); void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index b335293ec472..8b83dd83e15f 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2755,7 +2755,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, int wonly = -1; int write_targets = 0, read_targets = 0; sector_t sync_blocks; - int still_degraded = 0; + bool still_degraded = false; int good_sectors = RESYNC_SECTORS; int min_bad = 0; /* number of sectors that are bad in all devices */ int idx = sector_to_idx(sector_nr); @@ -2797,7 +2797,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, /* before building a request, check if we can skip these blocks.. * This call the bitmap_start_sync doesn't actually record anything */ - if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && + if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks, true) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { /* We can skip this block, and probably several more */ *skipped = 1; @@ -2848,7 +2848,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { if (i < conf->raid_disks) - still_degraded = 1; + still_degraded = true; } else if (!test_bit(In_sync, &rdev->flags)) { bio->bi_opf = REQ_OP_WRITE; bio->bi_end_io = end_sync_write; @@ -2972,8 +2972,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, if (len == 0) break; if (sync_blocks == 0) { - if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, - &sync_blocks, still_degraded) && + if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, + &sync_blocks, still_degraded) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) break; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index ce28150e0464..d88878741ed4 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3289,10 +3289,10 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, r10_bio = NULL; for (i = 0 ; i < conf->geo.raid_disks; i++) { - int still_degraded; + bool still_degraded; struct r10bio *rb2; sector_t sect; - int must_sync; + bool must_sync; int any_working; struct raid10_info *mirror = &conf->mirrors[i]; struct md_rdev *mrdev, *mreplace; @@ -3309,7 +3309,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (!mrdev && !mreplace) continue; - still_degraded = 0; + still_degraded = false; /* want to reconstruct this device */ rb2 = r10_bio; sect = raid10_find_virt(conf, sector_nr, i); @@ -3322,8 +3322,9 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, * we only need to recover the block if it is set in * the bitmap */ - must_sync = md_bitmap_start_sync(mddev->bitmap, sect, - &sync_blocks, 1); + must_sync = mddev->bitmap_ops->start_sync(mddev, sect, + &sync_blocks, + true); if (sync_blocks < max_sync) max_sync = sync_blocks; if (!must_sync && @@ -3361,13 +3362,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, struct md_rdev *rdev = conf->mirrors[j].rdev; if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { - still_degraded = 1; + still_degraded = false; break; } } - must_sync = md_bitmap_start_sync(mddev->bitmap, sect, - &sync_blocks, still_degraded); + must_sync = mddev->bitmap_ops->start_sync(mddev, sect, + &sync_blocks, still_degraded); any_working = 0; for (j=0; jcopies;j++) { @@ -3544,8 +3545,9 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); - if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, - &sync_blocks, mddev->degraded) && + if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, + &sync_blocks, + mddev->degraded) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { /* We can skip this block */ diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 93d582b9f922..313904dd6555 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6485,7 +6485,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n struct r5conf *conf = mddev->private; struct stripe_head *sh; sector_t sync_blocks; - int still_degraded = 0; + bool still_degraded = false; int i; if (sector_nr >= max_sector) { @@ -6530,7 +6530,8 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n } if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && !conf->fullsync && - !md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && + !mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks, + true) && sync_blocks >= RAID5_STRIPE_SECTORS(conf)) { /* we can skip this block, and probably more */ do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf)); @@ -6558,10 +6559,11 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n struct md_rdev *rdev = conf->disks[i].rdev; if (rdev == NULL || test_bit(Faulty, &rdev->flags)) - still_degraded = 1; + still_degraded = true; } - md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); + mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks, + still_degraded); set_bit(STRIPE_SYNC_REQUESTED, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); From 9be669bd1b031f40b35034223517cf886e7a7fcc Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:36 +0800 Subject: [PATCH 056/120] md/md-bitmap: remove the parameter 'aborted' for md_bitmap_end_sync() For internal callers, aborted are always set to false, while for external callers, aborted are always set to true. Hence there is no need to always pass in true for exported api. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-27-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 15 +++++++++++---- drivers/md/md-bitmap.h | 3 ++- drivers/md/raid1.c | 4 ++-- drivers/md/raid10.c | 4 ++-- drivers/md/raid5.c | 2 +- 5 files changed, 18 insertions(+), 10 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 57a4bae50e9c..7e810fc2852e 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1634,7 +1634,8 @@ static bool bitmap_start_sync(struct mddev *mddev, sector_t offset, return rv; } -void md_bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted) +static void __bitmap_end_sync(struct bitmap *bitmap, sector_t offset, + sector_t *blocks, bool aborted) { bitmap_counter_t *bmc; unsigned long flags; @@ -1663,6 +1664,12 @@ void md_bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks unlock: spin_unlock_irqrestore(&bitmap->counts.lock, flags); } + +void md_bitmap_end_sync(struct bitmap *bitmap, sector_t offset, + sector_t *blocks) +{ + __bitmap_end_sync(bitmap, offset, blocks, true); +} EXPORT_SYMBOL(md_bitmap_end_sync); void md_bitmap_close_sync(struct bitmap *bitmap) @@ -1676,7 +1683,7 @@ void md_bitmap_close_sync(struct bitmap *bitmap) if (!bitmap) return; while (sector < bitmap->mddev->resync_max_sectors) { - md_bitmap_end_sync(bitmap, sector, &blocks, 0); + __bitmap_end_sync(bitmap, sector, &blocks, false); sector += blocks; } } @@ -1704,7 +1711,7 @@ void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force) sector &= ~((1ULL << bitmap->counts.chunkshift) - 1); s = 0; while (s < sector && s < bitmap->mddev->resync_max_sectors) { - md_bitmap_end_sync(bitmap, s, &blocks, 0); + __bitmap_end_sync(bitmap, s, &blocks, false); s += blocks; } bitmap->last_end_sync = jiffies; @@ -1720,7 +1727,7 @@ void md_bitmap_sync_with_cluster(struct mddev *mddev, sector_t sector, blocks = 0; for (sector = old_lo; sector < new_lo; ) { - md_bitmap_end_sync(bitmap, sector, &blocks, 0); + __bitmap_end_sync(bitmap, sector, &blocks, false); sector += blocks; } WARN((blocks > new_lo) && old_lo, "alignment is not correct for lo\n"); diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 67c8f22e8726..6691524bdc80 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -270,7 +270,8 @@ struct bitmap_operations { void mddev_set_bitmap_ops(struct mddev *mddev); /* these are exported */ -void md_bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted); +void md_bitmap_end_sync(struct bitmap *bitmap, sector_t offset, + sector_t *blocks); void md_bitmap_close_sync(struct bitmap *bitmap); void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force); void md_bitmap_sync_with_cluster(struct mddev *mddev, diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 8b83dd83e15f..5ee929c89cb1 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2026,7 +2026,7 @@ static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio) /* make sure these bits don't get cleared. */ do { - md_bitmap_end_sync(mddev->bitmap, s, &sync_blocks, 1); + md_bitmap_end_sync(mddev->bitmap, s, &sync_blocks); s += sync_blocks; sectors_to_go -= sync_blocks; } while (sectors_to_go > 0); @@ -2773,7 +2773,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, */ if (mddev->curr_resync < max_sector) /* aborted */ md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync, - &sync_blocks, 1); + &sync_blocks); else /* completed sync */ conf->fullsync = 0; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index d88878741ed4..33372d30fa99 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3195,12 +3195,12 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (mddev->curr_resync < max_sector) { /* aborted */ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync, - &sync_blocks, 1); + &sync_blocks); else for (i = 0; i < conf->geo.raid_disks; i++) { sector_t sect = raid10_find_virt(conf, mddev->curr_resync, i); md_bitmap_end_sync(mddev->bitmap, sect, - &sync_blocks, 1); + &sync_blocks); } } else { /* completed sync */ diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 313904dd6555..3e9fed1e1153 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6498,7 +6498,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n if (mddev->curr_resync < max_sector) /* aborted */ md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync, - &sync_blocks, 1); + &sync_blocks); else /* completed sync */ conf->fullsync = 0; md_bitmap_close_sync(mddev->bitmap); From 1415f402e1a1e6054377ed15f249589204cfb8b7 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:37 +0800 Subject: [PATCH 057/120] md/md-bitmap: merge md_bitmap_end_sync() into bitmap_operations So that the implementation won't be exposed, and it'll be possible to invent a new bitmap by replacing bitmap_operations. Also change the parameter from bitmap to mddev, to avoid access bitmap outside md-bitmap.c as much as possible. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-28-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 8 ++++---- drivers/md/md-bitmap.h | 3 +-- drivers/md/raid1.c | 6 +++--- drivers/md/raid10.c | 10 ++++++---- drivers/md/raid5.c | 4 ++-- 5 files changed, 16 insertions(+), 15 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 7e810fc2852e..8aef907da158 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1665,12 +1665,11 @@ static void __bitmap_end_sync(struct bitmap *bitmap, sector_t offset, spin_unlock_irqrestore(&bitmap->counts.lock, flags); } -void md_bitmap_end_sync(struct bitmap *bitmap, sector_t offset, - sector_t *blocks) +static void bitmap_end_sync(struct mddev *mddev, sector_t offset, + sector_t *blocks) { - __bitmap_end_sync(bitmap, offset, blocks, true); + __bitmap_end_sync(mddev->bitmap, offset, blocks, true); } -EXPORT_SYMBOL(md_bitmap_end_sync); void md_bitmap_close_sync(struct bitmap *bitmap) { @@ -2745,6 +2744,7 @@ static struct bitmap_operations bitmap_ops = { .startwrite = bitmap_startwrite, .endwrite = bitmap_endwrite, .start_sync = bitmap_start_sync, + .end_sync = bitmap_end_sync, .update_sb = bitmap_update_sb, .get_stats = bitmap_get_stats, diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 6691524bdc80..d6f2d5979da4 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -261,6 +261,7 @@ struct bitmap_operations { unsigned long sectors, bool success, bool behind); bool (*start_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks, bool degraded); + void (*end_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks); void (*update_sb)(struct bitmap *bitmap); int (*get_stats)(struct bitmap *bitmap, struct md_bitmap_stats *stats); @@ -270,8 +271,6 @@ struct bitmap_operations { void mddev_set_bitmap_ops(struct mddev *mddev); /* these are exported */ -void md_bitmap_end_sync(struct bitmap *bitmap, sector_t offset, - sector_t *blocks); void md_bitmap_close_sync(struct bitmap *bitmap); void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force); void md_bitmap_sync_with_cluster(struct mddev *mddev, diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 5ee929c89cb1..b50ba8074281 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2026,7 +2026,7 @@ static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio) /* make sure these bits don't get cleared. */ do { - md_bitmap_end_sync(mddev->bitmap, s, &sync_blocks); + mddev->bitmap_ops->end_sync(mddev, s, &sync_blocks); s += sync_blocks; sectors_to_go -= sync_blocks; } while (sectors_to_go > 0); @@ -2772,8 +2772,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, * We can find the current addess in mddev->curr_resync */ if (mddev->curr_resync < max_sector) /* aborted */ - md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync, - &sync_blocks); + mddev->bitmap_ops->end_sync(mddev, mddev->curr_resync, + &sync_blocks); else /* completed sync */ conf->fullsync = 0; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 33372d30fa99..15299a7774a0 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3194,13 +3194,15 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (mddev->curr_resync < max_sector) { /* aborted */ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) - md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync, - &sync_blocks); + mddev->bitmap_ops->end_sync(mddev, + mddev->curr_resync, + &sync_blocks); else for (i = 0; i < conf->geo.raid_disks; i++) { sector_t sect = raid10_find_virt(conf, mddev->curr_resync, i); - md_bitmap_end_sync(mddev->bitmap, sect, - &sync_blocks); + + mddev->bitmap_ops->end_sync(mddev, sect, + &sync_blocks); } } else { /* completed sync */ diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 3e9fed1e1153..89ae149bf28e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6497,8 +6497,8 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n } if (mddev->curr_resync < max_sector) /* aborted */ - md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync, - &sync_blocks); + mddev->bitmap_ops->end_sync(mddev, mddev->curr_resync, + &sync_blocks); else /* completed sync */ conf->fullsync = 0; md_bitmap_close_sync(mddev->bitmap); From 077b18abde12f27da9776563413bbad4918ecb52 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:38 +0800 Subject: [PATCH 058/120] md/md-bitmap: merge md_bitmap_close_sync() into bitmap_operations So that the implementation won't be exposed, and it'll be possible to invent a new bitmap by replacing bitmap_operations. Also change the parameter from bitmap to mddev, to avoid access bitmap outside md-bitmap.c as much as possible. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-29-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 9 ++++++--- drivers/md/md-bitmap.h | 2 +- drivers/md/raid1.c | 2 +- drivers/md/raid10.c | 2 +- drivers/md/raid5.c | 2 +- 5 files changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 8aef907da158..e1dceff2d9a5 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1671,7 +1671,7 @@ static void bitmap_end_sync(struct mddev *mddev, sector_t offset, __bitmap_end_sync(mddev->bitmap, offset, blocks, true); } -void md_bitmap_close_sync(struct bitmap *bitmap) +static void bitmap_close_sync(struct mddev *mddev) { /* Sync has finished, and any bitmap chunks that weren't synced * properly have been aborted. It remains to us to clear the @@ -1679,14 +1679,16 @@ void md_bitmap_close_sync(struct bitmap *bitmap) */ sector_t sector = 0; sector_t blocks; + struct bitmap *bitmap = mddev->bitmap; + if (!bitmap) return; + while (sector < bitmap->mddev->resync_max_sectors) { __bitmap_end_sync(bitmap, sector, &blocks, false); sector += blocks; } } -EXPORT_SYMBOL(md_bitmap_close_sync); void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force) { @@ -2017,7 +2019,7 @@ static int bitmap_load(struct mddev *mddev) bitmap_start_sync(mddev, sector, &blocks, false); sector += blocks; } - md_bitmap_close_sync(bitmap); + bitmap_close_sync(mddev); if (mddev->degraded == 0 || bitmap->events_cleared == mddev->events) @@ -2745,6 +2747,7 @@ static struct bitmap_operations bitmap_ops = { .endwrite = bitmap_endwrite, .start_sync = bitmap_start_sync, .end_sync = bitmap_end_sync, + .close_sync = bitmap_close_sync, .update_sb = bitmap_update_sb, .get_stats = bitmap_get_stats, diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index d6f2d5979da4..5d919b530317 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -262,6 +262,7 @@ struct bitmap_operations { bool (*start_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks, bool degraded); void (*end_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks); + void (*close_sync)(struct mddev *mddev); void (*update_sb)(struct bitmap *bitmap); int (*get_stats)(struct bitmap *bitmap, struct md_bitmap_stats *stats); @@ -271,7 +272,6 @@ struct bitmap_operations { void mddev_set_bitmap_ops(struct mddev *mddev); /* these are exported */ -void md_bitmap_close_sync(struct bitmap *bitmap); void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force); void md_bitmap_sync_with_cluster(struct mddev *mddev, sector_t old_lo, sector_t old_hi, diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index b50ba8074281..a6757c9a2894 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2777,7 +2777,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, else /* completed sync */ conf->fullsync = 0; - md_bitmap_close_sync(mddev->bitmap); + mddev->bitmap_ops->close_sync(mddev); close_sync(conf); if (mddev_is_clustered(mddev)) { diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 15299a7774a0..5b1c86c368b1 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3222,7 +3222,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, } conf->fullsync = 0; } - md_bitmap_close_sync(mddev->bitmap); + mddev->bitmap_ops->close_sync(mddev); close_sync(conf); *skipped = 1; return sectors_skipped; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 89ae149bf28e..d2b8d2517abf 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6501,7 +6501,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n &sync_blocks); else /* completed sync */ conf->fullsync = 0; - md_bitmap_close_sync(mddev->bitmap); + mddev->bitmap_ops->close_sync(mddev); return 0; } From 15db1eca63bd954997058edc1563223d7b525003 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:39 +0800 Subject: [PATCH 059/120] md/md-bitmap: merge md_bitmap_cond_end_sync() into bitmap_operations So that the implementation won't be exposed, and it'll be possible to invent a new bitmap by replacing bitmap_operations. Also change the parameter from bitmap to mddev, to avoid access bitmap outside md-bitmap.c as much as possible. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-30-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 6 ++++-- drivers/md/md-bitmap.h | 2 +- drivers/md/raid1.c | 6 +++--- drivers/md/raid10.c | 2 +- drivers/md/raid5.c | 2 +- 5 files changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index e1dceff2d9a5..2d9d9689f721 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1690,10 +1690,12 @@ static void bitmap_close_sync(struct mddev *mddev) } } -void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force) +static void bitmap_cond_end_sync(struct mddev *mddev, sector_t sector, + bool force) { sector_t s = 0; sector_t blocks; + struct bitmap *bitmap = mddev->bitmap; if (!bitmap) return; @@ -1718,7 +1720,6 @@ void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force) bitmap->last_end_sync = jiffies; sysfs_notify_dirent_safe(bitmap->mddev->sysfs_completed); } -EXPORT_SYMBOL(md_bitmap_cond_end_sync); void md_bitmap_sync_with_cluster(struct mddev *mddev, sector_t old_lo, sector_t old_hi, @@ -2747,6 +2748,7 @@ static struct bitmap_operations bitmap_ops = { .endwrite = bitmap_endwrite, .start_sync = bitmap_start_sync, .end_sync = bitmap_end_sync, + .cond_end_sync = bitmap_cond_end_sync, .close_sync = bitmap_close_sync, .update_sb = bitmap_update_sb, diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 5d919b530317..027de097f96a 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -262,6 +262,7 @@ struct bitmap_operations { bool (*start_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks, bool degraded); void (*end_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks); + void (*cond_end_sync)(struct mddev *mddev, sector_t sector, bool force); void (*close_sync)(struct mddev *mddev); void (*update_sb)(struct bitmap *bitmap); @@ -272,7 +273,6 @@ struct bitmap_operations { void mddev_set_bitmap_ops(struct mddev *mddev); /* these are exported */ -void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force); void md_bitmap_sync_with_cluster(struct mddev *mddev, sector_t old_lo, sector_t old_hi, sector_t new_lo, sector_t new_hi); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a6757c9a2894..119c9477a453 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2815,9 +2815,9 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, * sector_nr + two times RESYNC_SECTORS */ - md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, - mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); - + mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, + mddev_is_clustered(mddev) && + (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); if (raise_barrier(conf, sector_nr)) return 0; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 5b1c86c368b1..5a7b19f48c45 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3543,7 +3543,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, * safety reason, which ensures curr_resync_completed is * updated in bitmap_cond_end_sync. */ - md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, + mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d2b8d2517abf..87b8d19ab601 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6540,7 +6540,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n return sync_blocks * RAID5_STRIPE_SECTORS(conf); } - md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false); + mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, false); sh = raid5_get_active_stripe(conf, NULL, sector_nr, R5_GAS_NOBLOCK); From 4338b94271dd8cee8ce3b662b12528cd009325a3 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:40 +0800 Subject: [PATCH 060/120] md/md-bitmap: merge md_bitmap_sync_with_cluster() into bitmap_operations So that the implementation won't be exposed, and it'll be possible to invent a new bitmap by replacing bitmap_operations. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-31-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 9 +++++---- drivers/md/md-bitmap.h | 8 ++++---- drivers/md/md-cluster.c | 4 ++-- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 2d9d9689f721..9fe97f14a719 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1721,9 +1721,9 @@ static void bitmap_cond_end_sync(struct mddev *mddev, sector_t sector, sysfs_notify_dirent_safe(bitmap->mddev->sysfs_completed); } -void md_bitmap_sync_with_cluster(struct mddev *mddev, - sector_t old_lo, sector_t old_hi, - sector_t new_lo, sector_t new_hi) +static void bitmap_sync_with_cluster(struct mddev *mddev, + sector_t old_lo, sector_t old_hi, + sector_t new_lo, sector_t new_hi) { struct bitmap *bitmap = mddev->bitmap; sector_t sector, blocks = 0; @@ -1740,7 +1740,6 @@ void md_bitmap_sync_with_cluster(struct mddev *mddev, } WARN((blocks > new_hi) && old_hi, "alignment is not correct for hi\n"); } -EXPORT_SYMBOL(md_bitmap_sync_with_cluster); static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) { @@ -2753,6 +2752,8 @@ static struct bitmap_operations bitmap_ops = { .update_sb = bitmap_update_sb, .get_stats = bitmap_get_stats, + + .sync_with_cluster = bitmap_sync_with_cluster, }; void mddev_set_bitmap_ops(struct mddev *mddev) diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 027de097f96a..0953ac73735c 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -267,16 +267,16 @@ struct bitmap_operations { void (*update_sb)(struct bitmap *bitmap); int (*get_stats)(struct bitmap *bitmap, struct md_bitmap_stats *stats); + + void (*sync_with_cluster)(struct mddev *mddev, + sector_t old_lo, sector_t old_hi, + sector_t new_lo, sector_t new_hi); }; /* the bitmap API */ void mddev_set_bitmap_ops(struct mddev *mddev); /* these are exported */ -void md_bitmap_sync_with_cluster(struct mddev *mddev, - sector_t old_lo, sector_t old_hi, - sector_t new_lo, sector_t new_hi); - void md_bitmap_unplug(struct bitmap *bitmap); void md_bitmap_unplug_async(struct bitmap *bitmap); void md_bitmap_daemon_work(struct mddev *mddev); diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index a5f1135cc1fa..55feabe14ad3 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -497,8 +497,8 @@ static void process_suspend_info(struct mddev *mddev, * we don't want to trigger lots of WARN. */ if (sb && !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) - md_bitmap_sync_with_cluster(mddev, cinfo->sync_low, - cinfo->sync_hi, lo, hi); + mddev->bitmap_ops->sync_with_cluster(mddev, cinfo->sync_low, + cinfo->sync_hi, lo, hi); cinfo->sync_low = lo; cinfo->sync_hi = hi; From 48eb95810a9241afd871de917d70712e2ddfda31 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:41 +0800 Subject: [PATCH 061/120] md/md-bitmap: merge md_bitmap_unplug_async() into md_bitmap_unplug() Add a parameter 'bool sync' to distinguish them, and md_bitmap_unplug_async() won't be exported anymore, hence bitmap_operations only need one op to cover them. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-32-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 28 ++++++++++++++++++++-------- drivers/md/md-bitmap.h | 3 +-- drivers/md/md.c | 2 +- drivers/md/raid1-10.c | 7 ++----- drivers/md/raid1.c | 2 +- drivers/md/raid10.c | 4 ++-- drivers/md/raid5.c | 2 +- 7 files changed, 28 insertions(+), 20 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 9fe97f14a719..ecfc3a02a976 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1026,7 +1026,7 @@ static int md_bitmap_file_test_bit(struct bitmap *bitmap, sector_t block) /* this gets called when the md device is ready to unplug its underlying * (slave) device queues -- before we let any writes go down, we need to * sync the dirty pages of the bitmap file to disk */ -void md_bitmap_unplug(struct bitmap *bitmap) +static void __bitmap_unplug(struct bitmap *bitmap) { unsigned long i; int dirty, need_write; @@ -1058,7 +1058,6 @@ void md_bitmap_unplug(struct bitmap *bitmap) if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) md_bitmap_file_kick(bitmap); } -EXPORT_SYMBOL(md_bitmap_unplug); struct bitmap_unplug_work { struct work_struct work; @@ -1071,11 +1070,11 @@ static void md_bitmap_unplug_fn(struct work_struct *work) struct bitmap_unplug_work *unplug_work = container_of(work, struct bitmap_unplug_work, work); - md_bitmap_unplug(unplug_work->bitmap); + __bitmap_unplug(unplug_work->bitmap); complete(unplug_work->done); } -void md_bitmap_unplug_async(struct bitmap *bitmap) +static void bitmap_unplug_async(struct bitmap *bitmap) { DECLARE_COMPLETION_ONSTACK(done); struct bitmap_unplug_work unplug_work; @@ -1087,7 +1086,20 @@ void md_bitmap_unplug_async(struct bitmap *bitmap) queue_work(md_bitmap_wq, &unplug_work.work); wait_for_completion(&done); } -EXPORT_SYMBOL(md_bitmap_unplug_async); + +void md_bitmap_unplug(struct mddev *mddev, bool sync) +{ + struct bitmap *bitmap = mddev->bitmap; + + if (!bitmap) + return; + + if (sync) + __bitmap_unplug(bitmap); + else + bitmap_unplug_async(bitmap); +} +EXPORT_SYMBOL_GPL(md_bitmap_unplug); static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed); @@ -2108,9 +2120,9 @@ int md_bitmap_copy_from_slot(struct mddev *mddev, int slot, for (i = 0; i < bitmap->storage.file_pages; i++) if (test_page_attr(bitmap, i, BITMAP_PAGE_PENDING)) set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE); - md_bitmap_unplug(bitmap); + __bitmap_unplug(bitmap); } - md_bitmap_unplug(mddev->bitmap); + __bitmap_unplug(mddev->bitmap); *low = lo; *high = hi; md_bitmap_free(bitmap); @@ -2351,7 +2363,7 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, spin_unlock_irq(&bitmap->counts.lock); if (!init) { - md_bitmap_unplug(bitmap); + __bitmap_unplug(bitmap); bitmap->mddev->pers->quiesce(bitmap->mddev, 0); } ret = 0; diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 0953ac73735c..ba8ba7e49ef9 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -277,8 +277,7 @@ struct bitmap_operations { void mddev_set_bitmap_ops(struct mddev *mddev); /* these are exported */ -void md_bitmap_unplug(struct bitmap *bitmap); -void md_bitmap_unplug_async(struct bitmap *bitmap); +void md_bitmap_unplug(struct mddev *mddev, bool sync); void md_bitmap_daemon_work(struct mddev *mddev); int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, diff --git a/drivers/md/md.c b/drivers/md/md.c index f72fa8e872f8..b3bc0c76aeb8 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4715,7 +4715,7 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len) mddev->bitmap_ops->dirty_bits(mddev, chunk, end_chunk); buf = skip_spaces(end); } - md_bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ + md_bitmap_unplug(mddev, true); /* flush the bits to disk */ out: mddev_unlock(mddev); return len; diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index 2ea1710a3b70..45b30f08f3a5 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -166,12 +166,9 @@ static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio, * while current io submission must wait for bitmap io to be done. In order to * avoid such deadlock, submit bitmap io asynchronously. */ -static inline void raid1_prepare_flush_writes(struct bitmap *bitmap) +static inline void raid1_prepare_flush_writes(struct mddev *mddev) { - if (current->bio_list) - md_bitmap_unplug_async(bitmap); - else - md_bitmap_unplug(bitmap); + md_bitmap_unplug(mddev, current->bio_list == NULL); } /* diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 119c9477a453..f9338756088a 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -896,7 +896,7 @@ static void wake_up_barrier(struct r1conf *conf) static void flush_bio_list(struct r1conf *conf, struct bio *bio) { /* flush any pending bitmap writes to disk before proceeding w/ I/O */ - raid1_prepare_flush_writes(conf->mddev->bitmap); + raid1_prepare_flush_writes(conf->mddev); wake_up_barrier(conf); while (bio) { /* submit pending writes */ diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 5a7b19f48c45..c79f374668dd 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -885,7 +885,7 @@ static void flush_pending_writes(struct r10conf *conf) __set_current_state(TASK_RUNNING); blk_start_plug(&plug); - raid1_prepare_flush_writes(conf->mddev->bitmap); + raid1_prepare_flush_writes(conf->mddev); wake_up(&conf->wait_barrier); while (bio) { /* submit pending writes */ @@ -1101,7 +1101,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) /* we aren't scheduling, so we can do the write-out directly. */ bio = bio_list_get(&plug->pending); - raid1_prepare_flush_writes(mddev->bitmap); + raid1_prepare_flush_writes(mddev); wake_up_barrier(conf); while (bio) { /* submit pending writes */ diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 87b8d19ab601..e98061c01b44 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6768,7 +6768,7 @@ static void raid5d(struct md_thread *thread) /* Now is a good time to flush some bitmap updates */ conf->seq_flush++; spin_unlock_irq(&conf->device_lock); - md_bitmap_unplug(mddev->bitmap); + md_bitmap_unplug(mddev, true); spin_lock_irq(&conf->device_lock); conf->seq_write = conf->seq_flush; activate_bit_delay(conf, conf->temp_inactive_list); From 3c9883e77a36ca76b8d92afa99599263ca587ae7 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:42 +0800 Subject: [PATCH 062/120] md/md-bitmap: merge bitmap_unplug() into bitmap_operations So that the implementation won't be exposed, and it'll be possible to invent a new bitmap by replacing bitmap_operations. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-33-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 4 ++-- drivers/md/md-bitmap.h | 2 +- drivers/md/md.c | 2 +- drivers/md/raid1-10.c | 2 +- drivers/md/raid5.c | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index ecfc3a02a976..4e6e31e364f6 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1087,7 +1087,7 @@ static void bitmap_unplug_async(struct bitmap *bitmap) wait_for_completion(&done); } -void md_bitmap_unplug(struct mddev *mddev, bool sync) +static void bitmap_unplug(struct mddev *mddev, bool sync) { struct bitmap *bitmap = mddev->bitmap; @@ -1099,7 +1099,6 @@ void md_bitmap_unplug(struct mddev *mddev, bool sync) else bitmap_unplug_async(bitmap); } -EXPORT_SYMBOL_GPL(md_bitmap_unplug); static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed); @@ -2754,6 +2753,7 @@ static struct bitmap_operations bitmap_ops = { .flush = bitmap_flush, .write_all = bitmap_write_all, .dirty_bits = bitmap_dirty_bits, + .unplug = bitmap_unplug, .startwrite = bitmap_startwrite, .endwrite = bitmap_endwrite, diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index ba8ba7e49ef9..dbe9b27091f4 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -254,6 +254,7 @@ struct bitmap_operations { void (*write_all)(struct mddev *mddev); void (*dirty_bits)(struct mddev *mddev, unsigned long s, unsigned long e); + void (*unplug)(struct mddev *mddev, bool sync); int (*startwrite)(struct mddev *mddev, sector_t offset, unsigned long sectors, bool behind); @@ -277,7 +278,6 @@ struct bitmap_operations { void mddev_set_bitmap_ops(struct mddev *mddev); /* these are exported */ -void md_bitmap_unplug(struct mddev *mddev, bool sync); void md_bitmap_daemon_work(struct mddev *mddev); int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, diff --git a/drivers/md/md.c b/drivers/md/md.c index b3bc0c76aeb8..b45d9b645606 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4715,7 +4715,7 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len) mddev->bitmap_ops->dirty_bits(mddev, chunk, end_chunk); buf = skip_spaces(end); } - md_bitmap_unplug(mddev, true); /* flush the bits to disk */ + mddev->bitmap_ops->unplug(mddev, true); /* flush the bits to disk */ out: mddev_unlock(mddev); return len; diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index 45b30f08f3a5..e8207513eb1b 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -168,7 +168,7 @@ static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio, */ static inline void raid1_prepare_flush_writes(struct mddev *mddev) { - md_bitmap_unplug(mddev, current->bio_list == NULL); + mddev->bitmap_ops->unplug(mddev, current->bio_list == NULL); } /* diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e98061c01b44..91b610d11c6a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6768,7 +6768,7 @@ static void raid5d(struct md_thread *thread) /* Now is a good time to flush some bitmap updates */ conf->seq_flush++; spin_unlock_irq(&conf->device_lock); - md_bitmap_unplug(mddev, true); + mddev->bitmap_ops->unplug(mddev, true); spin_lock_irq(&conf->device_lock); conf->seq_write = conf->seq_flush; activate_bit_delay(conf, conf->temp_inactive_list); From 18db2a9c60aefc61e796f6a384a952999d3b8885 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:43 +0800 Subject: [PATCH 063/120] md/md-bitmap: merge md_bitmap_daemon_work() into bitmap_operations So that the implementation won't be exposed, and it'll be possible to invent a new bitmap by replacing bitmap_operations. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-34-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 9 +++++---- drivers/md/md-bitmap.h | 2 +- drivers/md/md.c | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 4e6e31e364f6..efe928849c94 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1298,7 +1298,7 @@ out: * bitmap daemon -- periodically wakes up to clean bits and flush pages * out to disk */ -void md_bitmap_daemon_work(struct mddev *mddev) +static void bitmap_daemon_work(struct mddev *mddev) { struct bitmap *bitmap; unsigned long j; @@ -1815,11 +1815,11 @@ static void bitmap_flush(struct mddev *mddev) */ sleep = mddev->bitmap_info.daemon_sleep * 2; bitmap->daemon_lastrun -= sleep; - md_bitmap_daemon_work(mddev); + bitmap_daemon_work(mddev); bitmap->daemon_lastrun -= sleep; - md_bitmap_daemon_work(mddev); + bitmap_daemon_work(mddev); bitmap->daemon_lastrun -= sleep; - md_bitmap_daemon_work(mddev); + bitmap_daemon_work(mddev); if (mddev->bitmap_info.external) md_super_wait(mddev); bitmap_update_sb(bitmap); @@ -2754,6 +2754,7 @@ static struct bitmap_operations bitmap_ops = { .write_all = bitmap_write_all, .dirty_bits = bitmap_dirty_bits, .unplug = bitmap_unplug, + .daemon_work = bitmap_daemon_work, .startwrite = bitmap_startwrite, .endwrite = bitmap_endwrite, diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index dbe9b27091f4..065b36c0c43a 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -255,6 +255,7 @@ struct bitmap_operations { void (*dirty_bits)(struct mddev *mddev, unsigned long s, unsigned long e); void (*unplug)(struct mddev *mddev, bool sync); + void (*daemon_work)(struct mddev *mddev); int (*startwrite)(struct mddev *mddev, sector_t offset, unsigned long sectors, bool behind); @@ -278,7 +279,6 @@ struct bitmap_operations { void mddev_set_bitmap_ops(struct mddev *mddev); /* these are exported */ -void md_bitmap_daemon_work(struct mddev *mddev); int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, int chunksize, int init); diff --git a/drivers/md/md.c b/drivers/md/md.c index b45d9b645606..00932bd9b0c8 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9638,7 +9638,7 @@ static void unregister_sync_thread(struct mddev *mddev) void md_check_recovery(struct mddev *mddev) { if (mddev->bitmap) - md_bitmap_daemon_work(mddev); + mddev->bitmap_ops->daemon_work(mddev); if (signal_pending(current)) { if (mddev->pers->sync_request && !mddev->external) { From e1791dae6cbd65e5102dca40b8adadef1d89c1b9 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:44 +0800 Subject: [PATCH 064/120] md/md-bitmap: pass in mddev directly for md_bitmap_resize() And move the condition "if (mddev->bitmap)" into md_bitmap_resize() as well, on the one hand make code cleaner, on the other hand try not to access bitmap directly. Since we are here, also change the parameter 'init' from int to bool. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-35-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/dm-raid.c | 2 +- drivers/md/md-bitmap.c | 21 ++++++++++++++++++--- drivers/md/md-bitmap.h | 4 ++-- drivers/md/md-cluster.c | 4 ++-- drivers/md/raid1.c | 12 +++++++----- drivers/md/raid10.c | 17 +++++++++-------- drivers/md/raid5.c | 11 ++++++----- 7 files changed, 45 insertions(+), 26 deletions(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index c3e201fde4c5..cc071fcd7a04 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -4068,7 +4068,7 @@ static int raid_preresume(struct dm_target *ti) mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)))) { int chunksize = to_bytes(rs->requested_bitmap_chunk_sectors) ?: mddev->bitmap_info.chunksize; - r = md_bitmap_resize(mddev->bitmap, mddev->dev_sectors, chunksize, 0); + r = md_bitmap_resize(mddev, mddev->dev_sectors, chunksize, false); if (r) DMERR("Failed to resize bitmap"); } diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index efe928849c94..c2292546d6ce 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -32,6 +32,9 @@ #include "md.h" #include "md-bitmap.h" +static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks, + int chunksize, bool init); + static inline char *bmname(struct bitmap *bitmap) { return bitmap->mddev ? mdname(bitmap->mddev) : "mdX"; @@ -1975,7 +1978,8 @@ static struct bitmap *__bitmap_create(struct mddev *mddev, int slot) goto error; bitmap->daemon_lastrun = jiffies; - err = md_bitmap_resize(bitmap, blocks, mddev->bitmap_info.chunksize, 1); + err = __bitmap_resize(bitmap, blocks, mddev->bitmap_info.chunksize, + true); if (err) goto error; @@ -2163,8 +2167,8 @@ static int bitmap_get_stats(struct bitmap *bitmap, struct md_bitmap_stats *stats return 0; } -int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, - int chunksize, int init) +static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks, + int chunksize, bool init) { /* If chunk_size is 0, choose an appropriate chunk size. * Then possibly allocate new storage space. @@ -2369,6 +2373,17 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, err: return ret; } + +int md_bitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize, + bool init) +{ + struct bitmap *bitmap = mddev->bitmap; + + if (!bitmap) + return 0; + + return __bitmap_resize(bitmap, blocks, chunksize, init); +} EXPORT_SYMBOL_GPL(md_bitmap_resize); static ssize_t diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 065b36c0c43a..4610e50548eb 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -280,8 +280,8 @@ void mddev_set_bitmap_ops(struct mddev *mddev); /* these are exported */ -int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, - int chunksize, int init); +int md_bitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize, + bool init); struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot); int md_bitmap_copy_from_slot(struct mddev *mddev, int slot, sector_t *lo, sector_t *hi, bool clear_bits); diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 55feabe14ad3..a7e5ead71c2f 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -628,8 +628,8 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) break; case BITMAP_RESIZE: if (le64_to_cpu(msg->high) != mddev->pers->size(mddev, 0, 0)) - ret = md_bitmap_resize(mddev->bitmap, - le64_to_cpu(msg->high), 0, 0); + ret = md_bitmap_resize(mddev, le64_to_cpu(msg->high), 0, + false); break; default: ret = -1; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index f9338756088a..cf05bafeae77 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -3297,14 +3297,16 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors) * worth it. */ sector_t newsize = raid1_size(mddev, sectors, 0); + int ret; + if (mddev->external_size && mddev->array_sectors > newsize) return -EINVAL; - if (mddev->bitmap) { - int ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0); - if (ret) - return ret; - } + + ret = md_bitmap_resize(mddev, newsize, 0, false); + if (ret) + return ret; + md_set_array_sectors(mddev, newsize); if (sectors > mddev->dev_sectors && mddev->recovery_cp > mddev->dev_sectors) { diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index c79f374668dd..4512d285b76a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4196,6 +4196,7 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) */ struct r10conf *conf = mddev->private; sector_t oldsize, size; + int ret; if (mddev->reshape_position != MaxSector) return -EBUSY; @@ -4208,11 +4209,11 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) if (mddev->external_size && mddev->array_sectors > size) return -EINVAL; - if (mddev->bitmap) { - int ret = md_bitmap_resize(mddev->bitmap, size, 0, 0); - if (ret) - return ret; - } + + ret = md_bitmap_resize(mddev, size, 0, false); + if (ret) + return ret; + md_set_array_sectors(mddev, size); if (sectors > mddev->dev_sectors && mddev->recovery_cp > oldsize) { @@ -4478,7 +4479,7 @@ static int raid10_start_reshape(struct mddev *mddev) newsize = raid10_size(mddev, 0, conf->geo.raid_disks); if (!mddev_is_clustered(mddev)) { - ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0); + ret = md_bitmap_resize(mddev, newsize, 0, false); if (ret) goto abort; else @@ -4500,13 +4501,13 @@ static int raid10_start_reshape(struct mddev *mddev) MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize)) goto out; - ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0); + ret = md_bitmap_resize(mddev, newsize, 0, false); if (ret) goto abort; ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize); if (ret) { - md_bitmap_resize(mddev->bitmap, oldsize, 0, 0); + md_bitmap_resize(mddev, oldsize, 0, false); goto abort; } } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 91b610d11c6a..47c89f7b1dfe 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -8313,6 +8313,7 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) */ sector_t newsize; struct r5conf *conf = mddev->private; + int ret; if (raid5_has_log(conf) || raid5_has_ppl(conf)) return -EINVAL; @@ -8321,11 +8322,11 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) if (mddev->external_size && mddev->array_sectors > newsize) return -EINVAL; - if (mddev->bitmap) { - int ret = md_bitmap_resize(mddev->bitmap, sectors, 0, 0); - if (ret) - return ret; - } + + ret = md_bitmap_resize(mddev, sectors, 0, false); + if (ret) + return ret; + md_set_array_sectors(mddev, newsize); if (sectors > mddev->dev_sectors && mddev->recovery_cp > mddev->dev_sectors) { From 77c09640eea56dbfed069ac67b1cd79397d41be8 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:45 +0800 Subject: [PATCH 065/120] md/md-bitmap: merge md_bitmap_resize() into bitmap_operations So that the implementation won't be exposed, and it'll be possible to invent a new bitmap by replacing bitmap_operations. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-36-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/dm-raid.c | 3 ++- drivers/md/md-bitmap.c | 6 +++--- drivers/md/md-bitmap.h | 5 +++-- drivers/md/md-cluster.c | 5 +++-- drivers/md/raid1.c | 2 +- drivers/md/raid10.c | 10 +++++----- drivers/md/raid5.c | 2 +- 7 files changed, 18 insertions(+), 15 deletions(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index cc071fcd7a04..63682d27fc8d 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -4068,7 +4068,8 @@ static int raid_preresume(struct dm_target *ti) mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)))) { int chunksize = to_bytes(rs->requested_bitmap_chunk_sectors) ?: mddev->bitmap_info.chunksize; - r = md_bitmap_resize(mddev, mddev->dev_sectors, chunksize, false); + r = mddev->bitmap_ops->resize(mddev, mddev->dev_sectors, + chunksize, false); if (r) DMERR("Failed to resize bitmap"); } diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index c2292546d6ce..0506b54f6322 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2374,8 +2374,8 @@ err: return ret; } -int md_bitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize, - bool init) +static int bitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize, + bool init) { struct bitmap *bitmap = mddev->bitmap; @@ -2384,7 +2384,6 @@ int md_bitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize, return __bitmap_resize(bitmap, blocks, chunksize, init); } -EXPORT_SYMBOL_GPL(md_bitmap_resize); static ssize_t location_show(struct mddev *mddev, char *page) @@ -2763,6 +2762,7 @@ const struct attribute_group md_bitmap_group = { static struct bitmap_operations bitmap_ops = { .create = bitmap_create, + .resize = bitmap_resize, .load = bitmap_load, .destroy = bitmap_destroy, .flush = bitmap_flush, diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 4610e50548eb..d05fc2f1451e 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -248,6 +248,9 @@ struct md_bitmap_stats { struct bitmap_operations { int (*create)(struct mddev *mddev, int slot); + int (*resize)(struct mddev *mddev, sector_t blocks, int chunksize, + bool init); + int (*load)(struct mddev *mddev); void (*destroy)(struct mddev *mddev); void (*flush)(struct mddev *mddev); @@ -280,8 +283,6 @@ void mddev_set_bitmap_ops(struct mddev *mddev); /* these are exported */ -int md_bitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize, - bool init); struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot); int md_bitmap_copy_from_slot(struct mddev *mddev, int slot, sector_t *lo, sector_t *hi, bool clear_bits); diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index a7e5ead71c2f..21cf0f38cbf8 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -628,8 +628,9 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) break; case BITMAP_RESIZE: if (le64_to_cpu(msg->high) != mddev->pers->size(mddev, 0, 0)) - ret = md_bitmap_resize(mddev, le64_to_cpu(msg->high), 0, - false); + ret = mddev->bitmap_ops->resize(mddev, + le64_to_cpu(msg->high), + 0, false); break; default: ret = -1; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index cf05bafeae77..3e678af8f28a 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -3303,7 +3303,7 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors) mddev->array_sectors > newsize) return -EINVAL; - ret = md_bitmap_resize(mddev, newsize, 0, false); + ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false); if (ret) return ret; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 4512d285b76a..f3bf1116794a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4210,7 +4210,7 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) mddev->array_sectors > size) return -EINVAL; - ret = md_bitmap_resize(mddev, size, 0, false); + ret = mddev->bitmap_ops->resize(mddev, size, 0, false); if (ret) return ret; @@ -4479,7 +4479,7 @@ static int raid10_start_reshape(struct mddev *mddev) newsize = raid10_size(mddev, 0, conf->geo.raid_disks); if (!mddev_is_clustered(mddev)) { - ret = md_bitmap_resize(mddev, newsize, 0, false); + ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false); if (ret) goto abort; else @@ -4494,20 +4494,20 @@ static int raid10_start_reshape(struct mddev *mddev) /* * some node is already performing reshape, and no need to - * call md_bitmap_resize again since it should be called when + * call bitmap_ops->resize again since it should be called when * receiving BITMAP_RESIZE msg */ if ((sb && (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize)) goto out; - ret = md_bitmap_resize(mddev, newsize, 0, false); + ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false); if (ret) goto abort; ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize); if (ret) { - md_bitmap_resize(mddev, oldsize, 0, false); + mddev->bitmap_ops->resize(mddev, oldsize, 0, false); goto abort; } } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 47c89f7b1dfe..c84a7e0263cd 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -8323,7 +8323,7 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) mddev->array_sectors > newsize) return -EINVAL; - ret = md_bitmap_resize(mddev, sectors, 0, false); + ret = mddev->bitmap_ops->resize(mddev, sectors, 0, false); if (ret) return ret; From 57d602414d2e327160f2e03c39eff3f2f895839c Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:46 +0800 Subject: [PATCH 066/120] md/md-bitmap: merge get_bitmap_from_slot() into bitmap_operations So that the implementation won't be exposed, and it'll be possible to invent a new bitmap by replacing bitmap_operations. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-37-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 6 +++--- drivers/md/md-bitmap.h | 2 +- drivers/md/md-cluster.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 0506b54f6322..a75f4216fd40 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2065,7 +2065,7 @@ out: } /* caller need to free returned bitmap with md_bitmap_free() */ -struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot) +static struct bitmap *bitmap_get_from_slot(struct mddev *mddev, int slot) { int rv = 0; struct bitmap *bitmap; @@ -2084,7 +2084,6 @@ struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot) return bitmap; } -EXPORT_SYMBOL(get_bitmap_from_slot); /* Loads the bitmap associated with slot and copies the resync information * to our bitmap @@ -2097,7 +2096,7 @@ int md_bitmap_copy_from_slot(struct mddev *mddev, int slot, struct bitmap_counts *counts; struct bitmap *bitmap; - bitmap = get_bitmap_from_slot(mddev, slot); + bitmap = bitmap_get_from_slot(mddev, slot); if (IS_ERR(bitmap)) { pr_err("%s can't get bitmap from slot %d\n", __func__, slot); return -1; @@ -2782,6 +2781,7 @@ static struct bitmap_operations bitmap_ops = { .get_stats = bitmap_get_stats, .sync_with_cluster = bitmap_sync_with_cluster, + .get_from_slot = bitmap_get_from_slot, }; void mddev_set_bitmap_ops(struct mddev *mddev) diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index d05fc2f1451e..6bcb88b71371 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -276,6 +276,7 @@ struct bitmap_operations { void (*sync_with_cluster)(struct mddev *mddev, sector_t old_lo, sector_t old_hi, sector_t new_lo, sector_t new_hi); + struct bitmap *(*get_from_slot)(struct mddev *mddev, int slot); }; /* the bitmap API */ @@ -283,7 +284,6 @@ void mddev_set_bitmap_ops(struct mddev *mddev); /* these are exported */ -struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot); int md_bitmap_copy_from_slot(struct mddev *mddev, int slot, sector_t *lo, sector_t *hi, bool clear_bits); void md_bitmap_set_pages(struct bitmap *bitmap, unsigned long pages); diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 21cf0f38cbf8..81c630dcf9ea 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -1169,7 +1169,7 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz if (i == md_cluster_ops->slot_number(mddev)) continue; - bitmap = get_bitmap_from_slot(mddev, i); + bitmap = mddev->bitmap_ops->get_from_slot(mddev, i); if (IS_ERR(bitmap)) { pr_err("can't get bitmap from slot %d\n", i); bitmap = NULL; @@ -1236,7 +1236,7 @@ static int cluster_check_sync_size(struct mddev *mddev) if (i == current_slot) continue; - bitmap = get_bitmap_from_slot(mddev, i); + bitmap = mddev->bitmap_ops->get_from_slot(mddev, i); if (IS_ERR(bitmap)) { pr_err("can't get bitmap from slot %d\n", i); return -1; From 3dd9141a154785f10770fb3e8349f6b133b08b8a Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:47 +0800 Subject: [PATCH 067/120] md/md-bitmap: merge md_bitmap_copy_from_slot() into struct bitmap_operation. So that the implementation won't be exposed, and it'll be possible to invent a new bitmap by replacing bitmap_operations. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-38-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 6 +++--- drivers/md/md-bitmap.h | 4 ++-- drivers/md/md-cluster.c | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index a75f4216fd40..25126cdee4e1 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2088,8 +2088,8 @@ static struct bitmap *bitmap_get_from_slot(struct mddev *mddev, int slot) /* Loads the bitmap associated with slot and copies the resync information * to our bitmap */ -int md_bitmap_copy_from_slot(struct mddev *mddev, int slot, - sector_t *low, sector_t *high, bool clear_bits) +static int bitmap_copy_from_slot(struct mddev *mddev, int slot, sector_t *low, + sector_t *high, bool clear_bits) { int rv = 0, i, j; sector_t block, lo = 0, hi = 0; @@ -2131,7 +2131,6 @@ int md_bitmap_copy_from_slot(struct mddev *mddev, int slot, return rv; } -EXPORT_SYMBOL_GPL(md_bitmap_copy_from_slot); void md_bitmap_set_pages(struct bitmap *bitmap, unsigned long pages) { @@ -2782,6 +2781,7 @@ static struct bitmap_operations bitmap_ops = { .sync_with_cluster = bitmap_sync_with_cluster, .get_from_slot = bitmap_get_from_slot, + .copy_from_slot = bitmap_copy_from_slot, }; void mddev_set_bitmap_ops(struct mddev *mddev) diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 6bcb88b71371..94d9a6ae244a 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -277,6 +277,8 @@ struct bitmap_operations { sector_t old_lo, sector_t old_hi, sector_t new_lo, sector_t new_hi); struct bitmap *(*get_from_slot)(struct mddev *mddev, int slot); + int (*copy_from_slot)(struct mddev *mddev, int slot, sector_t *lo, + sector_t *hi, bool clear_bits); }; /* the bitmap API */ @@ -284,8 +286,6 @@ void mddev_set_bitmap_ops(struct mddev *mddev); /* these are exported */ -int md_bitmap_copy_from_slot(struct mddev *mddev, int slot, - sector_t *lo, sector_t *hi, bool clear_bits); void md_bitmap_set_pages(struct bitmap *bitmap, unsigned long pages); void md_bitmap_free(struct bitmap *bitmap); void md_bitmap_wait_behind_writes(struct mddev *mddev); diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 81c630dcf9ea..515f46983ace 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -317,7 +317,7 @@ static void recover_bitmaps(struct md_thread *thread) str, ret); goto clear_bit; } - ret = md_bitmap_copy_from_slot(mddev, slot, &lo, &hi, true); + ret = mddev->bitmap_ops->copy_from_slot(mddev, slot, &lo, &hi, true); if (ret) { pr_err("md-cluster: Could not copy data from bitmap %d\n", slot); goto clear_bit; @@ -857,7 +857,7 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots) } /* Read the disk bitmap sb and check if it needs recovery */ - ret = md_bitmap_copy_from_slot(mddev, i, &lo, &hi, false); + ret = mddev->bitmap_ops->copy_from_slot(mddev, i, &lo, &hi, false); if (ret) { pr_warn("md-cluster: Could not gather bitmaps from slot %d", i); lockres_free(bm_lockres); @@ -1600,7 +1600,7 @@ static int gather_bitmaps(struct md_rdev *rdev) for (sn = 0; sn < mddev->bitmap_info.nodes; sn++) { if (sn == (cinfo->slot_number - 1)) continue; - err = md_bitmap_copy_from_slot(mddev, sn, &lo, &hi, false); + err = mddev->bitmap_ops->copy_from_slot(mddev, sn, &lo, &hi, false); if (err) { pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn); goto out; From ef1c400fafe27f06ba1e8050fc2f35662fe8e106 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:48 +0800 Subject: [PATCH 068/120] md/md-bitmap: merge md_bitmap_set_pages() into struct bitmap_operations o that the implementation won't be exposed, and it'll be possible o invent a new bitmap by replacing bitmap_operations. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-39-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 4 ++-- drivers/md/md-bitmap.h | 2 +- drivers/md/md-cluster.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 25126cdee4e1..70818c56004f 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2132,11 +2132,10 @@ static int bitmap_copy_from_slot(struct mddev *mddev, int slot, sector_t *low, return rv; } -void md_bitmap_set_pages(struct bitmap *bitmap, unsigned long pages) +static void bitmap_set_pages(struct bitmap *bitmap, unsigned long pages) { bitmap->counts.pages = pages; } -EXPORT_SYMBOL_GPL(md_bitmap_set_pages); static int bitmap_get_stats(struct bitmap *bitmap, struct md_bitmap_stats *stats) { @@ -2782,6 +2781,7 @@ static struct bitmap_operations bitmap_ops = { .sync_with_cluster = bitmap_sync_with_cluster, .get_from_slot = bitmap_get_from_slot, .copy_from_slot = bitmap_copy_from_slot, + .set_pages = bitmap_set_pages, }; void mddev_set_bitmap_ops(struct mddev *mddev) diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 94d9a6ae244a..7fee98e3d517 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -279,6 +279,7 @@ struct bitmap_operations { struct bitmap *(*get_from_slot)(struct mddev *mddev, int slot); int (*copy_from_slot)(struct mddev *mddev, int slot, sector_t *lo, sector_t *hi, bool clear_bits); + void (*set_pages)(struct bitmap *bitmap, unsigned long pages); }; /* the bitmap API */ @@ -286,7 +287,6 @@ void mddev_set_bitmap_ops(struct mddev *mddev); /* these are exported */ -void md_bitmap_set_pages(struct bitmap *bitmap, unsigned long pages); void md_bitmap_free(struct bitmap *bitmap); void md_bitmap_wait_behind_writes(struct mddev *mddev); diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 515f46983ace..3296925b8455 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -1192,7 +1192,7 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz bm_lockres->flags |= DLM_LKF_NOQUEUE; rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); if (!rv) - md_bitmap_set_pages(bitmap, my_pages); + mddev->bitmap_ops->set_pages(bitmap, my_pages); lockres_free(bm_lockres); if (my_pages != stats.pages) From c65c20dc504b08204d6c3effffa9409a526822aa Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:49 +0800 Subject: [PATCH 069/120] md/md-bitmap: merge md_bitmap_free() into bitmap_operations So that the implementation won't be exposed, and it'll be possible o invent a new bitmap by replacing bitmap_operations. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-40-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 7 ++----- drivers/md/md-bitmap.h | 3 +-- drivers/md/md-cluster.c | 12 ++++++------ 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 70818c56004f..62e4f9d5a559 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1828,10 +1828,7 @@ static void bitmap_flush(struct mddev *mddev) bitmap_update_sb(bitmap); } -/* - * free memory that was allocated - */ -void md_bitmap_free(struct bitmap *bitmap) +static void md_bitmap_free(struct bitmap *bitmap) { unsigned long k, pages; struct bitmap_page *bp; @@ -1865,7 +1862,6 @@ void md_bitmap_free(struct bitmap *bitmap) kfree(bp); kfree(bitmap); } -EXPORT_SYMBOL(md_bitmap_free); void md_bitmap_wait_behind_writes(struct mddev *mddev) { @@ -2782,6 +2778,7 @@ static struct bitmap_operations bitmap_ops = { .get_from_slot = bitmap_get_from_slot, .copy_from_slot = bitmap_copy_from_slot, .set_pages = bitmap_set_pages, + .free = md_bitmap_free, }; void mddev_set_bitmap_ops(struct mddev *mddev) diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 7fee98e3d517..7f6cc6f616b3 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -280,14 +280,13 @@ struct bitmap_operations { int (*copy_from_slot)(struct mddev *mddev, int slot, sector_t *lo, sector_t *hi, bool clear_bits); void (*set_pages)(struct bitmap *bitmap, unsigned long pages); + void (*free)(struct bitmap *bitmap); }; /* the bitmap API */ void mddev_set_bitmap_ops(struct mddev *mddev); /* these are exported */ - -void md_bitmap_free(struct bitmap *bitmap); void md_bitmap_wait_behind_writes(struct mddev *mddev); static inline bool md_bitmap_enabled(struct bitmap *bitmap) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 3296925b8455..7647ce4f76fa 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -1201,12 +1201,12 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz * can't resize bitmap */ goto out; - md_bitmap_free(bitmap); + mddev->bitmap_ops->free(bitmap); } return 0; out: - md_bitmap_free(bitmap); + mddev->bitmap_ops->free(bitmap); update_bitmap_size(mddev, oldsize); return -1; } @@ -1250,7 +1250,7 @@ static int cluster_check_sync_size(struct mddev *mddev) bm_lockres = lockres_init(mddev, str, NULL, 1); if (!bm_lockres) { pr_err("md-cluster: Cannot initialize %s\n", str); - md_bitmap_free(bitmap); + mddev->bitmap_ops->free(bitmap); return -1; } bm_lockres->flags |= DLM_LKF_NOQUEUE; @@ -1261,17 +1261,17 @@ static int cluster_check_sync_size(struct mddev *mddev) rv = mddev->bitmap_ops->get_stats(bitmap, &stats); if (rv) { - md_bitmap_free(bitmap); + mddev->bitmap_ops->free(bitmap); return rv; } if (sync_size == 0) { sync_size = stats.sync_size; } else if (sync_size != stats.sync_size) { - md_bitmap_free(bitmap); + mddev->bitmap_ops->free(bitmap); return -1; } - md_bitmap_free(bitmap); + mddev->bitmap_ops->free(bitmap); } return (my_sync_size == sync_size) ? 0 : -1; From 49f5f5e309e6127957babed7834f5a0e1022f936 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:50 +0800 Subject: [PATCH 070/120] md/md-bitmap: merge md_bitmap_wait_behind_writes() into bitmap_operations So that the implementation won't be exposed, and it'll be possible to invent a new bitmap by replacing bitmap_operations. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-41-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 6 +++--- drivers/md/md-bitmap.h | 4 +--- drivers/md/md.c | 2 +- drivers/md/raid1.c | 2 +- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 62e4f9d5a559..b5dbfa9728dc 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1863,7 +1863,7 @@ static void md_bitmap_free(struct bitmap *bitmap) kfree(bitmap); } -void md_bitmap_wait_behind_writes(struct mddev *mddev) +static void bitmap_wait_behind_writes(struct mddev *mddev) { struct bitmap *bitmap = mddev->bitmap; @@ -1876,7 +1876,6 @@ void md_bitmap_wait_behind_writes(struct mddev *mddev) atomic_read(&bitmap->behind_writes) == 0); } } -EXPORT_SYMBOL_GPL(md_bitmap_wait_behind_writes); static void bitmap_destroy(struct mddev *mddev) { @@ -1885,7 +1884,7 @@ static void bitmap_destroy(struct mddev *mddev) if (!bitmap) /* there was no bitmap */ return; - md_bitmap_wait_behind_writes(mddev); + bitmap_wait_behind_writes(mddev); if (!mddev->serialize_policy) mddev_destroy_serial_pool(mddev, NULL); @@ -2763,6 +2762,7 @@ static struct bitmap_operations bitmap_ops = { .dirty_bits = bitmap_dirty_bits, .unplug = bitmap_unplug, .daemon_work = bitmap_daemon_work, + .wait_behind_writes = bitmap_wait_behind_writes, .startwrite = bitmap_startwrite, .endwrite = bitmap_endwrite, diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 7f6cc6f616b3..bc9cb4ca676e 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -259,6 +259,7 @@ struct bitmap_operations { unsigned long e); void (*unplug)(struct mddev *mddev, bool sync); void (*daemon_work)(struct mddev *mddev); + void (*wait_behind_writes)(struct mddev *mddev); int (*startwrite)(struct mddev *mddev, sector_t offset, unsigned long sectors, bool behind); @@ -286,9 +287,6 @@ struct bitmap_operations { /* the bitmap API */ void mddev_set_bitmap_ops(struct mddev *mddev); -/* these are exported */ -void md_bitmap_wait_behind_writes(struct mddev *mddev); - static inline bool md_bitmap_enabled(struct bitmap *bitmap) { return bitmap && bitmap->storage.filemap && diff --git a/drivers/md/md.c b/drivers/md/md.c index 00932bd9b0c8..6435688b7165 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6498,7 +6498,7 @@ EXPORT_SYMBOL_GPL(md_stop_writes); static void mddev_detach(struct mddev *mddev) { - md_bitmap_wait_behind_writes(mddev); + mddev->bitmap_ops->wait_behind_writes(mddev); if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) { mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 3e678af8f28a..b15725a7a581 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1371,7 +1371,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, * over-take any writes that are 'behind' */ mddev_add_trace_msg(mddev, "raid1 wait behind writes"); - md_bitmap_wait_behind_writes(mddev); + mddev->bitmap_ops->wait_behind_writes(mddev); } if (max_sectors < bio_sectors(bio)) { From dab2ce5534ef7a7b8db70d1abd4225ee8a7798c7 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:51 +0800 Subject: [PATCH 071/120] md/md-bitmap: merge md_bitmap_enabled() into bitmap_operations So that the implementation won't be exposed, and it'll be possible to invent a new bitmap by replacing bitmap_operations. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-42-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 19 ++++++++++++++++++- drivers/md/md-bitmap.h | 8 +------- drivers/md/raid1-10.c | 2 +- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index b5dbfa9728dc..098bffac2167 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -40,6 +40,22 @@ static inline char *bmname(struct bitmap *bitmap) return bitmap->mddev ? mdname(bitmap->mddev) : "mdX"; } +static bool __bitmap_enabled(struct bitmap *bitmap) +{ + return bitmap->storage.filemap && + !test_bit(BITMAP_STALE, &bitmap->flags); +} + +static bool bitmap_enabled(struct mddev *mddev) +{ + struct bitmap *bitmap = mddev->bitmap; + + if (!bitmap) + return false; + + return __bitmap_enabled(bitmap); +} + /* * check a page and, if necessary, allocate it (or hijack it if the alloc fails) * @@ -1035,7 +1051,7 @@ static void __bitmap_unplug(struct bitmap *bitmap) int dirty, need_write; int writing = 0; - if (!md_bitmap_enabled(bitmap)) + if (!__bitmap_enabled(bitmap)) return; /* look at each page to see if there are any set bits that need to be @@ -2753,6 +2769,7 @@ const struct attribute_group md_bitmap_group = { }; static struct bitmap_operations bitmap_ops = { + .enabled = bitmap_enabled, .create = bitmap_create, .resize = bitmap_resize, .load = bitmap_load, diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index bc9cb4ca676e..c720729687e2 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -247,6 +247,7 @@ struct md_bitmap_stats { }; struct bitmap_operations { + bool (*enabled)(struct mddev *mddev); int (*create)(struct mddev *mddev, int slot); int (*resize)(struct mddev *mddev, sector_t blocks, int chunksize, bool init); @@ -287,12 +288,5 @@ struct bitmap_operations { /* the bitmap API */ void mddev_set_bitmap_ops(struct mddev *mddev); -static inline bool md_bitmap_enabled(struct bitmap *bitmap) -{ - return bitmap && bitmap->storage.filemap && - !test_bit(BITMAP_STALE, &bitmap->flags); -} - #endif - #endif diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index e8207513eb1b..4378d3250bd7 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -140,7 +140,7 @@ static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio, * If bitmap is not enabled, it's safe to submit the io directly, and * this can get optimal performance. */ - if (!md_bitmap_enabled(mddev->bitmap)) { + if (!mddev->bitmap_ops->enabled(mddev)) { raid1_submit_write(bio); return true; } From 59fdd43304f4cbb7ee8e281ca337afd803c309f6 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:52 +0800 Subject: [PATCH 072/120] md/md-bitmap: make in memory structure internal Now that struct bitmap_page and bitmap is not used externally anymore, move them from md-bitmap.h to md-bitmap.c (expect that dm-raid is still using define marco 'COUNTER_MAX'). Also fix some checkpatch warnings. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-43-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 247 ++++++++++++++++++++++++++++++++++++---- drivers/md/md-bitmap.h | 189 +----------------------------- drivers/md/md-cluster.c | 4 +- drivers/md/md.h | 2 +- drivers/md/raid1.c | 5 +- 5 files changed, 235 insertions(+), 212 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 098bffac2167..864418c8c028 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -32,6 +32,186 @@ #include "md.h" #include "md-bitmap.h" +#define BITMAP_MAJOR_LO 3 +/* version 4 insists the bitmap is in little-endian order + * with version 3, it is host-endian which is non-portable + * Version 5 is currently set only for clustered devices + */ +#define BITMAP_MAJOR_HI 4 +#define BITMAP_MAJOR_CLUSTERED 5 +#define BITMAP_MAJOR_HOSTENDIAN 3 + +/* + * in-memory bitmap: + * + * Use 16 bit block counters to track pending writes to each "chunk". + * The 2 high order bits are special-purpose, the first is a flag indicating + * whether a resync is needed. The second is a flag indicating whether a + * resync is active. + * This means that the counter is actually 14 bits: + * + * +--------+--------+------------------------------------------------+ + * | resync | resync | counter | + * | needed | active | | + * | (0-1) | (0-1) | (0-16383) | + * +--------+--------+------------------------------------------------+ + * + * The "resync needed" bit is set when: + * a '1' bit is read from storage at startup. + * a write request fails on some drives + * a resync is aborted on a chunk with 'resync active' set + * It is cleared (and resync-active set) when a resync starts across all drives + * of the chunk. + * + * + * The "resync active" bit is set when: + * a resync is started on all drives, and resync_needed is set. + * resync_needed will be cleared (as long as resync_active wasn't already set). + * It is cleared when a resync completes. + * + * The counter counts pending write requests, plus the on-disk bit. + * When the counter is '1' and the resync bits are clear, the on-disk + * bit can be cleared as well, thus setting the counter to 0. + * When we set a bit, or in the counter (to start a write), if the fields is + * 0, we first set the disk bit and set the counter to 1. + * + * If the counter is 0, the on-disk bit is clear and the stripe is clean + * Anything that dirties the stripe pushes the counter to 2 (at least) + * and sets the on-disk bit (lazily). + * If a periodic sweep find the counter at 2, it is decremented to 1. + * If the sweep find the counter at 1, the on-disk bit is cleared and the + * counter goes to zero. + * + * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block + * counters as a fallback when "page" memory cannot be allocated: + * + * Normal case (page memory allocated): + * + * page pointer (32-bit) + * + * [ ] ------+ + * | + * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters) + * c1 c2 c2048 + * + * Hijacked case (page memory allocation failed): + * + * hijacked page pointer (32-bit) + * + * [ ][ ] (no page memory allocated) + * counter #1 (16-bit) counter #2 (16-bit) + * + */ + +#define PAGE_BITS (PAGE_SIZE << 3) +#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3) + +#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK) +#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK) +#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX) + +/* how many counters per page? */ +#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS) +/* same, except a shift value for more efficient bitops */ +#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT) +/* same, except a mask value for more efficient bitops */ +#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) + +#define BITMAP_BLOCK_SHIFT 9 + +/* + * bitmap structures: + */ + +/* the in-memory bitmap is represented by bitmap_pages */ +struct bitmap_page { + /* + * map points to the actual memory page + */ + char *map; + /* + * in emergencies (when map cannot be alloced), hijack the map + * pointer and use it as two counters itself + */ + unsigned int hijacked:1; + /* + * If any counter in this page is '1' or '2' - and so could be + * cleared then that page is marked as 'pending' + */ + unsigned int pending:1; + /* + * count of dirty bits on the page + */ + unsigned int count:30; +}; + +/* the main bitmap structure - one per mddev */ +struct bitmap { + + struct bitmap_counts { + spinlock_t lock; + struct bitmap_page *bp; + /* total number of pages in the bitmap */ + unsigned long pages; + /* number of pages not yet allocated */ + unsigned long missing_pages; + /* chunksize = 2^chunkshift (for bitops) */ + unsigned long chunkshift; + /* total number of data chunks for the array */ + unsigned long chunks; + } counts; + + struct mddev *mddev; /* the md device that the bitmap is for */ + + __u64 events_cleared; + int need_sync; + + struct bitmap_storage { + /* backing disk file */ + struct file *file; + /* cached copy of the bitmap file superblock */ + struct page *sb_page; + unsigned long sb_index; + /* list of cache pages for the file */ + struct page **filemap; + /* attributes associated filemap pages */ + unsigned long *filemap_attr; + /* number of pages in the file */ + unsigned long file_pages; + /* total bytes in the bitmap */ + unsigned long bytes; + } storage; + + unsigned long flags; + + int allclean; + + atomic_t behind_writes; + /* highest actual value at runtime */ + unsigned long behind_writes_used; + + /* + * the bitmap daemon - periodically wakes up and sweeps the bitmap + * file, cleaning up bits and flushing out pages to disk as necessary + */ + unsigned long daemon_lastrun; /* jiffies of last run */ + /* + * when we lasted called end_sync to update bitmap with resync + * progress. + */ + unsigned long last_end_sync; + + /* pending writes to the bitmap file */ + atomic_t pending_writes; + wait_queue_head_t write_wait; + wait_queue_head_t overflow_wait; + wait_queue_head_t behind_wait; + + struct kernfs_node *sysfs_can_clear; + /* slot offset for clustered env */ + int cluster_slot; +}; + static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks, int chunksize, bool init); @@ -491,9 +671,10 @@ static void md_bitmap_wait_writes(struct bitmap *bitmap) /* update the event counter and sync the superblock to disk */ -static void bitmap_update_sb(struct bitmap *bitmap) +static void bitmap_update_sb(void *data) { bitmap_super_t *sb; + struct bitmap *bitmap = data; if (!bitmap || !bitmap->mddev) /* no bitmap for this array */ return; @@ -1844,10 +2025,11 @@ static void bitmap_flush(struct mddev *mddev) bitmap_update_sb(bitmap); } -static void md_bitmap_free(struct bitmap *bitmap) +static void md_bitmap_free(void *data) { unsigned long k, pages; struct bitmap_page *bp; + struct bitmap *bitmap = data; if (!bitmap) /* there was no bitmap */ return; @@ -2076,7 +2258,7 @@ out: } /* caller need to free returned bitmap with md_bitmap_free() */ -static struct bitmap *bitmap_get_from_slot(struct mddev *mddev, int slot) +static void *bitmap_get_from_slot(struct mddev *mddev, int slot) { int rv = 0; struct bitmap *bitmap; @@ -2143,15 +2325,18 @@ static int bitmap_copy_from_slot(struct mddev *mddev, int slot, sector_t *low, return rv; } -static void bitmap_set_pages(struct bitmap *bitmap, unsigned long pages) +static void bitmap_set_pages(void *data, unsigned long pages) { + struct bitmap *bitmap = data; + bitmap->counts.pages = pages; } -static int bitmap_get_stats(struct bitmap *bitmap, struct md_bitmap_stats *stats) +static int bitmap_get_stats(void *data, struct md_bitmap_stats *stats) { struct bitmap_storage *storage; struct bitmap_counts *counts; + struct bitmap *bitmap = data; bitmap_super_t *sb; if (!bitmap) @@ -2510,6 +2695,7 @@ space_show(struct mddev *mddev, char *page) static ssize_t space_store(struct mddev *mddev, const char *buf, size_t len) { + struct bitmap *bitmap; unsigned long sectors; int rv; @@ -2520,8 +2706,8 @@ space_store(struct mddev *mddev, const char *buf, size_t len) if (sectors == 0) return -EINVAL; - if (mddev->bitmap && - sectors < (mddev->bitmap->storage.bytes + 511) >> 9) + bitmap = mddev->bitmap; + if (bitmap && sectors < (bitmap->storage.bytes + 511) >> 9) return -EFBIG; /* Bitmap is too big for this small space */ /* could make sure it isn't too big, but that isn't really @@ -2698,10 +2884,13 @@ __ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store); static ssize_t can_clear_show(struct mddev *mddev, char *page) { int len; + struct bitmap *bitmap; + spin_lock(&mddev->lock); - if (mddev->bitmap) - len = sprintf(page, "%s\n", (mddev->bitmap->need_sync ? - "false" : "true")); + bitmap = mddev->bitmap; + if (bitmap) + len = sprintf(page, "%s\n", (bitmap->need_sync ? "false" : + "true")); else len = sprintf(page, "\n"); spin_unlock(&mddev->lock); @@ -2710,17 +2899,24 @@ static ssize_t can_clear_show(struct mddev *mddev, char *page) static ssize_t can_clear_store(struct mddev *mddev, const char *buf, size_t len) { - if (mddev->bitmap == NULL) + struct bitmap *bitmap = mddev->bitmap; + + if (!bitmap) return -ENOENT; - if (strncmp(buf, "false", 5) == 0) - mddev->bitmap->need_sync = 1; - else if (strncmp(buf, "true", 4) == 0) { + + if (strncmp(buf, "false", 5) == 0) { + bitmap->need_sync = 1; + return len; + } + + if (strncmp(buf, "true", 4) == 0) { if (mddev->degraded) return -EBUSY; - mddev->bitmap->need_sync = 0; - } else - return -EINVAL; - return len; + bitmap->need_sync = 0; + return len; + } + + return -EINVAL; } static struct md_sysfs_entry bitmap_can_clear = @@ -2730,21 +2926,26 @@ static ssize_t behind_writes_used_show(struct mddev *mddev, char *page) { ssize_t ret; + struct bitmap *bitmap; + spin_lock(&mddev->lock); - if (mddev->bitmap == NULL) + bitmap = mddev->bitmap; + if (!bitmap) ret = sprintf(page, "0\n"); else - ret = sprintf(page, "%lu\n", - mddev->bitmap->behind_writes_used); + ret = sprintf(page, "%lu\n", bitmap->behind_writes_used); spin_unlock(&mddev->lock); + return ret; } static ssize_t behind_writes_used_reset(struct mddev *mddev, const char *buf, size_t len) { - if (mddev->bitmap) - mddev->bitmap->behind_writes_used = 0; + struct bitmap *bitmap = mddev->bitmap; + + if (bitmap) + bitmap->behind_writes_used = 0; return len; } diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index c720729687e2..662e6fc141a7 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -7,81 +7,7 @@ #ifndef BITMAP_H #define BITMAP_H 1 -#define BITMAP_MAJOR_LO 3 -/* version 4 insists the bitmap is in little-endian order - * with version 3, it is host-endian which is non-portable - * Version 5 is currently set only for clustered devices - */ -#define BITMAP_MAJOR_HI 4 -#define BITMAP_MAJOR_CLUSTERED 5 -#define BITMAP_MAJOR_HOSTENDIAN 3 - -/* - * in-memory bitmap: - * - * Use 16 bit block counters to track pending writes to each "chunk". - * The 2 high order bits are special-purpose, the first is a flag indicating - * whether a resync is needed. The second is a flag indicating whether a - * resync is active. - * This means that the counter is actually 14 bits: - * - * +--------+--------+------------------------------------------------+ - * | resync | resync | counter | - * | needed | active | | - * | (0-1) | (0-1) | (0-16383) | - * +--------+--------+------------------------------------------------+ - * - * The "resync needed" bit is set when: - * a '1' bit is read from storage at startup. - * a write request fails on some drives - * a resync is aborted on a chunk with 'resync active' set - * It is cleared (and resync-active set) when a resync starts across all drives - * of the chunk. - * - * - * The "resync active" bit is set when: - * a resync is started on all drives, and resync_needed is set. - * resync_needed will be cleared (as long as resync_active wasn't already set). - * It is cleared when a resync completes. - * - * The counter counts pending write requests, plus the on-disk bit. - * When the counter is '1' and the resync bits are clear, the on-disk - * bit can be cleared as well, thus setting the counter to 0. - * When we set a bit, or in the counter (to start a write), if the fields is - * 0, we first set the disk bit and set the counter to 1. - * - * If the counter is 0, the on-disk bit is clear and the stripe is clean - * Anything that dirties the stripe pushes the counter to 2 (at least) - * and sets the on-disk bit (lazily). - * If a periodic sweep find the counter at 2, it is decremented to 1. - * If the sweep find the counter at 1, the on-disk bit is cleared and the - * counter goes to zero. - * - * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block - * counters as a fallback when "page" memory cannot be allocated: - * - * Normal case (page memory allocated): - * - * page pointer (32-bit) - * - * [ ] ------+ - * | - * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters) - * c1 c2 c2048 - * - * Hijacked case (page memory allocation failed): - * - * hijacked page pointer (32-bit) - * - * [ ][ ] (no page memory allocated) - * counter #1 (16-bit) counter #2 (16-bit) - * - */ - -#ifdef __KERNEL__ - -#define PAGE_BITS (PAGE_SIZE << 3) -#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3) +#define BITMAP_MAGIC 0x6d746962 typedef __u16 bitmap_counter_t; #define COUNTER_BITS 16 @@ -91,26 +17,6 @@ typedef __u16 bitmap_counter_t; #define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1))) #define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2))) #define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1) -#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK) -#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK) -#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX) - -/* how many counters per page? */ -#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS) -/* same, except a shift value for more efficient bitops */ -#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT) -/* same, except a mask value for more efficient bitops */ -#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) - -#define BITMAP_BLOCK_SHIFT 9 - -#endif - -/* - * bitmap structures: - */ - -#define BITMAP_MAGIC 0x6d746962 /* use these for bitmap->flags and bitmap->sb->state bit-fields */ enum bitmap_state { @@ -152,88 +58,6 @@ typedef struct bitmap_super_s { * devices. For raid10 it is the size of the array. */ -#ifdef __KERNEL__ - -/* the in-memory bitmap is represented by bitmap_pages */ -struct bitmap_page { - /* - * map points to the actual memory page - */ - char *map; - /* - * in emergencies (when map cannot be alloced), hijack the map - * pointer and use it as two counters itself - */ - unsigned int hijacked:1; - /* - * If any counter in this page is '1' or '2' - and so could be - * cleared then that page is marked as 'pending' - */ - unsigned int pending:1; - /* - * count of dirty bits on the page - */ - unsigned int count:30; -}; - -/* the main bitmap structure - one per mddev */ -struct bitmap { - - struct bitmap_counts { - spinlock_t lock; - struct bitmap_page *bp; - unsigned long pages; /* total number of pages - * in the bitmap */ - unsigned long missing_pages; /* number of pages - * not yet allocated */ - unsigned long chunkshift; /* chunksize = 2^chunkshift - * (for bitops) */ - unsigned long chunks; /* Total number of data - * chunks for the array */ - } counts; - - struct mddev *mddev; /* the md device that the bitmap is for */ - - __u64 events_cleared; - int need_sync; - - struct bitmap_storage { - struct file *file; /* backing disk file */ - struct page *sb_page; /* cached copy of the bitmap - * file superblock */ - unsigned long sb_index; - struct page **filemap; /* list of cache pages for - * the file */ - unsigned long *filemap_attr; /* attributes associated - * w/ filemap pages */ - unsigned long file_pages; /* number of pages in the file*/ - unsigned long bytes; /* total bytes in the bitmap */ - } storage; - - unsigned long flags; - - int allclean; - - atomic_t behind_writes; - unsigned long behind_writes_used; /* highest actual value at runtime */ - - /* - * the bitmap daemon - periodically wakes up and sweeps the bitmap - * file, cleaning up bits and flushing out pages to disk as necessary - */ - unsigned long daemon_lastrun; /* jiffies of last run */ - unsigned long last_end_sync; /* when we lasted called end_sync to - * update bitmap with resync progress */ - - atomic_t pending_writes; /* pending writes to the bitmap file */ - wait_queue_head_t write_wait; - wait_queue_head_t overflow_wait; - wait_queue_head_t behind_wait; - - struct kernfs_node *sysfs_can_clear; - int cluster_slot; /* Slot offset for clustered env */ -}; - struct md_bitmap_stats { u64 events_cleared; int behind_writes; @@ -272,21 +96,20 @@ struct bitmap_operations { void (*cond_end_sync)(struct mddev *mddev, sector_t sector, bool force); void (*close_sync)(struct mddev *mddev); - void (*update_sb)(struct bitmap *bitmap); - int (*get_stats)(struct bitmap *bitmap, struct md_bitmap_stats *stats); + void (*update_sb)(void *data); + int (*get_stats)(void *data, struct md_bitmap_stats *stats); void (*sync_with_cluster)(struct mddev *mddev, sector_t old_lo, sector_t old_hi, sector_t new_lo, sector_t new_hi); - struct bitmap *(*get_from_slot)(struct mddev *mddev, int slot); + void *(*get_from_slot)(struct mddev *mddev, int slot); int (*copy_from_slot)(struct mddev *mddev, int slot, sector_t *lo, sector_t *hi, bool clear_bits); - void (*set_pages)(struct bitmap *bitmap, unsigned long pages); - void (*free)(struct bitmap *bitmap); + void (*set_pages)(void *data, unsigned long pages); + void (*free)(void *data); }; /* the bitmap API */ void mddev_set_bitmap_ops(struct mddev *mddev); #endif -#endif diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 7647ce4f76fa..6595f89becdb 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -1144,7 +1144,7 @@ static int update_bitmap_size(struct mddev *mddev, sector_t size) static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsize) { - struct bitmap *bitmap = mddev->bitmap; + void *bitmap = mddev->bitmap; struct md_bitmap_stats stats; unsigned long my_pages; int i, rv; @@ -1218,9 +1218,9 @@ static int cluster_check_sync_size(struct mddev *mddev) { int current_slot = md_cluster_ops->slot_number(mddev); int node_num = mddev->bitmap_info.nodes; - struct bitmap *bitmap = mddev->bitmap; struct dlm_lock_resource *bm_lockres; struct md_bitmap_stats stats; + void *bitmap = mddev->bitmap; unsigned long sync_size = 0; unsigned long my_sync_size; char str[64]; diff --git a/drivers/md/md.h b/drivers/md/md.h index e56193f71ab4..1c6a5f41adca 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -535,7 +535,7 @@ struct mddev { struct percpu_ref writes_pending; int sync_checkers; /* # of threads checking writes_pending */ - struct bitmap *bitmap; /* the bitmap for the device */ + void *bitmap; /* the bitmap for the device */ struct bitmap_operations *bitmap_ops; struct { struct file *file; /* the bitmap file */ diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index b15725a7a581..f55c8e67d059 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1412,7 +1412,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, struct r1conf *conf = mddev->private; struct r1bio *r1_bio; int i, disks; - struct bitmap *bitmap = mddev->bitmap; unsigned long flags; struct md_rdev *blocked_rdev; int first_clone; @@ -1565,7 +1564,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, * at a time and thus needs a new bio that can fit the whole payload * this bio in page sized chunks. */ - if (write_behind && bitmap) + if (write_behind && mddev->bitmap) max_sectors = min_t(int, max_sectors, BIO_MAX_VECS * (PAGE_SIZE >> 9)); if (max_sectors < bio_sectors(bio)) { @@ -1601,7 +1600,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, * Not if there are too many, or cannot * allocate memory, or a reader on WriteMostly * is waiting for behind writes to flush */ - err = mddev->bitmap_ops->get_stats(bitmap, &stats); + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); if (!err && write_behind && !stats.behind_wait && stats.behind_writes < max_write_behind) alloc_behind_master_bio(r1_bio, bio); From b75197e86e6d3de4e611869ef30a27cf414a5f77 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 27 Aug 2024 19:06:16 +0800 Subject: [PATCH 073/120] md: Remove flush handling For flush request, md has a special flush handling to merge concurrent flush request into single one, however, the whole mechanism is based on a disk level spin_lock 'mddev->lock'. And fsync can be called quite often in some user cases, for consequence, spin lock from IO fast path can cause performance degradation. Fortunately, the block layer already has flush handling to merge concurrent flush request, and it only acquires hctx level spin lock. (see details in blk-flush.c) This patch removes the flush handling in md, and converts to use general block layer flush handling in underlying disks. Flush test for 4 nvme raid10: start 128 threads to do fsync 100000 times, on arm64, see how long it takes. Test script: void* thread_func(void* arg) { int fd = *(int*)arg; for (int i = 0; i < FSYNC_COUNT; i++) { fsync(fd); } return NULL; } int main() { int fd = open("/dev/md0", O_RDWR); if (fd < 0) { perror("open"); exit(1); } pthread_t threads[THREADS]; struct timeval start, end; gettimeofday(&start, NULL); for (int i = 0; i < THREADS; i++) { pthread_create(&threads[i], NULL, thread_func, &fd); } for (int i = 0; i < THREADS; i++) { pthread_join(threads[i], NULL); } gettimeofday(&end, NULL); close(fd); long long elapsed = (end.tv_sec - start.tv_sec) * 1000000LL + (end.tv_usec - start.tv_usec); printf("Elapsed time: %lld microseconds\n", elapsed); return 0; } Test result: about 10 times faster: Before this patch: 50943374 microseconds After this patch: 5096347 microseconds Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240827110616.3860190-1-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md.c | 146 +++++++----------------------------------------- drivers/md/md.h | 10 ---- 2 files changed, 19 insertions(+), 137 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 23cc77d51676..ce5d8be13821 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -546,137 +546,30 @@ static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_n return 0; } -/* - * Generic flush handling for md - */ - -static void md_end_flush(struct bio *bio) -{ - struct md_rdev *rdev = bio->bi_private; - struct mddev *mddev = rdev->mddev; - - bio_put(bio); - - rdev_dec_pending(rdev, mddev); - - if (atomic_dec_and_test(&mddev->flush_pending)) - /* The pre-request flush has finished */ - queue_work(md_wq, &mddev->flush_work); -} - -static void md_submit_flush_data(struct work_struct *ws); - -static void submit_flushes(struct work_struct *ws) -{ - struct mddev *mddev = container_of(ws, struct mddev, flush_work); - struct md_rdev *rdev; - - mddev->start_flush = ktime_get_boottime(); - INIT_WORK(&mddev->flush_work, md_submit_flush_data); - atomic_set(&mddev->flush_pending, 1); - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) - if (rdev->raid_disk >= 0 && - !test_bit(Faulty, &rdev->flags)) { - struct bio *bi; - - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - bi = bio_alloc_bioset(rdev->bdev, 0, - REQ_OP_WRITE | REQ_PREFLUSH, - GFP_NOIO, &mddev->bio_set); - bi->bi_end_io = md_end_flush; - bi->bi_private = rdev; - atomic_inc(&mddev->flush_pending); - submit_bio(bi); - rcu_read_lock(); - } - rcu_read_unlock(); - if (atomic_dec_and_test(&mddev->flush_pending)) - queue_work(md_wq, &mddev->flush_work); -} - -static void md_submit_flush_data(struct work_struct *ws) -{ - struct mddev *mddev = container_of(ws, struct mddev, flush_work); - struct bio *bio = mddev->flush_bio; - - /* - * must reset flush_bio before calling into md_handle_request to avoid a - * deadlock, because other bios passed md_handle_request suspend check - * could wait for this and below md_handle_request could wait for those - * bios because of suspend check - */ - spin_lock_irq(&mddev->lock); - mddev->prev_flush_start = mddev->start_flush; - mddev->flush_bio = NULL; - spin_unlock_irq(&mddev->lock); - wake_up(&mddev->sb_wait); - - if (bio->bi_iter.bi_size == 0) { - /* an empty barrier - all done */ - bio_endio(bio); - } else { - bio->bi_opf &= ~REQ_PREFLUSH; - - /* - * make_requst() will never return error here, it only - * returns error in raid5_make_request() by dm-raid. - * Since dm always splits data and flush operation into - * two separate io, io size of flush submitted by dm - * always is 0, make_request() will not be called here. - */ - if (WARN_ON_ONCE(!mddev->pers->make_request(mddev, bio))) - bio_io_error(bio); - } - - /* The pair is percpu_ref_get() from md_flush_request() */ - percpu_ref_put(&mddev->active_io); -} - -/* - * Manages consolidation of flushes and submitting any flushes needed for - * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is - * being finished in another context. Returns false if the flushing is - * complete but still needs the I/O portion of the bio to be processed. - */ bool md_flush_request(struct mddev *mddev, struct bio *bio) { - ktime_t req_start = ktime_get_boottime(); - spin_lock_irq(&mddev->lock); - /* flush requests wait until ongoing flush completes, - * hence coalescing all the pending requests. + struct md_rdev *rdev; + struct bio *new; + + /* + * md_flush_reqeust() should be called under md_handle_request() and + * 'active_io' is already grabbed. Hence it's safe to get rdev directly + * without rcu protection. */ - wait_event_lock_irq(mddev->sb_wait, - !mddev->flush_bio || - ktime_before(req_start, mddev->prev_flush_start), - mddev->lock); - /* new request after previous flush is completed */ - if (ktime_after(req_start, mddev->prev_flush_start)) { - WARN_ON(mddev->flush_bio); - /* - * Grab a reference to make sure mddev_suspend() will wait for - * this flush to be done. - * - * md_flush_reqeust() is called under md_handle_request() and - * 'active_io' is already grabbed, hence percpu_ref_is_zero() - * won't pass, percpu_ref_tryget_live() can't be used because - * percpu_ref_kill() can be called by mddev_suspend() - * concurrently. - */ - WARN_ON(percpu_ref_is_zero(&mddev->active_io)); - percpu_ref_get(&mddev->active_io); - mddev->flush_bio = bio; - spin_unlock_irq(&mddev->lock); - INIT_WORK(&mddev->flush_work, submit_flushes); - queue_work(md_wq, &mddev->flush_work); - return true; + WARN_ON(percpu_ref_is_zero(&mddev->active_io)); + + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) + continue; + + new = bio_alloc_bioset(rdev->bdev, 0, + REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO, + &mddev->bio_set); + bio_chain(new, bio); + submit_bio(new); } - /* flush was performed for some other bio while we waited. */ - spin_unlock_irq(&mddev->lock); - if (bio->bi_iter.bi_size == 0) { - /* pure flush without data - all done */ + if (bio_sectors(bio) == 0) { bio_endio(bio); return true; } @@ -763,7 +656,6 @@ int mddev_init(struct mddev *mddev) atomic_set(&mddev->openers, 0); atomic_set(&mddev->sync_seq, 0); spin_lock_init(&mddev->lock); - atomic_set(&mddev->flush_pending, 0); init_waitqueue_head(&mddev->sb_wait); init_waitqueue_head(&mddev->recovery_wait); mddev->reshape_position = MaxSector; diff --git a/drivers/md/md.h b/drivers/md/md.h index a0d6827dced9..bc684d503ab6 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -571,16 +571,6 @@ struct mddev { */ struct bio_set io_clone_set; - /* Generic flush handling. - * The last to finish preflush schedules a worker to submit - * the rest of the request (without the REQ_PREFLUSH flag). - */ - struct bio *flush_bio; - atomic_t flush_pending; - ktime_t start_flush, prev_flush_start; /* prev_flush_start is when the previous completed - * flush was started. - */ - struct work_struct flush_work; struct work_struct event_work; /* used by dm to report failure event */ mempool_t *serial_info_pool; void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); From f6f84be089c9d6f5e3e1228c389e51c7ae7bad1a Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Fri, 9 Aug 2024 15:53:46 +0200 Subject: [PATCH 074/120] block/rnbd-srv: Add sanity check and remove redundant assignment The bio->bi_iter.bi_size is updated when bio_add_page() is called. So we do not need to assign msg->bi_size again to it, since its redudant and can also be harmful. Instead we can use it to add a sanity check, which checks the locally calculated bi_size, with the one sent in msg. Signed-off-by: Md Haris Iqbal Signed-off-by: Jack Wang Signed-off-by: Grzegorz Prajsner Link: https://lore.kernel.org/r/20240809135346.978320-1-haris.iqbal@ionos.com Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-srv.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index f6e3a3c4b76c..08ce6d96d04c 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -149,15 +149,22 @@ static int process_rdma(struct rnbd_srv_session *srv_sess, rnbd_to_bio_flags(le32_to_cpu(msg->rw)), GFP_KERNEL); if (bio_add_page(bio, virt_to_page(data), datalen, offset_in_page(data)) != datalen) { - rnbd_srv_err(sess_dev, "Failed to map data to bio\n"); + rnbd_srv_err_rl(sess_dev, "Failed to map data to bio\n"); err = -EINVAL; goto bio_put; } + bio->bi_opf = rnbd_to_bio_flags(le32_to_cpu(msg->rw)); + if (bio_has_data(bio) && + bio->bi_iter.bi_size != le32_to_cpu(msg->bi_size)) { + rnbd_srv_err_rl(sess_dev, "Datalen mismatch: bio bi_size (%u), bi_size (%u)\n", + bio->bi_iter.bi_size, msg->bi_size); + err = -EINVAL; + goto bio_put; + } bio->bi_end_io = rnbd_dev_bi_end_io; bio->bi_private = priv; bio->bi_iter.bi_sector = le64_to_cpu(msg->sector); - bio->bi_iter.bi_size = le32_to_cpu(msg->bi_size); prio = srv_sess->ver < RNBD_PROTO_VER_MAJOR || usrlen < sizeof(*msg) ? 0 : le16_to_cpu(msg->prio); bio_set_prio(bio, prio); From b35243a447b9fe6457fa8e1352152b818436ba5a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 26 Aug 2024 19:37:54 +0200 Subject: [PATCH 075/120] block: rework bio splitting The current setup with bio_may_exceed_limit and __bio_split_to_limits is a bit of a mess. Change it so that __bio_split_to_limits does all the work and is just a variant of bio_split_to_limits that returns nr_segs. This is done by inlining it and instead have the various bio_split_* helpers directly submit the potentially split bios. To support btrfs, the rw version has a lower level helper split out that just returns the offset to split. This turns out to nicely clean up the btrfs flow as well. Signed-off-by: Christoph Hellwig Acked-by: David Sterba Reviewed-by: Damien Le Moal Tested-by: Hans Holmberg Reviewed-by: Hans Holmberg Link: https://lore.kernel.org/r/20240826173820.1690925-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-merge.c | 146 +++++++++++++++++--------------------------- block/blk-mq.c | 11 ++-- block/blk.h | 69 ++++++++++++++------- fs/btrfs/bio.c | 30 +++++---- include/linux/bio.h | 4 +- 5 files changed, 128 insertions(+), 132 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index de5281bcadc5..c7222c4685e0 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -105,9 +105,33 @@ static unsigned int bio_allowed_max_sectors(const struct queue_limits *lim) return round_down(UINT_MAX, lim->logical_block_size) >> SECTOR_SHIFT; } -static struct bio *bio_split_discard(struct bio *bio, - const struct queue_limits *lim, - unsigned *nsegs, struct bio_set *bs) +static struct bio *bio_submit_split(struct bio *bio, int split_sectors) +{ + if (unlikely(split_sectors < 0)) { + bio->bi_status = errno_to_blk_status(split_sectors); + bio_endio(bio); + return NULL; + } + + if (split_sectors) { + struct bio *split; + + split = bio_split(bio, split_sectors, GFP_NOIO, + &bio->bi_bdev->bd_disk->bio_split); + split->bi_opf |= REQ_NOMERGE; + blkcg_bio_issue_init(split); + bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); + WARN_ON_ONCE(bio_zone_write_plugging(bio)); + submit_bio_noacct(bio); + return split; + } + + return bio; +} + +struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim, + unsigned *nsegs) { unsigned int max_discard_sectors, granularity; sector_t tmp; @@ -121,10 +145,10 @@ static struct bio *bio_split_discard(struct bio *bio, min(lim->max_discard_sectors, bio_allowed_max_sectors(lim)); max_discard_sectors -= max_discard_sectors % granularity; if (unlikely(!max_discard_sectors)) - return NULL; + return bio; if (bio_sectors(bio) <= max_discard_sectors) - return NULL; + return bio; split_sectors = max_discard_sectors; @@ -139,19 +163,18 @@ static struct bio *bio_split_discard(struct bio *bio, if (split_sectors > tmp) split_sectors -= tmp; - return bio_split(bio, split_sectors, GFP_NOIO, bs); + return bio_submit_split(bio, split_sectors); } -static struct bio *bio_split_write_zeroes(struct bio *bio, - const struct queue_limits *lim, - unsigned *nsegs, struct bio_set *bs) +struct bio *bio_split_write_zeroes(struct bio *bio, + const struct queue_limits *lim, unsigned *nsegs) { *nsegs = 0; if (!lim->max_write_zeroes_sectors) - return NULL; + return bio; if (bio_sectors(bio) <= lim->max_write_zeroes_sectors) - return NULL; - return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs); + return bio; + return bio_submit_split(bio, lim->max_write_zeroes_sectors); } static inline unsigned int blk_boundary_sectors(const struct queue_limits *lim, @@ -274,27 +297,19 @@ static bool bvec_split_segs(const struct queue_limits *lim, } /** - * bio_split_rw - split a bio in two bios + * bio_split_rw_at - check if and where to split a read/write bio * @bio: [in] bio to be split * @lim: [in] queue limits to split based on * @segs: [out] number of segments in the bio with the first half of the sectors - * @bs: [in] bio set to allocate the clone from * @max_bytes: [in] maximum number of bytes per bio * - * Clone @bio, update the bi_iter of the clone to represent the first sectors - * of @bio and update @bio->bi_iter to represent the remaining sectors. The - * following is guaranteed for the cloned bio: - * - That it has at most @max_bytes worth of data - * - That it has at most queue_max_segments(@q) segments. - * - * Except for discard requests the cloned bio will point at the bi_io_vec of - * the original bio. It is the responsibility of the caller to ensure that the - * original bio is not freed before the cloned bio. The caller is also - * responsible for ensuring that @bs is only destroyed after processing of the - * split bio has finished. + * Find out if @bio needs to be split to fit the queue limits in @lim and a + * maximum size of @max_bytes. Returns a negative error number if @bio can't be + * split, 0 if the bio doesn't have to be split, or a positive sector offset if + * @bio needs to be split. */ -struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, - unsigned *segs, struct bio_set *bs, unsigned max_bytes) +int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim, + unsigned *segs, unsigned max_bytes) { struct bio_vec bv, bvprv, *bvprvp = NULL; struct bvec_iter iter; @@ -324,22 +339,17 @@ struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, } *segs = nsegs; - return NULL; + return 0; split: - if (bio->bi_opf & REQ_ATOMIC) { - bio->bi_status = BLK_STS_INVAL; - bio_endio(bio); - return ERR_PTR(-EINVAL); - } + if (bio->bi_opf & REQ_ATOMIC) + return -EINVAL; + /* * We can't sanely support splitting for a REQ_NOWAIT bio. End it * with EAGAIN if splitting is required and return an error pointer. */ - if (bio->bi_opf & REQ_NOWAIT) { - bio->bi_status = BLK_STS_AGAIN; - bio_endio(bio); - return ERR_PTR(-EAGAIN); - } + if (bio->bi_opf & REQ_NOWAIT) + return -EAGAIN; *segs = nsegs; @@ -356,58 +366,16 @@ split: * big IO can be trival, disable iopoll when split needed. */ bio_clear_polled(bio); - return bio_split(bio, bytes >> SECTOR_SHIFT, GFP_NOIO, bs); + return bytes >> SECTOR_SHIFT; } -EXPORT_SYMBOL_GPL(bio_split_rw); +EXPORT_SYMBOL_GPL(bio_split_rw_at); -/** - * __bio_split_to_limits - split a bio to fit the queue limits - * @bio: bio to be split - * @lim: queue limits to split based on - * @nr_segs: returns the number of segments in the returned bio - * - * Check if @bio needs splitting based on the queue limits, and if so split off - * a bio fitting the limits from the beginning of @bio and return it. @bio is - * shortened to the remainder and re-submitted. - * - * The split bio is allocated from @q->bio_split, which is provided by the - * block layer. - */ -struct bio *__bio_split_to_limits(struct bio *bio, - const struct queue_limits *lim, - unsigned int *nr_segs) +struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, + unsigned *nr_segs) { - struct bio_set *bs = &bio->bi_bdev->bd_disk->bio_split; - struct bio *split; - - switch (bio_op(bio)) { - case REQ_OP_DISCARD: - case REQ_OP_SECURE_ERASE: - split = bio_split_discard(bio, lim, nr_segs, bs); - break; - case REQ_OP_WRITE_ZEROES: - split = bio_split_write_zeroes(bio, lim, nr_segs, bs); - break; - default: - split = bio_split_rw(bio, lim, nr_segs, bs, - get_max_io_size(bio, lim) << SECTOR_SHIFT); - if (IS_ERR(split)) - return NULL; - break; - } - - if (split) { - /* there isn't chance to merge the split bio */ - split->bi_opf |= REQ_NOMERGE; - - blkcg_bio_issue_init(split); - bio_chain(split, bio); - trace_block_split(split, bio->bi_iter.bi_sector); - WARN_ON_ONCE(bio_zone_write_plugging(bio)); - submit_bio_noacct(bio); - return split; - } - return bio; + return bio_submit_split(bio, + bio_split_rw_at(bio, lim, nr_segs, + get_max_io_size(bio, lim) << SECTOR_SHIFT)); } /** @@ -426,9 +394,7 @@ struct bio *bio_split_to_limits(struct bio *bio) const struct queue_limits *lim = &bdev_get_queue(bio->bi_bdev)->limits; unsigned int nr_segs; - if (bio_may_exceed_limits(bio, lim)) - return __bio_split_to_limits(bio, lim, &nr_segs); - return bio; + return __bio_split_to_limits(bio, lim, &nr_segs); } EXPORT_SYMBOL(bio_split_to_limits); diff --git a/block/blk-mq.c b/block/blk-mq.c index e3c3c0c21b55..36abbaefe387 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2939,7 +2939,7 @@ void blk_mq_submit_bio(struct bio *bio) struct blk_plug *plug = current->plug; const int is_sync = op_is_sync(bio->bi_opf); struct blk_mq_hw_ctx *hctx; - unsigned int nr_segs = 1; + unsigned int nr_segs; struct request *rq; blk_status_t ret; @@ -2981,11 +2981,10 @@ void blk_mq_submit_bio(struct bio *bio) goto queue_exit; } - if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { - bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); - if (!bio) - goto queue_exit; - } + bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); + if (!bio) + goto queue_exit; + if (!bio_integrity_prep(bio)) goto queue_exit; diff --git a/block/blk.h b/block/blk.h index e180863f918b..0d8cd64c1260 100644 --- a/block/blk.h +++ b/block/blk.h @@ -331,33 +331,58 @@ ssize_t part_timeout_show(struct device *, struct device_attribute *, char *); ssize_t part_timeout_store(struct device *, struct device_attribute *, const char *, size_t); -static inline bool bio_may_exceed_limits(struct bio *bio, - const struct queue_limits *lim) -{ - switch (bio_op(bio)) { - case REQ_OP_DISCARD: - case REQ_OP_SECURE_ERASE: - case REQ_OP_WRITE_ZEROES: - return true; /* non-trivial splitting decisions */ - default: - break; - } +struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim, + unsigned *nsegs); +struct bio *bio_split_write_zeroes(struct bio *bio, + const struct queue_limits *lim, unsigned *nsegs); +struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, + unsigned *nr_segs); - /* - * All drivers must accept single-segments bios that are <= PAGE_SIZE. - * This is a quick and dirty check that relies on the fact that - * bi_io_vec[0] is always valid if a bio has data. The check might - * lead to occasional false negatives when bios are cloned, but compared - * to the performance impact of cloned bios themselves the loop below - * doesn't matter anyway. - */ +/* + * All drivers must accept single-segments bios that are smaller than PAGE_SIZE. + * + * This is a quick and dirty check that relies on the fact that bi_io_vec[0] is + * always valid if a bio has data. The check might lead to occasional false + * positives when bios are cloned, but compared to the performance impact of + * cloned bios themselves the loop below doesn't matter anyway. + */ +static inline bool bio_may_need_split(struct bio *bio, + const struct queue_limits *lim) +{ return lim->chunk_sectors || bio->bi_vcnt != 1 || bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE; } -struct bio *__bio_split_to_limits(struct bio *bio, - const struct queue_limits *lim, - unsigned int *nr_segs); +/** + * __bio_split_to_limits - split a bio to fit the queue limits + * @bio: bio to be split + * @lim: queue limits to split based on + * @nr_segs: returns the number of segments in the returned bio + * + * Check if @bio needs splitting based on the queue limits, and if so split off + * a bio fitting the limits from the beginning of @bio and return it. @bio is + * shortened to the remainder and re-submitted. + * + * The split bio is allocated from @q->bio_split, which is provided by the + * block layer. + */ +static inline struct bio *__bio_split_to_limits(struct bio *bio, + const struct queue_limits *lim, unsigned int *nr_segs) +{ + switch (bio_op(bio)) { + default: + if (bio_may_need_split(bio, lim)) + return bio_split_rw(bio, lim, nr_segs); + *nr_segs = 1; + return bio; + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: + return bio_split_discard(bio, lim, nr_segs); + case REQ_OP_WRITE_ZEROES: + return bio_split_write_zeroes(bio, lim, nr_segs); + } +} + int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs); bool blk_attempt_req_merge(struct request_queue *q, struct request *rq, diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index f04d93109960..5ae769184907 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -73,20 +73,13 @@ struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, struct btrfs_bio *orig_bbio, - u64 map_length, bool use_append) + u64 map_length) { struct btrfs_bio *bbio; struct bio *bio; - if (use_append) { - unsigned int nr_segs; - - bio = bio_split_rw(&orig_bbio->bio, &fs_info->limits, &nr_segs, - &btrfs_clone_bioset, map_length); - } else { - bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, - GFP_NOFS, &btrfs_clone_bioset); - } + bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, GFP_NOFS, + &btrfs_clone_bioset); bbio = btrfs_bio(bio); btrfs_bio_init(bbio, fs_info, NULL, orig_bbio); bbio->inode = orig_bbio->inode; @@ -664,6 +657,19 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, return true; } +static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length) +{ + unsigned int nr_segs; + int sector_offset; + + map_length = min(map_length, bbio->fs_info->max_zone_append_size); + sector_offset = bio_split_rw_at(&bbio->bio, &bbio->fs_info->limits, + &nr_segs, map_length); + if (sector_offset) + return sector_offset << SECTOR_SHIFT; + return map_length; +} + static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) { struct btrfs_inode *inode = bbio->inode; @@ -691,10 +697,10 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) map_length = min(map_length, length); if (use_append) - map_length = min(map_length, fs_info->max_zone_append_size); + map_length = btrfs_append_map_length(bbio, map_length); if (map_length < length) { - bbio = btrfs_split_bio(fs_info, bbio, map_length, use_append); + bbio = btrfs_split_bio(fs_info, bbio, map_length); bio = &bbio->bio; } diff --git a/include/linux/bio.h b/include/linux/bio.h index a46e2047bea4..faceadb040f9 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -324,8 +324,8 @@ static inline void bio_next_folio(struct folio_iter *fi, struct bio *bio) void bio_trim(struct bio *bio, sector_t offset, sector_t size); extern struct bio *bio_split(struct bio *bio, int sectors, gfp_t gfp, struct bio_set *bs); -struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, - unsigned *segs, struct bio_set *bs, unsigned max_bytes); +int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim, + unsigned *segs, unsigned max_bytes); /** * bio_next_split - get next @sectors from a bio, splitting if necessary From 379b122a3ec8033aa43cb70e8ecb6fb7f98aa68f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 26 Aug 2024 19:37:55 +0200 Subject: [PATCH 076/120] block: constify the lim argument to queue_limits_max_zone_append_sectors queue_limits_max_zone_append_sectors doesn't change the lim argument, so mark it as const. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Tested-by: Hans Holmberg Reviewed-by: Hans Holmberg Link: https://lore.kernel.org/r/20240826173820.1690925-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e85ec73a07d5..ec3ea5d1f99d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1187,7 +1187,8 @@ static inline unsigned int queue_max_segment_size(const struct request_queue *q) return q->limits.max_segment_size; } -static inline unsigned int queue_limits_max_zone_append_sectors(struct queue_limits *l) +static inline unsigned int +queue_limits_max_zone_append_sectors(const struct queue_limits *l) { unsigned int max_sectors = min(l->chunk_sectors, l->max_hw_sectors); From 1e8a7f6af926e266cc1d7ac49b56bd064057d625 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 26 Aug 2024 19:37:56 +0200 Subject: [PATCH 077/120] block: properly handle REQ_OP_ZONE_APPEND in __bio_split_to_limits Currently REQ_OP_ZONE_APPEND is handled by the bio_split_rw case in __bio_split_to_limits. This is harmful because REQ_OP_ZONE_APPEND bios do not adhere to the soft max_limits value but instead use their own capped version of max_hw_sectors, leading to incorrect splits that later blow up in bio_split. We still need the bio_split_rw logic to count nr_segs for blk-mq code, so add a new wrapper that passes in the right limit, and turns any bio that would need a split into an error as an additional debugging aid. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Tested-by: Hans Holmberg Reviewed-by: Hans Holmberg Link: https://lore.kernel.org/r/20240826173820.1690925-4-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-merge.c | 20 ++++++++++++++++++++ block/blk.h | 4 ++++ 2 files changed, 24 insertions(+) diff --git a/block/blk-merge.c b/block/blk-merge.c index c7222c4685e0..56769c4bcd79 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -378,6 +378,26 @@ struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, get_max_io_size(bio, lim) << SECTOR_SHIFT)); } +/* + * REQ_OP_ZONE_APPEND bios must never be split by the block layer. + * + * But we want the nr_segs calculation provided by bio_split_rw_at, and having + * a good sanity check that the submitter built the bio correctly is nice to + * have as well. + */ +struct bio *bio_split_zone_append(struct bio *bio, + const struct queue_limits *lim, unsigned *nr_segs) +{ + unsigned int max_sectors = queue_limits_max_zone_append_sectors(lim); + int split_sectors; + + split_sectors = bio_split_rw_at(bio, lim, nr_segs, + max_sectors << SECTOR_SHIFT); + if (WARN_ON_ONCE(split_sectors > 0)) + split_sectors = -EINVAL; + return bio_submit_split(bio, split_sectors); +} + /** * bio_split_to_limits - split a bio to fit the queue limits * @bio: bio to be split diff --git a/block/blk.h b/block/blk.h index 0d8cd64c1260..61c2afa67daa 100644 --- a/block/blk.h +++ b/block/blk.h @@ -337,6 +337,8 @@ struct bio *bio_split_write_zeroes(struct bio *bio, const struct queue_limits *lim, unsigned *nsegs); struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, unsigned *nr_segs); +struct bio *bio_split_zone_append(struct bio *bio, + const struct queue_limits *lim, unsigned *nr_segs); /* * All drivers must accept single-segments bios that are smaller than PAGE_SIZE. @@ -375,6 +377,8 @@ static inline struct bio *__bio_split_to_limits(struct bio *bio, return bio_split_rw(bio, lim, nr_segs); *nr_segs = 1; return bio; + case REQ_OP_ZONE_APPEND: + return bio_split_zone_append(bio, lim, nr_segs); case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: return bio_split_discard(bio, lim, nr_segs); From 1251580983f267e2e6b6505609a835119b68c513 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 26 Aug 2024 19:37:57 +0200 Subject: [PATCH 078/120] block: don't use bio_split_rw on misc operations bio_split_rw is designed to split read and write bios with a payload. Currently it is called by __bio_split_to_limits for all operations not explicitly list, which works because bio_may_need_split explicitly checks for bi_vcnt == 1 and thus skips the bypass if there is no payload and bio_for_each_bvec loop will never execute it's body if bi_size is 0. But all this is hard to understand, fragile and wasted pointless cycles. Switch __bio_split_to_limits to only call bio_split_rw for READ and WRITE command and don't attempt any kind split for operation that do not require splitting. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Tested-by: Hans Holmberg Reviewed-by: Hans Holmberg Link: https://lore.kernel.org/r/20240826173820.1690925-5-hch@lst.de Signed-off-by: Jens Axboe --- block/blk.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/block/blk.h b/block/blk.h index 61c2afa67daa..32f4e9f630a3 100644 --- a/block/blk.h +++ b/block/blk.h @@ -372,7 +372,8 @@ static inline struct bio *__bio_split_to_limits(struct bio *bio, const struct queue_limits *lim, unsigned int *nr_segs) { switch (bio_op(bio)) { - default: + case REQ_OP_READ: + case REQ_OP_WRITE: if (bio_may_need_split(bio, lim)) return bio_split_rw(bio, lim, nr_segs); *nr_segs = 1; @@ -384,6 +385,10 @@ static inline struct bio *__bio_split_to_limits(struct bio *bio, return bio_split_discard(bio, lim, nr_segs); case REQ_OP_WRITE_ZEROES: return bio_split_write_zeroes(bio, lim, nr_segs); + default: + /* other operations can't be split */ + *nr_segs = 0; + return bio; } } From e6a03207b925aebe10d6aacd8e040ccdaa731bff Mon Sep 17 00:00:00 2001 From: Artur Paszkiewicz Date: Tue, 27 Aug 2024 17:35:34 +0200 Subject: [PATCH 079/120] md/raid5: use wait_on_bit() for R5_Overlap Convert uses of wait_for_overlap wait queue with R5_Overlap bit to wait_on_bit() / wake_up_bit(). Signed-off-by: Artur Paszkiewicz Link: https://lore.kernel.org/r/20240827153536.6743-2-artur.paszkiewicz@intel.com Signed-off-by: Song Liu --- drivers/md/raid5-cache.c | 6 +--- drivers/md/raid5.c | 60 +++++++++++++++++++--------------------- 2 files changed, 30 insertions(+), 36 deletions(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 23f2cbcf1a6c..b4f7b79fd187 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -2798,7 +2798,6 @@ void r5c_finish_stripe_write_out(struct r5conf *conf, { struct r5l_log *log = READ_ONCE(conf->log); int i; - int do_wakeup = 0; sector_t tree_index; void __rcu **pslot; uintptr_t refcount; @@ -2815,7 +2814,7 @@ void r5c_finish_stripe_write_out(struct r5conf *conf, for (i = sh->disks; i--; ) { clear_bit(R5_InJournal, &sh->dev[i].flags); if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - do_wakeup = 1; + wake_up_bit(&sh->dev[i].flags, R5_Overlap); } /* @@ -2828,9 +2827,6 @@ void r5c_finish_stripe_write_out(struct r5conf *conf, if (atomic_dec_and_test(&conf->pending_full_writes)) md_wakeup_thread(conf->mddev->thread); - if (do_wakeup) - wake_up(&conf->wait_for_overlap); - spin_lock_irq(&log->stripe_in_journal_lock); list_del_init(&sh->r5c); spin_unlock_irq(&log->stripe_in_journal_lock); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index c84a7e0263cd..de7d9959b3dc 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2337,7 +2337,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (test_and_clear_bit(R5_Overlap, &dev->flags)) - wake_up(&sh->raid_conf->wait_for_overlap); + wake_up_bit(&dev->flags, R5_Overlap); } } local_unlock(&conf->percpu->lock); @@ -3473,7 +3473,7 @@ static bool stripe_bio_overlaps(struct stripe_head *sh, struct bio *bi, * With PPL only writes to consecutive data chunks within a * stripe are allowed because for a single stripe_head we can * only have one PPL entry at a time, which describes one data - * range. Not really an overlap, but wait_for_overlap can be + * range. Not really an overlap, but R5_Overlap can be * used to handle this. */ sector_t sector; @@ -3652,7 +3652,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, log_stripe_write_finished(sh); if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - wake_up(&conf->wait_for_overlap); + wake_up_bit(&sh->dev[i].flags, R5_Overlap); while (bi && bi->bi_iter.bi_sector < sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) { @@ -3697,7 +3697,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, sh->dev[i].toread = NULL; spin_unlock_irq(&sh->stripe_lock); if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - wake_up(&conf->wait_for_overlap); + wake_up_bit(&sh->dev[i].flags, R5_Overlap); if (bi) s->to_read--; while (bi && bi->bi_iter.bi_sector < @@ -3736,7 +3736,7 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, BUG_ON(sh->batch_head); clear_bit(STRIPE_SYNCING, &sh->state); if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) - wake_up(&conf->wait_for_overlap); + wake_up_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap); s->syncing = 0; s->replacing = 0; /* There is nothing more to do for sync/check/repair. @@ -4877,7 +4877,6 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, { struct stripe_head *sh, *next; int i; - int do_wakeup = 0; list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) { @@ -4913,7 +4912,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, spin_unlock_irq(&sh->stripe_lock); for (i = 0; i < sh->disks; i++) { if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - do_wakeup = 1; + wake_up_bit(&sh->dev[i].flags, R5_Overlap); sh->dev[i].flags = head_sh->dev[i].flags & (~((1 << R5_WriteError) | (1 << R5_Overlap))); } @@ -4927,12 +4926,9 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, spin_unlock_irq(&head_sh->stripe_lock); for (i = 0; i < head_sh->disks; i++) if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags)) - do_wakeup = 1; + wake_up_bit(&head_sh->dev[i].flags, R5_Overlap); if (head_sh->state & handle_flags) set_bit(STRIPE_HANDLE, &head_sh->state); - - if (do_wakeup) - wake_up(&head_sh->raid_conf->wait_for_overlap); } static void handle_stripe(struct stripe_head *sh) @@ -5198,7 +5194,7 @@ static void handle_stripe(struct stripe_head *sh) md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1); clear_bit(STRIPE_SYNCING, &sh->state); if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) - wake_up(&conf->wait_for_overlap); + wake_up_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap); } /* If the failed drives are just a ReadError, then we might need @@ -5755,12 +5751,11 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) int d; again: sh = raid5_get_active_stripe(conf, NULL, logical_sector, 0); - prepare_to_wait(&conf->wait_for_overlap, &w, - TASK_UNINTERRUPTIBLE); set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); if (test_bit(STRIPE_SYNCING, &sh->state)) { raid5_release_stripe(sh); - schedule(); + wait_on_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap, + TASK_UNINTERRUPTIBLE); goto again; } clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); @@ -5772,12 +5767,12 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) set_bit(R5_Overlap, &sh->dev[d].flags); spin_unlock_irq(&sh->stripe_lock); raid5_release_stripe(sh); - schedule(); + wait_on_bit(&sh->dev[d].flags, R5_Overlap, + TASK_UNINTERRUPTIBLE); goto again; } } set_bit(STRIPE_DISCARD, &sh->state); - finish_wait(&conf->wait_for_overlap, &w); sh->overwrite_disks = 0; for (d = 0; d < conf->raid_disks; d++) { if (d == sh->pd_idx || d == sh->qd_idx) @@ -5854,7 +5849,6 @@ static int add_all_stripe_bios(struct r5conf *conf, struct bio *bi, int forwrite, int previous) { int dd_idx; - int ret = 1; spin_lock_irq(&sh->stripe_lock); @@ -5870,14 +5864,19 @@ static int add_all_stripe_bios(struct r5conf *conf, if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) { set_bit(R5_Overlap, &dev->flags); - ret = 0; - continue; + spin_unlock_irq(&sh->stripe_lock); + raid5_release_stripe(sh); + /* release batch_last before wait to avoid risk of deadlock */ + if (ctx->batch_last) { + raid5_release_stripe(ctx->batch_last); + ctx->batch_last = NULL; + } + md_wakeup_thread(conf->mddev->thread); + wait_on_bit(&dev->flags, R5_Overlap, TASK_UNINTERRUPTIBLE); + return 0; } } - if (!ret) - goto out; - for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) { struct r5dev *dev = &sh->dev[dd_idx]; @@ -5893,9 +5892,8 @@ static int add_all_stripe_bios(struct r5conf *conf, RAID5_STRIPE_SHIFT(conf), ctx->sectors_to_do); } -out: spin_unlock_irq(&sh->stripe_lock); - return ret; + return 1; } enum reshape_loc { @@ -5991,17 +5989,17 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, goto out_release; } - if (test_bit(STRIPE_EXPANDING, &sh->state) || - !add_all_stripe_bios(conf, ctx, sh, bi, rw, previous)) { - /* - * Stripe is busy expanding or add failed due to - * overlap. Flush everything and wait a while. - */ + if (test_bit(STRIPE_EXPANDING, &sh->state)) { md_wakeup_thread(mddev->thread); ret = STRIPE_SCHEDULE_AND_RETRY; goto out_release; } + if (!add_all_stripe_bios(conf, ctx, sh, bi, rw, previous)) { + ret = STRIPE_RETRY; + goto out; + } + if (stripe_can_batch(sh)) { stripe_add_to_batch_list(conf, sh, ctx->batch_last); if (ctx->batch_last) From 0e4aac7366666e1377ce7669b7f63b94c1d616e6 Mon Sep 17 00:00:00 2001 From: Artur Paszkiewicz Date: Tue, 27 Aug 2024 17:35:35 +0200 Subject: [PATCH 080/120] md/raid5: only add to wq if reshape is in progress Now that actual overlaps are not handled on the wait_for_overlap wq anymore, the remaining cases when we wait on this wq are limited to reshape. If reshape is not in progress, don't add to the wq in raid5_make_request() because add_wait_queue() / remove_wait_queue() operations take a spinlock and cause noticeable contention when multiple threads are submitting requests to the mddev. Signed-off-by: Artur Paszkiewicz Link: https://lore.kernel.org/r/20240827153536.6743-3-artur.paszkiewicz@intel.com Signed-off-by: Song Liu --- drivers/md/raid5.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index de7d9959b3dc..e1ddfb6d8b37 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6070,6 +6070,7 @@ static sector_t raid5_bio_lowest_chunk_sector(struct r5conf *conf, static bool raid5_make_request(struct mddev *mddev, struct bio * bi) { DEFINE_WAIT_FUNC(wait, woken_wake_function); + bool on_wq; struct r5conf *conf = mddev->private; sector_t logical_sector; struct stripe_request_ctx ctx = {}; @@ -6143,11 +6144,15 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) * sequential IO pattern. We don't bother with the optimization when * reshaping as the performance benefit is not worth the complexity. */ - if (likely(conf->reshape_progress == MaxSector)) + if (likely(conf->reshape_progress == MaxSector)) { logical_sector = raid5_bio_lowest_chunk_sector(conf, bi); + on_wq = false; + } else { + add_wait_queue(&conf->wait_for_overlap, &wait); + on_wq = true; + } s = (logical_sector - ctx.first_sector) >> RAID5_STRIPE_SHIFT(conf); - add_wait_queue(&conf->wait_for_overlap, &wait); while (1) { res = make_stripe_request(mddev, conf, &ctx, logical_sector, bi); @@ -6158,6 +6163,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) continue; if (res == STRIPE_SCHEDULE_AND_RETRY) { + WARN_ON_ONCE(!on_wq); /* * Must release the reference to batch_last before * scheduling and waiting for work to be done, @@ -6182,7 +6188,8 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) logical_sector = ctx.first_sector + (s << RAID5_STRIPE_SHIFT(conf)); } - remove_wait_queue(&conf->wait_for_overlap, &wait); + if (unlikely(on_wq)) + remove_wait_queue(&conf->wait_for_overlap, &wait); if (ctx.batch_last) raid5_release_stripe(ctx.batch_last); From 6f039cc42f21985ef0c31eb315a84f2364bcb396 Mon Sep 17 00:00:00 2001 From: Artur Paszkiewicz Date: Tue, 27 Aug 2024 17:35:36 +0200 Subject: [PATCH 081/120] md/raid5: rename wait_for_overlap to wait_for_reshape The only remaining uses of wait_for_overlap are related to reshape so rename it accordingly. Signed-off-by: Artur Paszkiewicz Link: https://lore.kernel.org/r/20240827153536.6743-4-artur.paszkiewicz@intel.com Signed-off-by: Song Liu --- drivers/md/raid5.c | 26 +++++++++++++------------- drivers/md/raid5.h | 2 +- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e1ddfb6d8b37..dc2ea636d173 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5257,7 +5257,7 @@ static void handle_stripe(struct stripe_head *sh) } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); - wake_up(&conf->wait_for_overlap); + wake_up(&conf->wait_for_reshape); md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1); } @@ -6148,7 +6148,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) logical_sector = raid5_bio_lowest_chunk_sector(conf, bi); on_wq = false; } else { - add_wait_queue(&conf->wait_for_overlap, &wait); + add_wait_queue(&conf->wait_for_reshape, &wait); on_wq = true; } s = (logical_sector - ctx.first_sector) >> RAID5_STRIPE_SHIFT(conf); @@ -6189,7 +6189,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) (s << RAID5_STRIPE_SHIFT(conf)); } if (unlikely(on_wq)) - remove_wait_queue(&conf->wait_for_overlap, &wait); + remove_wait_queue(&conf->wait_for_reshape, &wait); if (ctx.batch_last) raid5_release_stripe(ctx.batch_last); @@ -6342,7 +6342,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk : (safepos < writepos && readpos > writepos)) || time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { /* Cannot proceed until we've updated the superblock... */ - wait_event(conf->wait_for_overlap, + wait_event(conf->wait_for_reshape, atomic_read(&conf->reshape_stripes)==0 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); if (atomic_read(&conf->reshape_stripes) != 0) @@ -6368,7 +6368,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk spin_lock_irq(&conf->device_lock); conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); - wake_up(&conf->wait_for_overlap); + wake_up(&conf->wait_for_reshape); sysfs_notify_dirent_safe(mddev->sysfs_completed); } @@ -6451,7 +6451,7 @@ finish: (sector_nr - mddev->curr_resync_completed) * 2 >= mddev->resync_max - mddev->curr_resync_completed) { /* Cannot proceed until we've updated the superblock... */ - wait_event(conf->wait_for_overlap, + wait_event(conf->wait_for_reshape, atomic_read(&conf->reshape_stripes) == 0 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); if (atomic_read(&conf->reshape_stripes) != 0) @@ -6477,7 +6477,7 @@ finish: spin_lock_irq(&conf->device_lock); conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); - wake_up(&conf->wait_for_overlap); + wake_up(&conf->wait_for_reshape); sysfs_notify_dirent_safe(mddev->sysfs_completed); } ret: @@ -6512,7 +6512,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n } /* Allow raid5_quiesce to complete */ - wait_event(conf->wait_for_overlap, conf->quiesce != 2); + wait_event(conf->wait_for_reshape, conf->quiesce != 2); if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) return reshape_request(mddev, sector_nr, skipped); @@ -7498,7 +7498,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) init_waitqueue_head(&conf->wait_for_quiescent); init_waitqueue_head(&conf->wait_for_stripe); - init_waitqueue_head(&conf->wait_for_overlap); + init_waitqueue_head(&conf->wait_for_reshape); INIT_LIST_HEAD(&conf->handle_list); INIT_LIST_HEAD(&conf->loprio_list); INIT_LIST_HEAD(&conf->hold_list); @@ -8557,7 +8557,7 @@ static void end_reshape(struct r5conf *conf) !test_bit(In_sync, &rdev->flags)) rdev->recovery_offset = MaxSector; spin_unlock_irq(&conf->device_lock); - wake_up(&conf->wait_for_overlap); + wake_up(&conf->wait_for_reshape); mddev_update_io_opt(conf->mddev, conf->raid_disks - conf->max_degraded); @@ -8621,13 +8621,13 @@ static void raid5_quiesce(struct mddev *mddev, int quiesce) conf->quiesce = 1; unlock_all_device_hash_locks_irq(conf); /* allow reshape to continue */ - wake_up(&conf->wait_for_overlap); + wake_up(&conf->wait_for_reshape); } else { /* re-enable writes */ lock_all_device_hash_locks_irq(conf); conf->quiesce = 0; wake_up(&conf->wait_for_quiescent); - wake_up(&conf->wait_for_overlap); + wake_up(&conf->wait_for_reshape); unlock_all_device_hash_locks_irq(conf); } log_quiesce(conf, quiesce); @@ -8946,7 +8946,7 @@ static void raid5_prepare_suspend(struct mddev *mddev) { struct r5conf *conf = mddev->private; - wake_up(&conf->wait_for_overlap); + wake_up(&conf->wait_for_reshape); } static struct md_personality raid6_personality = diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 9b5a7dc3f2a0..896ecfc4afa6 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -668,7 +668,7 @@ struct r5conf { struct llist_head released_stripes; wait_queue_head_t wait_for_quiescent; wait_queue_head_t wait_for_stripe; - wait_queue_head_t wait_for_overlap; + wait_queue_head_t wait_for_reshape; unsigned long cache_state; struct shrinker *shrinker; int pool_size; /* number of disks in stripeheads in pool */ From 7c2fd76048e95dd267055b5f5e0a48e6e7c81fd9 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 29 Aug 2024 13:32:17 +0000 Subject: [PATCH 082/120] nvme: fix metadata handling in nvme-passthrough On an NVMe namespace that does not support metadata, it is possible to send an IO command with metadata through io-passthru. This allows issues like [1] to trigger in the completion code path. nvme_map_user_request() doesn't check if the namespace supports metadata before sending it forward. It also allows admin commands with metadata to be processed as it ignores metadata when bdev == NULL and may report success. Reject an IO command with metadata when the NVMe namespace doesn't support it and reject an admin command if it has metadata. [1] https://lore.kernel.org/all/mb61pcylvnym8.fsf@amazon.com/ Suggested-by: Christoph Hellwig Signed-off-by: Puranjay Mohan Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Anuj Gupta Signed-off-by: Keith Busch --- drivers/nvme/host/ioctl.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 850f81e08e7d..1d769c842fbf 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -4,6 +4,7 @@ * Copyright (c) 2017-2021 Christoph Hellwig. */ #include +#include #include /* for force_successful_syscall_return */ #include #include @@ -119,9 +120,14 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, struct request_queue *q = req->q; struct nvme_ns *ns = q->queuedata; struct block_device *bdev = ns ? ns->disk->part0 : NULL; + bool supports_metadata = bdev && blk_get_integrity(bdev->bd_disk); + bool has_metadata = meta_buffer && meta_len; struct bio *bio = NULL; int ret; + if (has_metadata && !supports_metadata) + return -EINVAL; + if (ioucmd && (ioucmd->flags & IORING_URING_CMD_FIXED)) { struct iov_iter iter; @@ -143,15 +149,15 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, goto out; bio = req->bio; - if (bdev) { + if (bdev) bio_set_dev(bio, bdev); - if (meta_buffer && meta_len) { - ret = bio_integrity_map_user(bio, meta_buffer, meta_len, - meta_seed); - if (ret) - goto out_unmap; - req->cmd_flags |= REQ_INTEGRITY; - } + + if (has_metadata) { + ret = bio_integrity_map_user(bio, meta_buffer, meta_len, + meta_seed); + if (ret) + goto out_unmap; + req->cmd_flags |= REQ_INTEGRITY; } return ret; From c9ea57c91f03bcad415e1a20113bdb2077bcf990 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 30 Aug 2024 11:41:45 +0800 Subject: [PATCH 083/120] nbd: fix race between timeout and normal completion If request timetout is handled by nbd_requeue_cmd(), normal completion has to be stopped for avoiding to complete this requeued request, other use-after-free can be triggered. Fix the race by clearing NBD_CMD_INFLIGHT in nbd_requeue_cmd(), meantime make sure that cmd->lock is grabbed for clearing the flag and the requeue. Cc: Josef Bacik Cc: Yu Kuai Fixes: 2895f1831e91 ("nbd: don't clear 'NBD_CMD_INFLIGHT' flag if request is not completed") Signed-off-by: Ming Lei Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20240830034145.1827742-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 5b1811b1ba5f..4d06472bf112 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -181,6 +181,17 @@ static void nbd_requeue_cmd(struct nbd_cmd *cmd) { struct request *req = blk_mq_rq_from_pdu(cmd); + lockdep_assert_held(&cmd->lock); + + /* + * Clear INFLIGHT flag so that this cmd won't be completed in + * normal completion path + * + * INFLIGHT flag will be set when the cmd is queued to nbd next + * time. + */ + __clear_bit(NBD_CMD_INFLIGHT, &cmd->flags); + if (!test_and_set_bit(NBD_CMD_REQUEUED, &cmd->flags)) blk_mq_requeue_request(req, true); } @@ -491,8 +502,8 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req) nbd_mark_nsock_dead(nbd, nsock, 1); mutex_unlock(&nsock->tx_lock); } - mutex_unlock(&cmd->lock); nbd_requeue_cmd(cmd); + mutex_unlock(&cmd->lock); nbd_config_put(nbd); return BLK_EH_DONE; } From 18ad4df091dd5d067d2faa8fce1180b79f7041a7 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 2 Sep 2024 21:03:26 +0800 Subject: [PATCH 084/120] block, bfq: fix possible UAF for bfqq->bic with merge chain MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1) initial state, three tasks: Process 1 Process 2 Process 3 (BIC1) (BIC2) (BIC3) | Λ | Λ | Λ | | | | | | V | V | V | bfqq1 bfqq2 bfqq3 process ref: 1 1 1 2) bfqq1 merged to bfqq2: Process 1 Process 2 Process 3 (BIC1) (BIC2) (BIC3) | | | Λ \--------------\| | | V V | bfqq1--------->bfqq2 bfqq3 process ref: 0 2 1 3) bfqq2 merged to bfqq3: Process 1 Process 2 Process 3 (BIC1) (BIC2) (BIC3) here -> Λ | | \--------------\ \-------------\| V V bfqq1--------->bfqq2---------->bfqq3 process ref: 0 1 3 In this case, IO from Process 1 will get bfqq2 from BIC1 first, and then get bfqq3 through merge chain, and finially handle IO by bfqq3. Howerver, current code will think bfqq2 is owned by BIC1, like initial state, and set bfqq2->bic to BIC1. bfq_insert_request -> by Process 1 bfqq = bfq_init_rq(rq) bfqq = bfq_get_bfqq_handle_split bfqq = bic_to_bfqq -> get bfqq2 from BIC1 bfqq->ref++ rq->elv.priv[0] = bic rq->elv.priv[1] = bfqq if (bfqq_process_refs(bfqq) == 1) bfqq->bic = bic -> record BIC1 to bfqq2 __bfq_insert_request new_bfqq = bfq_setup_cooperator -> get bfqq3 from bfqq2->new_bfqq bfqq_request_freed(bfqq) new_bfqq->ref++ rq->elv.priv[1] = new_bfqq -> handle IO by bfqq3 Fix the problem by checking bfqq is from merge chain fist. And this might fix a following problem reported by our syzkaller(unreproducible): ================================================================== BUG: KASAN: slab-use-after-free in bfq_do_early_stable_merge block/bfq-iosched.c:5692 [inline] BUG: KASAN: slab-use-after-free in bfq_do_or_sched_stable_merge block/bfq-iosched.c:5805 [inline] BUG: KASAN: slab-use-after-free in bfq_get_queue+0x25b0/0x2610 block/bfq-iosched.c:5889 Write of size 1 at addr ffff888123839eb8 by task kworker/0:1H/18595 CPU: 0 PID: 18595 Comm: kworker/0:1H Tainted: G L 6.6.0-07439-gba2303cacfda #6 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 Workqueue: kblockd blk_mq_requeue_work Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x91/0xf0 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:364 [inline] print_report+0x10d/0x610 mm/kasan/report.c:475 kasan_report+0x8e/0xc0 mm/kasan/report.c:588 bfq_do_early_stable_merge block/bfq-iosched.c:5692 [inline] bfq_do_or_sched_stable_merge block/bfq-iosched.c:5805 [inline] bfq_get_queue+0x25b0/0x2610 block/bfq-iosched.c:5889 bfq_get_bfqq_handle_split+0x169/0x5d0 block/bfq-iosched.c:6757 bfq_init_rq block/bfq-iosched.c:6876 [inline] bfq_insert_request block/bfq-iosched.c:6254 [inline] bfq_insert_requests+0x1112/0x5cf0 block/bfq-iosched.c:6304 blk_mq_insert_request+0x290/0x8d0 block/blk-mq.c:2593 blk_mq_requeue_work+0x6bc/0xa70 block/blk-mq.c:1502 process_one_work kernel/workqueue.c:2627 [inline] process_scheduled_works+0x432/0x13f0 kernel/workqueue.c:2700 worker_thread+0x6f2/0x1160 kernel/workqueue.c:2781 kthread+0x33c/0x440 kernel/kthread.c:388 ret_from_fork+0x4d/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1b/0x30 arch/x86/entry/entry_64.S:305 Allocated by task 20776: kasan_save_stack+0x20/0x40 mm/kasan/common.c:45 kasan_set_track+0x25/0x30 mm/kasan/common.c:52 __kasan_slab_alloc+0x87/0x90 mm/kasan/common.c:328 kasan_slab_alloc include/linux/kasan.h:188 [inline] slab_post_alloc_hook mm/slab.h:763 [inline] slab_alloc_node mm/slub.c:3458 [inline] kmem_cache_alloc_node+0x1a4/0x6f0 mm/slub.c:3503 ioc_create_icq block/blk-ioc.c:370 [inline] ioc_find_get_icq+0x180/0xaa0 block/blk-ioc.c:436 bfq_prepare_request+0x39/0xf0 block/bfq-iosched.c:6812 blk_mq_rq_ctx_init.isra.7+0x6ac/0xa00 block/blk-mq.c:403 __blk_mq_alloc_requests+0xcc0/0x1070 block/blk-mq.c:517 blk_mq_get_new_requests block/blk-mq.c:2940 [inline] blk_mq_submit_bio+0x624/0x27c0 block/blk-mq.c:3042 __submit_bio+0x331/0x6f0 block/blk-core.c:624 __submit_bio_noacct_mq block/blk-core.c:703 [inline] submit_bio_noacct_nocheck+0x816/0xb40 block/blk-core.c:732 submit_bio_noacct+0x7a6/0x1b50 block/blk-core.c:826 xlog_write_iclog+0x7d5/0xa00 fs/xfs/xfs_log.c:1958 xlog_state_release_iclog+0x3b8/0x720 fs/xfs/xfs_log.c:619 xlog_cil_push_work+0x19c5/0x2270 fs/xfs/xfs_log_cil.c:1330 process_one_work kernel/workqueue.c:2627 [inline] process_scheduled_works+0x432/0x13f0 kernel/workqueue.c:2700 worker_thread+0x6f2/0x1160 kernel/workqueue.c:2781 kthread+0x33c/0x440 kernel/kthread.c:388 ret_from_fork+0x4d/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1b/0x30 arch/x86/entry/entry_64.S:305 Freed by task 946: kasan_save_stack+0x20/0x40 mm/kasan/common.c:45 kasan_set_track+0x25/0x30 mm/kasan/common.c:52 kasan_save_free_info+0x2b/0x50 mm/kasan/generic.c:522 ____kasan_slab_free mm/kasan/common.c:236 [inline] __kasan_slab_free+0x12c/0x1c0 mm/kasan/common.c:244 kasan_slab_free include/linux/kasan.h:164 [inline] slab_free_hook mm/slub.c:1815 [inline] slab_free_freelist_hook mm/slub.c:1841 [inline] slab_free mm/slub.c:3786 [inline] kmem_cache_free+0x118/0x6f0 mm/slub.c:3808 rcu_do_batch+0x35c/0xe30 kernel/rcu/tree.c:2189 rcu_core+0x819/0xd90 kernel/rcu/tree.c:2462 __do_softirq+0x1b0/0x7a2 kernel/softirq.c:553 Last potentially related work creation: kasan_save_stack+0x20/0x40 mm/kasan/common.c:45 __kasan_record_aux_stack+0xaf/0xc0 mm/kasan/generic.c:492 __call_rcu_common kernel/rcu/tree.c:2712 [inline] call_rcu+0xce/0x1020 kernel/rcu/tree.c:2826 ioc_destroy_icq+0x54c/0x830 block/blk-ioc.c:105 ioc_release_fn+0xf0/0x360 block/blk-ioc.c:124 process_one_work kernel/workqueue.c:2627 [inline] process_scheduled_works+0x432/0x13f0 kernel/workqueue.c:2700 worker_thread+0x6f2/0x1160 kernel/workqueue.c:2781 kthread+0x33c/0x440 kernel/kthread.c:388 ret_from_fork+0x4d/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1b/0x30 arch/x86/entry/entry_64.S:305 Second to last potentially related work creation: kasan_save_stack+0x20/0x40 mm/kasan/common.c:45 __kasan_record_aux_stack+0xaf/0xc0 mm/kasan/generic.c:492 __call_rcu_common kernel/rcu/tree.c:2712 [inline] call_rcu+0xce/0x1020 kernel/rcu/tree.c:2826 ioc_destroy_icq+0x54c/0x830 block/blk-ioc.c:105 ioc_release_fn+0xf0/0x360 block/blk-ioc.c:124 process_one_work kernel/workqueue.c:2627 [inline] process_scheduled_works+0x432/0x13f0 kernel/workqueue.c:2700 worker_thread+0x6f2/0x1160 kernel/workqueue.c:2781 kthread+0x33c/0x440 kernel/kthread.c:388 ret_from_fork+0x4d/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1b/0x30 arch/x86/entry/entry_64.S:305 The buggy address belongs to the object at ffff888123839d68 which belongs to the cache bfq_io_cq of size 1360 The buggy address is located 336 bytes inside of freed 1360-byte region [ffff888123839d68, ffff88812383a2b8) The buggy address belongs to the physical page: page:ffffea00048e0e00 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff88812383f588 pfn:0x123838 head:ffffea00048e0e00 order:3 entire_mapcount:0 nr_pages_mapped:0 pincount:0 flags: 0x17ffffc0000a40(workingset|slab|head|node=0|zone=2|lastcpupid=0x1fffff) page_type: 0xffffffff() raw: 0017ffffc0000a40 ffff88810588c200 ffffea00048ffa10 ffff888105889488 raw: ffff88812383f588 0000000000150006 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff888123839d80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff888123839e00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff888123839e80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff888123839f00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff888123839f80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== Fixes: 36eca8948323 ("block, bfq: add Early Queue Merge (EQM)") Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240902130329.3787024-2-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 36a4998c4b37..83adac3e71db 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -6934,7 +6934,8 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) * addition, if the queue has also just been split, we have to * resume its state. */ - if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { + if (likely(bfqq != &bfqd->oom_bfqq) && !bfqq->new_bfqq && + bfqq_process_refs(bfqq) == 1) { bfqq->bic = bic; if (split) { /* From 0e456dba86c7f9a19792204a044835f1ca2c8dbb Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 2 Sep 2024 21:03:27 +0800 Subject: [PATCH 085/120] block, bfq: choose the last bfqq from merge chain in bfq_setup_cooperator() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Consider the following merge chain: Process 1 Process 2 Process 3 Process 4 (BIC1) (BIC2) (BIC3) (BIC4) Λ | | | \--------------\ \-------------\ \-------------\| V V V bfqq1--------->bfqq2---------->bfqq3----------->bfqq4 IO from Process 1 will get bfqf2 from BIC1 first, then bfq_setup_cooperator() will found bfqq2 already merged to bfqq3 and then handle this IO from bfqq3. However, the merge chain can be much deeper and bfqq3 can be merged to other bfqq as well. Fix this problem by iterating to the last bfqq in bfq_setup_cooperator(). Fixes: 36eca8948323 ("block, bfq: add Early Queue Merge (EQM)") Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240902130329.3787024-3-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 83adac3e71db..ffaa0d56328a 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2911,8 +2911,12 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx]; /* if a merge has already been setup, then proceed with that first */ - if (bfqq->new_bfqq) - return bfqq->new_bfqq; + new_bfqq = bfqq->new_bfqq; + if (new_bfqq) { + while (new_bfqq->new_bfqq) + new_bfqq = new_bfqq->new_bfqq; + return new_bfqq; + } /* * Check delayed stable merge for rotational or non-queueing From 42c306ed723321af4003b2a41bb73728cab54f85 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 2 Sep 2024 21:03:28 +0800 Subject: [PATCH 086/120] block, bfq: don't break merge chain in bfq_split_bfqq() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Consider the following scenario: Process 1 Process 2 Process 3 Process 4 (BIC1) (BIC2) (BIC3) (BIC4) Λ | | | \-------------\ \-------------\ \--------------\| V V V bfqq1--------->bfqq2---------->bfqq3----------->bfqq4 ref 0 1 2 4 If Process 1 issue a new IO and bfqq2 is found, and then bfq_init_rq() decide to spilt bfqq2 by bfq_split_bfqq(). Howerver, procress reference of bfqq2 is 1 and bfq_split_bfqq() just clear the coop flag, which will break the merge chain. Expected result: caller will allocate a new bfqq for BIC1 Process 1 Process 2 Process 3 Process 4 (BIC1) (BIC2) (BIC3) (BIC4) | | | \-------------\ \--------------\| V V bfqq1--------->bfqq2---------->bfqq3----------->bfqq4 ref 0 0 1 3 Since the condition is only used for the last bfqq4 when the previous bfqq2 and bfqq3 are already splited. Fix the problem by checking if bfqq is the last one in the merge chain as well. Fixes: 36eca8948323 ("block, bfq: add Early Queue Merge (EQM)") Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240902130329.3787024-4-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index ffaa0d56328a..ca766b7d5560 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -6727,7 +6727,7 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) { bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); - if (bfqq_process_refs(bfqq) == 1) { + if (bfqq_process_refs(bfqq) == 1 && !bfqq->new_bfqq) { bfqq->pid = current->pid; bfq_clear_bfqq_coop(bfqq); bfq_clear_bfqq_split_coop(bfqq); From f45916ae60eb60e7c9c3ac60cf07e66fe1a7faad Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 2 Sep 2024 21:03:29 +0800 Subject: [PATCH 087/120] block, bfq: use bfq_reassign_last_bfqq() in bfq_bfqq_move() Instead of open coding it, there are no functional changes. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240902130329.3787024-5-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 7 +------ block/bfq-iosched.c | 4 ++-- block/bfq-iosched.h | 2 ++ 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index b758693697c0..9fb9f3533150 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -679,12 +679,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); bfqg_and_blkg_put(old_parent); - if (entity->parent && - entity->parent->last_bfqq_created == bfqq) - entity->parent->last_bfqq_created = NULL; - else if (bfqd->last_bfqq_created == bfqq) - bfqd->last_bfqq_created = NULL; - + bfq_reassign_last_bfqq(bfqq, NULL); entity->parent = bfqg->my_entity; entity->sched_data = &bfqg->sched_data; /* pin down bfqg and its associated blkg */ diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index ca766b7d5560..d1bf2b8a3576 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -3097,8 +3097,8 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) } -static void -bfq_reassign_last_bfqq(struct bfq_queue *cur_bfqq, struct bfq_queue *new_bfqq) +void bfq_reassign_last_bfqq(struct bfq_queue *cur_bfqq, + struct bfq_queue *new_bfqq) { if (cur_bfqq->entity.parent && cur_bfqq->entity.parent->last_bfqq_created == cur_bfqq) diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 08ddf2cfae5b..e16d96e2367b 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -1156,6 +1156,8 @@ void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration); void bfq_add_bfqq_busy(struct bfq_queue *bfqq); void bfq_add_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq); void bfq_del_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq); +void bfq_reassign_last_bfqq(struct bfq_queue *cur_bfqq, + struct bfq_queue *new_bfqq); /* --------------- end of interface of B-WF2Q+ ---------------- */ From 761e5afb6ddbfd4c0523ec294b7d24f3652912c7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 3 Sep 2024 09:52:43 -0600 Subject: [PATCH 088/120] MAINTAINERS: move the BFQ io scheduler to orphan state Nobody is maintaining this code, and it just falls under the umbrella of block layer code. But at least mark it as such, in case anyone wants to care more deeply about it and assume the responsibility of doing so. Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- MAINTAINERS | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 42decde38320..4a857a125d6e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3781,10 +3781,8 @@ F: Documentation/filesystems/befs.rst F: fs/befs/ BFQ I/O SCHEDULER -M: Paolo Valente -M: Jens Axboe L: linux-block@vger.kernel.org -S: Maintained +S: Orphan F: Documentation/block/bfq-iosched.rst F: block/bfq-* From 2be6190cd75cd2029ced5ccef057e15939f48c4a Mon Sep 17 00:00:00 2001 From: Alvaro Parker Date: Tue, 3 Sep 2024 13:22:14 -0400 Subject: [PATCH 089/120] block: fix comment to use set_current_state The explanatory comment used `set_task_state` instead of `set_current_state` which is the function actually used in the code. Signed-off-by: Alvaro Parker Link: https://lore.kernel.org/r/20240903172214.520086-1-alparkerdf@gmail.com Signed-off-by: Jens Axboe --- block/blk-rq-qos.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index dd7310c94713..2cfb297d9a62 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -263,7 +263,7 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, has_sleeper = !prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE); do { - /* The memory barrier in set_task_state saves us here. */ + /* The memory barrier in set_current_state saves us here. */ if (data.got_token) break; if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) { From 697ba0b6ec4ae04afb67d3911799b5e2043b4455 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 3 Sep 2024 22:48:19 +0300 Subject: [PATCH 090/120] block: fix integer overflow in BLKSECDISCARD I independently rediscovered commit 22d24a544b0d49bbcbd61c8c0eaf77d3c9297155 block: fix overflow in blk_ioctl_discard() but for secure erase. Same problem: uint64_t r[2] = {512, 18446744073709551104ULL}; ioctl(fd, BLKSECDISCARD, r); will enter near infinite loop inside blkdev_issue_secure_erase(): a.out: attempt to access beyond end of device loop0: rw=5, sector=3399043073, nr_sectors = 1024 limit=2048 bio_check_eod: 3286214 callbacks suppressed Signed-off-by: Alexey Dobriyan Link: https://lore.kernel.org/r/9e64057f-650a-46d1-b9f7-34af391536ef@p183 Signed-off-by: Jens Axboe --- block/ioctl.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/block/ioctl.c b/block/ioctl.c index e8e4a4190f18..44257bdfeacb 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -126,7 +126,7 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, return -EINVAL; filemap_invalidate_lock(bdev->bd_mapping); - err = truncate_bdev_range(bdev, mode, start, start + len - 1); + err = truncate_bdev_range(bdev, mode, start, end - 1); if (err) goto fail; @@ -163,7 +163,7 @@ fail: static int blk_ioctl_secure_erase(struct block_device *bdev, blk_mode_t mode, void __user *argp) { - uint64_t start, len; + uint64_t start, len, end; uint64_t range[2]; int err; @@ -178,11 +178,12 @@ static int blk_ioctl_secure_erase(struct block_device *bdev, blk_mode_t mode, len = range[1]; if ((start & 511) || (len & 511)) return -EINVAL; - if (start + len > bdev_nr_bytes(bdev)) + if (check_add_overflow(start, len, &end) || + end > bdev_nr_bytes(bdev)) return -EINVAL; filemap_invalidate_lock(bdev->bd_mapping); - err = truncate_bdev_range(bdev, mode, start, start + len - 1); + err = truncate_bdev_range(bdev, mode, start, end - 1); if (!err) err = blkdev_issue_secure_erase(bdev, start >> 9, len >> 9, GFP_KERNEL); From 2d2b3bc145b9d5b5c6f07d22291723ddb024ca76 Mon Sep 17 00:00:00 2001 From: Mateusz Kusiak Date: Tue, 3 Sep 2024 16:29:49 +0200 Subject: [PATCH 091/120] md: Report failed arrays as broken in mdstat Depending on if array has personality, it is either reported as active or inactive. This patch adds third status "broken" for arrays with personality that became inoperative. The reason is end users tend to assume that "active" indicates array is operational. Add "broken" state for inoperative arrays with personality and refactor the code. Signed-off-by: Mateusz Kusiak Link: https://lore.kernel.org/r/20240903142949.53628-1-mateusz.kusiak@intel.com Signed-off-by: Song Liu --- drivers/md/md.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 414146111425..b669971d4782 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8327,14 +8327,19 @@ static int md_seq_show(struct seq_file *seq, void *v) spin_unlock(&all_mddevs_lock); spin_lock(&mddev->lock); if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { - seq_printf(seq, "%s : %sactive", mdname(mddev), - mddev->pers ? "" : "in"); + seq_printf(seq, "%s : ", mdname(mddev)); if (mddev->pers) { + if (test_bit(MD_BROKEN, &mddev->flags)) + seq_printf(seq, "broken"); + else + seq_printf(seq, "active"); if (mddev->ro == MD_RDONLY) seq_printf(seq, " (read-only)"); if (mddev->ro == MD_AUTO_READ) seq_printf(seq, " (auto-read-only)"); seq_printf(seq, " %s", mddev->pers->name); + } else { + seq_printf(seq, "inactive"); } sectors = 0; From f55d3b82ac2fe8e12fe784702a7a39ab36b7d4e1 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 6 Sep 2024 18:21:53 +0800 Subject: [PATCH 092/120] MAINTAINERS: Move the BFQ io scheduler to Odd Fixes state BFQ has been lacking active maintenance for approximately two years, and it was recently transitioned to the Orphan state. However, there are still many users, I have decided to step forward and assume the role of maintainer to ensure continued support and development. While I may not be the one with the most extensive knowledge of BFQ's internals, I have been actively involved in its development since 2021. Moreover, our team continues to rigorously test BFQ in downstream kernels, ensuring it's stability and performance. Despite my confidence to maintain BFQ, I believe it is prudent to classify its state as "Odd Fixes" to accurately reflect my relatively new position as the maintainer. By assuming this responsibility, I am committed to providing the necessary support and addressing any issues that may arise with BFQ. As time progresses, we will reassess the situation and determine the appropriate state. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240906102153.612997-1-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- MAINTAINERS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 4a857a125d6e..6da36e26ee8a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3781,8 +3781,9 @@ F: Documentation/filesystems/befs.rst F: fs/befs/ BFQ I/O SCHEDULER +M: Yu Kuai L: linux-block@vger.kernel.org -S: Orphan +S: Odd Fixes F: Documentation/block/bfq-iosched.rst F: block/bfq-* From e49dacc71ec2621ce4c422cd5605d4d06f7807b0 Mon Sep 17 00:00:00 2001 From: Wouter Verhelst Date: Mon, 12 Aug 2024 15:20:37 +0200 Subject: [PATCH 093/120] nbd: implement the WRITE_ZEROES command The NBD protocol defines a message for zeroing out a region of an export Add support to the kernel driver for that message. Signed-off-by: Wouter Verhelst Cc: Eric Blake Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20240812133032.115134-3-w@uter.be Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 8 ++++++++ include/uapi/linux/nbd.h | 5 ++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 4d06472bf112..eee632b84994 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -363,6 +363,8 @@ static int __nbd_set_size(struct nbd_device *nbd, loff_t bytesize, } if (nbd->config->flags & NBD_FLAG_ROTATIONAL) lim.features |= BLK_FEAT_ROTATIONAL; + if (nbd->config->flags & NBD_FLAG_SEND_WRITE_ZEROES) + lim.max_write_zeroes_sectors = UINT_MAX >> SECTOR_SHIFT; lim.logical_block_size = blksize; lim.physical_block_size = blksize; @@ -432,6 +434,8 @@ static u32 req_to_nbd_cmd_type(struct request *req) return NBD_CMD_WRITE; case REQ_OP_READ: return NBD_CMD_READ; + case REQ_OP_WRITE_ZEROES: + return NBD_CMD_WRITE_ZEROES; default: return U32_MAX; } @@ -648,6 +652,8 @@ static blk_status_t nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, if (req->cmd_flags & REQ_FUA) nbd_cmd_flags |= NBD_CMD_FLAG_FUA; + if ((req->cmd_flags & REQ_NOUNMAP) && (type == NBD_CMD_WRITE_ZEROES)) + nbd_cmd_flags |= NBD_CMD_FLAG_NO_HOLE; /* We did a partial send previously, and we at least sent the whole * request struct, so just go and send the rest of the pages in the @@ -1717,6 +1723,8 @@ static int nbd_dbg_flags_show(struct seq_file *s, void *unused) seq_puts(s, "NBD_FLAG_SEND_FUA\n"); if (flags & NBD_FLAG_SEND_TRIM) seq_puts(s, "NBD_FLAG_SEND_TRIM\n"); + if (flags & NBD_FLAG_SEND_WRITE_ZEROES) + seq_puts(s, "NBD_FLAG_SEND_WRITE_ZEROES\n"); return 0; } diff --git a/include/uapi/linux/nbd.h b/include/uapi/linux/nbd.h index d75215f2c675..f1d468acfb25 100644 --- a/include/uapi/linux/nbd.h +++ b/include/uapi/linux/nbd.h @@ -42,8 +42,9 @@ enum { NBD_CMD_WRITE = 1, NBD_CMD_DISC = 2, NBD_CMD_FLUSH = 3, - NBD_CMD_TRIM = 4 + NBD_CMD_TRIM = 4, /* userspace defines additional extension commands */ + NBD_CMD_WRITE_ZEROES = 6, }; /* values for flags field, these are server interaction specific. */ @@ -53,11 +54,13 @@ enum { #define NBD_FLAG_SEND_FUA (1 << 3) /* send FUA (forced unit access) */ #define NBD_FLAG_ROTATIONAL (1 << 4) /* device is rotational */ #define NBD_FLAG_SEND_TRIM (1 << 5) /* send trim/discard */ +#define NBD_FLAG_SEND_WRITE_ZEROES (1 << 6) /* supports WRITE_ZEROES */ /* there is a gap here to match userspace */ #define NBD_FLAG_CAN_MULTI_CONN (1 << 8) /* Server supports multiple connections per export. */ /* values for cmd flags in the upper 16 bits of request type */ #define NBD_CMD_FLAG_FUA (1 << 16) /* FUA (forced unit access) op */ +#define NBD_CMD_FLAG_NO_HOLE (1 << 17) /* Do not punch a hole for WRITE_ZEROES */ /* These are client behavior specific flags. */ #define NBD_CFLAG_DESTROY_ON_DISCONNECT (1 << 0) /* delete the nbd device on From 41372f5c9a866365e212809b3543ae8cb5b2542b Mon Sep 17 00:00:00 2001 From: Wouter Verhelst Date: Mon, 12 Aug 2024 15:20:40 +0200 Subject: [PATCH 094/120] nbd: nbd_bg_flags_show: add NBD_FLAG_ROTATIONAL Also handle NBD_FLAG_ROTATIONAL in our debug helper function Signed-off-by: Wouter Verhelst Cc: Eric Blake Link: https://lore.kernel.org/r/20240812133032.115134-6-w@uter.be Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index eee632b84994..18b94d7c795a 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1725,6 +1725,8 @@ static int nbd_dbg_flags_show(struct seq_file *s, void *unused) seq_puts(s, "NBD_FLAG_SEND_TRIM\n"); if (flags & NBD_FLAG_SEND_WRITE_ZEROES) seq_puts(s, "NBD_FLAG_SEND_WRITE_ZEROES\n"); + if (flags & NBD_FLAG_ROTATIONAL) + seq_puts(s, "NBD_FLAG_ROTATIONAL\n"); return 0; } From 296dbc72d29085d5fc34430d0760423071e9e81d Mon Sep 17 00:00:00 2001 From: Wouter Verhelst Date: Mon, 12 Aug 2024 15:20:42 +0200 Subject: [PATCH 095/120] nbd: correct the maximum value for discard sectors The version of the NBD protocol implemented by the kernel driver currently has a 32 bit field for length values. As the NBD protocol uses bytes as a unit of length, length values larger than 2^32 bytes cannot be expressed. Update the max_hw_discard_sectors field to match that. Signed-off-by: Wouter Verhelst Fixes: 268283244c0f ("nbd: use the atomic queue limits API in nbd_set_size") Reviewed-by: Damien Le Moal Cc: Eric Blake Link: https://lore.kernel.org/r/20240812133032.115134-8-w@uter.be Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 18b94d7c795a..b852050d8a96 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -350,7 +350,7 @@ static int __nbd_set_size(struct nbd_device *nbd, loff_t bytesize, lim = queue_limits_start_update(nbd->disk->queue); if (nbd->config->flags & NBD_FLAG_SEND_TRIM) - lim.max_hw_discard_sectors = UINT_MAX; + lim.max_hw_discard_sectors = UINT_MAX >> SECTOR_SHIFT; else lim.max_hw_discard_sectors = 0; if (!(nbd->config->flags & NBD_FLAG_SEND_FLUSH)) { From 9518e5bfaae19447d657983d0628062ab6712610 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 6 Sep 2024 16:14:43 +0200 Subject: [PATCH 096/120] zram: Replace bit spinlocks with a spinlock_t. The bit spinlock disables preemption. The spinlock_t lock becomes a sleeping lock on PREEMPT_RT and it can not be acquired in this context. In this locked section, zs_free() acquires a zs_pool::lock, and there is access to zram::wb_limit_lock. Add a spinlock_t for locking. Keep the set/ clear ZRAM_LOCK bit after the lock has been acquired/ dropped. The size of struct zram_table_entry increases by 4 bytes due to lock and additional 4 bytes padding with CONFIG_ZRAM_TRACK_ENTRY_ACTIME enabled. Signed-off-by: Mike Galbraith Reviewed-by: Sergey Senozhatsky Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Jens Axboe Link: https://lore.kernel.org/r/20240906141520.730009-2-bigeasy@linutronix.de Signed-off-by: Jens Axboe --- drivers/block/zram/zram_drv.c | 18 ++++++++++++++---- drivers/block/zram/zram_drv.h | 1 + 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index efcb8d9d274c..c1f0f2da1ec3 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -59,17 +59,24 @@ static int zram_read_page(struct zram *zram, struct page *page, u32 index, static int zram_slot_trylock(struct zram *zram, u32 index) { - return bit_spin_trylock(ZRAM_LOCK, &zram->table[index].flags); + int ret; + + ret = spin_trylock(&zram->table[index].lock); + if (ret) + __set_bit(ZRAM_LOCK, &zram->table[index].flags); + return ret; } static void zram_slot_lock(struct zram *zram, u32 index) { - bit_spin_lock(ZRAM_LOCK, &zram->table[index].flags); + spin_lock(&zram->table[index].lock); + __set_bit(ZRAM_LOCK, &zram->table[index].flags); } static void zram_slot_unlock(struct zram *zram, u32 index) { - bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags); + __clear_bit(ZRAM_LOCK, &zram->table[index].flags); + spin_unlock(&zram->table[index].lock); } static inline bool init_done(struct zram *zram) @@ -1211,7 +1218,7 @@ static void zram_meta_free(struct zram *zram, u64 disksize) static bool zram_meta_alloc(struct zram *zram, u64 disksize) { - size_t num_pages; + size_t num_pages, index; num_pages = disksize >> PAGE_SHIFT; zram->table = vzalloc(array_size(num_pages, sizeof(*zram->table))); @@ -1226,6 +1233,9 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) if (!huge_class_size) huge_class_size = zs_huge_class_size(zram->mem_pool); + + for (index = 0; index < num_pages; index++) + spin_lock_init(&zram->table[index].lock); return true; } diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 35e322144629..dcc3c106ce71 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -69,6 +69,7 @@ struct zram_table_entry { unsigned long element; }; unsigned long flags; + spinlock_t lock; #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME ktime_t ac_time; #endif From 6086aeb49e3d9e25165769b2a0a13ff67f98a1a2 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 6 Sep 2024 16:14:44 +0200 Subject: [PATCH 097/120] zram: Remove ZRAM_LOCK The ZRAM_LOCK was used for locking and after the addition of spinlock_t the bit set and cleared but there no reader of it. Remove the ZRAM_LOCK bit. Reviewed-by: Sergey Senozhatsky Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Jens Axboe Link: https://lore.kernel.org/r/20240906141520.730009-3-bigeasy@linutronix.de Signed-off-by: Jens Axboe --- drivers/block/zram/zram_drv.c | 11 ++--------- drivers/block/zram/zram_drv.h | 4 +--- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index c1f0f2da1ec3..60946a9a7bc4 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -59,23 +59,16 @@ static int zram_read_page(struct zram *zram, struct page *page, u32 index, static int zram_slot_trylock(struct zram *zram, u32 index) { - int ret; - - ret = spin_trylock(&zram->table[index].lock); - if (ret) - __set_bit(ZRAM_LOCK, &zram->table[index].flags); - return ret; + return spin_trylock(&zram->table[index].lock); } static void zram_slot_lock(struct zram *zram, u32 index) { spin_lock(&zram->table[index].lock); - __set_bit(ZRAM_LOCK, &zram->table[index].flags); } static void zram_slot_unlock(struct zram *zram, u32 index) { - __clear_bit(ZRAM_LOCK, &zram->table[index].flags); spin_unlock(&zram->table[index].lock); } @@ -1293,7 +1286,7 @@ out: zram_set_handle(zram, index, 0); zram_set_obj_size(zram, index, 0); WARN_ON_ONCE(zram->table[index].flags & - ~(1UL << ZRAM_LOCK | 1UL << ZRAM_UNDER_WB)); + ~(1UL << ZRAM_UNDER_WB)); } /* diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index dcc3c106ce71..262fa960a078 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -45,9 +45,7 @@ /* Flags for zram pages (table[page_no].flags) */ enum zram_pageflags { - /* zram slot is locked */ - ZRAM_LOCK = ZRAM_FLAG_SHIFT, - ZRAM_SAME, /* Page consists the same element */ + ZRAM_SAME = ZRAM_FLAG_SHIFT, /* Page consists the same element */ ZRAM_WB, /* page is stored on backing_device */ ZRAM_UNDER_WB, /* page is under writeback */ ZRAM_HUGE, /* Incompressible page */ From 68d20eb60efbdc80662efedeb088353e9c4aa17f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 6 Sep 2024 16:14:45 +0200 Subject: [PATCH 098/120] zram: Shrink zram_table_entry::flags. The zram_table_entry::flags member is of type long and uses 8 bytes on a 64bit architecture. With a PAGE_SIZE of 256KiB we have PAGE_SHIFT of 18 which in turn leads to __NR_ZRAM_PAGEFLAGS = 27. This still fits in an ordinary integer. By reducing the size of `flags' to four bytes, the size of the struct goes back to 16 bytes. The padding between the lock and ac_time (if enabled) is also gone. Make zram_table_entry::flags an unsigned int and update the build test to reflect the change. Reviewed-by: Sergey Senozhatsky Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Jens Axboe Link: https://lore.kernel.org/r/20240906141520.730009-4-bigeasy@linutronix.de Signed-off-by: Jens Axboe --- drivers/block/zram/zram_drv.c | 3 ++- drivers/block/zram/zram_drv.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 60946a9a7bc4..84cd62efac0f 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -2404,9 +2404,10 @@ static void destroy_devices(void) static int __init zram_init(void) { + struct zram_table_entry zram_te; int ret; - BUILD_BUG_ON(__NR_ZRAM_PAGEFLAGS > BITS_PER_LONG); + BUILD_BUG_ON(__NR_ZRAM_PAGEFLAGS > sizeof(zram_te.flags) * 8); ret = cpuhp_setup_state_multi(CPUHP_ZCOMP_PREPARE, "block/zram:prepare", zcomp_cpu_up_prepare, zcomp_cpu_dead); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 262fa960a078..531cefc66668 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -66,7 +66,7 @@ struct zram_table_entry { unsigned long handle; unsigned long element; }; - unsigned long flags; + unsigned int flags; spinlock_t lock; #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME ktime_t ac_time; From d981ed8419303ed12351eea8541ad6cb76455fe3 Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Thu, 5 Sep 2024 07:54:53 +0800 Subject: [PATCH 099/120] md: Add new_level sysfs interface Now reshape supports two ways: with backup file or without backup file. For the situation without backup file, it needs to change data offset. It doesn't need systemd service mdadm-grow-continue. So it can finish the reshape job in one process environment. It can know the new level from mdadm --grow command and can change to new level after reshape finishes. For the situation with backup file, it needs systemd service mdadm-grow-continue to monitor reshape progress. So there are two process envolved. One is mdadm --grow command whick kicks off reshape and wakes up mdadm-grow-continue service. The second process is the service, which doesn't know the new level from the first process. In kernel space mddev->new_level is used to record the new level when doing reshape. This patch adds a new interface to help mdadm update new_level and sync it to metadata. Then mdadm-grow-continue can read the right new_level. Commit log revised by Song Liu. Please refer to the link for more details. Signed-off-by: Xiao Ni Link: https://lore.kernel.org/r/20240904235453.99120-1-xni@redhat.com Signed-off-by: Song Liu --- drivers/md/md.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index b669971d4782..179ee4afe937 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4052,6 +4052,34 @@ out_unlock: static struct md_sysfs_entry md_level = __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); +static ssize_t +new_level_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%d\n", mddev->new_level); +} + +static ssize_t +new_level_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned int n; + int err; + + err = kstrtouint(buf, 10, &n); + if (err < 0) + return err; + err = mddev_lock(mddev); + if (err) + return err; + + mddev->new_level = n; + md_update_sb(mddev, 1); + + mddev_unlock(mddev); + return len; +} +static struct md_sysfs_entry md_new_level = +__ATTR(new_level, 0664, new_level_show, new_level_store); + static ssize_t layout_show(struct mddev *mddev, char *page) { @@ -5583,6 +5611,7 @@ __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, static struct attribute *md_default_attrs[] = { &md_level.attr, + &md_new_level.attr, &md_layout.attr, &md_raid_disks.attr, &md_uuid.attr, From 389e72c5d1efcfd3962d29624c1b78cfa002911e Mon Sep 17 00:00:00 2001 From: Shen Lichuan Date: Fri, 6 Sep 2024 14:58:15 +0800 Subject: [PATCH 100/120] nvme: Convert comma to semicolon To ensure code clarity and prevent potential errors, it's advisable to employ the ';' as a statement separator, except when ',' are intentionally used for specific purposes. Signed-off-by: Shen Lichuan Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 75ab62cb4aa4..1236e3aa00ed 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4605,7 +4605,7 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, set->flags = BLK_MQ_F_SHOULD_MERGE; if (ctrl->ops->flags & NVME_F_BLOCKING) set->flags |= BLK_MQ_F_BLOCKING; - set->cmd_size = cmd_size, + set->cmd_size = cmd_size; set->driver_data = ctrl; set->nr_hw_queues = ctrl->queue_count - 1; set->timeout = NVME_IO_TIMEOUT; From a02e98bebc15f1d973c2a62005be9456a657e2b6 Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Sat, 7 Sep 2024 11:40:46 +0800 Subject: [PATCH 101/120] mtip32xx: Remove redundant null pointer checks in mtip_hw_debugfs_init() Since the debugfs_create_dir() never returns a null pointer, checking the return value for a null pointer is redundant. Since debugfs_create_file() can deal with a ERR_PTR() style pointer, drop the check. Since mtip_hw_debugfs_init does not pay attention to the return value, its return type can be changed to void. Signed-off-by: Li Zetao Link: https://lore.kernel.org/r/20240907034046.3595268-1-lizetao1@huawei.com Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index c6ef0546ffc9..11901f2812ad 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -2269,25 +2269,12 @@ static const struct file_operations mtip_flags_fops = { .llseek = no_llseek, }; -static int mtip_hw_debugfs_init(struct driver_data *dd) +static void mtip_hw_debugfs_init(struct driver_data *dd) { - if (!dfs_parent) - return -1; - dd->dfs_node = debugfs_create_dir(dd->disk->disk_name, dfs_parent); - if (IS_ERR_OR_NULL(dd->dfs_node)) { - dev_warn(&dd->pdev->dev, - "Error creating node %s under debugfs\n", - dd->disk->disk_name); - dd->dfs_node = NULL; - return -1; - } - debugfs_create_file("flags", 0444, dd->dfs_node, dd, &mtip_flags_fops); debugfs_create_file("registers", 0444, dd->dfs_node, dd, &mtip_regs_fops); - - return 0; } static void mtip_hw_debugfs_exit(struct driver_data *dd) @@ -4043,10 +4030,6 @@ static int __init mtip_init(void) mtip_major = error; dfs_parent = debugfs_create_dir("rssd", NULL); - if (IS_ERR_OR_NULL(dfs_parent)) { - pr_warn("Error creating debugfs parent\n"); - dfs_parent = NULL; - } /* Register our PCI operations. */ error = pci_register_driver(&mtip_pci_driver); From acc8c0a9887558966295e3c0881e633154c239a9 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 6 Sep 2024 12:45:40 -0700 Subject: [PATCH 102/120] blk-mq: add missing unplug trace event The single-queue optimized list flush doesn't have an unplug trace event to pair with the plug event. Add one. In the unlikely event an error occurs and falls back to the less optimized plug flush path, it's possible a 2nd unplug trace event will be logged, but it will show the remainig count that weren't previously handled. Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20240906194540.3719642-1-kbusch@meta.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index 36abbaefe387..3f1f7d0b3ff3 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2753,6 +2753,7 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) { struct request *rq; + unsigned int depth; /* * We may have been called recursively midway through handling @@ -2763,6 +2764,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) */ if (plug->rq_count == 0) return; + depth = plug->rq_count; plug->rq_count = 0; if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) { @@ -2770,6 +2772,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) rq = rq_list_peek(&plug->mq_list); q = rq->q; + trace_block_unplug(q, depth, true); /* * Peek first request and see if we have a ->queue_rqs() hook. From 2d5a333e09c388189238291577e443221baacba0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 9 Sep 2024 20:21:09 +0000 Subject: [PATCH 103/120] nvme-tcp: fix link failure for TCP auth The nvme fabric driver calls the nvme_tls_key_lookup() function from nvmf_parse_key() when the keyring is enabled, but this is broken in a configuration with CONFIG_NVME_FABRICS=y and CONFIG_NVME_TCP=m because this leads to the function definition being in a loadable module: x86_64-linux-ld: vmlinux.o: in function `nvmf_parse_key': fabrics.c:(.text+0xb1bdec): undefined reference to `nvme_tls_key_lookup' Move the 'select' up to CONFIG_NVME_FABRICS itself to force this part to be built-in as well if needed. Fixes: 5bc46b49c828 ("nvme-tcp: check for invalidated or revoked key") Signed-off-by: Arnd Bergmann Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index 883aaab2d83e..486afe598184 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -41,6 +41,7 @@ config NVME_HWMON config NVME_FABRICS select NVME_CORE + select NVME_KEYRING if NVME_TCP_TLS tristate config NVME_RDMA @@ -94,7 +95,6 @@ config NVME_TCP config NVME_TCP_TLS bool "NVMe over Fabrics TCP TLS encryption support" depends on NVME_TCP - select NVME_KEYRING select NET_HANDSHAKE select KEYS help From a5e61b50c9f44c5edb6e134ede6fee8806ffafa9 Mon Sep 17 00:00:00 2001 From: Mikhail Lobanov Date: Mon, 9 Sep 2024 09:37:36 -0400 Subject: [PATCH 104/120] drbd: Add NULL check for net_conf to prevent dereference in state validation If the net_conf pointer is NULL and the code attempts to access its fields without a check, it will lead to a null pointer dereference. Add a NULL check before dereferencing the pointer. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 44ed167da748 ("drbd: rcu_read_lock() and rcu_dereference() for tconn->net_conf") Cc: stable@vger.kernel.org Signed-off-by: Mikhail Lobanov Link: https://lore.kernel.org/r/20240909133740.84297-1-m.lobanov@rosalinux.ru Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index e858e7e0383f..c2b6c4d9729d 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -876,7 +876,7 @@ is_valid_state(struct drbd_device *device, union drbd_state ns) ns.disk == D_OUTDATED) rv = SS_CONNECTED_OUTDATES; - else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && + else if (nc && (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && (nc->verify_alg[0] == 0)) rv = SS_NO_VERIFY_ALG; From 3bf73e6283ef0bae4e27dad62309e50e3bf7ee88 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 3 Sep 2024 21:51:48 +0800 Subject: [PATCH 105/120] blk-throttle: remove last_low_overflow_time last_low_overflow_time is not used anymore after commit bf20ab538c81 ("blk-throttle: remove CONFIG_BLK_DEV_THROTTLING_LOW"). Signed-off-by: Yu Kuai Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20240903135149.271857-2-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 8 +------- block/blk-throttle.h | 2 -- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index dc6140fa3de0..eb859c44c9f3 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1611,17 +1611,13 @@ bool __blk_throtl_bio(struct bio *bio) sq = &tg->service_queue; while (true) { - if (tg->last_low_overflow_time[rw] == 0) - tg->last_low_overflow_time[rw] = jiffies; /* throtl is FIFO - if bios are already queued, should queue */ if (sq->nr_queued[rw]) break; /* if above limits, break to queue */ - if (!tg_may_dispatch(tg, bio, NULL)) { - tg->last_low_overflow_time[rw] = jiffies; + if (!tg_may_dispatch(tg, bio, NULL)) break; - } /* within limits, let's charge and dispatch directly */ throtl_charge_bio(tg, bio); @@ -1661,8 +1657,6 @@ bool __blk_throtl_bio(struct bio *bio) tg->io_disp[rw], tg_iops_limit(tg, rw), sq->nr_queued[READ], sq->nr_queued[WRITE]); - tg->last_low_overflow_time[rw] = jiffies; - td->nr_queued[rw]++; throtl_add_bio_tg(bio, qn, tg); throttled = true; diff --git a/block/blk-throttle.h b/block/blk-throttle.h index 4d9ef5abdf21..1a36d1278eea 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -106,8 +106,6 @@ struct throtl_grp { /* Number of bio's dispatched in current slice */ unsigned int io_disp[2]; - unsigned long last_low_overflow_time[2]; - uint64_t last_bytes_disp[2]; unsigned int last_io_disp[2]; From 29390bb5661d49d10424ad8e915230de1f7074c9 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 3 Sep 2024 21:51:49 +0800 Subject: [PATCH 106/120] blk-throttle: support prioritized processing of metadata Currently, blk-throttle handle all IO fifo, hence if data IO is throttled and then meta IO is dispatched, the meta IO will have to wait for the data IO, causing priority inversion problems. This patch support to handle metadata first and then pay debt while throttling data. Test script: use cgroup v1 to throttle root cgroup, then create new dir and file while write back is throttled test() { mkdir /mnt/test/xxx touch /mnt/test/xxx/1 sync /mnt/test/xxx sync /mnt/test/xxx } mkfs.ext4 -F /dev/nvme0n1 -E lazy_itable_init=0,lazy_journal_init=0 mount /dev/nvme0n1 /mnt/test echo "259:0 $((1024*1024))" > /sys/fs/cgroup/blkio/blkio.throttle.write_bps_device dd if=/dev/zero of=/mnt/test/foo1 bs=16M count=1 conv=fdatasync status=none & sleep 4 time test echo "259:0 0" > /sys/fs/cgroup/blkio/blkio.throttle.write_bps_device sleep 1 umount /dev/nvme0n1 Test result: time cost for creating new dir and file before this patch: 14s after this patch: 0.1s Signed-off-by: Yu Kuai Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20240903135149.271857-3-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 65 +++++++++++++++++++++++++++++--------------- 1 file changed, 43 insertions(+), 22 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index eb859c44c9f3..9c5bbd261724 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1595,6 +1595,22 @@ void blk_throtl_cancel_bios(struct gendisk *disk) spin_unlock_irq(&q->queue_lock); } +static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw) +{ + /* throtl is FIFO - if bios are already queued, should queue */ + if (tg->service_queue.nr_queued[rw]) + return false; + + return tg_may_dispatch(tg, bio, NULL); +} + +static void tg_dispatch_in_debt(struct throtl_grp *tg, struct bio *bio, bool rw) +{ + if (!bio_flagged(bio, BIO_BPS_THROTTLED)) + tg->carryover_bytes[rw] -= throtl_bio_data_size(bio); + tg->carryover_ios[rw]--; +} + bool __blk_throtl_bio(struct bio *bio) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); @@ -1611,29 +1627,34 @@ bool __blk_throtl_bio(struct bio *bio) sq = &tg->service_queue; while (true) { - /* throtl is FIFO - if bios are already queued, should queue */ - if (sq->nr_queued[rw]) + if (tg_within_limit(tg, bio, rw)) { + /* within limits, let's charge and dispatch directly */ + throtl_charge_bio(tg, bio); + + /* + * We need to trim slice even when bios are not being + * queued otherwise it might happen that a bio is not + * queued for a long time and slice keeps on extending + * and trim is not called for a long time. Now if limits + * are reduced suddenly we take into account all the IO + * dispatched so far at new low rate and * newly queued + * IO gets a really long dispatch time. + * + * So keep on trimming slice even if bio is not queued. + */ + throtl_trim_slice(tg, rw); + } else if (bio_issue_as_root_blkg(bio)) { + /* + * IOs which may cause priority inversions are + * dispatched directly, even if they're over limit. + * Debts are handled by carryover_bytes/ios while + * calculating wait time. + */ + tg_dispatch_in_debt(tg, bio, rw); + } else { + /* if above limits, break to queue */ break; - - /* if above limits, break to queue */ - if (!tg_may_dispatch(tg, bio, NULL)) - break; - - /* within limits, let's charge and dispatch directly */ - throtl_charge_bio(tg, bio); - - /* - * We need to trim slice even when bios are not being queued - * otherwise it might happen that a bio is not queued for - * a long time and slice keeps on extending and trim is not - * called for a long time. Now if limits are reduced suddenly - * we take into account all the IO dispatched so far at new - * low rate and * newly queued IO gets a really long dispatch - * time. - * - * So keep on trimming slice even if bio is not queued. - */ - throtl_trim_slice(tg, rw); + } /* * @bio passed through this layer without being throttled. From 1ba0403ac6447f2d63914fb760c44a3b19c44eaf Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 9 Sep 2024 21:41:48 +0800 Subject: [PATCH 107/120] block, bfq: fix uaf for accessing waker_bfqq after splitting After commit 42c306ed7233 ("block, bfq: don't break merge chain in bfq_split_bfqq()"), if the current procress is the last holder of bfqq, the bfqq can be freed after bfq_split_bfqq(). Hence recored the bfqq and then access bfqq->waker_bfqq may trigger UAF. What's more, the waker_bfqq may in the merge chain of bfqq, hence just recored waker_bfqq is still not safe. Fix the problem by adding a helper bfq_waker_bfqq() to check if bfqq->waker_bfqq is in the merge chain, and current procress is the only holder. Fixes: 42c306ed7233 ("block, bfq: don't break merge chain in bfq_split_bfqq()") Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240909134154.954924-2-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index d1bf2b8a3576..d5d39974c674 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -6825,6 +6825,31 @@ static void bfq_prepare_request(struct request *rq) rq->elv.priv[0] = rq->elv.priv[1] = NULL; } +static struct bfq_queue *bfq_waker_bfqq(struct bfq_queue *bfqq) +{ + struct bfq_queue *new_bfqq = bfqq->new_bfqq; + struct bfq_queue *waker_bfqq = bfqq->waker_bfqq; + + if (!waker_bfqq) + return NULL; + + while (new_bfqq) { + if (new_bfqq == waker_bfqq) { + /* + * If waker_bfqq is in the merge chain, and current + * is the only procress. + */ + if (bfqq_process_refs(waker_bfqq) == 1) + return NULL; + break; + } + + new_bfqq = new_bfqq->new_bfqq; + } + + return waker_bfqq; +} + /* * If needed, init rq, allocate bfq data structures associated with * rq, and increment reference counters in the destination bfq_queue @@ -6886,7 +6911,7 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) /* If the queue was seeky for too long, break it apart. */ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq) && !bic->bfqq_data[a_idx].stably_merged) { - struct bfq_queue *old_bfqq = bfqq; + struct bfq_queue *waker_bfqq = bfq_waker_bfqq(bfqq); /* Update bic before losing reference to bfqq */ if (bfq_bfqq_in_large_burst(bfqq)) @@ -6906,7 +6931,7 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) bfqq_already_existing = true; if (!bfqq_already_existing) { - bfqq->waker_bfqq = old_bfqq->waker_bfqq; + bfqq->waker_bfqq = waker_bfqq; bfqq->tentative_waker_bfqq = NULL; /* @@ -6916,7 +6941,7 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) * woken_list of the waker. See * bfq_check_waker for details. */ - if (bfqq->waker_bfqq) + if (waker_bfqq) hlist_add_head(&bfqq->woken_list_node, &bfqq->waker_bfqq->woken_list); } From 73aeab373557fa6ee4ae0b742c6211ccd9859280 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 9 Sep 2024 21:41:49 +0800 Subject: [PATCH 108/120] block, bfq: fix procress reference leakage for bfqq in merge chain MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Original state: Process 1 Process 2 Process 3 Process 4 (BIC1) (BIC2) (BIC3) (BIC4) Λ | | | \--------------\ \-------------\ \-------------\| V V V bfqq1--------->bfqq2---------->bfqq3----------->bfqq4 ref 0 1 2 4 After commit 0e456dba86c7 ("block, bfq: choose the last bfqq from merge chain in bfq_setup_cooperator()"), if P1 issues a new IO: Without the patch: Process 1 Process 2 Process 3 Process 4 (BIC1) (BIC2) (BIC3) (BIC4) Λ | | | \------------------------------\ \-------------\| V V bfqq1--------->bfqq2---------->bfqq3----------->bfqq4 ref 0 0 2 4 bfqq3 will be used to handle IO from P1, this is not expected, IO should be redirected to bfqq4; With the patch: ------------------------------------------- | | Process 1 Process 2 Process 3 | Process 4 (BIC1) (BIC2) (BIC3) | (BIC4) | | | | \-------------\ \-------------\| V V bfqq1--------->bfqq2---------->bfqq3----------->bfqq4 ref 0 0 2 4 IO is redirected to bfqq4, however, procress reference of bfqq3 is still 2, while there is only P2 using it. Fix the problem by calling bfq_merge_bfqqs() for each bfqq in the merge chain. Also change bfqq_merge_bfqqs() to return new_bfqq to simplify code. Fixes: 0e456dba86c7 ("block, bfq: choose the last bfqq from merge chain in bfq_setup_cooperator()") Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240909134154.954924-3-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index d5d39974c674..f4192d5411d2 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -3129,10 +3129,12 @@ void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_put_queue(bfqq); } -static void -bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, - struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) +static struct bfq_queue *bfq_merge_bfqqs(struct bfq_data *bfqd, + struct bfq_io_cq *bic, + struct bfq_queue *bfqq) { + struct bfq_queue *new_bfqq = bfqq->new_bfqq; + bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", (unsigned long)new_bfqq->pid); /* Save weight raising and idle window of the merged queues */ @@ -3226,6 +3228,8 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, bfq_reassign_last_bfqq(bfqq, new_bfqq); bfq_release_process_ref(bfqd, bfqq); + + return new_bfqq; } static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, @@ -3261,14 +3265,8 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, * fulfilled, i.e., bic can be redirected to new_bfqq * and bfqq can be put. */ - bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq, - new_bfqq); - /* - * If we get here, bio will be queued into new_queue, - * so use new_bfqq to decide whether bio and rq can be - * merged. - */ - bfqq = new_bfqq; + while (bfqq != new_bfqq) + bfqq = bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq); /* * Change also bqfd->bio_bfqq, as @@ -5705,9 +5703,7 @@ bfq_do_early_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq, * state before killing it. */ bfqq->bic = bic; - bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); - - return new_bfqq; + return bfq_merge_bfqqs(bfqd, bic, bfqq); } /* @@ -6162,6 +6158,7 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) bool waiting, idle_timer_disabled = false; if (new_bfqq) { + struct bfq_queue *old_bfqq = bfqq; /* * Release the request's reference to the old bfqq * and make sure one is taken to the shared queue. @@ -6178,18 +6175,18 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) * new_bfqq. */ if (bic_to_bfqq(RQ_BIC(rq), true, - bfq_actuator_index(bfqd, rq->bio)) == bfqq) - bfq_merge_bfqqs(bfqd, RQ_BIC(rq), - bfqq, new_bfqq); + bfq_actuator_index(bfqd, rq->bio)) == bfqq) { + while (bfqq != new_bfqq) + bfqq = bfq_merge_bfqqs(bfqd, RQ_BIC(rq), bfqq); + } - bfq_clear_bfqq_just_created(bfqq); + bfq_clear_bfqq_just_created(old_bfqq); /* * rq is about to be enqueued into new_bfqq, * release rq reference on bfqq */ - bfq_put_queue(bfqq); + bfq_put_queue(old_bfqq); rq->elv.priv[1] = new_bfqq; - bfqq = new_bfqq; } bfq_update_io_thinktime(bfqd, bfqq); From bc3b1e9e7c50e1de0f573eea3871db61dd4787de Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 9 Sep 2024 21:41:50 +0800 Subject: [PATCH 109/120] block, bfq: merge bfq_release_process_ref() into bfq_put_cooperator() Because bfq_put_cooperator() is always followed by bfq_release_process_ref(). Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240909134154.954924-4-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 1 - block/bfq-iosched.c | 6 ++---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 9fb9f3533150..e831aedb4643 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -736,7 +736,6 @@ static void bfq_sync_bfqq_move(struct bfq_data *bfqd, */ bfq_put_cooperator(sync_bfqq); bic_set_bfqq(bic, NULL, true, act_idx); - bfq_release_process_ref(bfqd, sync_bfqq); } } diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index f4192d5411d2..17b0bf6b56bb 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5434,6 +5434,8 @@ void bfq_put_cooperator(struct bfq_queue *bfqq) bfq_put_queue(__bfqq); __bfqq = next; } + + bfq_release_process_ref(bfqq->bfqd, bfqq); } static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) @@ -5446,8 +5448,6 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); bfq_put_cooperator(bfqq); - - bfq_release_process_ref(bfqd, bfqq); } static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync, @@ -6734,8 +6734,6 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) bic_set_bfqq(bic, NULL, true, bfqq->actuator_idx); bfq_put_cooperator(bfqq); - - bfq_release_process_ref(bfqq->bfqd, bfqq); return NULL; } From 553a606c25f8ff5c518c7fcf488dd4dd5fbb4795 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 9 Sep 2024 21:41:51 +0800 Subject: [PATCH 110/120] block, bfq: remove bfq_log_bfqg() It's not used, hence can be removed. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240909134154.954924-5-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/bfq-iosched.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index e16d96e2367b..687a3a7ba784 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -1185,11 +1185,6 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq); "%s " fmt, pid_str, ##args); \ } while (0) -#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ - blk_add_cgroup_trace_msg((bfqd)->queue, \ - &bfqg_to_blkg(bfqg)->blkcg->css, fmt, ##args); \ -} while (0) - #else /* CONFIG_BFQ_GROUP_IOSCHED */ #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ @@ -1199,7 +1194,6 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq); bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH); \ blk_add_trace_msg((bfqd)->queue, "%s " fmt, pid_str, ##args); \ } while (0) -#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) #endif /* CONFIG_BFQ_GROUP_IOSCHED */ From e61e002a67da9ec36571af743c94a968cf1ce116 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 9 Sep 2024 21:41:52 +0800 Subject: [PATCH 111/120] block, bfq: remove local variable 'split' in bfq_init_rq() The local variable is used to call bfq_bfqq_resume_state() later, since 'bfqd->lock' is held, and bfqq status will not change between setting 'split' and calling bfq_bfqq_resume_state(), move forward bfq_bfqq_resume_state() so that 'split' can be removed. There are no functional chagnes. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240909134154.954924-6-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 17b0bf6b56bb..54f6eae2763d 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -6877,7 +6877,7 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) const int is_sync = rq_is_sync(rq); struct bfq_queue *bfqq; bool new_queue = false; - bool bfqq_already_existing = false, split = false; + bool bfqq_already_existing = false; unsigned int a_idx = bfq_actuator_index(bfqd, bio); if (unlikely(!rq->elv.icq)) @@ -6914,16 +6914,19 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) true; bfqq = bfq_split_bfqq(bic, bfqq); - split = true; - if (!bfqq) { bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, true, is_sync, NULL); if (unlikely(bfqq == &bfqd->oom_bfqq)) bfqq_already_existing = true; - } else + else + bfq_bfqq_resume_state(bfqq, bfqd, bic, + false); + } else { bfqq_already_existing = true; + bfq_bfqq_resume_state(bfqq, bfqd, bic, true); + } if (!bfqq_already_existing) { bfqq->waker_bfqq = waker_bfqq; @@ -6959,18 +6962,8 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) * resume its state. */ if (likely(bfqq != &bfqd->oom_bfqq) && !bfqq->new_bfqq && - bfqq_process_refs(bfqq) == 1) { + bfqq_process_refs(bfqq) == 1) bfqq->bic = bic; - if (split) { - /* - * The queue has just been split from a shared - * queue: restore the idle window and the - * possible weight raising period. - */ - bfq_bfqq_resume_state(bfqq, bfqd, bic, - bfqq_already_existing); - } - } /* * Consider bfqq as possibly belonging to a burst of newly From 3c61429c297582e0da7231fb29fc5ec1d2c7d1b2 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 9 Sep 2024 21:41:53 +0800 Subject: [PATCH 112/120] block, bfq: remove local variable 'bfqq_already_existing' in bfq_init_rq() Now that 'bfqq_already_existing' is only used in one branch, it can be removed. There are no functional changes. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240909134154.954924-7-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 54f6eae2763d..7936e8bc166a 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -6877,7 +6877,6 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) const int is_sync = rq_is_sync(rq); struct bfq_queue *bfqq; bool new_queue = false; - bool bfqq_already_existing = false; unsigned int a_idx = bfq_actuator_index(bfqd, bio); if (unlikely(!rq->elv.icq)) @@ -6918,31 +6917,27 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, true, is_sync, NULL); - if (unlikely(bfqq == &bfqd->oom_bfqq)) - bfqq_already_existing = true; - else + if (likely(bfqq != &bfqd->oom_bfqq)) { bfq_bfqq_resume_state(bfqq, bfqd, bic, false); + bfqq->waker_bfqq = waker_bfqq; + bfqq->tentative_waker_bfqq = NULL; + + /* + * If the waker queue disappears, then + * new_bfqq->waker_bfqq must be + * reset. So insert new_bfqq into the + * woken_list of the waker. See + * bfq_check_waker for details. + */ + if (waker_bfqq) + hlist_add_head( + &bfqq->woken_list_node, + &bfqq->waker_bfqq->woken_list); + } } else { - bfqq_already_existing = true; bfq_bfqq_resume_state(bfqq, bfqd, bic, true); } - - if (!bfqq_already_existing) { - bfqq->waker_bfqq = waker_bfqq; - bfqq->tentative_waker_bfqq = NULL; - - /* - * If the waker queue disappears, then - * new_bfqq->waker_bfqq must be - * reset. So insert new_bfqq into the - * woken_list of the waker. See - * bfq_check_waker for details. - */ - if (waker_bfqq) - hlist_add_head(&bfqq->woken_list_node, - &bfqq->waker_bfqq->woken_list); - } } } From a7609d2aec67ec16220036a5b1b14610883cdbd3 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 9 Sep 2024 21:41:54 +0800 Subject: [PATCH 113/120] block, bfq: factor out a helper to split bfqq in bfq_init_rq() Make code cleaner, there are no functional changes. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240909134154.954924-8-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 109 +++++++++++++++++++++++--------------------- 1 file changed, 58 insertions(+), 51 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 7936e8bc166a..0747d9d0e48c 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -6737,11 +6737,10 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) return NULL; } -static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, - struct bfq_io_cq *bic, - struct bio *bio, - bool split, bool is_sync, - bool *new_queue) +static struct bfq_queue * +__bfq_get_bfqq_handle_split(struct bfq_data *bfqd, struct bfq_io_cq *bic, + struct bio *bio, bool split, bool is_sync, + bool *new_queue) { unsigned int act_idx = bfq_actuator_index(bfqd, bio); struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync, act_idx); @@ -6845,6 +6844,59 @@ static struct bfq_queue *bfq_waker_bfqq(struct bfq_queue *bfqq) return waker_bfqq; } +static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, + struct bfq_io_cq *bic, + struct bio *bio, + unsigned int idx, + bool is_sync) +{ + struct bfq_queue *waker_bfqq; + struct bfq_queue *bfqq; + bool new_queue = false; + + bfqq = __bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync, + &new_queue); + if (unlikely(new_queue)) + return bfqq; + + /* If the queue was seeky for too long, break it apart. */ + if (!bfq_bfqq_coop(bfqq) || !bfq_bfqq_split_coop(bfqq) || + bic->bfqq_data[idx].stably_merged) + return bfqq; + + waker_bfqq = bfq_waker_bfqq(bfqq); + + /* Update bic before losing reference to bfqq */ + if (bfq_bfqq_in_large_burst(bfqq)) + bic->bfqq_data[idx].saved_in_large_burst = true; + + bfqq = bfq_split_bfqq(bic, bfqq); + if (bfqq) { + bfq_bfqq_resume_state(bfqq, bfqd, bic, true); + return bfqq; + } + + bfqq = __bfq_get_bfqq_handle_split(bfqd, bic, bio, true, is_sync, NULL); + if (unlikely(bfqq == &bfqd->oom_bfqq)) + return bfqq; + + bfq_bfqq_resume_state(bfqq, bfqd, bic, false); + bfqq->waker_bfqq = waker_bfqq; + bfqq->tentative_waker_bfqq = NULL; + + /* + * If the waker queue disappears, then new_bfqq->waker_bfqq must be + * reset. So insert new_bfqq into the + * woken_list of the waker. See + * bfq_check_waker for details. + */ + if (waker_bfqq) + hlist_add_head(&bfqq->woken_list_node, + &bfqq->waker_bfqq->woken_list); + + return bfqq; +} + /* * If needed, init rq, allocate bfq data structures associated with * rq, and increment reference counters in the destination bfq_queue @@ -6876,7 +6928,6 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) struct bfq_io_cq *bic; const int is_sync = rq_is_sync(rq); struct bfq_queue *bfqq; - bool new_queue = false; unsigned int a_idx = bfq_actuator_index(bfqd, bio); if (unlikely(!rq->elv.icq)) @@ -6893,53 +6944,9 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) return RQ_BFQQ(rq); bic = icq_to_bic(rq->elv.icq); - bfq_check_ioprio_change(bic, bio); - bfq_bic_update_cgroup(bic, bio); - - bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync, - &new_queue); - - if (likely(!new_queue)) { - /* If the queue was seeky for too long, break it apart. */ - if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq) && - !bic->bfqq_data[a_idx].stably_merged) { - struct bfq_queue *waker_bfqq = bfq_waker_bfqq(bfqq); - - /* Update bic before losing reference to bfqq */ - if (bfq_bfqq_in_large_burst(bfqq)) - bic->bfqq_data[a_idx].saved_in_large_burst = - true; - - bfqq = bfq_split_bfqq(bic, bfqq); - if (!bfqq) { - bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, - true, is_sync, - NULL); - if (likely(bfqq != &bfqd->oom_bfqq)) { - bfq_bfqq_resume_state(bfqq, bfqd, bic, - false); - bfqq->waker_bfqq = waker_bfqq; - bfqq->tentative_waker_bfqq = NULL; - - /* - * If the waker queue disappears, then - * new_bfqq->waker_bfqq must be - * reset. So insert new_bfqq into the - * woken_list of the waker. See - * bfq_check_waker for details. - */ - if (waker_bfqq) - hlist_add_head( - &bfqq->woken_list_node, - &bfqq->waker_bfqq->woken_list); - } - } else { - bfq_bfqq_resume_state(bfqq, bfqd, bic, true); - } - } - } + bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, a_idx, is_sync); bfqq_request_allocated(bfqq); bfqq->ref++; From 7de98954687fe152c5f38afd719b3fdf9f34020a Mon Sep 17 00:00:00 2001 From: Kundan Kumar Date: Wed, 11 Sep 2024 12:19:32 +0530 Subject: [PATCH 114/120] block: Added folio-ized version of bio_add_hw_page() Added new bio_add_hw_folio() function as a wrapper around bio_add_hw_page(). This is a prep patch. Signed-off-by: Kundan Kumar Tested-by: Luis Chamberlain Reviewed-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20240911064935.5630-2-kundan.kumar@samsung.com Signed-off-by: Jens Axboe --- block/bio.c | 23 +++++++++++++++++++++++ block/blk.h | 4 ++++ 2 files changed, 27 insertions(+) diff --git a/block/bio.c b/block/bio.c index c4053d49679a..f9d759315f4d 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1016,6 +1016,29 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, return len; } +/** + * bio_add_hw_folio - attempt to add a folio to a bio with hw constraints + * @q: the target queue + * @bio: destination bio + * @folio: folio to add + * @len: vec entry length + * @offset: vec entry offset in the folio + * @max_sectors: maximum number of sectors that can be added + * @same_page: return if the segment has been merged inside the same folio + * + * Add a folio to a bio while respecting the hardware max_sectors, max_segment + * and gap limitations. + */ +int bio_add_hw_folio(struct request_queue *q, struct bio *bio, + struct folio *folio, size_t len, size_t offset, + unsigned int max_sectors, bool *same_page) +{ + if (len > UINT_MAX || offset > UINT_MAX) + return 0; + return bio_add_hw_page(q, bio, folio_page(folio, 0), len, offset, + max_sectors, same_page); +} + /** * bio_add_pc_page - attempt to add page to passthrough bio * @q: the target queue diff --git a/block/blk.h b/block/blk.h index 32f4e9f630a3..86affb583eb6 100644 --- a/block/blk.h +++ b/block/blk.h @@ -574,6 +574,10 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, unsigned int max_sectors, bool *same_page); +int bio_add_hw_folio(struct request_queue *q, struct bio *bio, + struct folio *folio, size_t len, size_t offset, + unsigned int max_sectors, bool *same_page); + /* * Clean up a page appropriately, where the page may be pinned, may have a * ref taken on it or neither. From ed9832bc08db29874600eb066b74918fe6fc2060 Mon Sep 17 00:00:00 2001 From: Kundan Kumar Date: Wed, 11 Sep 2024 12:19:33 +0530 Subject: [PATCH 115/120] block: introduce folio awareness and add a bigger size from folio Add a bigger size from folio to bio and skip merge processing for pages. Fetch the offset of page within a folio. Depending on the size of folio and folio_offset, fetch a larger length. This length may consist of multiple contiguous pages if folio is multiorder. Using the length calculate number of pages which will be added to bio and increment the loop counter to skip those pages. This technique helps to avoid overhead of merging pages which belong to same large order folio. Also folio-ize the functions bio_iov_add_page() and bio_iov_add_zone_append_page() Signed-off-by: Kundan Kumar Tested-by: Luis Chamberlain Reviewed-by: Christoph Hellwig Reviewed-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20240911064935.5630-3-kundan.kumar@samsung.com Signed-off-by: Jens Axboe --- block/bio.c | 79 +++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 61 insertions(+), 18 deletions(-) diff --git a/block/bio.c b/block/bio.c index f9d759315f4d..d8b52bc54549 100644 --- a/block/bio.c +++ b/block/bio.c @@ -931,7 +931,8 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, if (!zone_device_pages_have_same_pgmap(bv->bv_page, page)) return false; - *same_page = ((vec_end_addr & PAGE_MASK) == page_addr); + *same_page = ((vec_end_addr & PAGE_MASK) == ((page_addr + off) & + PAGE_MASK)); if (!*same_page) { if (IS_ENABLED(CONFIG_KMSAN)) return false; @@ -1227,8 +1228,8 @@ void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) bio_set_flag(bio, BIO_CLONED); } -static int bio_iov_add_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int offset) +static int bio_iov_add_folio(struct bio *bio, struct folio *folio, size_t len, + size_t offset) { bool same_page = false; @@ -1237,30 +1238,61 @@ static int bio_iov_add_page(struct bio *bio, struct page *page, if (bio->bi_vcnt > 0 && bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1], - page, len, offset, &same_page)) { + folio_page(folio, 0), len, offset, + &same_page)) { bio->bi_iter.bi_size += len; if (same_page) - bio_release_page(bio, page); + bio_release_page(bio, folio_page(folio, 0)); return 0; } - __bio_add_page(bio, page, len, offset); + bio_add_folio_nofail(bio, folio, len, offset); return 0; } -static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int offset) +static int bio_iov_add_zone_append_folio(struct bio *bio, struct folio *folio, + size_t len, size_t offset) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); bool same_page = false; - if (bio_add_hw_page(q, bio, page, len, offset, + if (bio_add_hw_folio(q, bio, folio, len, offset, queue_max_zone_append_sectors(q), &same_page) != len) return -EINVAL; if (same_page) - bio_release_page(bio, page); + bio_release_page(bio, folio_page(folio, 0)); return 0; } +static unsigned int get_contig_folio_len(unsigned int *num_pages, + struct page **pages, unsigned int i, + struct folio *folio, size_t left, + size_t offset) +{ + size_t bytes = left; + size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, bytes); + unsigned int j; + + /* + * We might COW a single page in the middle of + * a large folio, so we have to check that all + * pages belong to the same folio. + */ + bytes -= contig_sz; + for (j = i + 1; j < i + *num_pages; j++) { + size_t next = min_t(size_t, PAGE_SIZE, bytes); + + if (page_folio(pages[j]) != folio || + pages[j] != pages[j - 1] + 1) { + break; + } + contig_sz += next; + bytes -= next; + } + *num_pages = j - i; + + return contig_sz; +} + #define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *)) /** @@ -1280,9 +1312,9 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; struct page **pages = (struct page **)bv; - ssize_t size, left; - unsigned len, i = 0; - size_t offset; + ssize_t size; + unsigned int num_pages, i = 0; + size_t offset, folio_offset, left, len; int ret = 0; /* @@ -1322,17 +1354,28 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) goto out; } - for (left = size, i = 0; left > 0; left -= len, i++) { + for (left = size, i = 0; left > 0; left -= len, i += num_pages) { struct page *page = pages[i]; + struct folio *folio = page_folio(page); + + folio_offset = ((size_t)folio_page_idx(folio, page) << + PAGE_SHIFT) + offset; + + len = min(folio_size(folio) - folio_offset, left); + + num_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); + + if (num_pages > 1) + len = get_contig_folio_len(&num_pages, pages, i, + folio, left, offset); - len = min_t(size_t, PAGE_SIZE - offset, left); if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - ret = bio_iov_add_zone_append_page(bio, page, len, - offset); + ret = bio_iov_add_zone_append_folio(bio, folio, len, + folio_offset); if (ret) break; } else - bio_iov_add_page(bio, page, len, offset); + bio_iov_add_folio(bio, folio, len, folio_offset); offset = 0; } From d3bfbfb1248498656cd25c51e41c1e31219bd0dd Mon Sep 17 00:00:00 2001 From: Kundan Kumar Date: Wed, 11 Sep 2024 12:19:34 +0530 Subject: [PATCH 116/120] mm: release number of pages of a folio Add a new function unpin_user_folio() to put the refs of a folio by npages count. The check for BIO_PAGE_PINNED flag is removed as it is already checked in bio_release_pages(). Signed-off-by: Kundan Kumar Tested-by: Luis Chamberlain Reviewed-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20240911064935.5630-4-kundan.kumar@samsung.com Signed-off-by: Jens Axboe --- include/linux/mm.h | 1 + mm/gup.c | 13 +++++++++++++ 2 files changed, 14 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index c4b238a20b76..2fd88cd5997d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1597,6 +1597,7 @@ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, bool make_dirty); void unpin_user_pages(struct page **pages, unsigned long npages); +void unpin_user_folio(struct folio *folio, unsigned long npages); void unpin_folios(struct folio **folios, unsigned long nfolios); static inline bool is_cow_mapping(vm_flags_t flags) diff --git a/mm/gup.c b/mm/gup.c index 54d0dc3831fb..02c46ae33028 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -415,6 +415,19 @@ void unpin_user_pages(struct page **pages, unsigned long npages) } EXPORT_SYMBOL(unpin_user_pages); +/** + * unpin_user_folio() - release pages of a folio + * @folio: pointer to folio to be released + * @npages: number of pages of same folio + * + * Release npages of the folio + */ +void unpin_user_folio(struct folio *folio, unsigned long npages) +{ + gup_put_folio(folio, npages, FOLL_PIN); +} +EXPORT_SYMBOL(unpin_user_folio); + /** * unpin_folios() - release an array of gup-pinned folios. * @folios: array of folios to be marked dirty and released. From eb1d46fcd5d672c9da84925ec38f1aca35d40940 Mon Sep 17 00:00:00 2001 From: Kundan Kumar Date: Wed, 11 Sep 2024 12:19:35 +0530 Subject: [PATCH 117/120] block: unpin user pages belonging to a folio at once Use newly added mm function unpin_user_folio() to put refs by npages count. Signed-off-by: Kundan Kumar Tested-by: Luis Chamberlain Link: https://lore.kernel.org/r/20240911064935.5630-5-kundan.kumar@samsung.com Signed-off-by: Jens Axboe --- block/bio.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/block/bio.c b/block/bio.c index d8b52bc54549..ac4d77c88932 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1190,7 +1190,6 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty) struct folio_iter fi; bio_for_each_folio_all(fi, bio) { - struct page *page; size_t nr_pages; if (mark_dirty) { @@ -1198,12 +1197,9 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty) folio_mark_dirty(fi.folio); folio_unlock(fi.folio); } - page = folio_page(fi.folio, fi.offset / PAGE_SIZE); nr_pages = (fi.offset + fi.length - 1) / PAGE_SIZE - fi.offset / PAGE_SIZE + 1; - do { - bio_release_page(bio, page++); - } while (--nr_pages != 0); + unpin_user_folio(fi.folio, nr_pages); } } EXPORT_SYMBOL_GPL(__bio_release_pages); @@ -1241,8 +1237,8 @@ static int bio_iov_add_folio(struct bio *bio, struct folio *folio, size_t len, folio_page(folio, 0), len, offset, &same_page)) { bio->bi_iter.bi_size += len; - if (same_page) - bio_release_page(bio, folio_page(folio, 0)); + if (same_page && bio_flagged(bio, BIO_PAGE_PINNED)) + unpin_user_folio(folio, 1); return 0; } bio_add_folio_nofail(bio, folio, len, offset); @@ -1258,8 +1254,8 @@ static int bio_iov_add_zone_append_folio(struct bio *bio, struct folio *folio, if (bio_add_hw_folio(q, bio, folio, len, offset, queue_max_zone_append_sectors(q), &same_page) != len) return -EINVAL; - if (same_page) - bio_release_page(bio, folio_page(folio, 0)); + if (same_page && bio_flagged(bio, BIO_PAGE_PINNED)) + unpin_user_folio(folio, 1); return 0; } From cc089684664ebd379f1451aab65eb50f4008b381 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 11 Sep 2024 22:41:24 +0100 Subject: [PATCH 118/120] blk_iocost: make read-only static array vrate_adj_pct const The static array vrate_adj_pct is read-only, so make it const as well. Signed-off-by: Colin Ian King Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20240911214124.197403-1-colin.i.king@gmail.com Signed-off-by: Jens Axboe --- block/blk-iocost.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 5a6098a3db57..9dc9323f84ac 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -648,7 +648,7 @@ static const struct ioc_params autop[] = { * vrate adjust percentages indexed by ioc->busy_level. We adjust up on * vtime credit shortage and down on device saturation. */ -static u32 vrate_adj_pct[] = +static const u32 vrate_adj_pct[] = { 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, From 26e197b7f9240a4ac301dd0ad520c0c697c2ea7d Mon Sep 17 00:00:00 2001 From: Riyan Dhiman Date: Wed, 11 Sep 2024 18:59:54 +0530 Subject: [PATCH 119/120] block: fix potential invalid pointer dereference in blk_add_partition The blk_add_partition() function initially used a single if-condition (IS_ERR(part)) to check for errors when adding a partition. This was modified to handle the specific case of -ENXIO separately, allowing the function to proceed without logging the error in this case. However, this change unintentionally left a path where md_autodetect_dev() could be called without confirming that part is a valid pointer. This commit separates the error handling logic by splitting the initial if-condition, improving code readability and handling specific error scenarios explicitly. The function now distinguishes the general error case from -ENXIO without altering the existing behavior of md_autodetect_dev() calls. Fixes: b72053072c0b (block: allow partitions on host aware zone devices) Signed-off-by: Riyan Dhiman Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240911132954.5874-1-riyandhiman14@gmail.com Signed-off-by: Jens Axboe --- block/partitions/core.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/block/partitions/core.c b/block/partitions/core.c index ab76e64f0f6c..5bd7a603092e 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -555,9 +555,11 @@ static bool blk_add_partition(struct gendisk *disk, part = add_partition(disk, p, from, size, state->parts[p].flags, &state->parts[p].info); - if (IS_ERR(part) && PTR_ERR(part) != -ENXIO) { - printk(KERN_ERR " %s: p%d could not be added: %pe\n", - disk->disk_name, p, part); + if (IS_ERR(part)) { + if (PTR_ERR(part) != -ENXIO) { + printk(KERN_ERR " %s: p%d could not be added: %pe\n", + disk->disk_name, p, part); + } return true; } From 83bdfcbdbe5d901c5fa432decf12e1725a840a56 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 11 Sep 2024 10:39:59 -0700 Subject: [PATCH 120/120] nvme-pci: qdepth 1 quirk Another device has been reported to be unreliable if we have more than one outstanding command. In this new case, data corruption may occur. Since we have two devices now needing this quirky behavior, make a generic quirk flag. The same Apple quirk is clearly not "temporary", so update the comment while moving it. Link: https://lore.kernel.org/linux-nvme/191d810a4e3.fcc6066c765804.973611676137075390@collabora.com/ Reported-by: Robert Beckett Reviewed-by: Christoph Hellwig hch@lst.de> Signed-off-by: Keith Busch --- drivers/nvme/host/nvme.h | 5 +++++ drivers/nvme/host/pci.c | 18 +++++++++--------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index af22368800e7..69a3fb6de1e9 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -90,6 +90,11 @@ enum nvme_quirks { */ NVME_QUIRK_NO_DEEPEST_PS = (1 << 5), + /* + * Problems seen with concurrent commands + */ + NVME_QUIRK_QDEPTH_ONE = (1 << 6), + /* * Set MEDIUM priority on SQ creation */ diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 6cd9395ba9ec..0ec0f3e5291c 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2557,15 +2557,8 @@ static int nvme_pci_enable(struct nvme_dev *dev) else dev->io_sqes = NVME_NVM_IOSQES; - /* - * Temporary fix for the Apple controller found in the MacBook8,1 and - * some MacBook7,1 to avoid controller resets and data loss. - */ - if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) { + if (dev->ctrl.quirks & NVME_QUIRK_QDEPTH_ONE) { dev->q_depth = 2; - dev_warn(dev->ctrl.device, "detected Apple NVMe controller, " - "set queue depth=%u to work around controller resets\n", - dev->q_depth); } else if (pdev->vendor == PCI_VENDOR_ID_SAMSUNG && (pdev->device == 0xa821 || pdev->device == 0xa822) && NVME_CAP_MQES(dev->ctrl.cap) == 0) { @@ -3425,6 +3418,8 @@ static const struct pci_device_id nvme_id_table[] = { NVME_QUIRK_BOGUS_NID, }, { PCI_VDEVICE(REDHAT, 0x0010), /* Qemu emulated controller */ .driver_data = NVME_QUIRK_BOGUS_NID, }, + { PCI_DEVICE(0x1217, 0x8760), /* O2 Micro 64GB Steam Deck */ + .driver_data = NVME_QUIRK_QDEPTH_ONE }, { PCI_DEVICE(0x126f, 0x2262), /* Silicon Motion generic */ .driver_data = NVME_QUIRK_NO_DEEPEST_PS | NVME_QUIRK_BOGUS_NID, }, @@ -3559,7 +3554,12 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd02), .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001), - .driver_data = NVME_QUIRK_SINGLE_VECTOR }, + /* + * Fix for the Apple controller found in the MacBook8,1 and + * some MacBook7,1 to avoid controller resets and data loss. + */ + .driver_data = NVME_QUIRK_SINGLE_VECTOR | + NVME_QUIRK_QDEPTH_ONE }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005), .driver_data = NVME_QUIRK_SINGLE_VECTOR |