From fbdee71bb5d8d054e1bdb5af4c540f2cb86fe296 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 4 Jan 2022 08:16:47 +0100
Subject: [PATCH 001/127] block: deprecate autoloading based on dev_t

Make the legacy dev_t based autoloading optional and add a deprecation
warning.  This kind of autoloading has ceased to be useful about 20 years
ago.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220104071647.164918-1-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/Kconfig | 12 ++++++++++++
 block/bdev.c  |  9 ++++++---
 block/genhd.c |  6 ++++++
 3 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/block/Kconfig b/block/Kconfig
index d5d4197b7ed2..205f8d01c695 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -26,6 +26,18 @@ menuconfig BLOCK
 
 if BLOCK
 
+config BLOCK_LEGACY_AUTOLOAD
+	bool "Legacy autoloading support"
+	help
+	  Enable loading modules and creating block device instances based on
+	  accesses through their device special file.  This is a historic Linux
+	  feature and makes no sense in a udev world where device files are
+	  created on demand.
+
+	  Say N here unless booting or other functionality broke without it, in
+	  which case you should also send a report to your distribution and
+	  linux-block@vger.kernel.org.
+
 config BLK_RQ_ALLOC_TIME
 	bool
 
diff --git a/block/bdev.c b/block/bdev.c
index 102837a37051..c68772644566 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -733,12 +733,15 @@ struct block_device *blkdev_get_no_open(dev_t dev)
 	struct inode *inode;
 
 	inode = ilookup(blockdev_superblock, dev);
-	if (!inode) {
+	if (!inode && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) {
 		blk_request_module(dev);
 		inode = ilookup(blockdev_superblock, dev);
-		if (!inode)
-			return NULL;
+		if (inode)
+			pr_warn_ratelimited(
+"block device autoloading is deprecated. It will be removed in Linux 5.19\n");
 	}
+	if (!inode)
+		return NULL;
 
 	/* switch from the inode reference to a device mode one: */
 	bdev = &BDEV_I(inode)->bdev;
diff --git a/block/genhd.c b/block/genhd.c
index 626c8406f21a..6ae990ff0266 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -185,7 +185,9 @@ static struct blk_major_name {
 	struct blk_major_name *next;
 	int major;
 	char name[16];
+#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
 	void (*probe)(dev_t devt);
+#endif
 } *major_names[BLKDEV_MAJOR_HASH_SIZE];
 static DEFINE_MUTEX(major_names_lock);
 static DEFINE_SPINLOCK(major_names_spinlock);
@@ -275,7 +277,9 @@ int __register_blkdev(unsigned int major, const char *name,
 	}
 
 	p->major = major;
+#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
 	p->probe = probe;
+#endif
 	strlcpy(p->name, name, sizeof(p->name));
 	p->next = NULL;
 	index = major_to_index(major);
@@ -679,6 +683,7 @@ static ssize_t disk_badblocks_store(struct device *dev,
 	return badblocks_store(disk->bb, page, len, 0);
 }
 
+#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
 void blk_request_module(dev_t devt)
 {
 	unsigned int major = MAJOR(devt);
@@ -698,6 +703,7 @@ void blk_request_module(dev_t devt)
 		/* Make old-style 2.4 aliases work */
 		request_module("block-major-%d", MAJOR(devt));
 }
+#endif /* CONFIG_BLOCK_LEGACY_AUTOLOAD */
 
 /*
  * print a full list of all partitions - intended for places where the root

From 926597ffce0e3e2f785475df18e1636194209910 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 24 Jan 2022 10:39:11 +0100
Subject: [PATCH 002/127] block: move disk_{block,unblock,flush}_events to
 blk.h

No need to have these declarations in a public header.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20220124093913.742411-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk.h           | 3 +++
 include/linux/genhd.h | 3 ---
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/block/blk.h b/block/blk.h
index 8bd43b3ad33d..2cba50d7e6cb 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -445,6 +445,9 @@ int disk_alloc_events(struct gendisk *disk);
 void disk_add_events(struct gendisk *disk);
 void disk_del_events(struct gendisk *disk);
 void disk_release_events(struct gendisk *disk);
+void disk_block_events(struct gendisk *disk);
+void disk_unblock_events(struct gendisk *disk);
+void disk_flush_events(struct gendisk *disk, unsigned int mask);
 extern struct device_attribute dev_attr_events;
 extern struct device_attribute dev_attr_events_async;
 extern struct device_attribute dev_attr_events_poll_msecs;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 6906a45bc761..504f9a6674ac 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -185,9 +185,6 @@ static inline int bdev_read_only(struct block_device *bdev)
 	return bdev->bd_read_only || get_disk_ro(bdev->bd_disk);
 }
 
-extern void disk_block_events(struct gendisk *disk);
-extern void disk_unblock_events(struct gendisk *disk);
-extern void disk_flush_events(struct gendisk *disk, unsigned int mask);
 bool set_capacity_and_notify(struct gendisk *disk, sector_t size);
 bool disk_force_media_change(struct gendisk *disk, unsigned int events);
 

From e7243285c0fc87054990fcde630583586ff8ed5f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 24 Jan 2022 10:39:12 +0100
Subject: [PATCH 003/127] block: move blk_drop_partitions to blk.h

No need to have this declaration in a public header.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20220124093913.742411-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk.h           | 1 +
 include/linux/genhd.h | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk.h b/block/blk.h
index 2cba50d7e6cb..800c5ae387a0 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -426,6 +426,7 @@ int bdev_add_partition(struct gendisk *disk, int partno, sector_t start,
 int bdev_del_partition(struct gendisk *disk, int partno);
 int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start,
 		sector_t length);
+void blk_drop_partitions(struct gendisk *disk);
 
 int bio_add_hw_page(struct request_queue *q, struct bio *bio,
 		struct page *page, unsigned int len, unsigned int offset,
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 504f9a6674ac..aa4bd985dbe5 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -219,7 +219,6 @@ static inline u64 sb_bdev_nr_blocks(struct super_block *sb)
 }
 
 int bdev_disk_changed(struct gendisk *disk, bool invalidate);
-void blk_drop_partitions(struct gendisk *disk);
 
 struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
 		struct lock_class_key *lkclass);

From 322cbb50de711814c42fb088f6d31901502c711a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 24 Jan 2022 10:39:13 +0100
Subject: [PATCH 004/127] block: remove genhd.h

There is no good reason to keep genhd.h separate from the main blkdev.h
header that includes it.  So fold the contents of genhd.h into blkdev.h
and remove genhd.h entirely.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20220124093913.742411-4-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/block/capability.rst  |   2 +-
 arch/m68k/atari/stdma.c             |   1 -
 arch/m68k/bvme6000/config.c         |   1 -
 arch/m68k/emu/nfblock.c             |   1 -
 arch/m68k/kernel/setup_mm.c         |   1 -
 arch/m68k/mvme147/config.c          |   1 -
 arch/m68k/mvme16x/config.c          |   1 -
 block/blk-cgroup.c                  |   1 -
 block/disk-events.c                 |   2 +-
 block/genhd.c                       |   1 -
 block/holder.c                      |   2 +-
 block/partitions/check.h            |   1 -
 block/partitions/core.c             |   1 -
 block/partitions/efi.h              |   1 -
 block/partitions/ldm.h              |   1 -
 block/sed-opal.c                    |   2 +-
 drivers/base/class.c                |   2 +-
 drivers/base/core.c                 |   2 +-
 drivers/base/devtmpfs.c             |   2 +-
 drivers/block/aoe/aoeblk.c          |   1 -
 drivers/block/aoe/aoecmd.c          |   1 -
 drivers/block/drbd/drbd_int.h       |   1 -
 drivers/block/mtip32xx/mtip32xx.c   |   1 -
 drivers/block/mtip32xx/mtip32xx.h   |   1 -
 drivers/block/rnbd/rnbd-srv-sysfs.c |   1 -
 drivers/block/sunvdc.c              |   1 -
 drivers/block/zram/zram_drv.c       |   1 -
 drivers/cdrom/gdrom.c               |   1 -
 drivers/char/random.c               |   2 +-
 drivers/md/bcache/super.c           |   1 -
 drivers/md/dm-core.h                |   1 -
 drivers/mtd/mtdswap.c               |   2 +-
 drivers/mtd/nand/raw/sharpsl.c      |   1 -
 drivers/nvdimm/blk.c                |   1 -
 drivers/nvdimm/btt.c                |   1 -
 drivers/nvdimm/btt_devs.c           |   1 -
 drivers/nvdimm/bus.c                |   1 -
 drivers/nvdimm/pfn_devs.c           |   1 -
 drivers/s390/block/dasd_int.h       |   1 -
 drivers/s390/block/scm_blk.c        |   1 -
 drivers/s390/block/scm_blk.h        |   1 -
 drivers/scsi/scsi_debug.c           |   1 -
 drivers/scsi/scsicam.c              |   1 -
 drivers/scsi/sd.c                   |   1 -
 drivers/scsi/sr.h                   |   1 -
 drivers/target/target_core_iblock.c |   1 -
 drivers/target/target_core_pscsi.c  |   1 -
 fs/btrfs/check-integrity.c          |   1 -
 fs/dax.c                            |   1 -
 fs/gfs2/sys.c                       |   2 +-
 fs/hfs/mdb.c                        |   2 +-
 fs/hfsplus/wrapper.c                |   1 -
 fs/ksmbd/vfs.c                      |   1 -
 fs/nfs/blocklayout/rpc_pipefs.c     |   1 -
 fs/nfsd/blocklayout.c               |   1 -
 include/linux/blkdev.h              | 273 +++++++++++++++++++++++++-
 include/linux/genhd.h               | 287 ----------------------------
 include/linux/part_stat.h           |   2 +-
 init/do_mounts.c                    |   1 -
 kernel/power/hibernate.c            |   1 -
 kernel/power/swap.c                 |   1 -
 security/integrity/ima/ima_policy.c |   1 -
 62 files changed, 282 insertions(+), 350 deletions(-)
 delete mode 100644 include/linux/genhd.h

diff --git a/Documentation/block/capability.rst b/Documentation/block/capability.rst
index 160a5148b915..2ae7f064736a 100644
--- a/Documentation/block/capability.rst
+++ b/Documentation/block/capability.rst
@@ -7,4 +7,4 @@ This file documents the sysfs file ``block/<disk>/capability``.
 ``capability`` is a bitfield, printed in hexadecimal, indicating which
 capabilities a specific block device supports:
 
-.. kernel-doc:: include/linux/genhd.h
+.. kernel-doc:: include/linux/blkdev.h
diff --git a/arch/m68k/atari/stdma.c b/arch/m68k/atari/stdma.c
index ba65f942d0c7..ce6818eff75e 100644
--- a/arch/m68k/atari/stdma.c
+++ b/arch/m68k/atari/stdma.c
@@ -30,7 +30,6 @@
 
 #include <linux/types.h>
 #include <linux/kdev_t.h>
-#include <linux/genhd.h>
 #include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
diff --git a/arch/m68k/bvme6000/config.c b/arch/m68k/bvme6000/config.c
index 0c6feafbbd11..0fe0f3e888fb 100644
--- a/arch/m68k/bvme6000/config.c
+++ b/arch/m68k/bvme6000/config.c
@@ -23,7 +23,6 @@
 #include <linux/linkage.h>
 #include <linux/init.h>
 #include <linux/major.h>
-#include <linux/genhd.h>
 #include <linux/rtc.h>
 #include <linux/interrupt.h>
 #include <linux/bcd.h>
diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c
index 9c57b245dc12..267b02cc5655 100644
--- a/arch/m68k/emu/nfblock.c
+++ b/arch/m68k/emu/nfblock.c
@@ -13,7 +13,6 @@
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/types.h>
-#include <linux/genhd.h>
 #include <linux/blkdev.h>
 #include <linux/hdreg.h>
 #include <linux/slab.h>
diff --git a/arch/m68k/kernel/setup_mm.c b/arch/m68k/kernel/setup_mm.c
index 49e573b94326..ee268055bdce 100644
--- a/arch/m68k/kernel/setup_mm.c
+++ b/arch/m68k/kernel/setup_mm.c
@@ -16,7 +16,6 @@
 #include <linux/interrupt.h>
 #include <linux/fs.h>
 #include <linux/console.h>
-#include <linux/genhd.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/init.h>
diff --git a/arch/m68k/mvme147/config.c b/arch/m68k/mvme147/config.c
index dfd6202fd403..db1430dc411f 100644
--- a/arch/m68k/mvme147/config.c
+++ b/arch/m68k/mvme147/config.c
@@ -22,7 +22,6 @@
 #include <linux/linkage.h>
 #include <linux/init.h>
 #include <linux/major.h>
-#include <linux/genhd.h>
 #include <linux/rtc.h>
 #include <linux/interrupt.h>
 
diff --git a/arch/m68k/mvme16x/config.c b/arch/m68k/mvme16x/config.c
index b4422c2dfbbf..45a07ab3123a 100644
--- a/arch/m68k/mvme16x/config.c
+++ b/arch/m68k/mvme16x/config.c
@@ -24,7 +24,6 @@
 #include <linux/linkage.h>
 #include <linux/init.h>
 #include <linux/major.h>
-#include <linux/genhd.h>
 #include <linux/rtc.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 650f7e27989f..671debbae941 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -23,7 +23,6 @@
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/slab.h>
-#include <linux/genhd.h>
 #include <linux/delay.h>
 #include <linux/atomic.h>
 #include <linux/ctype.h>
diff --git a/block/disk-events.c b/block/disk-events.c
index 8d5496e7592a..aee25a7e1ab7 100644
--- a/block/disk-events.c
+++ b/block/disk-events.c
@@ -4,7 +4,7 @@
  */
 #include <linux/export.h>
 #include <linux/moduleparam.h>
-#include <linux/genhd.h>
+#include <linux/blkdev.h>
 #include "blk.h"
 
 struct disk_events {
diff --git a/block/genhd.c b/block/genhd.c
index 6ae990ff0266..9589d1d59afa 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -8,7 +8,6 @@
 #include <linux/module.h>
 #include <linux/ctype.h>
 #include <linux/fs.h>
-#include <linux/genhd.h>
 #include <linux/kdev_t.h>
 #include <linux/kernel.h>
 #include <linux/blkdev.h>
diff --git a/block/holder.c b/block/holder.c
index 27cddce1b446..8d750281a1cd 100644
--- a/block/holder.c
+++ b/block/holder.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-#include <linux/genhd.h>
+#include <linux/blkdev.h>
 #include <linux/slab.h>
 
 struct bd_holder_disk {
diff --git a/block/partitions/check.h b/block/partitions/check.h
index d5b28e309d64..4ffa2359b1a3 100644
--- a/block/partitions/check.h
+++ b/block/partitions/check.h
@@ -1,7 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #include <linux/pagemap.h>
 #include <linux/blkdev.h>
-#include <linux/genhd.h>
 #include "../blk.h"
 
 /*
diff --git a/block/partitions/core.c b/block/partitions/core.c
index c2a1635922b1..2ef8dfa1e5c8 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -8,7 +8,6 @@
 #include <linux/major.h>
 #include <linux/slab.h>
 #include <linux/ctype.h>
-#include <linux/genhd.h>
 #include <linux/vmalloc.h>
 #include <linux/blktrace_api.h>
 #include <linux/raid/detect.h>
diff --git a/block/partitions/efi.h b/block/partitions/efi.h
index 8cc2b88d0aa8..84b9f36b9e47 100644
--- a/block/partitions/efi.h
+++ b/block/partitions/efi.h
@@ -13,7 +13,6 @@
 
 #include <linux/types.h>
 #include <linux/fs.h>
-#include <linux/genhd.h>
 #include <linux/kernel.h>
 #include <linux/major.h>
 #include <linux/string.h>
diff --git a/block/partitions/ldm.h b/block/partitions/ldm.h
index 8693704dcf5e..0a747a0c782d 100644
--- a/block/partitions/ldm.h
+++ b/block/partitions/ldm.h
@@ -14,7 +14,6 @@
 
 #include <linux/types.h>
 #include <linux/list.h>
-#include <linux/genhd.h>
 #include <linux/fs.h>
 #include <asm/unaligned.h>
 #include <asm/byteorder.h>
diff --git a/block/sed-opal.c b/block/sed-opal.c
index daafadbb88ca..9700197000f2 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -13,7 +13,7 @@
 #include <linux/device.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
-#include <linux/genhd.h>
+#include <linux/blkdev.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 #include <uapi/linux/sed-opal.h>
diff --git a/drivers/base/class.c b/drivers/base/class.c
index 7476f393df97..8feb85e186e3 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -16,7 +16,7 @@
 #include <linux/kdev_t.h>
 #include <linux/err.h>
 #include <linux/slab.h>
-#include <linux/genhd.h>
+#include <linux/blkdev.h>
 #include <linux/mutex.h>
 #include "base.h"
 
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 7bb957b11861..3d6430eb0c6a 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -21,7 +21,7 @@
 #include <linux/notifier.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
-#include <linux/genhd.h>
+#include <linux/blkdev.h>
 #include <linux/mutex.h>
 #include <linux/pm_runtime.h>
 #include <linux/netdevice.h>
diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index f41063ac1aee..db5a03a0618e 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -17,7 +17,7 @@
 #include <linux/syscalls.h>
 #include <linux/mount.h>
 #include <linux/device.h>
-#include <linux/genhd.h>
+#include <linux/blkdev.h>
 #include <linux/namei.h>
 #include <linux/fs.h>
 #include <linux/shmem_fs.h>
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 52484bcdedb9..8a91fcac6f82 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -12,7 +12,6 @@
 #include <linux/ioctl.h>
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
-#include <linux/genhd.h>
 #include <linux/netdevice.h>
 #include <linux/mutex.h>
 #include <linux/export.h>
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 6af111f568e4..cc11f89a0928 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -10,7 +10,6 @@
 #include <linux/blk-mq.h>
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
-#include <linux/genhd.h>
 #include <linux/moduleparam.h>
 #include <linux/workqueue.h>
 #include <linux/kthread.h>
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index f27d5b0f9a0b..acb1ad3c0603 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -27,7 +27,6 @@
 #include <linux/major.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
-#include <linux/genhd.h>
 #include <linux/idr.h>
 #include <linux/dynamic_debug.h>
 #include <net/tcp.h>
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index e6005c232328..cba956881d55 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -19,7 +19,6 @@
 #include <linux/compat.h>
 #include <linux/fs.h>
 #include <linux/module.h>
-#include <linux/genhd.h>
 #include <linux/blkdev.h>
 #include <linux/blk-mq.h>
 #include <linux/bio.h>
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
index 88f4206310e4..6816beb45352 100644
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -15,7 +15,6 @@
 #include <linux/rwsem.h>
 #include <linux/ata.h>
 #include <linux/interrupt.h>
-#include <linux/genhd.h>
 
 /* Offset of Subsystem Device ID in pci confoguration space */
 #define PCI_SUBSYSTEM_DEVICEID	0x2E
diff --git a/drivers/block/rnbd/rnbd-srv-sysfs.c b/drivers/block/rnbd/rnbd-srv-sysfs.c
index 4db98e0e76f0..feaa76c5a342 100644
--- a/drivers/block/rnbd/rnbd-srv-sysfs.c
+++ b/drivers/block/rnbd/rnbd-srv-sysfs.c
@@ -13,7 +13,6 @@
 #include <linux/kobject.h>
 #include <linux/sysfs.h>
 #include <linux/stat.h>
-#include <linux/genhd.h>
 #include <linux/list.h>
 #include <linux/moduleparam.h>
 #include <linux/device.h>
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index 146d85d80e0e..dd0a1a6fed29 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -9,7 +9,6 @@
 #include <linux/types.h>
 #include <linux/blk-mq.h>
 #include <linux/hdreg.h>
-#include <linux/genhd.h>
 #include <linux/cdrom.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index cb253d80d72b..342dbcb3f220 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -22,7 +22,6 @@
 #include <linux/blkdev.h>
 #include <linux/buffer_head.h>
 #include <linux/device.h>
-#include <linux/genhd.h>
 #include <linux/highmem.h>
 #include <linux/slab.h>
 #include <linux/backing-dev.h>
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index faead41709bc..8e78b37d0f6a 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -15,7 +15,6 @@
 #include <linux/slab.h>
 #include <linux/dma-mapping.h>
 #include <linux/cdrom.h>
-#include <linux/genhd.h>
 #include <linux/bio.h>
 #include <linux/blk-mq.h>
 #include <linux/interrupt.h>
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 68613f0b6887..f206c87c6202 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -330,7 +330,7 @@
 #include <linux/poll.h>
 #include <linux/init.h>
 #include <linux/fs.h>
-#include <linux/genhd.h>
+#include <linux/blkdev.h>
 #include <linux/interrupt.h>
 #include <linux/mm.h>
 #include <linux/nodemask.h>
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 140f35dc0c45..c31a62b963f0 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -18,7 +18,6 @@
 #include <linux/blkdev.h>
 #include <linux/pagemap.h>
 #include <linux/debugfs.h>
-#include <linux/genhd.h>
 #include <linux/idr.h>
 #include <linux/kthread.h>
 #include <linux/workqueue.h>
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index b855fef4f38a..72d18c3fbf1f 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -11,7 +11,6 @@
 
 #include <linux/kthread.h>
 #include <linux/ktime.h>
-#include <linux/genhd.h>
 #include <linux/blk-mq.h>
 #include <linux/blk-crypto-profile.h>
 
diff --git a/drivers/mtd/mtdswap.c b/drivers/mtd/mtdswap.c
index e86b04bc1d6b..dc7f1532a37f 100644
--- a/drivers/mtd/mtdswap.c
+++ b/drivers/mtd/mtdswap.c
@@ -19,7 +19,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
-#include <linux/genhd.h>
+#include <linux/blkdev.h>
 #include <linux/swap.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
diff --git a/drivers/mtd/nand/raw/sharpsl.c b/drivers/mtd/nand/raw/sharpsl.c
index 5612ee628425..52ce5162538a 100644
--- a/drivers/mtd/nand/raw/sharpsl.c
+++ b/drivers/mtd/nand/raw/sharpsl.c
@@ -6,7 +6,6 @@
  *  Based on Sharp's NAND driver sharp_sl.c
  */
 
-#include <linux/genhd.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/delay.h>
diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
index 228c33b8d1d6..c1db43524d75 100644
--- a/drivers/nvdimm/blk.c
+++ b/drivers/nvdimm/blk.c
@@ -6,7 +6,6 @@
 
 #include <linux/blkdev.h>
 #include <linux/fs.h>
-#include <linux/genhd.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/nd.h>
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index da3f007a1211..cbd994f7f1fe 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -11,7 +11,6 @@
 #include <linux/device.h>
 #include <linux/mutex.h>
 #include <linux/hdreg.h>
-#include <linux/genhd.h>
 #include <linux/sizes.h>
 #include <linux/ndctl.h>
 #include <linux/fs.h>
diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c
index 8b52e5144f08..e5a58520d398 100644
--- a/drivers/nvdimm/btt_devs.c
+++ b/drivers/nvdimm/btt_devs.c
@@ -4,7 +4,6 @@
  */
 #include <linux/blkdev.h>
 #include <linux/device.h>
-#include <linux/genhd.h>
 #include <linux/sizes.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 9dc7f3edd42b..5bbe31b08581 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -11,7 +11,6 @@
 #include <linux/blkdev.h>
 #include <linux/fcntl.h>
 #include <linux/async.h>
-#include <linux/genhd.h>
 #include <linux/ndctl.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 58eda16f5c53..c31e184bfa45 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -5,7 +5,6 @@
 #include <linux/memremap.h>
 #include <linux/blkdev.h>
 #include <linux/device.h>
-#include <linux/genhd.h>
 #include <linux/sizes.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h
index 8b458010f88a..3b7af00a7825 100644
--- a/drivers/s390/block/dasd_int.h
+++ b/drivers/s390/block/dasd_int.h
@@ -47,7 +47,6 @@
 #include <linux/module.h>
 #include <linux/wait.h>
 #include <linux/blkdev.h>
-#include <linux/genhd.h>
 #include <linux/hdreg.h>
 #include <linux/interrupt.h>
 #include <linux/log2.h>
diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c
index 61ecdcb2cc6a..2a9c0ddcade5 100644
--- a/drivers/s390/block/scm_blk.c
+++ b/drivers/s390/block/scm_blk.c
@@ -15,7 +15,6 @@
 #include <linux/module.h>
 #include <linux/blkdev.h>
 #include <linux/blk-mq.h>
-#include <linux/genhd.h>
 #include <linux/slab.h>
 #include <linux/list.h>
 #include <asm/eadm.h>
diff --git a/drivers/s390/block/scm_blk.h b/drivers/s390/block/scm_blk.h
index a05a4297cfae..af82b3214774 100644
--- a/drivers/s390/block/scm_blk.h
+++ b/drivers/s390/block/scm_blk.h
@@ -6,7 +6,6 @@
 #include <linux/spinlock.h>
 #include <linux/blkdev.h>
 #include <linux/blk-mq.h>
-#include <linux/genhd.h>
 #include <linux/list.h>
 
 #include <asm/debug.h>
diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index 2104973a35cd..911cc72dd7ac 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -23,7 +23,6 @@
 #include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/string.h>
-#include <linux/genhd.h>
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/proc_fs.h>
diff --git a/drivers/scsi/scsicam.c b/drivers/scsi/scsicam.c
index 0ffdb8f2995f..acdc0aceca5e 100644
--- a/drivers/scsi/scsicam.c
+++ b/drivers/scsi/scsicam.c
@@ -14,7 +14,6 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
-#include <linux/genhd.h>
 #include <linux/kernel.h>
 #include <linux/blkdev.h>
 #include <linux/pagemap.h>
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 62eb9921cc94..2d648d27bfd7 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -38,7 +38,6 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/bio.h>
-#include <linux/genhd.h>
 #include <linux/hdreg.h>
 #include <linux/errno.h>
 #include <linux/idr.h>
diff --git a/drivers/scsi/sr.h b/drivers/scsi/sr.h
index 339c624e04d8..1609f02ed29a 100644
--- a/drivers/scsi/sr.h
+++ b/drivers/scsi/sr.h
@@ -18,7 +18,6 @@
 #ifndef _SR_H
 #define _SR_H
 
-#include <linux/genhd.h>
 #include <linux/kref.h>
 #include <linux/mutex.h>
 
diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c
index bf8ae4825a06..6045678365a5 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -20,7 +20,6 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/bio.h>
-#include <linux/genhd.h>
 #include <linux/file.h>
 #include <linux/module.h>
 #include <linux/scatterlist.h>
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index 807d06ecadee..0fae71ac5cc8 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -17,7 +17,6 @@
 #include <linux/blk_types.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
-#include <linux/genhd.h>
 #include <linux/cdrom.h>
 #include <linux/ratelimit.h>
 #include <linux/module.h>
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 7e9f90fa0388..abac86a75840 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -78,7 +78,6 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/mutex.h>
-#include <linux/genhd.h>
 #include <linux/blkdev.h>
 #include <linux/mm.h>
 #include <linux/string.h>
diff --git a/fs/dax.c b/fs/dax.c
index cd03485867a7..ab0978739eaa 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -11,7 +11,6 @@
 #include <linux/buffer_head.h>
 #include <linux/dax.h>
 #include <linux/fs.h>
-#include <linux/genhd.h>
 #include <linux/highmem.h>
 #include <linux/memcontrol.h>
 #include <linux/mm.h>
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index a6002b2d146d..d87ea98cf535 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -15,7 +15,7 @@
 #include <linux/kobject.h>
 #include <linux/uaccess.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/genhd.h>
+#include <linux/blkdev.h>
 
 #include "gfs2.h"
 #include "incore.h"
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 5beb82652435..8082eb01127c 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -9,7 +9,7 @@
  */
 
 #include <linux/cdrom.h>
-#include <linux/genhd.h>
+#include <linux/blkdev.h>
 #include <linux/nls.h>
 #include <linux/slab.h>
 
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 51ae6f1eb4a5..4688cc7b3692 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -12,7 +12,6 @@
 #include <linux/fs.h>
 #include <linux/blkdev.h>
 #include <linux/cdrom.h>
-#include <linux/genhd.h>
 #include <asm/unaligned.h>
 
 #include "hfsplus_fs.h"
diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c
index 19d36393974c..9cebb6ba555b 100644
--- a/fs/ksmbd/vfs.c
+++ b/fs/ksmbd/vfs.c
@@ -11,7 +11,6 @@
 #include <linux/writeback.h>
 #include <linux/xattr.h>
 #include <linux/falloc.h>
-#include <linux/genhd.h>
 #include <linux/fsnotify.h>
 #include <linux/dcache.h>
 #include <linux/slab.h>
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
index ef9db135c649..6c977288cc28 100644
--- a/fs/nfs/blocklayout/rpc_pipefs.c
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -27,7 +27,6 @@
  */
 
 #include <linux/module.h>
-#include <linux/genhd.h>
 #include <linux/blkdev.h>
 
 #include "blocklayout.h"
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index e5c0982a381d..b6d01d51a746 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -4,7 +4,6 @@
  */
 #include <linux/exportfs.h>
 #include <linux/iomap.h>
-#include <linux/genhd.h>
 #include <linux/slab.h>
 #include <linux/pr.h>
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f35aea98bc35..99a4384bb8a5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1,9 +1,13 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Portions Copyright (C) 1992 Drew Eckhardt
+ */
 #ifndef _LINUX_BLKDEV_H
 #define _LINUX_BLKDEV_H
 
-#include <linux/sched.h>
-#include <linux/genhd.h>
+#include <linux/types.h>
+#include <linux/blk_types.h>
+#include <linux/device.h>
 #include <linux/list.h>
 #include <linux/llist.h>
 #include <linux/minmax.h>
@@ -12,11 +16,15 @@
 #include <linux/wait.h>
 #include <linux/bio.h>
 #include <linux/gfp.h>
+#include <linux/kdev_t.h>
 #include <linux/rcupdate.h>
 #include <linux/percpu-refcount.h>
 #include <linux/blkzoned.h>
+#include <linux/sched.h>
 #include <linux/sbitmap.h>
 #include <linux/srcu.h>
+#include <linux/uuid.h>
+#include <linux/xarray.h>
 
 struct module;
 struct request_queue;
@@ -33,6 +41,10 @@ struct blk_queue_stats;
 struct blk_stat_callback;
 struct blk_crypto_profile;
 
+extern const struct device_type disk_type;
+extern struct device_type part_type;
+extern struct class block_class;
+
 /* Must be consistent with blk_mq_poll_stats_bkt() */
 #define BLK_MQ_POLL_STATS_BKTS 16
 
@@ -45,6 +57,144 @@ struct blk_crypto_profile;
  */
 #define BLKCG_MAX_POLS		6
 
+#define DISK_MAX_PARTS			256
+#define DISK_NAME_LEN			32
+
+#define PARTITION_META_INFO_VOLNAMELTH	64
+/*
+ * Enough for the string representation of any kind of UUID plus NULL.
+ * EFI UUID is 36 characters. MSDOS UUID is 11 characters.
+ */
+#define PARTITION_META_INFO_UUIDLTH	(UUID_STRING_LEN + 1)
+
+struct partition_meta_info {
+	char uuid[PARTITION_META_INFO_UUIDLTH];
+	u8 volname[PARTITION_META_INFO_VOLNAMELTH];
+};
+
+/**
+ * DOC: genhd capability flags
+ *
+ * ``GENHD_FL_REMOVABLE``: indicates that the block device gives access to
+ * removable media.  When set, the device remains present even when media is not
+ * inserted.  Shall not be set for devices which are removed entirely when the
+ * media is removed.
+ *
+ * ``GENHD_FL_HIDDEN``: the block device is hidden; it doesn't produce events,
+ * doesn't appear in sysfs, and can't be opened from userspace or using
+ * blkdev_get*. Used for the underlying components of multipath devices.
+ *
+ * ``GENHD_FL_NO_PART``: partition support is disabled.  The kernel will not
+ * scan for partitions from add_disk, and users can't add partitions manually.
+ *
+ */
+enum {
+	GENHD_FL_REMOVABLE			= 1 << 0,
+	GENHD_FL_HIDDEN				= 1 << 1,
+	GENHD_FL_NO_PART			= 1 << 2,
+};
+
+enum {
+	DISK_EVENT_MEDIA_CHANGE			= 1 << 0, /* media changed */
+	DISK_EVENT_EJECT_REQUEST		= 1 << 1, /* eject requested */
+};
+
+enum {
+	/* Poll even if events_poll_msecs is unset */
+	DISK_EVENT_FLAG_POLL			= 1 << 0,
+	/* Forward events to udev */
+	DISK_EVENT_FLAG_UEVENT			= 1 << 1,
+	/* Block event polling when open for exclusive write */
+	DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE	= 1 << 2,
+};
+
+struct disk_events;
+struct badblocks;
+
+struct blk_integrity {
+	const struct blk_integrity_profile	*profile;
+	unsigned char				flags;
+	unsigned char				tuple_size;
+	unsigned char				interval_exp;
+	unsigned char				tag_size;
+};
+
+struct gendisk {
+	/*
+	 * major/first_minor/minors should not be set by any new driver, the
+	 * block core will take care of allocating them automatically.
+	 */
+	int major;
+	int first_minor;
+	int minors;
+
+	char disk_name[DISK_NAME_LEN];	/* name of major driver */
+
+	unsigned short events;		/* supported events */
+	unsigned short event_flags;	/* flags related to event processing */
+
+	struct xarray part_tbl;
+	struct block_device *part0;
+
+	const struct block_device_operations *fops;
+	struct request_queue *queue;
+	void *private_data;
+
+	int flags;
+	unsigned long state;
+#define GD_NEED_PART_SCAN		0
+#define GD_READ_ONLY			1
+#define GD_DEAD				2
+#define GD_NATIVE_CAPACITY		3
+
+	struct mutex open_mutex;	/* open/close mutex */
+	unsigned open_partitions;	/* number of open partitions */
+
+	struct backing_dev_info	*bdi;
+	struct kobject *slave_dir;
+#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
+	struct list_head slave_bdevs;
+#endif
+	struct timer_rand_state *random;
+	atomic_t sync_io;		/* RAID */
+	struct disk_events *ev;
+#ifdef  CONFIG_BLK_DEV_INTEGRITY
+	struct kobject integrity_kobj;
+#endif	/* CONFIG_BLK_DEV_INTEGRITY */
+#if IS_ENABLED(CONFIG_CDROM)
+	struct cdrom_device_info *cdi;
+#endif
+	int node_id;
+	struct badblocks *bb;
+	struct lockdep_map lockdep_map;
+	u64 diskseq;
+};
+
+static inline bool disk_live(struct gendisk *disk)
+{
+	return !inode_unhashed(disk->part0->bd_inode);
+}
+
+/*
+ * The gendisk is refcounted by the part0 block_device, and the bd_device
+ * therein is also used for device model presentation in sysfs.
+ */
+#define dev_to_disk(device) \
+	(dev_to_bdev(device)->bd_disk)
+#define disk_to_dev(disk) \
+	(&((disk)->part0->bd_device))
+
+#if IS_REACHABLE(CONFIG_CDROM)
+#define disk_to_cdi(disk)	((disk)->cdi)
+#else
+#define disk_to_cdi(disk)	NULL
+#endif
+
+static inline dev_t disk_devt(struct gendisk *disk)
+{
+	return MKDEV(disk->major, disk->first_minor);
+}
+
 static inline int blk_validate_block_size(unsigned long bsize)
 {
 	if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize))
@@ -596,6 +746,118 @@ static inline unsigned int blk_queue_depth(struct request_queue *q)
 #define for_each_bio(_bio)		\
 	for (; _bio; _bio = _bio->bi_next)
 
+int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
+				 const struct attribute_group **groups);
+static inline int __must_check add_disk(struct gendisk *disk)
+{
+	return device_add_disk(NULL, disk, NULL);
+}
+void del_gendisk(struct gendisk *gp);
+void invalidate_disk(struct gendisk *disk);
+void set_disk_ro(struct gendisk *disk, bool read_only);
+void disk_uevent(struct gendisk *disk, enum kobject_action action);
+
+static inline int get_disk_ro(struct gendisk *disk)
+{
+	return disk->part0->bd_read_only ||
+		test_bit(GD_READ_ONLY, &disk->state);
+}
+
+static inline int bdev_read_only(struct block_device *bdev)
+{
+	return bdev->bd_read_only || get_disk_ro(bdev->bd_disk);
+}
+
+bool set_capacity_and_notify(struct gendisk *disk, sector_t size);
+bool disk_force_media_change(struct gendisk *disk, unsigned int events);
+
+void add_disk_randomness(struct gendisk *disk) __latent_entropy;
+void rand_initialize_disk(struct gendisk *disk);
+
+static inline sector_t get_start_sect(struct block_device *bdev)
+{
+	return bdev->bd_start_sect;
+}
+
+static inline sector_t bdev_nr_sectors(struct block_device *bdev)
+{
+	return bdev->bd_nr_sectors;
+}
+
+static inline loff_t bdev_nr_bytes(struct block_device *bdev)
+{
+	return (loff_t)bdev_nr_sectors(bdev) << SECTOR_SHIFT;
+}
+
+static inline sector_t get_capacity(struct gendisk *disk)
+{
+	return bdev_nr_sectors(disk->part0);
+}
+
+static inline u64 sb_bdev_nr_blocks(struct super_block *sb)
+{
+	return bdev_nr_sectors(sb->s_bdev) >>
+		(sb->s_blocksize_bits - SECTOR_SHIFT);
+}
+
+int bdev_disk_changed(struct gendisk *disk, bool invalidate);
+
+struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
+		struct lock_class_key *lkclass);
+void put_disk(struct gendisk *disk);
+struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass);
+
+/**
+ * blk_alloc_disk - allocate a gendisk structure
+ * @node_id: numa node to allocate on
+ *
+ * Allocate and pre-initialize a gendisk structure for use with BIO based
+ * drivers.
+ *
+ * Context: can sleep
+ */
+#define blk_alloc_disk(node_id)						\
+({									\
+	static struct lock_class_key __key;				\
+									\
+	__blk_alloc_disk(node_id, &__key);				\
+})
+void blk_cleanup_disk(struct gendisk *disk);
+
+int __register_blkdev(unsigned int major, const char *name,
+		void (*probe)(dev_t devt));
+#define register_blkdev(major, name) \
+	__register_blkdev(major, name, NULL)
+void unregister_blkdev(unsigned int major, const char *name);
+
+bool bdev_check_media_change(struct block_device *bdev);
+int __invalidate_device(struct block_device *bdev, bool kill_dirty);
+void set_capacity(struct gendisk *disk, sector_t size);
+
+#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
+int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk);
+void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk);
+int bd_register_pending_holders(struct gendisk *disk);
+#else
+static inline int bd_link_disk_holder(struct block_device *bdev,
+				      struct gendisk *disk)
+{
+	return 0;
+}
+static inline void bd_unlink_disk_holder(struct block_device *bdev,
+					 struct gendisk *disk)
+{
+}
+static inline int bd_register_pending_holders(struct gendisk *disk)
+{
+	return 0;
+}
+#endif /* CONFIG_BLOCK_HOLDER_DEPRECATED */
+
+dev_t part_devt(struct gendisk *disk, u8 partno);
+void inc_diskseq(struct gendisk *disk);
+dev_t blk_lookup_devt(const char *name, int partno);
+void blk_request_module(dev_t devt);
 
 extern int blk_register_queue(struct gendisk *disk);
 extern void blk_unregister_queue(struct gendisk *disk);
@@ -1311,6 +1573,7 @@ void invalidate_bdev(struct block_device *bdev);
 int sync_blockdev(struct block_device *bdev);
 int sync_blockdev_nowait(struct block_device *bdev);
 void sync_bdevs(bool wait);
+void printk_all_partitions(void);
 #else
 static inline void invalidate_bdev(struct block_device *bdev)
 {
@@ -1326,7 +1589,11 @@ static inline int sync_blockdev_nowait(struct block_device *bdev)
 static inline void sync_bdevs(bool wait)
 {
 }
-#endif
+static inline void printk_all_partitions(void)
+{
+}
+#endif /* CONFIG_BLOCK */
+
 int fsync_bdev(struct block_device *bdev);
 
 int freeze_bdev(struct block_device *bdev);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
deleted file mode 100644
index aa4bd985dbe5..000000000000
--- a/include/linux/genhd.h
+++ /dev/null
@@ -1,287 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_GENHD_H
-#define _LINUX_GENHD_H
-
-/*
- * 	genhd.h Copyright (C) 1992 Drew Eckhardt
- *	Generic hard disk header file by  
- * 		Drew Eckhardt
- *
- *		<drew@colorado.edu>
- */
-
-#include <linux/types.h>
-#include <linux/kdev_t.h>
-#include <linux/uuid.h>
-#include <linux/blk_types.h>
-#include <linux/device.h>
-#include <linux/xarray.h>
-
-extern const struct device_type disk_type;
-extern struct device_type part_type;
-extern struct class block_class;
-
-#define DISK_MAX_PARTS			256
-#define DISK_NAME_LEN			32
-
-#define PARTITION_META_INFO_VOLNAMELTH	64
-/*
- * Enough for the string representation of any kind of UUID plus NULL.
- * EFI UUID is 36 characters. MSDOS UUID is 11 characters.
- */
-#define PARTITION_META_INFO_UUIDLTH	(UUID_STRING_LEN + 1)
-
-struct partition_meta_info {
-	char uuid[PARTITION_META_INFO_UUIDLTH];
-	u8 volname[PARTITION_META_INFO_VOLNAMELTH];
-};
-
-/**
- * DOC: genhd capability flags
- *
- * ``GENHD_FL_REMOVABLE``: indicates that the block device gives access to
- * removable media.  When set, the device remains present even when media is not
- * inserted.  Shall not be set for devices which are removed entirely when the
- * media is removed.
- *
- * ``GENHD_FL_HIDDEN``: the block device is hidden; it doesn't produce events,
- * doesn't appear in sysfs, and can't be opened from userspace or using
- * blkdev_get*. Used for the underlying components of multipath devices.
- *
- * ``GENHD_FL_NO_PART``: partition support is disabled.  The kernel will not
- * scan for partitions from add_disk, and users can't add partitions manually.
- *
- */
-enum {
-	GENHD_FL_REMOVABLE			= 1 << 0,
-	GENHD_FL_HIDDEN				= 1 << 1,
-	GENHD_FL_NO_PART			= 1 << 2,
-};
-
-enum {
-	DISK_EVENT_MEDIA_CHANGE			= 1 << 0, /* media changed */
-	DISK_EVENT_EJECT_REQUEST		= 1 << 1, /* eject requested */
-};
-
-enum {
-	/* Poll even if events_poll_msecs is unset */
-	DISK_EVENT_FLAG_POLL			= 1 << 0,
-	/* Forward events to udev */
-	DISK_EVENT_FLAG_UEVENT			= 1 << 1,
-	/* Block event polling when open for exclusive write */
-	DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE	= 1 << 2,
-};
-
-struct disk_events;
-struct badblocks;
-
-struct blk_integrity {
-	const struct blk_integrity_profile	*profile;
-	unsigned char				flags;
-	unsigned char				tuple_size;
-	unsigned char				interval_exp;
-	unsigned char				tag_size;
-};
-
-struct gendisk {
-	/*
-	 * major/first_minor/minors should not be set by any new driver, the
-	 * block core will take care of allocating them automatically.
-	 */
-	int major;
-	int first_minor;
-	int minors;
-
-	char disk_name[DISK_NAME_LEN];	/* name of major driver */
-
-	unsigned short events;		/* supported events */
-	unsigned short event_flags;	/* flags related to event processing */
-
-	struct xarray part_tbl;
-	struct block_device *part0;
-
-	const struct block_device_operations *fops;
-	struct request_queue *queue;
-	void *private_data;
-
-	int flags;
-	unsigned long state;
-#define GD_NEED_PART_SCAN		0
-#define GD_READ_ONLY			1
-#define GD_DEAD				2
-#define GD_NATIVE_CAPACITY		3
-
-	struct mutex open_mutex;	/* open/close mutex */
-	unsigned open_partitions;	/* number of open partitions */
-
-	struct backing_dev_info	*bdi;
-	struct kobject *slave_dir;
-#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
-	struct list_head slave_bdevs;
-#endif
-	struct timer_rand_state *random;
-	atomic_t sync_io;		/* RAID */
-	struct disk_events *ev;
-#ifdef  CONFIG_BLK_DEV_INTEGRITY
-	struct kobject integrity_kobj;
-#endif	/* CONFIG_BLK_DEV_INTEGRITY */
-#if IS_ENABLED(CONFIG_CDROM)
-	struct cdrom_device_info *cdi;
-#endif
-	int node_id;
-	struct badblocks *bb;
-	struct lockdep_map lockdep_map;
-	u64 diskseq;
-};
-
-static inline bool disk_live(struct gendisk *disk)
-{
-	return !inode_unhashed(disk->part0->bd_inode);
-}
-
-/*
- * The gendisk is refcounted by the part0 block_device, and the bd_device
- * therein is also used for device model presentation in sysfs.
- */
-#define dev_to_disk(device) \
-	(dev_to_bdev(device)->bd_disk)
-#define disk_to_dev(disk) \
-	(&((disk)->part0->bd_device))
-
-#if IS_REACHABLE(CONFIG_CDROM)
-#define disk_to_cdi(disk)	((disk)->cdi)
-#else
-#define disk_to_cdi(disk)	NULL
-#endif
-
-static inline dev_t disk_devt(struct gendisk *disk)
-{
-	return MKDEV(disk->major, disk->first_minor);
-}
-
-void disk_uevent(struct gendisk *disk, enum kobject_action action);
-
-/* block/genhd.c */
-int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
-				 const struct attribute_group **groups);
-static inline int __must_check add_disk(struct gendisk *disk)
-{
-	return device_add_disk(NULL, disk, NULL);
-}
-extern void del_gendisk(struct gendisk *gp);
-
-void invalidate_disk(struct gendisk *disk);
-
-void set_disk_ro(struct gendisk *disk, bool read_only);
-
-static inline int get_disk_ro(struct gendisk *disk)
-{
-	return disk->part0->bd_read_only ||
-		test_bit(GD_READ_ONLY, &disk->state);
-}
-
-static inline int bdev_read_only(struct block_device *bdev)
-{
-	return bdev->bd_read_only || get_disk_ro(bdev->bd_disk);
-}
-
-bool set_capacity_and_notify(struct gendisk *disk, sector_t size);
-bool disk_force_media_change(struct gendisk *disk, unsigned int events);
-
-/* drivers/char/random.c */
-extern void add_disk_randomness(struct gendisk *disk) __latent_entropy;
-extern void rand_initialize_disk(struct gendisk *disk);
-
-static inline sector_t get_start_sect(struct block_device *bdev)
-{
-	return bdev->bd_start_sect;
-}
-
-static inline sector_t bdev_nr_sectors(struct block_device *bdev)
-{
-	return bdev->bd_nr_sectors;
-}
-
-static inline loff_t bdev_nr_bytes(struct block_device *bdev)
-{
-	return (loff_t)bdev_nr_sectors(bdev) << SECTOR_SHIFT;
-}
-
-static inline sector_t get_capacity(struct gendisk *disk)
-{
-	return bdev_nr_sectors(disk->part0);
-}
-
-static inline u64 sb_bdev_nr_blocks(struct super_block *sb)
-{
-	return bdev_nr_sectors(sb->s_bdev) >>
-		(sb->s_blocksize_bits - SECTOR_SHIFT);
-}
-
-int bdev_disk_changed(struct gendisk *disk, bool invalidate);
-
-struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
-		struct lock_class_key *lkclass);
-extern void put_disk(struct gendisk *disk);
-struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass);
-
-/**
- * blk_alloc_disk - allocate a gendisk structure
- * @node_id: numa node to allocate on
- *
- * Allocate and pre-initialize a gendisk structure for use with BIO based
- * drivers.
- *
- * Context: can sleep
- */
-#define blk_alloc_disk(node_id)						\
-({									\
-	static struct lock_class_key __key;				\
-									\
-	__blk_alloc_disk(node_id, &__key);				\
-})
-void blk_cleanup_disk(struct gendisk *disk);
-
-int __register_blkdev(unsigned int major, const char *name,
-		void (*probe)(dev_t devt));
-#define register_blkdev(major, name) \
-	__register_blkdev(major, name, NULL)
-void unregister_blkdev(unsigned int major, const char *name);
-
-bool bdev_check_media_change(struct block_device *bdev);
-int __invalidate_device(struct block_device *bdev, bool kill_dirty);
-void set_capacity(struct gendisk *disk, sector_t size);
-
-#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
-int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk);
-void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk);
-int bd_register_pending_holders(struct gendisk *disk);
-#else
-static inline int bd_link_disk_holder(struct block_device *bdev,
-				      struct gendisk *disk)
-{
-	return 0;
-}
-static inline void bd_unlink_disk_holder(struct block_device *bdev,
-					 struct gendisk *disk)
-{
-}
-static inline int bd_register_pending_holders(struct gendisk *disk)
-{
-	return 0;
-}
-#endif /* CONFIG_BLOCK_HOLDER_DEPRECATED */
-
-dev_t part_devt(struct gendisk *disk, u8 partno);
-void inc_diskseq(struct gendisk *disk);
-dev_t blk_lookup_devt(const char *name, int partno);
-void blk_request_module(dev_t devt);
-#ifdef CONFIG_BLOCK
-void printk_all_partitions(void);
-#else /* CONFIG_BLOCK */
-static inline void printk_all_partitions(void)
-{
-}
-#endif /* CONFIG_BLOCK */
-
-#endif /* _LINUX_GENHD_H */
diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h
index 6f7949b2fd8d..abeba356bc3f 100644
--- a/include/linux/part_stat.h
+++ b/include/linux/part_stat.h
@@ -2,7 +2,7 @@
 #ifndef _LINUX_PART_STAT_H
 #define _LINUX_PART_STAT_H
 
-#include <linux/genhd.h>
+#include <linux/blkdev.h>
 #include <asm/local.h>
 
 struct disk_stats {
diff --git a/init/do_mounts.c b/init/do_mounts.c
index 762b534978d9..7058e14ad5f7 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -8,7 +8,6 @@
 #include <linux/root_dev.h>
 #include <linux/security.h>
 #include <linux/delay.h>
-#include <linux/genhd.h>
 #include <linux/mount.h>
 #include <linux/device.h>
 #include <linux/init.h>
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index e6af502c2fd7..a94044197c4a 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -28,7 +28,6 @@
 #include <linux/gfp.h>
 #include <linux/syscore_ops.h>
 #include <linux/ctype.h>
-#include <linux/genhd.h>
 #include <linux/ktime.h>
 #include <linux/security.h>
 #include <linux/secretmem.h>
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index ad10359030a4..f1bd03129575 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -16,7 +16,6 @@
 #include <linux/file.h>
 #include <linux/delay.h>
 #include <linux/bitops.h>
-#include <linux/genhd.h>
 #include <linux/device.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
index 320ca80aacab..02882526ba9a 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@ -16,7 +16,6 @@
 #include <linux/parser.h>
 #include <linux/slab.h>
 #include <linux/rculist.h>
-#include <linux/genhd.h>
 #include <linux/seq_file.h>
 #include <linux/ima.h>
 

From d5f68a42da7a4516e7503c281a54a58727f07dc3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 24 Jan 2022 10:10:49 +0100
Subject: [PATCH 005/127] fs: remove mpage_alloc

open code mpage_alloc in it's two callers and simplify the results
because of the context:

 - __mpage_writepage always passes GFP_NOFS and can thus always sleep and
    will never get a NULL return from bio_alloc at all.
 - do_mpage_readpage can only get a non-sleeping context for readahead
   which never sets PF_MEMALLOC and thus doesn't need the retry loop
   either.

Both cases will never have __GFP_HIGH set.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220124091107.642561-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/mpage.c | 35 ++++++-----------------------------
 1 file changed, 6 insertions(+), 29 deletions(-)

diff --git a/fs/mpage.c b/fs/mpage.c
index 87f5cfef6caa..06e95d777e94 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -66,29 +66,6 @@ static struct bio *mpage_bio_submit(int op, int op_flags, struct bio *bio)
 	return NULL;
 }
 
-static struct bio *
-mpage_alloc(struct block_device *bdev,
-		sector_t first_sector, int nr_vecs,
-		gfp_t gfp_flags)
-{
-	struct bio *bio;
-
-	/* Restrict the given (page cache) mask for slab allocations */
-	gfp_flags &= GFP_KERNEL;
-	bio = bio_alloc(gfp_flags, nr_vecs);
-
-	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
-		while (!bio && (nr_vecs /= 2))
-			bio = bio_alloc(gfp_flags, nr_vecs);
-	}
-
-	if (bio) {
-		bio_set_dev(bio, bdev);
-		bio->bi_iter.bi_sector = first_sector;
-	}
-	return bio;
-}
-
 /*
  * support function for mpage_readahead.  The fs supplied get_block might
  * return an up to date buffer.  This is used to map that buffer into
@@ -296,10 +273,11 @@ alloc_new:
 								page))
 				goto out;
 		}
-		args->bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
-					bio_max_segs(args->nr_pages), gfp);
+		args->bio = bio_alloc(gfp, bio_max_segs(args->nr_pages));
 		if (args->bio == NULL)
 			goto confused;
+		bio_set_dev(args->bio, bdev);
+		args->bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
 	}
 
 	length = first_hole << blkbits;
@@ -608,10 +586,9 @@ alloc_new:
 								page, wbc))
 				goto out;
 		}
-		bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
-				BIO_MAX_VECS, GFP_NOFS|__GFP_HIGH);
-		if (bio == NULL)
-			goto confused;
+		bio = bio_alloc(GFP_NOFS, BIO_MAX_VECS);
+		bio_set_dev(bio, bdev);
+		bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
 
 		wbc_init_bio(wbc, bio);
 		bio->bi_write_hint = inode->i_write_hint;

From f0d911927b3c7cf5f9edb5941d0287144a602d0d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 24 Jan 2022 10:10:50 +0100
Subject: [PATCH 006/127] nilfs2: remove nilfs_alloc_seg_bio

bio_alloc will never fail when it can sleep.  Remove the now simple
nilfs_alloc_seg_bio helper and open code it in the only caller.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220124091107.642561-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/nilfs2/segbuf.c | 31 ++++---------------------------
 1 file changed, 4 insertions(+), 27 deletions(-)

diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 43287b0d3e9b..53b7c6d21cdd 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -371,29 +371,6 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
 	return err;
 }
 
-/**
- * nilfs_alloc_seg_bio - allocate a new bio for writing log
- * @nilfs: nilfs object
- * @start: start block number of the bio
- * @nr_vecs: request size of page vector.
- *
- * Return Value: On success, pointer to the struct bio is returned.
- * On error, NULL is returned.
- */
-static struct bio *nilfs_alloc_seg_bio(struct the_nilfs *nilfs, sector_t start,
-				       int nr_vecs)
-{
-	struct bio *bio;
-
-	bio = bio_alloc(GFP_NOIO, nr_vecs);
-	if (likely(bio)) {
-		bio_set_dev(bio, nilfs->ns_bdev);
-		bio->bi_iter.bi_sector =
-			start << (nilfs->ns_blocksize_bits - 9);
-	}
-	return bio;
-}
-
 static void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf,
 				       struct nilfs_write_info *wi)
 {
@@ -414,10 +391,10 @@ static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf,
 	BUG_ON(wi->nr_vecs <= 0);
  repeat:
 	if (!wi->bio) {
-		wi->bio = nilfs_alloc_seg_bio(wi->nilfs, wi->blocknr + wi->end,
-					      wi->nr_vecs);
-		if (unlikely(!wi->bio))
-			return -ENOMEM;
+		wi->bio = bio_alloc(GFP_NOIO, wi->nr_vecs);
+		bio_set_dev(wi->bio, wi->nilfs->ns_bdev);
+		wi->bio->bi_iter.bi_sector = (wi->blocknr + wi->end) <<
+			(wi->nilfs->ns_blocksize_bits - 9);
 	}
 
 	len = bio_add_page(wi->bio, bh->b_page, bh->b_size, bh_offset(bh));

From 5d2ca2132f889bc2c90d6d07fc9fc129cfee8955 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 24 Jan 2022 10:10:51 +0100
Subject: [PATCH 007/127] nfs/blocklayout: remove bl_alloc_init_bio

bio_alloc will never fail when it can sleep.  Remove the now simple
bl_alloc_init_bio helper and open code it in the only caller.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220124091107.642561-4-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/nfs/blocklayout/blocklayout.c | 26 +++++---------------------
 1 file changed, 5 insertions(+), 21 deletions(-)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index fe860c538747..38e063af7e98 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -115,23 +115,6 @@ bl_submit_bio(struct bio *bio)
 	return NULL;
 }
 
-static struct bio *bl_alloc_init_bio(unsigned int npg,
-		struct block_device *bdev, sector_t disk_sector,
-		bio_end_io_t end_io, struct parallel_io *par)
-{
-	struct bio *bio;
-
-	npg = bio_max_segs(npg);
-	bio = bio_alloc(GFP_NOIO, npg);
-	if (bio) {
-		bio->bi_iter.bi_sector = disk_sector;
-		bio_set_dev(bio, bdev);
-		bio->bi_end_io = end_io;
-		bio->bi_private = par;
-	}
-	return bio;
-}
-
 static bool offset_in_map(u64 offset, struct pnfs_block_dev_map *map)
 {
 	return offset >= map->start && offset < map->start + map->len;
@@ -171,10 +154,11 @@ do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect,
 
 retry:
 	if (!bio) {
-		bio = bl_alloc_init_bio(npg, map->bdev,
-				disk_addr >> SECTOR_SHIFT, end_io, par);
-		if (!bio)
-			return ERR_PTR(-ENOMEM);
+		bio = bio_alloc(GFP_NOIO, bio_max_segs(npg));
+		bio->bi_iter.bi_sector = disk_addr >> SECTOR_SHIFT;
+		bio_set_dev(bio, map->bdev);
+		bio->bi_end_io = end_io;
+		bio->bi_private = par;
 		bio_set_op_attrs(bio, rw, 0);
 	}
 	if (bio_add_page(bio, page, *len, offset) < *len) {

From 39146b6f66ba5c107d5c5758a17f290846165b4d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 24 Jan 2022 10:10:52 +0100
Subject: [PATCH 008/127] ntfs3: remove ntfs_alloc_bio

bio_alloc will never fail if it is allowed to sleep, so there is no
need for this loop.  Also remove the __GFP_HIGH specifier as it doesn't
make sense here given that we'll always fall back to the mempool anyway.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220124091107.642561-5-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/ntfs3/fsntfs.c | 23 ++---------------------
 1 file changed, 2 insertions(+), 21 deletions(-)

diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c
index 4de9acb16968..4a255e21ecf5 100644
--- a/fs/ntfs3/fsntfs.c
+++ b/fs/ntfs3/fsntfs.c
@@ -1443,17 +1443,6 @@ int ntfs_write_bh(struct ntfs_sb_info *sbi, struct NTFS_RECORD_HEADER *rhdr,
 	return err;
 }
 
-static inline struct bio *ntfs_alloc_bio(u32 nr_vecs)
-{
-	struct bio *bio = bio_alloc(GFP_NOFS | __GFP_HIGH, nr_vecs);
-
-	if (!bio && (current->flags & PF_MEMALLOC)) {
-		while (!bio && (nr_vecs /= 2))
-			bio = bio_alloc(GFP_NOFS | __GFP_HIGH, nr_vecs);
-	}
-	return bio;
-}
-
 /*
  * ntfs_bio_pages - Read/write pages from/to disk.
  */
@@ -1496,11 +1485,7 @@ int ntfs_bio_pages(struct ntfs_sb_info *sbi, const struct runs_tree *run,
 		lbo = ((u64)lcn << cluster_bits) + off;
 		len = ((u64)clen << cluster_bits) - off;
 new_bio:
-		new = ntfs_alloc_bio(nr_pages - page_idx);
-		if (!new) {
-			err = -ENOMEM;
-			goto out;
-		}
+		new = bio_alloc(GFP_NOFS, nr_pages - page_idx);
 		if (bio) {
 			bio_chain(bio, new);
 			submit_bio(bio);
@@ -1599,11 +1584,7 @@ int ntfs_bio_fill_1(struct ntfs_sb_info *sbi, const struct runs_tree *run)
 		lbo = (u64)lcn << cluster_bits;
 		len = (u64)clen << cluster_bits;
 new_bio:
-		new = ntfs_alloc_bio(BIO_MAX_VECS);
-		if (!new) {
-			err = -ENOMEM;
-			break;
-		}
+		new = bio_alloc(GFP_NOFS, BIO_MAX_VECS);
 		if (bio) {
 			bio_chain(bio, new);
 			submit_bio(bio);

From 53db984e004c7116ce69e2f4a163664453336ae1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 24 Jan 2022 10:10:53 +0100
Subject: [PATCH 009/127] dm: bio_alloc can't fail if it is allowed to sleep

Remove handling of NULL returns from sleeping bio_alloc calls given that
those can't fail.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220124091107.642561-6-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/dm-crypt.c          |  5 +----
 drivers/md/dm-log-writes.c     | 18 ------------------
 drivers/md/dm-thin.c           | 25 +++++++++----------------
 drivers/md/dm-zoned-metadata.c | 11 -----------
 drivers/md/dm.c                |  2 --
 5 files changed, 10 insertions(+), 51 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index d4ae31558826..20abe3486aba 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1673,9 +1673,6 @@ retry:
 		mutex_lock(&cc->bio_alloc_lock);
 
 	clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, &cc->bs);
-	if (!clone)
-		goto out;
-
 	clone_init(io, clone);
 
 	remaining_size = size;
@@ -1702,7 +1699,7 @@ retry:
 		bio_put(clone);
 		clone = NULL;
 	}
-out:
+
 	if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM))
 		mutex_unlock(&cc->bio_alloc_lock);
 
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 139b09b06eda..25f5e8d2d417 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -218,10 +218,6 @@ static int write_metadata(struct log_writes_c *lc, void *entry,
 	size_t ret;
 
 	bio = bio_alloc(GFP_KERNEL, 1);
-	if (!bio) {
-		DMERR("Couldn't alloc log bio");
-		goto error;
-	}
 	bio->bi_iter.bi_size = 0;
 	bio->bi_iter.bi_sector = sector;
 	bio_set_dev(bio, lc->logdev->bdev);
@@ -276,11 +272,6 @@ static int write_inline_data(struct log_writes_c *lc, void *entry,
 		atomic_inc(&lc->io_blocks);
 
 		bio = bio_alloc(GFP_KERNEL, bio_pages);
-		if (!bio) {
-			DMERR("Couldn't alloc inline data bio");
-			goto error;
-		}
-
 		bio->bi_iter.bi_size = 0;
 		bio->bi_iter.bi_sector = sector;
 		bio_set_dev(bio, lc->logdev->bdev);
@@ -322,7 +313,6 @@ static int write_inline_data(struct log_writes_c *lc, void *entry,
 error_bio:
 	bio_free_pages(bio);
 	bio_put(bio);
-error:
 	put_io_block(lc);
 	return -1;
 }
@@ -364,10 +354,6 @@ static int log_one_block(struct log_writes_c *lc,
 
 	atomic_inc(&lc->io_blocks);
 	bio = bio_alloc(GFP_KERNEL, bio_max_segs(block->vec_cnt));
-	if (!bio) {
-		DMERR("Couldn't alloc log bio");
-		goto error;
-	}
 	bio->bi_iter.bi_size = 0;
 	bio->bi_iter.bi_sector = sector;
 	bio_set_dev(bio, lc->logdev->bdev);
@@ -387,10 +373,6 @@ static int log_one_block(struct log_writes_c *lc,
 			submit_bio(bio);
 			bio = bio_alloc(GFP_KERNEL,
 					bio_max_segs(block->vec_cnt - i));
-			if (!bio) {
-				DMERR("Couldn't alloc log bio");
-				goto error;
-			}
 			bio->bi_iter.bi_size = 0;
 			bio->bi_iter.bi_sector = sector;
 			bio_set_dev(bio, lc->logdev->bdev);
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index ec119d2422d5..76a9c2e9aeee 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -1180,24 +1180,17 @@ static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
 	}
 
 	discard_parent = bio_alloc(GFP_NOIO, 1);
-	if (!discard_parent) {
-		DMWARN("%s: unable to allocate top level discard bio for passdown. Skipping passdown.",
-		       dm_device_name(tc->pool->pool_md));
-		queue_passdown_pt2(m);
+	discard_parent->bi_end_io = passdown_endio;
+	discard_parent->bi_private = m;
 
-	} else {
-		discard_parent->bi_end_io = passdown_endio;
-		discard_parent->bi_private = m;
+	if (m->maybe_shared)
+		passdown_double_checking_shared_status(m, discard_parent);
+	else {
+		struct discard_op op;
 
-		if (m->maybe_shared)
-			passdown_double_checking_shared_status(m, discard_parent);
-		else {
-			struct discard_op op;
-
-			begin_discard(&op, tc, discard_parent);
-			r = issue_discard(&op, m->data_block, data_end);
-			end_discard(&op, r);
-		}
+		begin_discard(&op, tc, discard_parent);
+		r = issue_discard(&op, m->data_block, data_end);
+		end_discard(&op, r);
 	}
 }
 
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index ee4626d08557..5718b83cc718 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -551,10 +551,6 @@ static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd,
 		return ERR_PTR(-ENOMEM);
 
 	bio = bio_alloc(GFP_NOIO, 1);
-	if (!bio) {
-		dmz_free_mblock(zmd, mblk);
-		return ERR_PTR(-ENOMEM);
-	}
 
 	spin_lock(&zmd->mblk_lock);
 
@@ -726,10 +722,6 @@ static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
 		return -EIO;
 
 	bio = bio_alloc(GFP_NOIO, 1);
-	if (!bio) {
-		set_bit(DMZ_META_ERROR, &mblk->state);
-		return -ENOMEM;
-	}
 
 	set_bit(DMZ_META_WRITING, &mblk->state);
 
@@ -760,9 +752,6 @@ static int dmz_rdwr_block(struct dmz_dev *dev, int op,
 		return -EIO;
 
 	bio = bio_alloc(GFP_NOIO, 1);
-	if (!bio)
-		return -ENOMEM;
-
 	bio->bi_iter.bi_sector = dmz_blk2sect(block);
 	bio_set_dev(bio, dev->bdev);
 	bio_set_op_attrs(bio, op, REQ_SYNC | REQ_META | REQ_PRIO);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index dcbd6d201619..e431f72c10bf 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -520,8 +520,6 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
 	struct bio *clone;
 
 	clone = bio_alloc_bioset(GFP_NOIO, 0, &md->io_bs);
-	if (!clone)
-		return NULL;
 
 	tio = container_of(clone, struct dm_target_io, clone);
 	tio->inside_dm_io = true;

From 3f868c09ea8f40f800c4c644c072d91c9eee0d71 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 24 Jan 2022 10:10:54 +0100
Subject: [PATCH 010/127] dm-crypt: remove clone_init

Just open code it next to the bio allocations, which saves a few lines
of code, prepares for future changes and allows to remove the duplicate
bi_opf assignment for the bio_clone_fast case in kcryptd_io_read.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220124091107.642561-7-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/dm-crypt.c | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 20abe3486aba..3c5ecd35d348 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -234,7 +234,7 @@ static volatile unsigned long dm_crypt_pages_per_client;
 #define DM_CRYPT_MEMORY_PERCENT			2
 #define DM_CRYPT_MIN_PAGES_PER_CLIENT		(BIO_MAX_VECS * 16)
 
-static void clone_init(struct dm_crypt_io *, struct bio *);
+static void crypt_endio(struct bio *clone);
 static void kcryptd_queue_crypt(struct dm_crypt_io *io);
 static struct scatterlist *crypt_get_sg_data(struct crypt_config *cc,
 					     struct scatterlist *sg);
@@ -1673,7 +1673,10 @@ retry:
 		mutex_lock(&cc->bio_alloc_lock);
 
 	clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, &cc->bs);
-	clone_init(io, clone);
+	clone->bi_private = io;
+	clone->bi_end_io = crypt_endio;
+	bio_set_dev(clone, cc->dev->bdev);
+	clone->bi_opf = io->base_bio->bi_opf;
 
 	remaining_size = size;
 
@@ -1826,16 +1829,6 @@ static void crypt_endio(struct bio *clone)
 	crypt_dec_pending(io);
 }
 
-static void clone_init(struct dm_crypt_io *io, struct bio *clone)
-{
-	struct crypt_config *cc = io->cc;
-
-	clone->bi_private = io;
-	clone->bi_end_io  = crypt_endio;
-	bio_set_dev(clone, cc->dev->bdev);
-	clone->bi_opf	  = io->base_bio->bi_opf;
-}
-
 static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
 {
 	struct crypt_config *cc = io->cc;
@@ -1850,10 +1843,12 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
 	clone = bio_clone_fast(io->base_bio, gfp, &cc->bs);
 	if (!clone)
 		return 1;
+	clone->bi_private = io;
+	clone->bi_end_io = crypt_endio;
+	bio_set_dev(clone, cc->dev->bdev);
 
 	crypt_inc_pending(io);
 
-	clone_init(io, clone);
 	clone->bi_iter.bi_sector = cc->start + io->sector;
 
 	if (dm_crypt_integrity_io_alloc(io, clone)) {

From eba33b8ef1b90d8996eceb0569c06a4f784ef2b5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 24 Jan 2022 10:10:55 +0100
Subject: [PATCH 011/127] dm-snap: use blkdev_issue_flush instead of open
 coding it

Use blkdev_issue_flush, which uses an on-stack bio instead of an
opencoded version with a bio embedded into struct dm_snapshot.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220124091107.642561-8-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/dm-snap.c | 21 +--------------------
 1 file changed, 1 insertion(+), 20 deletions(-)

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index dcf34c6b05ad..0d336b5ec571 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -141,11 +141,6 @@ struct dm_snapshot {
 	 * for them to be committed.
 	 */
 	struct bio_list bios_queued_during_merge;
-
-	/*
-	 * Flush data after merge.
-	 */
-	struct bio flush_bio;
 };
 
 /*
@@ -1127,17 +1122,6 @@ shut:
 
 static void error_bios(struct bio *bio);
 
-static int flush_data(struct dm_snapshot *s)
-{
-	struct bio *flush_bio = &s->flush_bio;
-
-	bio_reset(flush_bio);
-	bio_set_dev(flush_bio, s->origin->bdev);
-	flush_bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
-
-	return submit_bio_wait(flush_bio);
-}
-
 static void merge_callback(int read_err, unsigned long write_err, void *context)
 {
 	struct dm_snapshot *s = context;
@@ -1151,7 +1135,7 @@ static void merge_callback(int read_err, unsigned long write_err, void *context)
 		goto shut;
 	}
 
-	if (flush_data(s) < 0) {
+	if (blkdev_issue_flush(s->origin->bdev) < 0) {
 		DMERR("Flush after merge failed: shutting down merge");
 		goto shut;
 	}
@@ -1340,7 +1324,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	s->first_merging_chunk = 0;
 	s->num_merging_chunks = 0;
 	bio_list_init(&s->bios_queued_during_merge);
-	bio_init(&s->flush_bio, NULL, 0);
 
 	/* Allocate hash table for COW data */
 	if (init_hash_tables(s)) {
@@ -1528,8 +1511,6 @@ static void snapshot_dtr(struct dm_target *ti)
 
 	dm_exception_store_destroy(s->store);
 
-	bio_uninit(&s->flush_bio);
-
 	dm_put_device(ti, s->cow);
 
 	dm_put_device(ti, s->origin);

From 28d7d128aad5cd2178b158900d58365d1fd3de94 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 24 Jan 2022 10:10:56 +0100
Subject: [PATCH 012/127] dm-thin: use blkdev_issue_flush instead of open
 coding it

Use blkdev_issue_flush, which uses an on-stack bio instead of an
opencoded version with a bio embedded into struct pool.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220124091107.642561-9-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/dm-thin.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 76a9c2e9aeee..411a3f56ed90 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -282,8 +282,6 @@ struct pool {
 	struct dm_bio_prison_cell **cell_sort_array;
 
 	mempool_t mapping_pool;
-
-	struct bio flush_bio;
 };
 
 static void metadata_operation_failed(struct pool *pool, const char *op, int r);
@@ -2906,7 +2904,6 @@ static void __pool_destroy(struct pool *pool)
 	if (pool->next_mapping)
 		mempool_free(pool->next_mapping, &pool->mapping_pool);
 	mempool_exit(&pool->mapping_pool);
-	bio_uninit(&pool->flush_bio);
 	dm_deferred_set_destroy(pool->shared_read_ds);
 	dm_deferred_set_destroy(pool->all_io_ds);
 	kfree(pool);
@@ -2987,7 +2984,6 @@ static struct pool *pool_create(struct mapped_device *pool_md,
 	pool->low_water_triggered = false;
 	pool->suspended = true;
 	pool->out_of_data_space = false;
-	bio_init(&pool->flush_bio, NULL, 0);
 
 	pool->shared_read_ds = dm_deferred_set_create();
 	if (!pool->shared_read_ds) {
@@ -3194,13 +3190,8 @@ static void metadata_low_callback(void *context)
 static int metadata_pre_commit_callback(void *context)
 {
 	struct pool *pool = context;
-	struct bio *flush_bio = &pool->flush_bio;
 
-	bio_reset(flush_bio);
-	bio_set_dev(flush_bio, pool->data_dev);
-	flush_bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
-
-	return submit_bio_wait(flush_bio);
+	return blkdev_issue_flush(pool->data_dev);
 }
 
 static sector_t get_dev_size(struct block_device *bdev)

From 4b1dc86d1857f1007865cab759f2285280692eee Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 24 Jan 2022 10:10:57 +0100
Subject: [PATCH 013/127] drbd: bio_alloc can't fail if it is allow to sleep

Remove handling of NULL returns from sleeping bio_alloc calls given that
those can't fail.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220124091107.642561-10-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/drbd/drbd_receiver.c | 22 ++++------------------
 1 file changed, 4 insertions(+), 18 deletions(-)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 6df2539e215b..fb59b263deee 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1281,14 +1281,13 @@ static void submit_one_flush(struct drbd_device *device, struct issue_flush_cont
 {
 	struct bio *bio = bio_alloc(GFP_NOIO, 0);
 	struct one_flush_context *octx = kmalloc(sizeof(*octx), GFP_NOIO);
-	if (!bio || !octx) {
-		drbd_warn(device, "Could not allocate a bio, CANNOT ISSUE FLUSH\n");
+
+	if (!octx) {
+		drbd_warn(device, "Could not allocate a octx, CANNOT ISSUE FLUSH\n");
 		/* FIXME: what else can I do now?  disconnecting or detaching
 		 * really does not help to improve the state of the world, either.
 		 */
-		kfree(octx);
-		if (bio)
-			bio_put(bio);
+		bio_put(bio);
 
 		ctx->error = -ENOMEM;
 		put_ldev(device);
@@ -1646,7 +1645,6 @@ int drbd_submit_peer_request(struct drbd_device *device,
 	unsigned data_size = peer_req->i.size;
 	unsigned n_bios = 0;
 	unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
-	int err = -ENOMEM;
 
 	/* TRIM/DISCARD: for now, always use the helper function
 	 * blkdev_issue_zeroout(..., discard=true).
@@ -1688,10 +1686,6 @@ int drbd_submit_peer_request(struct drbd_device *device,
 	 */
 next_bio:
 	bio = bio_alloc(GFP_NOIO, nr_pages);
-	if (!bio) {
-		drbd_err(device, "submit_ee: Allocation of a bio failed (nr_pages=%u)\n", nr_pages);
-		goto fail;
-	}
 	/* > peer_req->i.sector, unless this is the first bio */
 	bio->bi_iter.bi_sector = sector;
 	bio_set_dev(bio, device->ldev->backing_bdev);
@@ -1726,14 +1720,6 @@ next_bio:
 		drbd_submit_bio_noacct(device, fault_type, bio);
 	} while (bios);
 	return 0;
-
-fail:
-	while (bios) {
-		bio = bios;
-		bios = bios->bi_next;
-		bio_put(bio);
-	}
-	return err;
 }
 
 static void drbd_remove_epoch_entry_interval(struct drbd_device *device,

From 1fe0640ff94feae6d21417e2f4f2829b882274b1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 24 Jan 2022 10:10:58 +0100
Subject: [PATCH 014/127] rnbd-srv: simplify bio mapping in process_rdma

The memory mapped in process_rdma is contiguous, so there is no need
to loop over bio_add_page.  Remove rnbd_bio_map_kern and just open code
the bio allocation and mapping in the caller.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jack Wang <jinpu.wang@ionons.com>
Tested-by: Jack Wang <jinpu.wang@ionos.com>
Link: https://lore.kernel.org/r/20220124091107.642561-11-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/rnbd/rnbd-srv-dev.c | 57 -------------------------------
 drivers/block/rnbd/rnbd-srv-dev.h |  5 ---
 drivers/block/rnbd/rnbd-srv.c     | 23 +++++++++----
 3 files changed, 16 insertions(+), 69 deletions(-)

diff --git a/drivers/block/rnbd/rnbd-srv-dev.c b/drivers/block/rnbd/rnbd-srv-dev.c
index b241a099aeae..98d3e591a088 100644
--- a/drivers/block/rnbd/rnbd-srv-dev.c
+++ b/drivers/block/rnbd/rnbd-srv-dev.c
@@ -44,60 +44,3 @@ void rnbd_dev_close(struct rnbd_dev *dev)
 	blkdev_put(dev->bdev, dev->blk_open_flags);
 	kfree(dev);
 }
-
-void rnbd_dev_bi_end_io(struct bio *bio)
-{
-	struct rnbd_dev_blk_io *io = bio->bi_private;
-
-	rnbd_endio(io->priv, blk_status_to_errno(bio->bi_status));
-	bio_put(bio);
-}
-
-/**
- *	rnbd_bio_map_kern	-	map kernel address into bio
- *	@data: pointer to buffer to map
- *	@bs: bio_set to use.
- *	@len: length in bytes
- *	@gfp_mask: allocation flags for bio allocation
- *
- *	Map the kernel address into a bio suitable for io to a block
- *	device. Returns an error pointer in case of error.
- */
-struct bio *rnbd_bio_map_kern(void *data, struct bio_set *bs,
-			      unsigned int len, gfp_t gfp_mask)
-{
-	unsigned long kaddr = (unsigned long)data;
-	unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	unsigned long start = kaddr >> PAGE_SHIFT;
-	const int nr_pages = end - start;
-	int offset, i;
-	struct bio *bio;
-
-	bio = bio_alloc_bioset(gfp_mask, nr_pages, bs);
-	if (!bio)
-		return ERR_PTR(-ENOMEM);
-
-	offset = offset_in_page(kaddr);
-	for (i = 0; i < nr_pages; i++) {
-		unsigned int bytes = PAGE_SIZE - offset;
-
-		if (len <= 0)
-			break;
-
-		if (bytes > len)
-			bytes = len;
-
-		if (bio_add_page(bio, virt_to_page(data), bytes,
-				    offset) < bytes) {
-			/* we don't support partial mappings */
-			bio_put(bio);
-			return ERR_PTR(-EINVAL);
-		}
-
-		data += bytes;
-		len -= bytes;
-		offset = 0;
-	}
-
-	return bio;
-}
diff --git a/drivers/block/rnbd/rnbd-srv-dev.h b/drivers/block/rnbd/rnbd-srv-dev.h
index 0eb23850afb9..1a14ece0be72 100644
--- a/drivers/block/rnbd/rnbd-srv-dev.h
+++ b/drivers/block/rnbd/rnbd-srv-dev.h
@@ -41,11 +41,6 @@ void rnbd_dev_close(struct rnbd_dev *dev);
 
 void rnbd_endio(void *priv, int error);
 
-void rnbd_dev_bi_end_io(struct bio *bio);
-
-struct bio *rnbd_bio_map_kern(void *data, struct bio_set *bs,
-			      unsigned int len, gfp_t gfp_mask);
-
 static inline int rnbd_dev_get_max_segs(const struct rnbd_dev *dev)
 {
 	return queue_max_segments(bdev_get_queue(dev->bdev));
diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c
index 1ee808fc600c..6d228af1dcc3 100644
--- a/drivers/block/rnbd/rnbd-srv.c
+++ b/drivers/block/rnbd/rnbd-srv.c
@@ -114,6 +114,14 @@ rnbd_get_sess_dev(int dev_id, struct rnbd_srv_session *srv_sess)
 	return sess_dev;
 }
 
+static void rnbd_dev_bi_end_io(struct bio *bio)
+{
+	struct rnbd_dev_blk_io *io = bio->bi_private;
+
+	rnbd_endio(io->priv, blk_status_to_errno(bio->bi_status));
+	bio_put(bio);
+}
+
 static int process_rdma(struct rnbd_srv_session *srv_sess,
 			struct rtrs_srv_op *id, void *data, u32 datalen,
 			const void *usr, size_t usrlen)
@@ -144,12 +152,12 @@ static int process_rdma(struct rnbd_srv_session *srv_sess,
 	priv->sess_dev = sess_dev;
 	priv->id = id;
 
-	/* Generate bio with pages pointing to the rdma buffer */
-	bio = rnbd_bio_map_kern(data, sess_dev->rnbd_dev->ibd_bio_set, datalen, GFP_KERNEL);
-	if (IS_ERR(bio)) {
-		err = PTR_ERR(bio);
-		rnbd_srv_err(sess_dev, "Failed to generate bio, err: %d\n", err);
-		goto sess_dev_put;
+	bio = bio_alloc_bioset(GFP_KERNEL, 1, sess_dev->rnbd_dev->ibd_bio_set);
+	if (bio_add_page(bio, virt_to_page(data), datalen,
+			offset_in_page(data)) != datalen) {
+		rnbd_srv_err(sess_dev, "Failed to map data to bio\n");
+		err = -EINVAL;
+		goto bio_put;
 	}
 
 	io = container_of(bio, struct rnbd_dev_blk_io, bio);
@@ -170,7 +178,8 @@ static int process_rdma(struct rnbd_srv_session *srv_sess,
 
 	return 0;
 
-sess_dev_put:
+bio_put:
+	bio_put(bio);
 	rnbd_put_sess_dev(sess_dev);
 err:
 	kfree(priv);

From d7b78de2b1552e3e7ce3a069f075cc2729aa5c34 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 24 Jan 2022 10:10:59 +0100
Subject: [PATCH 015/127] rnbd-srv: remove struct rnbd_dev_blk_io

Only the priv field of rnbd_dev_blk_io is used, so store the value of
that in bio->bi_private directly and remove the entire bio_set overhead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jack Wang <jinpu.wang@ionos.com>
Link: https://lore.kernel.org/r/20220124091107.642561-12-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/rnbd/rnbd-srv-dev.c |  4 +---
 drivers/block/rnbd/rnbd-srv-dev.h | 13 ++-----------
 drivers/block/rnbd/rnbd-srv.c     | 27 ++++-----------------------
 drivers/block/rnbd/rnbd-srv.h     |  1 -
 4 files changed, 7 insertions(+), 38 deletions(-)

diff --git a/drivers/block/rnbd/rnbd-srv-dev.c b/drivers/block/rnbd/rnbd-srv-dev.c
index 98d3e591a088..c5d0a0391165 100644
--- a/drivers/block/rnbd/rnbd-srv-dev.c
+++ b/drivers/block/rnbd/rnbd-srv-dev.c
@@ -12,8 +12,7 @@
 #include "rnbd-srv-dev.h"
 #include "rnbd-log.h"
 
-struct rnbd_dev *rnbd_dev_open(const char *path, fmode_t flags,
-			       struct bio_set *bs)
+struct rnbd_dev *rnbd_dev_open(const char *path, fmode_t flags)
 {
 	struct rnbd_dev *dev;
 	int ret;
@@ -30,7 +29,6 @@ struct rnbd_dev *rnbd_dev_open(const char *path, fmode_t flags,
 
 	dev->blk_open_flags = flags;
 	bdevname(dev->bdev, dev->name);
-	dev->ibd_bio_set = bs;
 
 	return dev;
 
diff --git a/drivers/block/rnbd/rnbd-srv-dev.h b/drivers/block/rnbd/rnbd-srv-dev.h
index 1a14ece0be72..2c3df02b5e8e 100644
--- a/drivers/block/rnbd/rnbd-srv-dev.h
+++ b/drivers/block/rnbd/rnbd-srv-dev.h
@@ -14,25 +14,16 @@
 
 struct rnbd_dev {
 	struct block_device	*bdev;
-	struct bio_set		*ibd_bio_set;
 	fmode_t			blk_open_flags;
 	char			name[BDEVNAME_SIZE];
 };
 
-struct rnbd_dev_blk_io {
-	struct rnbd_dev *dev;
-	void		 *priv;
-	/* have to be last member for front_pad usage of bioset_init */
-	struct bio	bio;
-};
-
 /**
  * rnbd_dev_open() - Open a device
+ * @path:	path to open
  * @flags:	open flags
- * @bs:		bio_set to use during block io,
  */
-struct rnbd_dev *rnbd_dev_open(const char *path, fmode_t flags,
-			       struct bio_set *bs);
+struct rnbd_dev *rnbd_dev_open(const char *path, fmode_t flags);
 
 /**
  * rnbd_dev_close() - Close a device
diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c
index 6d228af1dcc3..ff9b38997607 100644
--- a/drivers/block/rnbd/rnbd-srv.c
+++ b/drivers/block/rnbd/rnbd-srv.c
@@ -116,9 +116,7 @@ rnbd_get_sess_dev(int dev_id, struct rnbd_srv_session *srv_sess)
 
 static void rnbd_dev_bi_end_io(struct bio *bio)
 {
-	struct rnbd_dev_blk_io *io = bio->bi_private;
-
-	rnbd_endio(io->priv, blk_status_to_errno(bio->bi_status));
+	rnbd_endio(bio->bi_private, blk_status_to_errno(bio->bi_status));
 	bio_put(bio);
 }
 
@@ -131,7 +129,6 @@ static int process_rdma(struct rnbd_srv_session *srv_sess,
 	struct rnbd_srv_sess_dev *sess_dev;
 	u32 dev_id;
 	int err;
-	struct rnbd_dev_blk_io *io;
 	struct bio *bio;
 	short prio;
 
@@ -152,7 +149,7 @@ static int process_rdma(struct rnbd_srv_session *srv_sess,
 	priv->sess_dev = sess_dev;
 	priv->id = id;
 
-	bio = bio_alloc_bioset(GFP_KERNEL, 1, sess_dev->rnbd_dev->ibd_bio_set);
+	bio = bio_alloc(GFP_KERNEL, 1);
 	if (bio_add_page(bio, virt_to_page(data), datalen,
 			offset_in_page(data)) != datalen) {
 		rnbd_srv_err(sess_dev, "Failed to map data to bio\n");
@@ -160,12 +157,8 @@ static int process_rdma(struct rnbd_srv_session *srv_sess,
 		goto bio_put;
 	}
 
-	io = container_of(bio, struct rnbd_dev_blk_io, bio);
-	io->dev = sess_dev->rnbd_dev;
-	io->priv = priv;
-
 	bio->bi_end_io = rnbd_dev_bi_end_io;
-	bio->bi_private = io;
+	bio->bi_private = priv;
 	bio->bi_opf = rnbd_to_bio_flags(le32_to_cpu(msg->rw));
 	bio->bi_iter.bi_sector = le64_to_cpu(msg->sector);
 	bio->bi_iter.bi_size = le32_to_cpu(msg->bi_size);
@@ -260,7 +253,6 @@ static void destroy_sess(struct rnbd_srv_session *srv_sess)
 
 out:
 	xa_destroy(&srv_sess->index_idr);
-	bioset_exit(&srv_sess->sess_bio_set);
 
 	pr_info("RTRS Session %s disconnected\n", srv_sess->sessname);
 
@@ -289,16 +281,6 @@ static int create_sess(struct rtrs_srv_sess *rtrs)
 		return -ENOMEM;
 
 	srv_sess->queue_depth = rtrs_srv_get_queue_depth(rtrs);
-	err = bioset_init(&srv_sess->sess_bio_set, srv_sess->queue_depth,
-			  offsetof(struct rnbd_dev_blk_io, bio),
-			  BIOSET_NEED_BVECS);
-	if (err) {
-		pr_err("Allocating srv_session for path %s failed\n",
-		       pathname);
-		kfree(srv_sess);
-		return err;
-	}
-
 	xa_init_flags(&srv_sess->index_idr, XA_FLAGS_ALLOC);
 	INIT_LIST_HEAD(&srv_sess->sess_dev_list);
 	mutex_init(&srv_sess->lock);
@@ -747,8 +729,7 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess,
 		goto reject;
 	}
 
-	rnbd_dev = rnbd_dev_open(full_path, open_flags,
-				 &srv_sess->sess_bio_set);
+	rnbd_dev = rnbd_dev_open(full_path, open_flags);
 	if (IS_ERR(rnbd_dev)) {
 		pr_err("Opening device '%s' on session %s failed, failed to open the block device, err: %ld\n",
 		       full_path, srv_sess->sessname, PTR_ERR(rnbd_dev));
diff --git a/drivers/block/rnbd/rnbd-srv.h b/drivers/block/rnbd/rnbd-srv.h
index e5604bce123a..be2ae486d407 100644
--- a/drivers/block/rnbd/rnbd-srv.h
+++ b/drivers/block/rnbd/rnbd-srv.h
@@ -23,7 +23,6 @@ struct rnbd_srv_session {
 	struct rtrs_srv_sess	*rtrs;
 	char			sessname[NAME_MAX];
 	int			queue_depth;
-	struct bio_set		sess_bio_set;
 
 	struct xarray		index_idr;
 	/* List of struct rnbd_srv_sess_dev */

From 7d8d0c658d48705fca35238a8ff601b7c5cbc0de Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 24 Jan 2022 10:11:00 +0100
Subject: [PATCH 016/127] xen-blkback: bio_alloc can't fail if it is allow to
 sleep

Remove handling of NULL returns from sleeping bio_alloc calls given that
those can't fail.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220124091107.642561-13-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/xen-blkback/blkback.c | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 14e452896d04..6bb2ad769206 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -1327,9 +1327,6 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
 				     seg[i].nsec << 9,
 				     seg[i].offset) == 0)) {
 			bio = bio_alloc(GFP_KERNEL, bio_max_segs(nseg - i));
-			if (unlikely(bio == NULL))
-				goto fail_put_bio;
-
 			biolist[nbio++] = bio;
 			bio_set_dev(bio, preq.bdev);
 			bio->bi_private = pending_req;
@@ -1346,9 +1343,6 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
 		BUG_ON(operation_flags != REQ_PREFLUSH);
 
 		bio = bio_alloc(GFP_KERNEL, 0);
-		if (unlikely(bio == NULL))
-			goto fail_put_bio;
-
 		biolist[nbio++] = bio;
 		bio_set_dev(bio, preq.bdev);
 		bio->bi_private = pending_req;
@@ -1381,14 +1375,6 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
 	free_req(ring, pending_req);
 	msleep(1); /* back off a bit */
 	return -EIO;
-
- fail_put_bio:
-	for (i = 0; i < nbio; i++)
-		bio_put(biolist[i]);
-	atomic_set(&pending_req->pendcnt, 1);
-	__end_block_io_op(pending_req, BLK_STS_RESOURCE);
-	msleep(1); /* back off a bit */
-	return -EIO;
 }
 
 

From 3b005bf6acf009abd700e2c652c86e5c209cf63d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 24 Jan 2022 10:11:01 +0100
Subject: [PATCH 017/127] block: move blk_next_bio to bio.c

Keep blk_next_bio next to the core bio infrastructure.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Link: https://lore.kernel.org/r/20220124091107.642561-14-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c     | 13 +++++++++++++
 block/blk-lib.c | 13 -------------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 4312a8085396..1536579ed490 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -344,6 +344,19 @@ void bio_chain(struct bio *bio, struct bio *parent)
 }
 EXPORT_SYMBOL(bio_chain);
 
+struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp)
+{
+	struct bio *new = bio_alloc(gfp, nr_pages);
+
+	if (bio) {
+		bio_chain(bio, new);
+		submit_bio(bio);
+	}
+
+	return new;
+}
+EXPORT_SYMBOL_GPL(blk_next_bio);
+
 static void bio_alloc_rescue(struct work_struct *work)
 {
 	struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 9f09beadcbe3..9245b300ef73 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -10,19 +10,6 @@
 
 #include "blk.h"
 
-struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp)
-{
-	struct bio *new = bio_alloc(gfp, nr_pages);
-
-	if (bio) {
-		bio_chain(bio, new);
-		submit_bio(bio);
-	}
-
-	return new;
-}
-EXPORT_SYMBOL_GPL(blk_next_bio);
-
 int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, int flags,
 		struct bio **biop)

From 0a3140ea0fae377c9eaa031b7db1670ae422ed47 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <kch@nvidia.com>
Date: Mon, 24 Jan 2022 10:11:02 +0100
Subject: [PATCH 018/127] block: pass a block_device and opf to blk_next_bio

All callers need to set the block_device and operation, so lift that into
the common code.

Signed-off-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220124091107.642561-15-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c               |  6 +++++-
 block/blk-lib.c           | 19 +++++--------------
 block/blk-zoned.c         |  9 +++------
 block/blk.h               |  2 --
 drivers/nvme/target/zns.c |  6 +++---
 include/linux/bio.h       |  3 ++-
 6 files changed, 18 insertions(+), 27 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 1536579ed490..a0166f29a05c 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -344,10 +344,14 @@ void bio_chain(struct bio *bio, struct bio *parent)
 }
 EXPORT_SYMBOL(bio_chain);
 
-struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp)
+struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
+		unsigned int nr_pages, unsigned int opf, gfp_t gfp)
 {
 	struct bio *new = bio_alloc(gfp, nr_pages);
 
+	bio_set_dev(new, bdev);
+	new->bi_opf = opf;
+
 	if (bio) {
 		bio_chain(bio, new);
 		submit_bio(bio);
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 9245b300ef73..1b8ced45e4e5 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -82,11 +82,8 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 
 		WARN_ON_ONCE((req_sects << 9) > UINT_MAX);
 
-		bio = blk_next_bio(bio, 0, gfp_mask);
+		bio = blk_next_bio(bio, bdev, 0, op, gfp_mask);
 		bio->bi_iter.bi_sector = sector;
-		bio_set_dev(bio, bdev);
-		bio_set_op_attrs(bio, op, 0);
-
 		bio->bi_iter.bi_size = req_sects << 9;
 		sector += req_sects;
 		nr_sects -= req_sects;
@@ -176,14 +173,12 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 	max_write_same_sectors = bio_allowed_max_sectors(q);
 
 	while (nr_sects) {
-		bio = blk_next_bio(bio, 1, gfp_mask);
+		bio = blk_next_bio(bio, bdev, 1, REQ_OP_WRITE_SAME, gfp_mask);
 		bio->bi_iter.bi_sector = sector;
-		bio_set_dev(bio, bdev);
 		bio->bi_vcnt = 1;
 		bio->bi_io_vec->bv_page = page;
 		bio->bi_io_vec->bv_offset = 0;
 		bio->bi_io_vec->bv_len = bdev_logical_block_size(bdev);
-		bio_set_op_attrs(bio, REQ_OP_WRITE_SAME, 0);
 
 		if (nr_sects > max_write_same_sectors) {
 			bio->bi_iter.bi_size = max_write_same_sectors << 9;
@@ -252,10 +247,8 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
 		return -EOPNOTSUPP;
 
 	while (nr_sects) {
-		bio = blk_next_bio(bio, 0, gfp_mask);
+		bio = blk_next_bio(bio, bdev, 0, REQ_OP_WRITE_ZEROES, gfp_mask);
 		bio->bi_iter.bi_sector = sector;
-		bio_set_dev(bio, bdev);
-		bio->bi_opf = REQ_OP_WRITE_ZEROES;
 		if (flags & BLKDEV_ZERO_NOUNMAP)
 			bio->bi_opf |= REQ_NOUNMAP;
 
@@ -303,11 +296,9 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev,
 		return -EPERM;
 
 	while (nr_sects != 0) {
-		bio = blk_next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
-				   gfp_mask);
+		bio = blk_next_bio(bio, bdev, __blkdev_sectors_to_bio_pages(nr_sects),
+				   REQ_OP_WRITE, gfp_mask);
 		bio->bi_iter.bi_sector = sector;
-		bio_set_dev(bio, bdev);
-		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 
 		while (nr_sects != 0) {
 			sz = min((sector_t) PAGE_SIZE, nr_sects << 9);
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 774ecc598bee..5ab755d792c8 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -215,9 +215,8 @@ static int blkdev_zone_reset_all_emulated(struct block_device *bdev,
 			continue;
 		}
 
-		bio = blk_next_bio(bio, 0, gfp_mask);
-		bio_set_dev(bio, bdev);
-		bio->bi_opf = REQ_OP_ZONE_RESET | REQ_SYNC;
+		bio = blk_next_bio(bio, bdev, 0, REQ_OP_ZONE_RESET | REQ_SYNC,
+				   gfp_mask);
 		bio->bi_iter.bi_sector = sector;
 		sector += zone_sectors;
 
@@ -306,9 +305,7 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op,
 	}
 
 	while (sector < end_sector) {
-		bio = blk_next_bio(bio, 0, gfp_mask);
-		bio_set_dev(bio, bdev);
-		bio->bi_opf = op | REQ_SYNC;
+		bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, gfp_mask);
 		bio->bi_iter.bi_sector = sector;
 		sector += zone_sectors;
 
diff --git a/block/blk.h b/block/blk.h
index 800c5ae387a0..abb663a2a147 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -406,8 +406,6 @@ extern int blk_iolatency_init(struct request_queue *q);
 static inline int blk_iolatency_init(struct request_queue *q) { return 0; }
 #endif
 
-struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp);
-
 #ifdef CONFIG_BLK_DEV_ZONED
 void blk_queue_free_zone_bitmaps(struct request_queue *q);
 void blk_queue_clear_zone_settings(struct request_queue *q);
diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c
index 46bc30fe85d2..247de74247fa 100644
--- a/drivers/nvme/target/zns.c
+++ b/drivers/nvme/target/zns.c
@@ -412,10 +412,10 @@ static u16 nvmet_bdev_zone_mgmt_emulate_all(struct nvmet_req *req)
 
 	while (sector < get_capacity(bdev->bd_disk)) {
 		if (test_bit(blk_queue_zone_no(q, sector), d.zbitmap)) {
-			bio = blk_next_bio(bio, 0, GFP_KERNEL);
-			bio->bi_opf = zsa_req_op(req->cmd->zms.zsa) | REQ_SYNC;
+			bio = blk_next_bio(bio, bdev, 0,
+				zsa_req_op(req->cmd->zms.zsa) | REQ_SYNC,
+				GFP_KERNEL);
 			bio->bi_iter.bi_sector = sector;
-			bio_set_dev(bio, bdev);
 			/* This may take a while, so be nice to others */
 			cond_resched();
 		}
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 117d7f248ac9..edeae54074ed 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -790,6 +790,7 @@ static inline void bio_set_polled(struct bio *bio, struct kiocb *kiocb)
 		bio->bi_opf |= REQ_NOWAIT;
 }
 
-struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp);
+struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
+		unsigned int nr_pages, unsigned int opf, gfp_t gfp);
 
 #endif /* __LINUX_BIO_H */

From 609be1066731fea86436f5f91022f82e592ab456 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 24 Jan 2022 10:11:03 +0100
Subject: [PATCH 019/127] block: pass a block_device and opf to
 bio_alloc_bioset

Pass the block_device and operation that we plan to use this bio for to
bio_alloc_bioset to optimize the assigment.  NULL/0 can be passed, both
for the passthrough case on a raw request_queue and to temporarily avoid
refactoring some nasty code.

Also move the gfp_mask argument after the nr_vecs argument for a much
more logical calling convention matching what most of the kernel does.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Link: https://lore.kernel.org/r/20220124091107.642561-16-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c                         | 30 +++++++++++++++++------------
 block/bounce.c                      |  6 ++----
 drivers/block/drbd/drbd_actlog.c    |  5 ++---
 drivers/block/drbd/drbd_bitmap.c    |  7 +++----
 drivers/md/bcache/request.c         | 12 +++++-------
 drivers/md/dm-crypt.c               |  5 ++---
 drivers/md/dm-io.c                  |  5 ++---
 drivers/md/dm-writecache.c          |  7 ++++---
 drivers/md/dm.c                     |  5 +++--
 drivers/md/md.c                     | 16 +++++++--------
 drivers/md/raid1.c                  |  3 ++-
 drivers/md/raid10.c                 |  6 ++----
 drivers/md/raid5-cache.c            |  8 ++++----
 drivers/md/raid5-ppl.c              | 11 +++++------
 drivers/target/target_core_iblock.c |  6 ++----
 fs/btrfs/extent_io.c                |  2 +-
 fs/f2fs/data.c                      |  7 +++----
 fs/iomap/buffered-io.c              |  6 +++---
 include/linux/bio.h                 |  7 ++++---
 19 files changed, 75 insertions(+), 79 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index a0166f29a05c..9afc0c2aca6e 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -417,8 +417,10 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
 
 /**
  * bio_alloc_bioset - allocate a bio for I/O
+ * @bdev:	block device to allocate the bio for (can be %NULL)
+ * @nr_vecs:	number of bvecs to pre-allocate
+ * @opf:	operation and flags for bio
  * @gfp_mask:   the GFP_* mask given to the slab allocator
- * @nr_iovecs:	number of iovecs to pre-allocate
  * @bs:		the bio_set to allocate from.
  *
  * Allocate a bio from the mempools in @bs.
@@ -447,15 +449,16 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
  *
  * Returns: Pointer to new bio on success, NULL on failure.
  */
-struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned short nr_iovecs,
+struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
+			     unsigned int opf, gfp_t gfp_mask,
 			     struct bio_set *bs)
 {
 	gfp_t saved_gfp = gfp_mask;
 	struct bio *bio;
 	void *p;
 
-	/* should not use nobvec bioset for nr_iovecs > 0 */
-	if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_iovecs > 0))
+	/* should not use nobvec bioset for nr_vecs > 0 */
+	if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_vecs > 0))
 		return NULL;
 
 	/*
@@ -492,26 +495,29 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned short nr_iovecs,
 		return NULL;
 
 	bio = p + bs->front_pad;
-	if (nr_iovecs > BIO_INLINE_VECS) {
+	if (nr_vecs > BIO_INLINE_VECS) {
 		struct bio_vec *bvl = NULL;
 
-		bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask);
+		bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask);
 		if (!bvl && gfp_mask != saved_gfp) {
 			punt_bios_to_rescuer(bs);
 			gfp_mask = saved_gfp;
-			bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask);
+			bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask);
 		}
 		if (unlikely(!bvl))
 			goto err_free;
 
-		bio_init(bio, bvl, nr_iovecs);
-	} else if (nr_iovecs) {
+		bio_init(bio, bvl, nr_vecs);
+	} else if (nr_vecs) {
 		bio_init(bio, bio->bi_inline_vecs, BIO_INLINE_VECS);
 	} else {
 		bio_init(bio, NULL, 0);
 	}
 
 	bio->bi_pool = bs;
+	if (bdev)
+		bio_set_dev(bio, bdev);
+	bio->bi_opf = opf;
 	return bio;
 
 err_free:
@@ -767,7 +773,7 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
 {
 	struct bio *b;
 
-	b = bio_alloc_bioset(gfp_mask, 0, bs);
+	b = bio_alloc_bioset(NULL, 0, 0, gfp_mask, bs);
 	if (!b)
 		return NULL;
 
@@ -1743,7 +1749,7 @@ struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs,
 	struct bio *bio;
 
 	if (!(kiocb->ki_flags & IOCB_ALLOC_CACHE) || nr_vecs > BIO_INLINE_VECS)
-		return bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs);
+		return bio_alloc_bioset(NULL, nr_vecs, 0, GFP_KERNEL, bs);
 
 	cache = per_cpu_ptr(bs->cache, get_cpu());
 	if (cache->free_list) {
@@ -1757,7 +1763,7 @@ struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs,
 		return bio;
 	}
 	put_cpu();
-	bio = bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs);
+	bio = bio_alloc_bioset(NULL, nr_vecs, 0, GFP_KERNEL, bs);
 	bio_set_flag(bio, BIO_PERCPU_CACHE);
 	return bio;
 }
diff --git a/block/bounce.c b/block/bounce.c
index 7af1a72835b9..330ddde25b46 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -165,12 +165,10 @@ static struct bio *bounce_clone_bio(struct bio *bio_src)
 	 *    asking for trouble and would force extra work on
 	 *    __bio_clone_fast() anyways.
 	 */
-	bio = bio_alloc_bioset(GFP_NOIO, bio_segments(bio_src),
-			       &bounce_bio_set);
-	bio->bi_bdev		= bio_src->bi_bdev;
+	bio = bio_alloc_bioset(bio_src->bi_bdev, bio_segments(bio_src),
+			       bio_src->bi_opf, GFP_NOIO, &bounce_bio_set);
 	if (bio_flagged(bio_src, BIO_REMAPPED))
 		bio_set_flag(bio, BIO_REMAPPED);
-	bio->bi_opf		= bio_src->bi_opf;
 	bio->bi_ioprio		= bio_src->bi_ioprio;
 	bio->bi_write_hint	= bio_src->bi_write_hint;
 	bio->bi_iter.bi_sector	= bio_src->bi_iter.bi_sector;
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 72cf7603d51f..f5bcded3640d 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -138,15 +138,14 @@ static int _drbd_md_sync_page_io(struct drbd_device *device,
 		op_flags |= REQ_FUA | REQ_PREFLUSH;
 	op_flags |= REQ_SYNC;
 
-	bio = bio_alloc_bioset(GFP_NOIO, 1, &drbd_md_io_bio_set);
-	bio_set_dev(bio, bdev->md_bdev);
+	bio = bio_alloc_bioset(bdev->md_bdev, 1, op | op_flags, GFP_NOIO,
+			       &drbd_md_io_bio_set);
 	bio->bi_iter.bi_sector = sector;
 	err = -EIO;
 	if (bio_add_page(bio, device->md_io.page, size, 0) != size)
 		goto out;
 	bio->bi_private = device;
 	bio->bi_end_io = drbd_md_endio;
-	bio_set_op_attrs(bio, op, op_flags);
 
 	if (op != REQ_OP_WRITE && device->state.disk == D_DISKLESS && device->ldev == NULL)
 		/* special case, drbd_md_read() during drbd_adm_attach(): no get_ldev */
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index c1f816f896a8..df25eecf80af 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -976,12 +976,13 @@ static void drbd_bm_endio(struct bio *bio)
 
 static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_hold(local)
 {
-	struct bio *bio = bio_alloc_bioset(GFP_NOIO, 1, &drbd_md_io_bio_set);
 	struct drbd_device *device = ctx->device;
+	unsigned int op = (ctx->flags & BM_AIO_READ) ? REQ_OP_READ : REQ_OP_WRITE;
+	struct bio *bio = bio_alloc_bioset(device->ldev->md_bdev, 1, op,
+					   GFP_NOIO, &drbd_md_io_bio_set);
 	struct drbd_bitmap *b = device->bitmap;
 	struct page *page;
 	unsigned int len;
-	unsigned int op = (ctx->flags & BM_AIO_READ) ? REQ_OP_READ : REQ_OP_WRITE;
 
 	sector_t on_disk_sector =
 		device->ldev->md.md_offset + device->ldev->md.bm_offset;
@@ -1006,14 +1007,12 @@ static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_ho
 		bm_store_page_idx(page, page_nr);
 	} else
 		page = b->bm_pages[page_nr];
-	bio_set_dev(bio, device->ldev->md_bdev);
 	bio->bi_iter.bi_sector = on_disk_sector;
 	/* bio_add_page of a single page to an empty bio will always succeed,
 	 * according to api.  Do we want to assert that? */
 	bio_add_page(bio, page, len, 0);
 	bio->bi_private = ctx;
 	bio->bi_end_io = drbd_bm_endio;
-	bio_set_op_attrs(bio, op, 0);
 
 	if (drbd_insert_fault(device, (op == REQ_OP_WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
 		bio_io_error(bio);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index d15aae6c51c1..c4b7e434de8a 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -913,14 +913,13 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 	/* btree_search_recurse()'s btree iterator is no good anymore */
 	ret = miss == bio ? MAP_DONE : -EINTR;
 
-	cache_bio = bio_alloc_bioset(GFP_NOWAIT,
+	cache_bio = bio_alloc_bioset(miss->bi_bdev,
 			DIV_ROUND_UP(s->insert_bio_sectors, PAGE_SECTORS),
-			&dc->disk.bio_split);
+			0, GFP_NOWAIT, &dc->disk.bio_split);
 	if (!cache_bio)
 		goto out_submit;
 
 	cache_bio->bi_iter.bi_sector	= miss->bi_iter.bi_sector;
-	bio_copy_dev(cache_bio, miss);
 	cache_bio->bi_iter.bi_size	= s->insert_bio_sectors << 9;
 
 	cache_bio->bi_end_io	= backing_request_endio;
@@ -1025,16 +1024,15 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 			 */
 			struct bio *flush;
 
-			flush = bio_alloc_bioset(GFP_NOIO, 0,
-						 &dc->disk.bio_split);
+			flush = bio_alloc_bioset(bio->bi_bdev, 0,
+						 REQ_OP_WRITE | REQ_PREFLUSH,
+						 GFP_NOIO, &dc->disk.bio_split);
 			if (!flush) {
 				s->iop.status = BLK_STS_RESOURCE;
 				goto insert_data;
 			}
-			bio_copy_dev(flush, bio);
 			flush->bi_end_io = backing_request_endio;
 			flush->bi_private = cl;
-			flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 			/* I/O request sent to backing device */
 			closure_bio_submit(s->iop.c, flush, cl);
 		}
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 3c5ecd35d348..f7e4435b7439 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1672,11 +1672,10 @@ retry:
 	if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM))
 		mutex_lock(&cc->bio_alloc_lock);
 
-	clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, &cc->bs);
+	clone = bio_alloc_bioset(cc->dev->bdev, nr_iovecs, io->base_bio->bi_opf,
+				 GFP_NOIO, &cc->bs);
 	clone->bi_private = io;
 	clone->bi_end_io = crypt_endio;
-	bio_set_dev(clone, cc->dev->bdev);
-	clone->bi_opf = io->base_bio->bi_opf;
 
 	remaining_size = size;
 
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 2d3cda0acacb..23e038f8dc84 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -345,11 +345,10 @@ static void do_region(int op, int op_flags, unsigned region,
 						(PAGE_SIZE >> SECTOR_SHIFT)));
 		}
 
-		bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, &io->client->bios);
+		bio = bio_alloc_bioset(where->bdev, num_bvecs, op | op_flags,
+				       GFP_NOIO, &io->client->bios);
 		bio->bi_iter.bi_sector = where->sector + (where->count - remaining);
-		bio_set_dev(bio, where->bdev);
 		bio->bi_end_io = endio;
-		bio_set_op_attrs(bio, op, op_flags);
 		store_io_and_region_in_bio(bio, io, region);
 
 		if (op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES) {
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index 4f31591d2d25..5630b470ba42 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -1821,11 +1821,11 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba
 
 		max_pages = e->wc_list_contiguous;
 
-		bio = bio_alloc_bioset(GFP_NOIO, max_pages, &wc->bio_set);
+		bio = bio_alloc_bioset(wc->dev->bdev, max_pages, REQ_OP_WRITE,
+				       GFP_NOIO, &wc->bio_set);
 		wb = container_of(bio, struct writeback_struct, bio);
 		wb->wc = wc;
 		bio->bi_end_io = writecache_writeback_endio;
-		bio_set_dev(bio, wc->dev->bdev);
 		bio->bi_iter.bi_sector = read_original_sector(wc, e);
 		if (max_pages <= WB_LIST_INLINE ||
 		    unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *),
@@ -1852,7 +1852,8 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba
 			wb->wc_list[wb->wc_list_n++] = f;
 			e = f;
 		}
-		bio_set_op_attrs(bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA);
+		if (WC_MODE_FUA(wc))
+			bio->bi_opf |= REQ_FUA;
 		if (writecache_has_error(wc)) {
 			bio->bi_status = BLK_STS_IOERR;
 			bio_endio(bio);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index e431f72c10bf..069e29013b6b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -519,7 +519,7 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
 	struct dm_target_io *tio;
 	struct bio *clone;
 
-	clone = bio_alloc_bioset(GFP_NOIO, 0, &md->io_bs);
+	clone = bio_alloc_bioset(NULL, 0, 0, GFP_NOIO, &md->io_bs);
 
 	tio = container_of(clone, struct dm_target_io, clone);
 	tio->inside_dm_io = true;
@@ -552,7 +552,8 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *t
 		/* the dm_target_io embedded in ci->io is available */
 		tio = &ci->io->tio;
 	} else {
-		struct bio *clone = bio_alloc_bioset(gfp_mask, 0, &ci->io->md->bs);
+		struct bio *clone = bio_alloc_bioset(NULL, 0, 0, gfp_mask,
+						     &ci->io->md->bs);
 		if (!clone)
 			return NULL;
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5881d05a76eb..40fc1f7e65c5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -562,11 +562,11 @@ static void submit_flushes(struct work_struct *ws)
 			atomic_inc(&rdev->nr_pending);
 			atomic_inc(&rdev->nr_pending);
 			rcu_read_unlock();
-			bi = bio_alloc_bioset(GFP_NOIO, 0, &mddev->bio_set);
+			bi = bio_alloc_bioset(rdev->bdev, 0,
+					      REQ_OP_WRITE | REQ_PREFLUSH,
+					      GFP_NOIO, &mddev->bio_set);
 			bi->bi_end_io = md_end_flush;
 			bi->bi_private = rdev;
-			bio_set_dev(bi, rdev->bdev);
-			bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 			atomic_inc(&mddev->flush_pending);
 			submit_bio(bi);
 			rcu_read_lock();
@@ -955,7 +955,6 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
 	 * If an error occurred, call md_error
 	 */
 	struct bio *bio;
-	int ff = 0;
 
 	if (!page)
 		return;
@@ -963,11 +962,13 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
 	if (test_bit(Faulty, &rdev->flags))
 		return;
 
-	bio = bio_alloc_bioset(GFP_NOIO, 1, &mddev->sync_set);
+	bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev,
+			       1,
+			       REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA,
+			       GFP_NOIO, &mddev->sync_set);
 
 	atomic_inc(&rdev->nr_pending);
 
-	bio_set_dev(bio, rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev);
 	bio->bi_iter.bi_sector = sector;
 	bio_add_page(bio, page, size, 0);
 	bio->bi_private = rdev;
@@ -976,8 +977,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
 	if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) &&
 	    test_bit(FailFast, &rdev->flags) &&
 	    !test_bit(LastDev, &rdev->flags))
-		ff = MD_FAILFAST;
-	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA | ff;
+		bio->bi_opf |= MD_FAILFAST;
 
 	atomic_inc(&mddev->pending_writes);
 	submit_bio(bio);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e2d8acb1e988..43276f8fdc81 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1126,7 +1126,8 @@ static void alloc_behind_master_bio(struct r1bio *r1_bio,
 	int i = 0;
 	struct bio *behind_bio = NULL;
 
-	behind_bio = bio_alloc_bioset(GFP_NOIO, vcnt, &r1_bio->mddev->bio_set);
+	behind_bio = bio_alloc_bioset(NULL, vcnt, 0, GFP_NOIO,
+				      &r1_bio->mddev->bio_set);
 	if (!behind_bio)
 		return;
 
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 2b969f70a31f..cb7c58050708 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -4892,14 +4892,12 @@ read_more:
 		return sectors_done;
 	}
 
-	read_bio = bio_alloc_bioset(GFP_KERNEL, RESYNC_PAGES, &mddev->bio_set);
-
-	bio_set_dev(read_bio, rdev->bdev);
+	read_bio = bio_alloc_bioset(rdev->bdev, RESYNC_PAGES, REQ_OP_READ,
+				    GFP_KERNEL, &mddev->bio_set);
 	read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
 			       + rdev->data_offset);
 	read_bio->bi_private = r10_bio;
 	read_bio->bi_end_io = end_reshape_read;
-	bio_set_op_attrs(read_bio, REQ_OP_READ, 0);
 	r10_bio->master_bio = read_bio;
 	r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
 
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 0b5dcaabbc15..66313adf9987 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -735,10 +735,9 @@ static void r5l_submit_current_io(struct r5l_log *log)
 
 static struct bio *r5l_bio_alloc(struct r5l_log *log)
 {
-	struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_VECS, &log->bs);
+	struct bio *bio = bio_alloc_bioset(log->rdev->bdev, BIO_MAX_VECS,
+					   REQ_OP_WRITE, GFP_NOIO, &log->bs);
 
-	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-	bio_set_dev(bio, log->rdev->bdev);
 	bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start;
 
 	return bio;
@@ -1634,7 +1633,8 @@ static int r5l_recovery_allocate_ra_pool(struct r5l_log *log,
 {
 	struct page *page;
 
-	ctx->ra_bio = bio_alloc_bioset(GFP_KERNEL, BIO_MAX_VECS, &log->bs);
+	ctx->ra_bio = bio_alloc_bioset(NULL, BIO_MAX_VECS, 0, GFP_KERNEL,
+				       &log->bs);
 	if (!ctx->ra_bio)
 		return -ENOMEM;
 
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 4ab417915d7f..054d3bb252d4 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -496,11 +496,10 @@ static void ppl_submit_iounit(struct ppl_io_unit *io)
 		if (!bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0)) {
 			struct bio *prev = bio;
 
-			bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_VECS,
+			bio = bio_alloc_bioset(prev->bi_bdev, BIO_MAX_VECS,
+					       prev->bi_opf, GFP_NOIO,
 					       &ppl_conf->bs);
-			bio->bi_opf = prev->bi_opf;
 			bio->bi_write_hint = prev->bi_write_hint;
-			bio_copy_dev(bio, prev);
 			bio->bi_iter.bi_sector = bio_end_sector(prev);
 			bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0);
 
@@ -637,10 +636,10 @@ static void ppl_do_flush(struct ppl_io_unit *io)
 			struct bio *bio;
 			char b[BDEVNAME_SIZE];
 
-			bio = bio_alloc_bioset(GFP_NOIO, 0, &ppl_conf->flush_bs);
-			bio_set_dev(bio, bdev);
+			bio = bio_alloc_bioset(bdev, 0, GFP_NOIO,
+					       REQ_OP_WRITE | REQ_PREFLUSH,
+					       &ppl_conf->flush_bs);
 			bio->bi_private = io;
-			bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 			bio->bi_end_io = ppl_flush_endio;
 
 			pr_debug("%s: dev: %s\n", __func__,
diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c
index 6045678365a5..3c92ba374819 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -352,18 +352,16 @@ static struct bio *iblock_get_bio(struct se_cmd *cmd, sector_t lba, u32 sg_num,
 	 * Only allocate as many vector entries as the bio code allows us to,
 	 * we'll loop later on until we have handled the whole request.
 	 */
-	bio = bio_alloc_bioset(GFP_NOIO, bio_max_segs(sg_num),
-				&ib_dev->ibd_bio_set);
+	bio = bio_alloc_bioset(ib_dev->ibd_bd, bio_max_segs(sg_num), opf,
+			       GFP_NOIO, &ib_dev->ibd_bio_set);
 	if (!bio) {
 		pr_err("Unable to allocate memory for bio\n");
 		return NULL;
 	}
 
-	bio_set_dev(bio, ib_dev->ibd_bd);
 	bio->bi_private = cmd;
 	bio->bi_end_io = &iblock_bio_done;
 	bio->bi_iter.bi_sector = lba;
-	bio->bi_opf = opf;
 
 	return bio;
 }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 409bad3928db..421d921a0571 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3143,7 +3143,7 @@ struct bio *btrfs_bio_alloc(unsigned int nr_iovecs)
 	struct bio *bio;
 
 	ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS);
-	bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset);
+	bio = bio_alloc_bioset(NULL, nr_iovecs, 0, GFP_NOFS, &btrfs_bioset);
 	btrfs_bio_init(btrfs_bio(bio));
 	return bio;
 }
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 8c417864c66a..e71dde8de0db 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -394,7 +394,7 @@ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages)
 	struct f2fs_sb_info *sbi = fio->sbi;
 	struct bio *bio;
 
-	bio = bio_alloc_bioset(GFP_NOIO, npages, &f2fs_bioset);
+	bio = bio_alloc_bioset(NULL, npages, 0, GFP_NOIO, &f2fs_bioset);
 
 	f2fs_target_device(sbi, fio->new_blkaddr, bio);
 	if (is_read_io(fio->op)) {
@@ -985,8 +985,8 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
 	struct bio_post_read_ctx *ctx = NULL;
 	unsigned int post_read_steps = 0;
 
-	bio = bio_alloc_bioset(for_write ? GFP_NOIO : GFP_KERNEL,
-			       bio_max_segs(nr_pages), &f2fs_bioset);
+	bio = bio_alloc_bioset(NULL, bio_max_segs(nr_pages), REQ_OP_READ,
+			       for_write ? GFP_NOIO : GFP_KERNEL, &f2fs_bioset);
 	if (!bio)
 		return ERR_PTR(-ENOMEM);
 
@@ -994,7 +994,6 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
 
 	f2fs_target_device(sbi, blkaddr, bio);
 	bio->bi_end_io = f2fs_read_end_io;
-	bio_set_op_attrs(bio, REQ_OP_READ, op_flag);
 
 	if (fscrypt_inode_uses_fs_layer_crypto(inode))
 		post_read_steps |= STEP_DECRYPT;
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index c938bbad075e..340d373cb1bf 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1196,10 +1196,10 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
 	struct iomap_ioend *ioend;
 	struct bio *bio;
 
-	bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, &iomap_ioend_bioset);
-	bio_set_dev(bio, wpc->iomap.bdev);
+	bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
+			       REQ_OP_WRITE | wbc_to_write_flags(wbc),
+			       GFP_NOFS, &iomap_ioend_bioset);
 	bio->bi_iter.bi_sector = sector;
-	bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
 	bio->bi_write_hint = inode->i_write_hint;
 	wbc_init_bio(wbc, bio);
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index edeae54074ed..2f63ae9a71e1 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -405,8 +405,9 @@ extern void bioset_exit(struct bio_set *);
 extern int biovec_init_pool(mempool_t *pool, int pool_entries);
 extern int bioset_init_from_src(struct bio_set *bs, struct bio_set *src);
 
-struct bio *bio_alloc_bioset(gfp_t gfp, unsigned short nr_iovecs,
-		struct bio_set *bs);
+struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
+			     unsigned int opf, gfp_t gfp_mask,
+			     struct bio_set *bs);
 struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs,
 		struct bio_set *bs);
 struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs);
@@ -419,7 +420,7 @@ extern struct bio_set fs_bio_set;
 
 static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned short nr_iovecs)
 {
-	return bio_alloc_bioset(gfp_mask, nr_iovecs, &fs_bio_set);
+	return bio_alloc_bioset(NULL, nr_iovecs, 0, gfp_mask, &fs_bio_set);
 }
 
 void submit_bio(struct bio *bio);

From b77c88c2100ce6a5ec8126c13599b5a7f6663e32 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 24 Jan 2022 10:11:04 +0100
Subject: [PATCH 020/127] block: pass a block_device and opf to bio_alloc_kiocb

Pass the block_device and operation that we plan to use this bio for to
bio_alloc_kiocb to optimize the assigment.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Link: https://lore.kernel.org/r/20220124091107.642561-17-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c         | 12 ++++++++----
 block/fops.c        | 17 ++++++++---------
 include/linux/bio.h |  4 ++--
 3 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 9afc0c2aca6e..6c3efb0fd12b 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1731,7 +1731,9 @@ EXPORT_SYMBOL(bioset_init_from_src);
 /**
  * bio_alloc_kiocb - Allocate a bio from bio_set based on kiocb
  * @kiocb:	kiocb describing the IO
+ * @bdev:	block device to allocate the bio for (can be %NULL)
  * @nr_vecs:	number of iovecs to pre-allocate
+ * @opf:	operation and flags for bio
  * @bs:		bio_set to allocate from
  *
  * Description:
@@ -1742,14 +1744,14 @@ EXPORT_SYMBOL(bioset_init_from_src);
  *    MUST be done from process context, not hard/soft IRQ.
  *
  */
-struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs,
-			    struct bio_set *bs)
+struct bio *bio_alloc_kiocb(struct kiocb *kiocb, struct block_device *bdev,
+		unsigned short nr_vecs, unsigned int opf, struct bio_set *bs)
 {
 	struct bio_alloc_cache *cache;
 	struct bio *bio;
 
 	if (!(kiocb->ki_flags & IOCB_ALLOC_CACHE) || nr_vecs > BIO_INLINE_VECS)
-		return bio_alloc_bioset(NULL, nr_vecs, 0, GFP_KERNEL, bs);
+		return bio_alloc_bioset(bdev, nr_vecs, opf, GFP_KERNEL, bs);
 
 	cache = per_cpu_ptr(bs->cache, get_cpu());
 	if (cache->free_list) {
@@ -1758,12 +1760,14 @@ struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs,
 		cache->nr--;
 		put_cpu();
 		bio_init(bio, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs);
+		bio_set_dev(bio, bdev);
+		bio->bi_opf = opf;
 		bio->bi_pool = bs;
 		bio_set_flag(bio, BIO_PERCPU_CACHE);
 		return bio;
 	}
 	put_cpu();
-	bio = bio_alloc_bioset(NULL, nr_vecs, 0, GFP_KERNEL, bs);
+	bio = bio_alloc_bioset(bdev, nr_vecs, opf, GFP_KERNEL, bs);
 	bio_set_flag(bio, BIO_PERCPU_CACHE);
 	return bio;
 }
diff --git a/block/fops.c b/block/fops.c
index 26bf15c770d2..3a62b8b91275 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -190,6 +190,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	struct blkdev_dio *dio;
 	struct bio *bio;
 	bool is_read = (iov_iter_rw(iter) == READ), is_sync;
+	unsigned int opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
 	loff_t pos = iocb->ki_pos;
 	int ret = 0;
 
@@ -197,7 +198,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	    (bdev_logical_block_size(bdev) - 1))
 		return -EINVAL;
 
-	bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool);
+	bio = bio_alloc_kiocb(iocb, bdev, nr_pages, opf, &blkdev_dio_pool);
 
 	dio = container_of(bio, struct blkdev_dio, bio);
 	atomic_set(&dio->ref, 1);
@@ -223,7 +224,6 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	blk_start_plug(&plug);
 
 	for (;;) {
-		bio_set_dev(bio, bdev);
 		bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
 		bio->bi_write_hint = iocb->ki_hint;
 		bio->bi_private = dio;
@@ -238,11 +238,9 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 		}
 
 		if (is_read) {
-			bio->bi_opf = REQ_OP_READ;
 			if (dio->flags & DIO_SHOULD_DIRTY)
 				bio_set_pages_dirty(bio);
 		} else {
-			bio->bi_opf = dio_bio_write_op(iocb);
 			task_io_account_write(bio->bi_iter.bi_size);
 		}
 		if (iocb->ki_flags & IOCB_NOWAIT)
@@ -259,6 +257,8 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 		atomic_inc(&dio->ref);
 		submit_bio(bio);
 		bio = bio_alloc(GFP_KERNEL, nr_pages);
+		bio_set_dev(bio, bdev);
+		bio->bi_opf = opf;
 	}
 
 	blk_finish_plug(&plug);
@@ -311,6 +311,8 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
 					unsigned int nr_pages)
 {
 	struct block_device *bdev = iocb->ki_filp->private_data;
+	bool is_read = iov_iter_rw(iter) == READ;
+	unsigned int opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
 	struct blkdev_dio *dio;
 	struct bio *bio;
 	loff_t pos = iocb->ki_pos;
@@ -320,11 +322,10 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
 	    (bdev_logical_block_size(bdev) - 1))
 		return -EINVAL;
 
-	bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool);
+	bio = bio_alloc_kiocb(iocb, bdev, nr_pages, opf, &blkdev_dio_pool);
 	dio = container_of(bio, struct blkdev_dio, bio);
 	dio->flags = 0;
 	dio->iocb = iocb;
-	bio_set_dev(bio, bdev);
 	bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
 	bio->bi_write_hint = iocb->ki_hint;
 	bio->bi_end_io = blkdev_bio_end_io_async;
@@ -347,14 +348,12 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
 	}
 	dio->size = bio->bi_iter.bi_size;
 
-	if (iov_iter_rw(iter) == READ) {
-		bio->bi_opf = REQ_OP_READ;
+	if (is_read) {
 		if (iter_is_iovec(iter)) {
 			dio->flags |= DIO_SHOULD_DIRTY;
 			bio_set_pages_dirty(bio);
 		}
 	} else {
-		bio->bi_opf = dio_bio_write_op(iocb);
 		task_io_account_write(bio->bi_iter.bi_size);
 	}
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 2f63ae9a71e1..5c5ada2ebb27 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -408,8 +408,8 @@ extern int bioset_init_from_src(struct bio_set *bs, struct bio_set *src);
 struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
 			     unsigned int opf, gfp_t gfp_mask,
 			     struct bio_set *bs);
-struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs,
-		struct bio_set *bs);
+struct bio *bio_alloc_kiocb(struct kiocb *kiocb, struct block_device *bdev,
+		unsigned short nr_vecs, unsigned int opf, struct bio_set *bs);
 struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs);
 extern void bio_put(struct bio *);
 

From 07888c665b405b1cd3577ddebfeb74f4717a84c4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 24 Jan 2022 10:11:05 +0100
Subject: [PATCH 021/127] block: pass a block_device and opf to bio_alloc

Pass the block_device and operation that we plan to use this bio for to
bio_alloc to optimize the assignment.  NULL/0 can be passed, both for the
passthrough case on a raw request_queue and to temporarily avoid
refactoring some nasty code.

Also move the gfp_mask argument after the nr_vecs argument for a much
more logical calling convention matching what most of the kernel does.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Link: https://lore.kernel.org/r/20220124091107.642561-18-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c                         |  5 +----
 block/fops.c                        |  4 +---
 drivers/block/drbd/drbd_receiver.c  | 10 ++++------
 drivers/block/rnbd/rnbd-srv.c       |  5 ++---
 drivers/block/xen-blkback/blkback.c | 11 +++++------
 drivers/block/zram/zram_drv.c       | 11 ++++-------
 drivers/md/dm-log-writes.c          | 21 ++++++++-------------
 drivers/md/dm-thin.c                |  9 ++++-----
 drivers/md/dm-zoned-metadata.c      | 15 ++++++---------
 drivers/nvdimm/nd_virtio.c          |  6 +++---
 drivers/nvme/target/io-cmd-bdev.c   | 12 ++++++------
 drivers/nvme/target/passthru.c      |  5 +++--
 drivers/nvme/target/zns.c           |  6 +++---
 drivers/scsi/ufs/ufshpb.c           |  4 ++--
 drivers/target/target_core_iblock.c |  5 ++---
 fs/btrfs/disk-io.c                  |  6 +++---
 fs/buffer.c                         | 14 ++++++--------
 fs/crypto/bio.c                     | 13 +++++++------
 fs/direct-io.c                      |  5 +----
 fs/erofs/zdata.c                    |  5 ++---
 fs/ext4/page-io.c                   |  3 +--
 fs/ext4/readpage.c                  |  8 ++++----
 fs/gfs2/lops.c                      |  8 +++-----
 fs/gfs2/meta_io.c                   |  4 +---
 fs/gfs2/ops_fstype.c                |  4 +---
 fs/hfsplus/wrapper.c                |  4 +---
 fs/iomap/buffered-io.c              | 16 ++++++++--------
 fs/iomap/direct-io.c                |  8 ++------
 fs/jfs/jfs_logmgr.c                 | 11 ++---------
 fs/jfs/jfs_metapage.c               |  9 +++------
 fs/mpage.c                          |  7 +++----
 fs/nfs/blocklayout/blocklayout.c    |  4 +---
 fs/nilfs2/segbuf.c                  |  4 ++--
 fs/ntfs3/fsntfs.c                   |  8 ++------
 fs/ocfs2/cluster/heartbeat.c        |  4 +---
 fs/squashfs/block.c                 | 11 ++++++-----
 fs/xfs/xfs_bio_io.c                 | 10 ++++------
 fs/xfs/xfs_buf.c                    |  4 +---
 fs/zonefs/super.c                   |  5 ++---
 include/linux/bio.h                 |  5 +++--
 kernel/power/swap.c                 |  5 ++---
 mm/page_io.c                        | 10 ++++------
 42 files changed, 130 insertions(+), 194 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 6c3efb0fd12b..b73c9babd583 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -347,10 +347,7 @@ EXPORT_SYMBOL(bio_chain);
 struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
 		unsigned int nr_pages, unsigned int opf, gfp_t gfp)
 {
-	struct bio *new = bio_alloc(gfp, nr_pages);
-
-	bio_set_dev(new, bdev);
-	new->bi_opf = opf;
+	struct bio *new = bio_alloc(bdev, nr_pages, opf, gfp);
 
 	if (bio) {
 		bio_chain(bio, new);
diff --git a/block/fops.c b/block/fops.c
index 3a62b8b91275..c68359684773 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -256,9 +256,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 		}
 		atomic_inc(&dio->ref);
 		submit_bio(bio);
-		bio = bio_alloc(GFP_KERNEL, nr_pages);
-		bio_set_dev(bio, bdev);
-		bio->bi_opf = opf;
+		bio = bio_alloc(bdev, nr_pages, opf, GFP_KERNEL);
 	}
 
 	blk_finish_plug(&plug);
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index fb59b263deee..04e3ec12d8b4 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1279,7 +1279,8 @@ static void one_flush_endio(struct bio *bio)
 
 static void submit_one_flush(struct drbd_device *device, struct issue_flush_context *ctx)
 {
-	struct bio *bio = bio_alloc(GFP_NOIO, 0);
+	struct bio *bio = bio_alloc(device->ldev->backing_bdev, 0,
+				    REQ_OP_FLUSH | REQ_PREFLUSH, GFP_NOIO);
 	struct one_flush_context *octx = kmalloc(sizeof(*octx), GFP_NOIO);
 
 	if (!octx) {
@@ -1297,10 +1298,8 @@ static void submit_one_flush(struct drbd_device *device, struct issue_flush_cont
 
 	octx->device = device;
 	octx->ctx = ctx;
-	bio_set_dev(bio, device->ldev->backing_bdev);
 	bio->bi_private = octx;
 	bio->bi_end_io = one_flush_endio;
-	bio->bi_opf = REQ_OP_FLUSH | REQ_PREFLUSH;
 
 	device->flush_jif = jiffies;
 	set_bit(FLUSH_PENDING, &device->flags);
@@ -1685,11 +1684,10 @@ int drbd_submit_peer_request(struct drbd_device *device,
 	 * generated bio, but a bio allocated on behalf of the peer.
 	 */
 next_bio:
-	bio = bio_alloc(GFP_NOIO, nr_pages);
+	bio = bio_alloc(device->ldev->backing_bdev, nr_pages, op | op_flags,
+			GFP_NOIO);
 	/* > peer_req->i.sector, unless this is the first bio */
 	bio->bi_iter.bi_sector = sector;
-	bio_set_dev(bio, device->ldev->backing_bdev);
-	bio_set_op_attrs(bio, op, op_flags);
 	bio->bi_private = peer_req;
 	bio->bi_end_io = drbd_peer_request_endio;
 
diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c
index ff9b38997607..132e950685d5 100644
--- a/drivers/block/rnbd/rnbd-srv.c
+++ b/drivers/block/rnbd/rnbd-srv.c
@@ -149,7 +149,8 @@ static int process_rdma(struct rnbd_srv_session *srv_sess,
 	priv->sess_dev = sess_dev;
 	priv->id = id;
 
-	bio = bio_alloc(GFP_KERNEL, 1);
+	bio = bio_alloc(sess_dev->rnbd_dev->bdev, 1,
+			rnbd_to_bio_flags(le32_to_cpu(msg->rw)), GFP_KERNEL);
 	if (bio_add_page(bio, virt_to_page(data), datalen,
 			offset_in_page(data)) != datalen) {
 		rnbd_srv_err(sess_dev, "Failed to map data to bio\n");
@@ -159,13 +160,11 @@ static int process_rdma(struct rnbd_srv_session *srv_sess,
 
 	bio->bi_end_io = rnbd_dev_bi_end_io;
 	bio->bi_private = priv;
-	bio->bi_opf = rnbd_to_bio_flags(le32_to_cpu(msg->rw));
 	bio->bi_iter.bi_sector = le64_to_cpu(msg->sector);
 	bio->bi_iter.bi_size = le32_to_cpu(msg->bi_size);
 	prio = srv_sess->ver < RNBD_PROTO_VER_MAJOR ||
 	       usrlen < sizeof(*msg) ? 0 : le16_to_cpu(msg->prio);
 	bio_set_prio(bio, prio);
-	bio_set_dev(bio, sess_dev->rnbd_dev->bdev);
 
 	submit_bio(bio);
 
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 6bb2ad769206..d1e26461a64e 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -1326,13 +1326,13 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
 				     pages[i]->page,
 				     seg[i].nsec << 9,
 				     seg[i].offset) == 0)) {
-			bio = bio_alloc(GFP_KERNEL, bio_max_segs(nseg - i));
+			bio = bio_alloc(preq.bdev, bio_max_segs(nseg - i),
+					operation | operation_flags,
+					GFP_KERNEL);
 			biolist[nbio++] = bio;
-			bio_set_dev(bio, preq.bdev);
 			bio->bi_private = pending_req;
 			bio->bi_end_io  = end_block_io_op;
 			bio->bi_iter.bi_sector  = preq.sector_number;
-			bio_set_op_attrs(bio, operation, operation_flags);
 		}
 
 		preq.sector_number += seg[i].nsec;
@@ -1342,12 +1342,11 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
 	if (!bio) {
 		BUG_ON(operation_flags != REQ_PREFLUSH);
 
-		bio = bio_alloc(GFP_KERNEL, 0);
+		bio = bio_alloc(preq.bdev, 0, operation | operation_flags,
+				GFP_KERNEL);
 		biolist[nbio++] = bio;
-		bio_set_dev(bio, preq.bdev);
 		bio->bi_private = pending_req;
 		bio->bi_end_io  = end_block_io_op;
-		bio_set_op_attrs(bio, operation, operation_flags);
 	}
 
 	atomic_set(&pending_req->pendcnt, nbio);
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 342dbcb3f220..f3fe0ea8aa80 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -616,24 +616,21 @@ static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec,
 {
 	struct bio *bio;
 
-	bio = bio_alloc(GFP_NOIO, 1);
+	bio = bio_alloc(zram->bdev, 1, parent ? parent->bi_opf : REQ_OP_READ,
+			GFP_NOIO);
 	if (!bio)
 		return -ENOMEM;
 
 	bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9);
-	bio_set_dev(bio, zram->bdev);
 	if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len, bvec->bv_offset)) {
 		bio_put(bio);
 		return -EIO;
 	}
 
-	if (!parent) {
-		bio->bi_opf = REQ_OP_READ;
+	if (!parent)
 		bio->bi_end_io = zram_page_end_io;
-	} else {
-		bio->bi_opf = parent->bi_opf;
+	else
 		bio_chain(bio, parent);
-	}
 
 	submit_bio(bio);
 	return 1;
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 25f5e8d2d417..c9d036d6bb2e 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -217,14 +217,12 @@ static int write_metadata(struct log_writes_c *lc, void *entry,
 	void *ptr;
 	size_t ret;
 
-	bio = bio_alloc(GFP_KERNEL, 1);
+	bio = bio_alloc(lc->logdev->bdev, 1, REQ_OP_WRITE, GFP_KERNEL);
 	bio->bi_iter.bi_size = 0;
 	bio->bi_iter.bi_sector = sector;
-	bio_set_dev(bio, lc->logdev->bdev);
 	bio->bi_end_io = (sector == WRITE_LOG_SUPER_SECTOR) ?
 			  log_end_super : log_end_io;
 	bio->bi_private = lc;
-	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 
 	page = alloc_page(GFP_KERNEL);
 	if (!page) {
@@ -271,13 +269,12 @@ static int write_inline_data(struct log_writes_c *lc, void *entry,
 
 		atomic_inc(&lc->io_blocks);
 
-		bio = bio_alloc(GFP_KERNEL, bio_pages);
+		bio = bio_alloc(lc->logdev->bdev, bio_pages, REQ_OP_WRITE,
+				GFP_KERNEL);
 		bio->bi_iter.bi_size = 0;
 		bio->bi_iter.bi_sector = sector;
-		bio_set_dev(bio, lc->logdev->bdev);
 		bio->bi_end_io = log_end_io;
 		bio->bi_private = lc;
-		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 
 		for (i = 0; i < bio_pages; i++) {
 			pg_datalen = min_t(int, datalen, PAGE_SIZE);
@@ -353,13 +350,12 @@ static int log_one_block(struct log_writes_c *lc,
 		goto out;
 
 	atomic_inc(&lc->io_blocks);
-	bio = bio_alloc(GFP_KERNEL, bio_max_segs(block->vec_cnt));
+	bio = bio_alloc(lc->logdev->bdev, bio_max_segs(block->vec_cnt),
+			REQ_OP_WRITE, GFP_KERNEL);
 	bio->bi_iter.bi_size = 0;
 	bio->bi_iter.bi_sector = sector;
-	bio_set_dev(bio, lc->logdev->bdev);
 	bio->bi_end_io = log_end_io;
 	bio->bi_private = lc;
-	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 
 	for (i = 0; i < block->vec_cnt; i++) {
 		/*
@@ -371,14 +367,13 @@ static int log_one_block(struct log_writes_c *lc,
 		if (ret != block->vecs[i].bv_len) {
 			atomic_inc(&lc->io_blocks);
 			submit_bio(bio);
-			bio = bio_alloc(GFP_KERNEL,
-					bio_max_segs(block->vec_cnt - i));
+			bio = bio_alloc(lc->logdev->bdev,
+					bio_max_segs(block->vec_cnt - i),
+					REQ_OP_WRITE, GFP_KERNEL);
 			bio->bi_iter.bi_size = 0;
 			bio->bi_iter.bi_sector = sector;
-			bio_set_dev(bio, lc->logdev->bdev);
 			bio->bi_end_io = log_end_io;
 			bio->bi_private = lc;
-			bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 
 			ret = bio_add_page(bio, block->vecs[i].bv_page,
 					   block->vecs[i].bv_len, 0);
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 411a3f56ed90..f4234d615aa1 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -1177,13 +1177,12 @@ static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
 		return;
 	}
 
-	discard_parent = bio_alloc(GFP_NOIO, 1);
+	discard_parent = bio_alloc(NULL, 1, 0, GFP_NOIO);
 	discard_parent->bi_end_io = passdown_endio;
 	discard_parent->bi_private = m;
-
-	if (m->maybe_shared)
-		passdown_double_checking_shared_status(m, discard_parent);
-	else {
+ 	if (m->maybe_shared)
+ 		passdown_double_checking_shared_status(m, discard_parent);
+ 	else {
 		struct discard_op op;
 
 		begin_discard(&op, tc, discard_parent);
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index 5718b83cc718..e5f1eb27ce2e 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -550,7 +550,8 @@ static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd,
 	if (!mblk)
 		return ERR_PTR(-ENOMEM);
 
-	bio = bio_alloc(GFP_NOIO, 1);
+	bio = bio_alloc(dev->bdev, 1, REQ_OP_READ | REQ_META | REQ_PRIO,
+			GFP_NOIO);
 
 	spin_lock(&zmd->mblk_lock);
 
@@ -574,10 +575,8 @@ static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd,
 
 	/* Submit read BIO */
 	bio->bi_iter.bi_sector = dmz_blk2sect(block);
-	bio_set_dev(bio, dev->bdev);
 	bio->bi_private = mblk;
 	bio->bi_end_io = dmz_mblock_bio_end_io;
-	bio_set_op_attrs(bio, REQ_OP_READ, REQ_META | REQ_PRIO);
 	bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0);
 	submit_bio(bio);
 
@@ -721,15 +720,14 @@ static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
 	if (dmz_bdev_is_dying(dev))
 		return -EIO;
 
-	bio = bio_alloc(GFP_NOIO, 1);
+	bio = bio_alloc(dev->bdev, 1, REQ_OP_WRITE | REQ_META | REQ_PRIO,
+			GFP_NOIO);
 
 	set_bit(DMZ_META_WRITING, &mblk->state);
 
 	bio->bi_iter.bi_sector = dmz_blk2sect(block);
-	bio_set_dev(bio, dev->bdev);
 	bio->bi_private = mblk;
 	bio->bi_end_io = dmz_mblock_bio_end_io;
-	bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META | REQ_PRIO);
 	bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0);
 	submit_bio(bio);
 
@@ -751,10 +749,9 @@ static int dmz_rdwr_block(struct dmz_dev *dev, int op,
 	if (dmz_bdev_is_dying(dev))
 		return -EIO;
 
-	bio = bio_alloc(GFP_NOIO, 1);
+	bio = bio_alloc(dev->bdev, 1, op | REQ_SYNC | REQ_META | REQ_PRIO,
+			GFP_NOIO);
 	bio->bi_iter.bi_sector = dmz_blk2sect(block);
-	bio_set_dev(bio, dev->bdev);
-	bio_set_op_attrs(bio, op, REQ_SYNC | REQ_META | REQ_PRIO);
 	bio_add_page(bio, page, DMZ_BLOCK_SIZE, 0);
 	ret = submit_bio_wait(bio);
 	bio_put(bio);
diff --git a/drivers/nvdimm/nd_virtio.c b/drivers/nvdimm/nd_virtio.c
index 10351d5b49fa..c6a648fd8744 100644
--- a/drivers/nvdimm/nd_virtio.c
+++ b/drivers/nvdimm/nd_virtio.c
@@ -105,12 +105,12 @@ int async_pmem_flush(struct nd_region *nd_region, struct bio *bio)
 	 * parent bio. Otherwise directly call nd_region flush.
 	 */
 	if (bio && bio->bi_iter.bi_sector != -1) {
-		struct bio *child = bio_alloc(GFP_ATOMIC, 0);
+		struct bio *child = bio_alloc(bio->bi_bdev, 0, REQ_PREFLUSH,
+					      GFP_ATOMIC);
 
 		if (!child)
 			return -ENOMEM;
-		bio_copy_dev(child, bio);
-		child->bi_opf = REQ_PREFLUSH;
+		bio_clone_blkg_association(child, bio);
 		child->bi_iter.bi_sector = -1;
 		bio_chain(child, bio);
 		submit_bio(child);
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index 70ca9dfc1771..e092af3abc71 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -268,14 +268,15 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req)
 	if (nvmet_use_inline_bvec(req)) {
 		bio = &req->b.inline_bio;
 		bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec));
+		bio_set_dev(bio, req->ns->bdev);
+		bio->bi_opf = op;
 	} else {
-		bio = bio_alloc(GFP_KERNEL, bio_max_segs(sg_cnt));
+		bio = bio_alloc(req->ns->bdev, bio_max_segs(sg_cnt), op,
+				GFP_KERNEL);
 	}
-	bio_set_dev(bio, req->ns->bdev);
 	bio->bi_iter.bi_sector = sector;
 	bio->bi_private = req;
 	bio->bi_end_io = nvmet_bio_done;
-	bio->bi_opf = op;
 
 	blk_start_plug(&plug);
 	if (req->metadata_len)
@@ -296,10 +297,9 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req)
 				}
 			}
 
-			bio = bio_alloc(GFP_KERNEL, bio_max_segs(sg_cnt));
-			bio_set_dev(bio, req->ns->bdev);
+			bio = bio_alloc(req->ns->bdev, bio_max_segs(sg_cnt),
+					op, GFP_KERNEL);
 			bio->bi_iter.bi_sector = sector;
-			bio->bi_opf = op;
 
 			bio_chain(bio, prev);
 			submit_bio(prev);
diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c
index 9e5b89ae29df..38f72968c3fd 100644
--- a/drivers/nvme/target/passthru.c
+++ b/drivers/nvme/target/passthru.c
@@ -207,11 +207,12 @@ static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq)
 	if (nvmet_use_inline_bvec(req)) {
 		bio = &req->p.inline_bio;
 		bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec));
+		bio->bi_opf = req_op(rq);
 	} else {
-		bio = bio_alloc(GFP_KERNEL, bio_max_segs(req->sg_cnt));
+		bio = bio_alloc(NULL, bio_max_segs(req->sg_cnt), req_op(rq),
+				GFP_KERNEL);
 		bio->bi_end_io = bio_put;
 	}
-	bio->bi_opf = req_op(rq);
 
 	for_each_sg(req->sg, sg, req->sg_cnt, i) {
 		if (bio_add_pc_page(rq->q, bio, sg_page(sg), sg->length,
diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c
index 247de74247fa..62c53e8f26d3 100644
--- a/drivers/nvme/target/zns.c
+++ b/drivers/nvme/target/zns.c
@@ -522,6 +522,7 @@ static void nvmet_bdev_zone_append_bio_done(struct bio *bio)
 void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
 {
 	sector_t sect = nvmet_lba_to_sect(req->ns, req->cmd->rw.slba);
+	const unsigned int op = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE;
 	u16 status = NVME_SC_SUCCESS;
 	unsigned int total_len = 0;
 	struct scatterlist *sg;
@@ -552,13 +553,12 @@ void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
 	if (nvmet_use_inline_bvec(req)) {
 		bio = &req->z.inline_bio;
 		bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec));
+		bio->bi_opf = op;
 	} else {
-		bio = bio_alloc(GFP_KERNEL, req->sg_cnt);
+		bio = bio_alloc(req->ns->bdev, req->sg_cnt, op, GFP_KERNEL);
 	}
 
-	bio->bi_opf = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE;
 	bio->bi_end_io = nvmet_bdev_zone_append_bio_done;
-	bio_set_dev(bio, req->ns->bdev);
 	bio->bi_iter.bi_sector = sect;
 	bio->bi_private = req;
 	if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA))
diff --git a/drivers/scsi/ufs/ufshpb.c b/drivers/scsi/ufs/ufshpb.c
index 2d36a0715fca..8970068314ef 100644
--- a/drivers/scsi/ufs/ufshpb.c
+++ b/drivers/scsi/ufs/ufshpb.c
@@ -494,7 +494,7 @@ static struct ufshpb_req *ufshpb_get_map_req(struct ufshpb_lu *hpb,
 	if (!map_req)
 		return NULL;
 
-	bio = bio_alloc(GFP_KERNEL, hpb->pages_per_srgn);
+	bio = bio_alloc(NULL, hpb->pages_per_srgn, 0, GFP_KERNEL);
 	if (!bio) {
 		ufshpb_put_req(hpb, map_req);
 		return NULL;
@@ -2050,7 +2050,7 @@ static int ufshpb_pre_req_mempool_init(struct ufshpb_lu *hpb)
 		INIT_LIST_HEAD(&pre_req->list_req);
 		pre_req->req = NULL;
 
-		pre_req->bio = bio_alloc(GFP_KERNEL, 1);
+		pre_req->bio = bio_alloc(NULL, 1, 0, GFP_KERNEL);
 		if (!pre_req->bio)
 			goto release_mem;
 
diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c
index 3c92ba374819..87ede165ddba 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -415,10 +415,9 @@ iblock_execute_sync_cache(struct se_cmd *cmd)
 	if (immed)
 		target_complete_cmd(cmd, SAM_STAT_GOOD);
 
-	bio = bio_alloc(GFP_KERNEL, 0);
+	bio = bio_alloc(ib_dev->ibd_bd, 0, REQ_OP_WRITE | REQ_PREFLUSH,
+			GFP_KERNEL);
 	bio->bi_end_io = iblock_end_io_flush;
-	bio_set_dev(bio, ib_dev->ibd_bd);
-	bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 	if (!immed)
 		bio->bi_private = cmd;
 	submit_bio(bio);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 87a5addbedf6..f45aa506f9a6 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4029,8 +4029,9 @@ static int write_dev_supers(struct btrfs_device *device,
 		 * to do I/O, so we don't lose the ability to do integrity
 		 * checking.
 		 */
-		bio = bio_alloc(GFP_NOFS, 1);
-		bio_set_dev(bio, device->bdev);
+		bio = bio_alloc(device->bdev, 1,
+				REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO,
+				GFP_NOFS);
 		bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
 		bio->bi_private = device;
 		bio->bi_end_io = btrfs_end_super_write;
@@ -4042,7 +4043,6 @@ static int write_dev_supers(struct btrfs_device *device,
 		 * go down lazy and there's a short window where the on-disk
 		 * copies might still contain the older version.
 		 */
-		bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO;
 		if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
 			bio->bi_opf |= REQ_FUA;
 
diff --git a/fs/buffer.c b/fs/buffer.c
index 8e112b6bd371..a17c386a142c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3024,12 +3024,16 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
 	if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
 		clear_buffer_write_io_error(bh);
 
-	bio = bio_alloc(GFP_NOIO, 1);
+	if (buffer_meta(bh))
+		op_flags |= REQ_META;
+	if (buffer_prio(bh))
+		op_flags |= REQ_PRIO;
+
+	bio = bio_alloc(bh->b_bdev, 1, op | op_flags, GFP_NOIO);
 
 	fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
 
 	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
-	bio_set_dev(bio, bh->b_bdev);
 	bio->bi_write_hint = write_hint;
 
 	bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
@@ -3038,12 +3042,6 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
 	bio->bi_end_io = end_bio_bh_io_sync;
 	bio->bi_private = bh;
 
-	if (buffer_meta(bh))
-		op_flags |= REQ_META;
-	if (buffer_prio(bh))
-		op_flags |= REQ_PRIO;
-	bio_set_op_attrs(bio, op, op_flags);
-
 	/* Take care of bh's that straddle the end of the device */
 	guard_bio_eod(bio);
 
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index bfc2a5b74ed3..755e985a42e0 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -54,7 +54,8 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode,
 	int num_pages = 0;
 
 	/* This always succeeds since __GFP_DIRECT_RECLAIM is set. */
-	bio = bio_alloc(GFP_NOFS, BIO_MAX_VECS);
+	bio = bio_alloc(inode->i_sb->s_bdev, BIO_MAX_VECS, REQ_OP_WRITE,
+			GFP_NOFS);
 
 	while (len) {
 		unsigned int blocks_this_page = min(len, blocks_per_page);
@@ -62,10 +63,8 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode,
 
 		if (num_pages == 0) {
 			fscrypt_set_bio_crypt_ctx(bio, inode, lblk, GFP_NOFS);
-			bio_set_dev(bio, inode->i_sb->s_bdev);
 			bio->bi_iter.bi_sector =
 					pblk << (blockbits - SECTOR_SHIFT);
-			bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 		}
 		ret = bio_add_page(bio, ZERO_PAGE(0), bytes_this_page, 0);
 		if (WARN_ON(ret != bytes_this_page)) {
@@ -82,6 +81,8 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode,
 			if (err)
 				goto out;
 			bio_reset(bio);
+			bio_set_dev(bio, inode->i_sb->s_bdev);
+			bio->bi_opf = REQ_OP_WRITE;
 			num_pages = 0;
 		}
 	}
@@ -150,12 +151,10 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
 		return -EINVAL;
 
 	/* This always succeeds since __GFP_DIRECT_RECLAIM is set. */
-	bio = bio_alloc(GFP_NOFS, nr_pages);
+	bio = bio_alloc(inode->i_sb->s_bdev, nr_pages, REQ_OP_WRITE, GFP_NOFS);
 
 	do {
-		bio_set_dev(bio, inode->i_sb->s_bdev);
 		bio->bi_iter.bi_sector = pblk << (blockbits - 9);
-		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 
 		i = 0;
 		offset = 0;
@@ -183,6 +182,8 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
 		if (err)
 			goto out;
 		bio_reset(bio);
+		bio_set_dev(bio, inode->i_sb->s_bdev);
+		bio->bi_opf = REQ_OP_WRITE;
 	} while (len != 0);
 	err = 0;
 out:
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 654443558047..38bca4980a1c 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -396,11 +396,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
 	 * bio_alloc() is guaranteed to return a bio when allowed to sleep and
 	 * we request a valid number of vectors.
 	 */
-	bio = bio_alloc(GFP_KERNEL, nr_vecs);
-
-	bio_set_dev(bio, bdev);
+	bio = bio_alloc(bdev, nr_vecs, dio->op | dio->op_flags, GFP_KERNEL);
 	bio->bi_iter.bi_sector = first_sector;
-	bio_set_op_attrs(bio, dio->op, dio->op_flags);
 	if (dio->is_async)
 		bio->bi_end_io = dio_bio_end_aio;
 	else
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 498b7666efe8..db7de2dbac73 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -1371,15 +1371,14 @@ submit_bio_retry:
 			}
 
 			if (!bio) {
-				bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS);
+				bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
+						REQ_OP_READ, GFP_NOIO);
 				bio->bi_end_io = z_erofs_decompressqueue_endio;
 
-				bio_set_dev(bio, mdev.m_bdev);
 				last_bdev = mdev.m_bdev;
 				bio->bi_iter.bi_sector = (sector_t)cur <<
 					LOG_SECTORS_PER_BLOCK;
 				bio->bi_private = bi_private;
-				bio->bi_opf = REQ_OP_READ;
 				if (f->readahead)
 					bio->bi_opf |= REQ_RAHEAD;
 				++nr_bios;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 1d370364230e..125398226873 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -398,10 +398,9 @@ static void io_submit_init_bio(struct ext4_io_submit *io,
 	 * bio_alloc will _always_ be able to allocate a bio if
 	 * __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset().
 	 */
-	bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS);
+	bio = bio_alloc(bh->b_bdev, BIO_MAX_VECS, 0, GFP_NOIO);
 	fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
 	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
-	bio_set_dev(bio, bh->b_bdev);
 	bio->bi_end_io = ext4_end_bio;
 	bio->bi_private = ext4_get_io_end(io->io_end);
 	io->io_bio = bio;
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 4cd62f1d848c..1aa26d6634fc 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -365,15 +365,15 @@ int ext4_mpage_readpages(struct inode *inode,
 			 * bio_alloc will _always_ be able to allocate a bio if
 			 * __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset().
 			 */
-			bio = bio_alloc(GFP_KERNEL, bio_max_segs(nr_pages));
+			bio = bio_alloc(bdev, bio_max_segs(nr_pages),
+					REQ_OP_READ, GFP_KERNEL);
 			fscrypt_set_bio_crypt_ctx(bio, inode, next_block,
 						  GFP_KERNEL);
 			ext4_set_bio_post_read_ctx(bio, inode, page->index);
-			bio_set_dev(bio, bdev);
 			bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
 			bio->bi_end_io = mpage_end_io;
-			bio_set_op_attrs(bio, REQ_OP_READ,
-						rac ? REQ_RAHEAD : 0);
+			if (rac)
+				bio->bi_opf |= REQ_RAHEAD;
 		}
 
 		length = first_hole << blkbits;
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index ca0bb3a73912..4ae1eefae616 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -265,10 +265,9 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno,
 				      bio_end_io_t *end_io)
 {
 	struct super_block *sb = sdp->sd_vfs;
-	struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS);
+	struct bio *bio = bio_alloc(sb->s_bdev, BIO_MAX_VECS, 0, GFP_NOIO);
 
 	bio->bi_iter.bi_sector = blkno << sdp->sd_fsb2bb_shift;
-	bio_set_dev(bio, sb->s_bdev);
 	bio->bi_end_io = end_io;
 	bio->bi_private = sdp;
 
@@ -489,10 +488,9 @@ static struct bio *gfs2_chain_bio(struct bio *prev, unsigned int nr_iovecs)
 {
 	struct bio *new;
 
-	new = bio_alloc(GFP_NOIO, nr_iovecs);
-	bio_copy_dev(new, prev);
+	new = bio_alloc(prev->bi_bdev, nr_iovecs, prev->bi_opf, GFP_NOIO);
+	bio_clone_blkg_association(new, prev);
 	new->bi_iter.bi_sector = bio_end_sector(prev);
-	new->bi_opf = prev->bi_opf;
 	new->bi_write_hint = prev->bi_write_hint;
 	bio_chain(new, prev);
 	submit_bio(prev);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 72d30a682ece..a580b90b7522 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -222,9 +222,8 @@ static void gfs2_submit_bhs(int op, int op_flags, struct buffer_head *bhs[],
 		struct buffer_head *bh = *bhs;
 		struct bio *bio;
 
-		bio = bio_alloc(GFP_NOIO, num);
+		bio = bio_alloc(bh->b_bdev, num, op | op_flags, GFP_NOIO);
 		bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
-		bio_set_dev(bio, bh->b_bdev);
 		while (num > 0) {
 			bh = *bhs;
 			if (!bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh))) {
@@ -235,7 +234,6 @@ static void gfs2_submit_bhs(int op, int op_flags, struct buffer_head *bhs[],
 			num--;
 		}
 		bio->bi_end_io = gfs2_meta_read_endio;
-		bio_set_op_attrs(bio, op, op_flags);
 		submit_bio(bio);
 	}
 }
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 7f8410d8fdc1..c9b423c874a3 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -251,14 +251,12 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
 	ClearPageDirty(page);
 	lock_page(page);
 
-	bio = bio_alloc(GFP_NOFS, 1);
+	bio = bio_alloc(sb->s_bdev, 1, REQ_OP_READ | REQ_META, GFP_NOFS);
 	bio->bi_iter.bi_sector = sector * (sb->s_blocksize >> 9);
-	bio_set_dev(bio, sb->s_bdev);
 	bio_add_page(bio, page, PAGE_SIZE, 0);
 
 	bio->bi_end_io = end_bio_io_page;
 	bio->bi_private = page;
-	bio_set_op_attrs(bio, REQ_OP_READ, REQ_META);
 	submit_bio(bio);
 	wait_on_page_locked(page);
 	bio_put(bio);
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 4688cc7b3692..0b8ad6586df5 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -63,10 +63,8 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
 	offset = start & (io_size - 1);
 	sector &= ~((io_size >> HFSPLUS_SECTOR_SHIFT) - 1);
 
-	bio = bio_alloc(GFP_NOIO, 1);
+	bio = bio_alloc(sb->s_bdev, 1, op | op_flags, GFP_NOIO);
 	bio->bi_iter.bi_sector = sector;
-	bio_set_dev(bio, sb->s_bdev);
-	bio_set_op_attrs(bio, op, op_flags);
 
 	if (op != WRITE && data)
 		*data = (u8 *)buf + offset;
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 340d373cb1bf..70f3657a6ec0 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -290,19 +290,20 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
 
 		if (ctx->rac) /* same as readahead_gfp_mask */
 			gfp |= __GFP_NORETRY | __GFP_NOWARN;
-		ctx->bio = bio_alloc(gfp, bio_max_segs(nr_vecs));
+		ctx->bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs),
+				     REQ_OP_READ, gfp);
 		/*
 		 * If the bio_alloc fails, try it again for a single page to
 		 * avoid having to deal with partial page reads.  This emulates
 		 * what do_mpage_readpage does.
 		 */
-		if (!ctx->bio)
-			ctx->bio = bio_alloc(orig_gfp, 1);
-		ctx->bio->bi_opf = REQ_OP_READ;
+		if (!ctx->bio) {
+			ctx->bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ,
+					     orig_gfp);
+		}
 		if (ctx->rac)
 			ctx->bio->bi_opf |= REQ_RAHEAD;
 		ctx->bio->bi_iter.bi_sector = sector;
-		bio_set_dev(ctx->bio, iomap->bdev);
 		ctx->bio->bi_end_io = iomap_read_end_io;
 		bio_add_folio(ctx->bio, folio, plen, poff);
 	}
@@ -1226,10 +1227,9 @@ iomap_chain_bio(struct bio *prev)
 {
 	struct bio *new;
 
-	new = bio_alloc(GFP_NOFS, BIO_MAX_VECS);
-	bio_copy_dev(new, prev);/* also copies over blkcg information */
+	new = bio_alloc(prev->bi_bdev, BIO_MAX_VECS, prev->bi_opf, GFP_NOFS);
+	bio_clone_blkg_association(new, prev);
 	new->bi_iter.bi_sector = bio_end_sector(prev);
-	new->bi_opf = prev->bi_opf;
 	new->bi_write_hint = prev->bi_write_hint;
 
 	bio_chain(prev, new);
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 03ea367df19a..e2ba13645ef2 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -183,15 +183,13 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
 	int flags = REQ_SYNC | REQ_IDLE;
 	struct bio *bio;
 
-	bio = bio_alloc(GFP_KERNEL, 1);
-	bio_set_dev(bio, iter->iomap.bdev);
+	bio = bio_alloc(iter->iomap.bdev, 1, REQ_OP_WRITE | flags, GFP_KERNEL);
 	bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos);
 	bio->bi_private = dio;
 	bio->bi_end_io = iomap_dio_bio_end_io;
 
 	get_page(page);
 	__bio_add_page(bio, page, len, 0);
-	bio_set_op_attrs(bio, REQ_OP_WRITE, flags);
 	iomap_dio_submit_bio(iter, dio, bio, pos);
 }
 
@@ -309,14 +307,12 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 			goto out;
 		}
 
-		bio = bio_alloc(GFP_KERNEL, nr_pages);
-		bio_set_dev(bio, iomap->bdev);
+		bio = bio_alloc(iomap->bdev, nr_pages, bio_opf, GFP_KERNEL);
 		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
 		bio->bi_write_hint = dio->iocb->ki_hint;
 		bio->bi_ioprio = dio->iocb->ki_ioprio;
 		bio->bi_private = dio;
 		bio->bi_end_io = iomap_dio_bio_end_io;
-		bio->bi_opf = bio_opf;
 
 		ret = bio_iov_iter_get_pages(bio, dio->submit.iter);
 		if (unlikely(ret)) {
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 78fd136ac13b..997c81fcea34 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1980,17 +1980,13 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
 
 	bp->l_flag |= lbmREAD;
 
-	bio = bio_alloc(GFP_NOFS, 1);
-
+	bio = bio_alloc(log->bdev, 1, REQ_OP_READ, GFP_NOFS);
 	bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
-	bio_set_dev(bio, log->bdev);
-
 	bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset);
 	BUG_ON(bio->bi_iter.bi_size != LOGPSIZE);
 
 	bio->bi_end_io = lbmIODone;
 	bio->bi_private = bp;
-	bio->bi_opf = REQ_OP_READ;
 	/*check if journaling to disk has been disabled*/
 	if (log->no_integrity) {
 		bio->bi_iter.bi_size = 0;
@@ -2125,16 +2121,13 @@ static void lbmStartIO(struct lbuf * bp)
 
 	jfs_info("lbmStartIO");
 
-	bio = bio_alloc(GFP_NOFS, 1);
+	bio = bio_alloc(log->bdev, 1, REQ_OP_WRITE | REQ_SYNC, GFP_NOFS);
 	bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
-	bio_set_dev(bio, log->bdev);
-
 	bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset);
 	BUG_ON(bio->bi_iter.bi_size != LOGPSIZE);
 
 	bio->bi_end_io = lbmIODone;
 	bio->bi_private = bp;
-	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
 
 	/* check if journaling to disk has been disabled */
 	if (log->no_integrity) {
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 104ae698443e..fde1a9cf902e 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -417,12 +417,10 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
 		}
 		len = min(xlen, (int)JFS_SBI(inode->i_sb)->nbperpage);
 
-		bio = bio_alloc(GFP_NOFS, 1);
-		bio_set_dev(bio, inode->i_sb->s_bdev);
+		bio = bio_alloc(inode->i_sb->s_bdev, 1, REQ_OP_WRITE, GFP_NOFS);
 		bio->bi_iter.bi_sector = pblock << (inode->i_blkbits - 9);
 		bio->bi_end_io = metapage_write_end_io;
 		bio->bi_private = page;
-		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 
 		/* Don't call bio_add_page yet, we may add to this vec */
 		bio_offset = offset;
@@ -497,13 +495,12 @@ static int metapage_readpage(struct file *fp, struct page *page)
 			if (bio)
 				submit_bio(bio);
 
-			bio = bio_alloc(GFP_NOFS, 1);
-			bio_set_dev(bio, inode->i_sb->s_bdev);
+			bio = bio_alloc(inode->i_sb->s_bdev, 1, REQ_OP_READ,
+					GFP_NOFS);
 			bio->bi_iter.bi_sector =
 				pblock << (inode->i_blkbits - 9);
 			bio->bi_end_io = metapage_read_end_io;
 			bio->bi_private = page;
-			bio_set_op_attrs(bio, REQ_OP_READ, 0);
 			len = xlen << inode->i_blkbits;
 			offset = block_offset << inode->i_blkbits;
 			if (bio_add_page(bio, page, len, offset) < len)
diff --git a/fs/mpage.c b/fs/mpage.c
index 06e95d777e94..dbfc02e23d97 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -273,10 +273,10 @@ alloc_new:
 								page))
 				goto out;
 		}
-		args->bio = bio_alloc(gfp, bio_max_segs(args->nr_pages));
+		args->bio = bio_alloc(bdev, bio_max_segs(args->nr_pages), 0,
+				      gfp);
 		if (args->bio == NULL)
 			goto confused;
-		bio_set_dev(args->bio, bdev);
 		args->bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
 	}
 
@@ -586,8 +586,7 @@ alloc_new:
 								page, wbc))
 				goto out;
 		}
-		bio = bio_alloc(GFP_NOFS, BIO_MAX_VECS);
-		bio_set_dev(bio, bdev);
+		bio = bio_alloc(bdev, BIO_MAX_VECS, 0, GFP_NOFS);
 		bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
 
 		wbc_init_bio(wbc, bio);
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 38e063af7e98..79a8b451791f 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -154,12 +154,10 @@ do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect,
 
 retry:
 	if (!bio) {
-		bio = bio_alloc(GFP_NOIO, bio_max_segs(npg));
+		bio = bio_alloc(map->bdev, bio_max_segs(npg), rw, GFP_NOIO);
 		bio->bi_iter.bi_sector = disk_addr >> SECTOR_SHIFT;
-		bio_set_dev(bio, map->bdev);
 		bio->bi_end_io = end_io;
 		bio->bi_private = par;
-		bio_set_op_attrs(bio, rw, 0);
 	}
 	if (bio_add_page(bio, page, *len, offset) < *len) {
 		bio = bl_submit_bio(bio);
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 53b7c6d21cdd..4f71faacd825 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -391,8 +391,8 @@ static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf,
 	BUG_ON(wi->nr_vecs <= 0);
  repeat:
 	if (!wi->bio) {
-		wi->bio = bio_alloc(GFP_NOIO, wi->nr_vecs);
-		bio_set_dev(wi->bio, wi->nilfs->ns_bdev);
+		wi->bio = bio_alloc(wi->nilfs->ns_bdev, wi->nr_vecs, 0,
+				    GFP_NOIO);
 		wi->bio->bi_iter.bi_sector = (wi->blocknr + wi->end) <<
 			(wi->nilfs->ns_blocksize_bits - 9);
 	}
diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c
index 4a255e21ecf5..0660a07c5a96 100644
--- a/fs/ntfs3/fsntfs.c
+++ b/fs/ntfs3/fsntfs.c
@@ -1485,15 +1485,13 @@ int ntfs_bio_pages(struct ntfs_sb_info *sbi, const struct runs_tree *run,
 		lbo = ((u64)lcn << cluster_bits) + off;
 		len = ((u64)clen << cluster_bits) - off;
 new_bio:
-		new = bio_alloc(GFP_NOFS, nr_pages - page_idx);
+		new = bio_alloc(bdev, nr_pages - page_idx, op, GFP_NOFS);
 		if (bio) {
 			bio_chain(bio, new);
 			submit_bio(bio);
 		}
 		bio = new;
-		bio_set_dev(bio, bdev);
 		bio->bi_iter.bi_sector = lbo >> 9;
-		bio->bi_opf = op;
 
 		while (len) {
 			off = vbo & (PAGE_SIZE - 1);
@@ -1584,14 +1582,12 @@ int ntfs_bio_fill_1(struct ntfs_sb_info *sbi, const struct runs_tree *run)
 		lbo = (u64)lcn << cluster_bits;
 		len = (u64)clen << cluster_bits;
 new_bio:
-		new = bio_alloc(GFP_NOFS, BIO_MAX_VECS);
+		new = bio_alloc(bdev, BIO_MAX_VECS, REQ_OP_WRITE, GFP_NOFS);
 		if (bio) {
 			bio_chain(bio, new);
 			submit_bio(bio);
 		}
 		bio = new;
-		bio_set_dev(bio, bdev);
-		bio->bi_opf = REQ_OP_WRITE;
 		bio->bi_iter.bi_sector = lbo >> 9;
 
 		for (;;) {
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index a17be1618bf7..ea0e70c0fce0 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -518,7 +518,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
 	 * GFP_KERNEL that the local node can get fenced. It would be
 	 * nicest if we could pre-allocate these bios and avoid this
 	 * all together. */
-	bio = bio_alloc(GFP_ATOMIC, 16);
+	bio = bio_alloc(reg->hr_bdev, 16, op | op_flags, GFP_ATOMIC);
 	if (!bio) {
 		mlog(ML_ERROR, "Could not alloc slots BIO!\n");
 		bio = ERR_PTR(-ENOMEM);
@@ -527,10 +527,8 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
 
 	/* Must put everything in 512 byte sectors for the bio... */
 	bio->bi_iter.bi_sector = (reg->hr_start_block + cs) << (bits - 9);
-	bio_set_dev(bio, reg->hr_bdev);
 	bio->bi_private = wc;
 	bio->bi_end_io = o2hb_bio_end_io;
-	bio_set_op_attrs(bio, op, op_flags);
 
 	vec_start = (cs << bits) % PAGE_SIZE;
 	while(cs < max_slots) {
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 2db8bcf7ff85..622c844f6d11 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -86,16 +86,17 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length,
 	int error, i;
 	struct bio *bio;
 
-	if (page_count <= BIO_MAX_VECS)
-		bio = bio_alloc(GFP_NOIO, page_count);
-	else
+	if (page_count <= BIO_MAX_VECS) {
+		bio = bio_alloc(sb->s_bdev, page_count, REQ_OP_READ, GFP_NOIO);
+	} else {
 		bio = bio_kmalloc(GFP_NOIO, page_count);
+		bio_set_dev(bio, sb->s_bdev);
+		bio->bi_opf = REQ_OP_READ;
+	}
 
 	if (!bio)
 		return -ENOMEM;
 
-	bio_set_dev(bio, sb->s_bdev);
-	bio->bi_opf = READ;
 	bio->bi_iter.bi_sector = block * (msblk->devblksize >> SECTOR_SHIFT);
 
 	for (i = 0; i < page_count; ++i) {
diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c
index 667e297f59b1..eff4a9f21dcf 100644
--- a/fs/xfs/xfs_bio_io.c
+++ b/fs/xfs/xfs_bio_io.c
@@ -61,10 +61,9 @@ xfs_rw_bdev(
 	if (is_vmalloc && op == REQ_OP_WRITE)
 		flush_kernel_vmap_range(data, count);
 
-	bio = bio_alloc(GFP_KERNEL, bio_max_vecs(left));
-	bio_set_dev(bio, bdev);
+	bio = bio_alloc(bdev, bio_max_vecs(left), op | REQ_META | REQ_SYNC,
+			GFP_KERNEL);
 	bio->bi_iter.bi_sector = sector;
-	bio->bi_opf = op | REQ_META | REQ_SYNC;
 
 	do {
 		struct page	*page = kmem_to_page(data);
@@ -74,10 +73,9 @@ xfs_rw_bdev(
 		while (bio_add_page(bio, page, len, off) != len) {
 			struct bio	*prev = bio;
 
-			bio = bio_alloc(GFP_KERNEL, bio_max_vecs(left));
-			bio_copy_dev(bio, prev);
+			bio = bio_alloc(prev->bi_bdev, bio_max_vecs(left),
+					prev->bi_opf, GFP_KERNEL);
 			bio->bi_iter.bi_sector = bio_end_sector(prev);
-			bio->bi_opf = prev->bi_opf;
 			bio_chain(prev, bio);
 
 			submit_bio(prev);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index b45e0d50a405..ae87fd95b17e 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1440,12 +1440,10 @@ next_chunk:
 	atomic_inc(&bp->b_io_remaining);
 	nr_pages = bio_max_segs(total_nr_pages);
 
-	bio = bio_alloc(GFP_NOIO, nr_pages);
-	bio_set_dev(bio, bp->b_target->bt_bdev);
+	bio = bio_alloc(bp->b_target->bt_bdev, nr_pages, op, GFP_NOIO);
 	bio->bi_iter.bi_sector = sector;
 	bio->bi_end_io = xfs_buf_bio_end_io;
 	bio->bi_private = bp;
-	bio->bi_opf = op;
 
 	for (; size && nr_pages; nr_pages--, page_index++) {
 		int	rbytes, nbytes = PAGE_SIZE - offset;
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index b76dfb310ab6..c0fc2c326dce 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -692,12 +692,11 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
 	if (!nr_pages)
 		return 0;
 
-	bio = bio_alloc(GFP_NOFS, nr_pages);
-	bio_set_dev(bio, bdev);
+	bio = bio_alloc(bdev, nr_pages,
+			REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS);
 	bio->bi_iter.bi_sector = zi->i_zsector;
 	bio->bi_write_hint = iocb->ki_hint;
 	bio->bi_ioprio = iocb->ki_ioprio;
-	bio->bi_opf = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE;
 	if (iocb->ki_flags & IOCB_DSYNC)
 		bio->bi_opf |= REQ_FUA;
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 5c5ada2ebb27..be6ac92913d4 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -418,9 +418,10 @@ extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *);
 
 extern struct bio_set fs_bio_set;
 
-static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned short nr_iovecs)
+static inline struct bio *bio_alloc(struct block_device *bdev,
+		unsigned short nr_vecs, unsigned int opf, gfp_t gfp_mask)
 {
-	return bio_alloc_bioset(NULL, nr_iovecs, 0, gfp_mask, &fs_bio_set);
+	return bio_alloc_bioset(bdev, nr_vecs, opf, gfp_mask, &fs_bio_set);
 }
 
 void submit_bio(struct bio *bio);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index f1bd03129575..6c4f983cbacc 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -276,10 +276,9 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
 	struct bio *bio;
 	int error = 0;
 
-	bio = bio_alloc(GFP_NOIO | __GFP_HIGH, 1);
+	bio = bio_alloc(hib_resume_bdev, 1, op | op_flags,
+			GFP_NOIO | __GFP_HIGH);
 	bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
-	bio_set_dev(bio, hib_resume_bdev);
-	bio_set_op_attrs(bio, op, op_flags);
 
 	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
 		pr_err("Adding page to bio failed at %llu\n",
diff --git a/mm/page_io.c b/mm/page_io.c
index 0bf8e40f4e57..61c792f916fa 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -338,10 +338,10 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
 		return 0;
 	}
 
-	bio = bio_alloc(GFP_NOIO, 1);
-	bio_set_dev(bio, sis->bdev);
+	bio = bio_alloc(sis->bdev, 1,
+			REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc),
+			GFP_NOIO);
 	bio->bi_iter.bi_sector = swap_page_sector(page);
-	bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc);
 	bio->bi_end_io = end_write_func;
 	bio_add_page(bio, page, thp_size(page), 0);
 
@@ -403,9 +403,7 @@ int swap_readpage(struct page *page, bool synchronous)
 	}
 
 	ret = 0;
-	bio = bio_alloc(GFP_KERNEL, 1);
-	bio_set_dev(bio, sis->bdev);
-	bio->bi_opf = REQ_OP_READ;
+	bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL);
 	bio->bi_iter.bi_sector = swap_page_sector(page);
 	bio->bi_end_io = end_swap_bio_read;
 	bio_add_page(bio, page, thp_size(page), 0);

From 49add4966d79244013fce35f95c6833fae82b8b1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 24 Jan 2022 10:11:06 +0100
Subject: [PATCH 022/127] block: pass a block_device and opf to bio_init

Pass the block_device that we plan to use this bio for and the
operation to bio_init to optimize the assignment.  A NULL block_device
can be passed, both for the passthrough case on a raw request_queue and
to temporarily avoid refactoring some nasty code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Link: https://lore.kernel.org/r/20220124091107.642561-19-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c                       | 27 +++++++++++++--------------
 block/blk-flush.c                 |  4 +---
 block/blk-zoned.c                 |  5 +----
 block/fops.c                      | 18 +++++++++---------
 drivers/block/floppy.c            |  4 +---
 drivers/block/zram/zram_drv.c     |  5 ++---
 drivers/md/bcache/io.c            |  3 ++-
 drivers/md/bcache/journal.c       |  4 +---
 drivers/md/bcache/movinggc.c      |  4 ++--
 drivers/md/bcache/request.c       |  2 +-
 drivers/md/bcache/super.c         |  8 +++-----
 drivers/md/bcache/writeback.c     |  4 ++--
 drivers/md/dm.c                   |  5 ++---
 drivers/md/md-multipath.c         |  2 +-
 drivers/md/md.c                   |  8 +++-----
 drivers/md/raid5-cache.c          |  2 +-
 drivers/md/raid5-ppl.c            |  2 +-
 drivers/md/raid5.c                |  4 ++--
 drivers/nvme/target/io-cmd-bdev.c | 10 ++++------
 drivers/nvme/target/passthru.c    |  4 ++--
 drivers/nvme/target/zns.c         |  4 ++--
 fs/iomap/buffered-io.c            |  4 +---
 fs/xfs/xfs_bio_io.c               |  4 +---
 fs/xfs/xfs_log.c                  | 14 +++++++-------
 fs/zonefs/super.c                 |  4 +---
 include/linux/bio.h               |  4 ++--
 26 files changed, 68 insertions(+), 91 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index b73c9babd583..b2133d86e885 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -249,12 +249,12 @@ static void bio_free(struct bio *bio)
  * they must remember to pair any call to bio_init() with bio_uninit()
  * when IO has completed, or when the bio is released.
  */
-void bio_init(struct bio *bio, struct bio_vec *table,
-	      unsigned short max_vecs)
+void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
+	      unsigned short max_vecs, unsigned int opf)
 {
 	bio->bi_next = NULL;
-	bio->bi_bdev = NULL;
-	bio->bi_opf = 0;
+	bio->bi_bdev = bdev;
+	bio->bi_opf = opf;
 	bio->bi_flags = 0;
 	bio->bi_ioprio = 0;
 	bio->bi_write_hint = 0;
@@ -268,6 +268,8 @@ void bio_init(struct bio *bio, struct bio_vec *table,
 #ifdef CONFIG_BLK_CGROUP
 	bio->bi_blkg = NULL;
 	bio->bi_issue.value = 0;
+	if (bdev)
+		bio_associate_blkg(bio);
 #ifdef CONFIG_BLK_CGROUP_IOCOST
 	bio->bi_iocost_cost = 0;
 #endif
@@ -504,17 +506,14 @@ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
 		if (unlikely(!bvl))
 			goto err_free;
 
-		bio_init(bio, bvl, nr_vecs);
+		bio_init(bio, bdev, bvl, nr_vecs, opf);
 	} else if (nr_vecs) {
-		bio_init(bio, bio->bi_inline_vecs, BIO_INLINE_VECS);
+		bio_init(bio, bdev, bio->bi_inline_vecs, BIO_INLINE_VECS, opf);
 	} else {
-		bio_init(bio, NULL, 0);
+		bio_init(bio, bdev, NULL, 0, opf);
 	}
 
 	bio->bi_pool = bs;
-	if (bdev)
-		bio_set_dev(bio, bdev);
-	bio->bi_opf = opf;
 	return bio;
 
 err_free:
@@ -542,7 +541,8 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs)
 	bio = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask);
 	if (unlikely(!bio))
 		return NULL;
-	bio_init(bio, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs);
+	bio_init(bio, NULL, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs,
+		 0);
 	bio->bi_pool = NULL;
 	return bio;
 }
@@ -1756,9 +1756,8 @@ struct bio *bio_alloc_kiocb(struct kiocb *kiocb, struct block_device *bdev,
 		cache->free_list = bio->bi_next;
 		cache->nr--;
 		put_cpu();
-		bio_init(bio, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs);
-		bio_set_dev(bio, bdev);
-		bio->bi_opf = opf;
+		bio_init(bio, bdev, nr_vecs ? bio->bi_inline_vecs : NULL,
+			 nr_vecs, opf);
 		bio->bi_pool = bs;
 		bio_set_flag(bio, BIO_PERCPU_CACHE);
 		return bio;
diff --git a/block/blk-flush.c b/block/blk-flush.c
index e4df894189ce..c68968724870 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -460,9 +460,7 @@ int blkdev_issue_flush(struct block_device *bdev)
 {
 	struct bio bio;
 
-	bio_init(&bio, NULL, 0);
-	bio_set_dev(&bio, bdev);
-	bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+	bio_init(&bio, bdev, NULL, 0, REQ_OP_WRITE | REQ_PREFLUSH);
 	return submit_bio_wait(&bio);
 }
 EXPORT_SYMBOL(blkdev_issue_flush);
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 5ab755d792c8..602bef54c813 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -238,10 +238,7 @@ static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask)
 {
 	struct bio bio;
 
-	bio_init(&bio, NULL, 0);
-	bio_set_dev(&bio, bdev);
-	bio.bi_opf = REQ_OP_ZONE_RESET_ALL | REQ_SYNC;
-
+	bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC);
 	return submit_bio_wait(&bio);
 }
 
diff --git a/block/fops.c b/block/fops.c
index c68359684773..3696665e586a 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -75,8 +75,13 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
 			return -ENOMEM;
 	}
 
-	bio_init(&bio, vecs, nr_pages);
-	bio_set_dev(&bio, bdev);
+	if (iov_iter_rw(iter) == READ) {
+		bio_init(&bio, bdev, vecs, nr_pages, REQ_OP_READ);
+		if (iter_is_iovec(iter))
+			should_dirty = true;
+	} else {
+		bio_init(&bio, bdev, vecs, nr_pages, dio_bio_write_op(iocb));
+	}
 	bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT;
 	bio.bi_write_hint = iocb->ki_hint;
 	bio.bi_private = current;
@@ -88,14 +93,9 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
 		goto out;
 	ret = bio.bi_iter.bi_size;
 
-	if (iov_iter_rw(iter) == READ) {
-		bio.bi_opf = REQ_OP_READ;
-		if (iter_is_iovec(iter))
-			should_dirty = true;
-	} else {
-		bio.bi_opf = dio_bio_write_op(iocb);
+	if (iov_iter_rw(iter) == WRITE)
 		task_io_account_write(ret);
-	}
+
 	if (iocb->ki_flags & IOCB_NOWAIT)
 		bio.bi_opf |= REQ_NOWAIT;
 	if (iocb->ki_flags & IOCB_HIPRI)
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index e611411a934c..19c2d0327e15 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -4129,15 +4129,13 @@ static int __floppy_read_block_0(struct block_device *bdev, int drive)
 
 	cbdata.drive = drive;
 
-	bio_init(&bio, &bio_vec, 1);
-	bio_set_dev(&bio, bdev);
+	bio_init(&bio, bdev, &bio_vec, 1, REQ_OP_READ);
 	bio_add_page(&bio, page, block_size(bdev), 0);
 
 	bio.bi_iter.bi_sector = 0;
 	bio.bi_flags |= (1 << BIO_QUIET);
 	bio.bi_private = &cbdata;
 	bio.bi_end_io = floppy_rb0_cb;
-	bio_set_op_attrs(&bio, REQ_OP_READ, 0);
 
 	init_completion(&cbdata.complete);
 
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index f3fe0ea8aa80..a3a5e1e71326 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -743,10 +743,9 @@ static ssize_t writeback_store(struct device *dev,
 			continue;
 		}
 
-		bio_init(&bio, &bio_vec, 1);
-		bio_set_dev(&bio, zram->bdev);
+		bio_init(&bio, zram->bdev, &bio_vec, 1,
+			 REQ_OP_WRITE | REQ_SYNC);
 		bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9);
-		bio.bi_opf = REQ_OP_WRITE | REQ_SYNC;
 
 		bio_add_page(&bio, bvec.bv_page, bvec.bv_len,
 				bvec.bv_offset);
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index 9c6f9ec55b72..020712c5203f 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -26,7 +26,8 @@ struct bio *bch_bbio_alloc(struct cache_set *c)
 	struct bbio *b = mempool_alloc(&c->bio_meta, GFP_NOIO);
 	struct bio *bio = &b->bio;
 
-	bio_init(bio, bio->bi_inline_vecs, meta_bucket_pages(&c->cache->sb));
+	bio_init(bio, NULL, bio->bi_inline_vecs,
+		 meta_bucket_pages(&c->cache->sb), 0);
 
 	return bio;
 }
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 61bd79babf7a..6d26c5b06e2b 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -611,11 +611,9 @@ static void do_journal_discard(struct cache *ca)
 
 		atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT);
 
-		bio_init(bio, bio->bi_inline_vecs, 1);
-		bio_set_op_attrs(bio, REQ_OP_DISCARD, 0);
+		bio_init(bio, ca->bdev, bio->bi_inline_vecs, 1, REQ_OP_DISCARD);
 		bio->bi_iter.bi_sector	= bucket_to_sector(ca->set,
 						ca->sb.d[ja->discard_idx]);
-		bio_set_dev(bio, ca->bdev);
 		bio->bi_iter.bi_size	= bucket_bytes(ca);
 		bio->bi_end_io		= journal_discard_endio;
 
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index b9c3d27ec093..99499d1f6e66 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -79,8 +79,8 @@ static void moving_init(struct moving_io *io)
 {
 	struct bio *bio = &io->bio.bio;
 
-	bio_init(bio, bio->bi_inline_vecs,
-		 DIV_ROUND_UP(KEY_SIZE(&io->w->key), PAGE_SECTORS));
+	bio_init(bio, NULL, bio->bi_inline_vecs,
+		 DIV_ROUND_UP(KEY_SIZE(&io->w->key), PAGE_SECTORS), 0);
 	bio_get(bio);
 	bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
 
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index c4b7e434de8a..d4b98ebffd94 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -685,7 +685,7 @@ static void do_bio_hook(struct search *s,
 {
 	struct bio *bio = &s->bio.bio;
 
-	bio_init(bio, NULL, 0);
+	bio_init(bio, NULL, NULL, 0, 0);
 	__bio_clone_fast(bio, orig_bio);
 	/*
 	 * bi_end_io can be set separately somewhere else, e.g. the
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index c31a62b963f0..bf3de149d3c9 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -342,8 +342,7 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
 	down(&dc->sb_write_mutex);
 	closure_init(cl, parent);
 
-	bio_init(bio, dc->sb_bv, 1);
-	bio_set_dev(bio, dc->bdev);
+	bio_init(bio, dc->bdev, dc->sb_bv, 1, 0);
 	bio->bi_end_io	= write_bdev_super_endio;
 	bio->bi_private = dc;
 
@@ -386,8 +385,7 @@ void bcache_write_super(struct cache_set *c)
 	if (ca->sb.version < version)
 		ca->sb.version = version;
 
-	bio_init(bio, ca->sb_bv, 1);
-	bio_set_dev(bio, ca->bdev);
+	bio_init(bio, ca->bdev, ca->sb_bv, 1, 0);
 	bio->bi_end_io	= write_super_endio;
 	bio->bi_private = ca;
 
@@ -2239,7 +2237,7 @@ static int cache_alloc(struct cache *ca)
 	__module_get(THIS_MODULE);
 	kobject_init(&ca->kobj, &bch_cache_ktype);
 
-	bio_init(&ca->journal.bio, ca->journal.bio.bi_inline_vecs, 8);
+	bio_init(&ca->journal.bio, NULL, ca->journal.bio.bi_inline_vecs, 8, 0);
 
 	/*
 	 * when ca->sb.njournal_buckets is not zero, journal exists,
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index c7560f66dca8..d42301e6309d 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -292,8 +292,8 @@ static void dirty_init(struct keybuf_key *w)
 	struct dirty_io *io = w->private;
 	struct bio *bio = &io->bio;
 
-	bio_init(bio, bio->bi_inline_vecs,
-		 DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS));
+	bio_init(bio, NULL, bio->bi_inline_vecs,
+		 DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 0);
 	if (!io->dc->writeback_percent)
 		bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 069e29013b6b..fa596b654c99 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1303,9 +1303,8 @@ static int __send_empty_flush(struct clone_info *ci)
 	 * need to reference it after submit. It's just used as
 	 * the basis for the clone(s).
 	 */
-	bio_init(&flush_bio, NULL, 0);
-	flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
-	bio_set_dev(&flush_bio, ci->io->md->disk->part0);
+	bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0,
+		 REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC);
 
 	ci->bio = &flush_bio;
 	ci->sector_count = 0;
diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c
index e7d6486f090f..5e15940634d8 100644
--- a/drivers/md/md-multipath.c
+++ b/drivers/md/md-multipath.c
@@ -121,7 +121,7 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio)
 	}
 	multipath = conf->multipaths + mp_bh->path;
 
-	bio_init(&mp_bh->bio, NULL, 0);
+	bio_init(&mp_bh->bio, NULL, NULL, 0, 0);
 	__bio_clone_fast(&mp_bh->bio, bio);
 
 	mp_bh->bio.bi_iter.bi_sector += multipath->rdev->data_offset;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 40fc1f7e65c5..0a89f072dae0 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -998,13 +998,11 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
 	struct bio bio;
 	struct bio_vec bvec;
 
-	bio_init(&bio, &bvec, 1);
-
 	if (metadata_op && rdev->meta_bdev)
-		bio_set_dev(&bio, rdev->meta_bdev);
+		bio_init(&bio, rdev->meta_bdev, &bvec, 1, op | op_flags);
 	else
-		bio_set_dev(&bio, rdev->bdev);
-	bio.bi_opf = op | op_flags;
+		bio_init(&bio, rdev->bdev, &bvec, 1, op | op_flags);
+
 	if (metadata_op)
 		bio.bi_iter.bi_sector = sector + rdev->sb_start;
 	else if (rdev->mddev->reshape_position != MaxSector &&
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 66313adf9987..98b9ca11c28d 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -3108,7 +3108,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	INIT_LIST_HEAD(&log->io_end_ios);
 	INIT_LIST_HEAD(&log->flushing_ios);
 	INIT_LIST_HEAD(&log->finished_ios);
-	bio_init(&log->flush_bio, NULL, 0);
+	bio_init(&log->flush_bio, NULL, NULL, 0, 0);
 
 	log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
 	if (!log->io_kc)
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 054d3bb252d4..3446797fa0ac 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -250,7 +250,7 @@ static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log,
 	INIT_LIST_HEAD(&io->stripe_list);
 	atomic_set(&io->pending_stripes, 0);
 	atomic_set(&io->pending_flushes, 0);
-	bio_init(&io->bio, io->biovec, PPL_IO_INLINE_BVECS);
+	bio_init(&io->bio, NULL, io->biovec, PPL_IO_INLINE_BVECS, 0);
 
 	pplhdr = page_address(io->header_page);
 	clear_page(pplhdr);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index ffe720c73b0a..a9dcc5bc9c32 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2310,8 +2310,8 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
 		for (i = 0; i < disks; i++) {
 			struct r5dev *dev = &sh->dev[i];
 
-			bio_init(&dev->req, &dev->vec, 1);
-			bio_init(&dev->rreq, &dev->rvec, 1);
+			bio_init(&dev->req, NULL, &dev->vec, 1, 0);
+			bio_init(&dev->rreq, NULL, &dev->rvec, 1, 0);
 		}
 
 		if (raid5_has_ppl(conf)) {
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index e092af3abc71..95c2bbb0b2f5 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -267,9 +267,8 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req)
 
 	if (nvmet_use_inline_bvec(req)) {
 		bio = &req->b.inline_bio;
-		bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec));
-		bio_set_dev(bio, req->ns->bdev);
-		bio->bi_opf = op;
+		bio_init(bio, req->ns->bdev, req->inline_bvec,
+			 ARRAY_SIZE(req->inline_bvec), op);
 	} else {
 		bio = bio_alloc(req->ns->bdev, bio_max_segs(sg_cnt), op,
 				GFP_KERNEL);
@@ -328,11 +327,10 @@ static void nvmet_bdev_execute_flush(struct nvmet_req *req)
 	if (!nvmet_check_transfer_len(req, 0))
 		return;
 
-	bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec));
-	bio_set_dev(bio, req->ns->bdev);
+	bio_init(bio, req->ns->bdev, req->inline_bvec,
+		 ARRAY_SIZE(req->inline_bvec), REQ_OP_WRITE | REQ_PREFLUSH);
 	bio->bi_private = req;
 	bio->bi_end_io = nvmet_bio_done;
-	bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 
 	submit_bio(bio);
 }
diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c
index 38f72968c3fd..a810bf569fff 100644
--- a/drivers/nvme/target/passthru.c
+++ b/drivers/nvme/target/passthru.c
@@ -206,8 +206,8 @@ static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq)
 
 	if (nvmet_use_inline_bvec(req)) {
 		bio = &req->p.inline_bio;
-		bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec));
-		bio->bi_opf = req_op(rq);
+		bio_init(bio, NULL, req->inline_bvec,
+			 ARRAY_SIZE(req->inline_bvec), req_op(rq));
 	} else {
 		bio = bio_alloc(NULL, bio_max_segs(req->sg_cnt), req_op(rq),
 				GFP_KERNEL);
diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c
index 62c53e8f26d3..3e421217a7ad 100644
--- a/drivers/nvme/target/zns.c
+++ b/drivers/nvme/target/zns.c
@@ -552,8 +552,8 @@ void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
 
 	if (nvmet_use_inline_bvec(req)) {
 		bio = &req->z.inline_bio;
-		bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec));
-		bio->bi_opf = op;
+		bio_init(bio, req->ns->bdev, req->inline_bvec,
+			 ARRAY_SIZE(req->inline_bvec), op);
 	} else {
 		bio = bio_alloc(req->ns->bdev, req->sg_cnt, op, GFP_KERNEL);
 	}
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 70f3657a6ec0..491534e90861 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -549,10 +549,8 @@ static int iomap_read_folio_sync(loff_t block_start, struct folio *folio,
 	struct bio_vec bvec;
 	struct bio bio;
 
-	bio_init(&bio, &bvec, 1);
-	bio.bi_opf = REQ_OP_READ;
+	bio_init(&bio, iomap->bdev, &bvec, 1, REQ_OP_READ);
 	bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
-	bio_set_dev(&bio, iomap->bdev);
 	bio_add_folio(&bio, folio, plen, poff);
 	return submit_bio_wait(&bio);
 }
diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c
index eff4a9f21dcf..32fa02945f73 100644
--- a/fs/xfs/xfs_bio_io.c
+++ b/fs/xfs/xfs_bio_io.c
@@ -36,9 +36,7 @@ xfs_flush_bdev_async(
 		return;
 	}
 
-	bio_init(bio, NULL, 0);
-	bio_set_dev(bio, bdev);
-	bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
+	bio_init(bio, bdev, NULL, 0, REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC);
 	bio->bi_private = done;
 	bio->bi_end_io = xfs_flush_bdev_async_endio;
 
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 89fec9a18c34..16f9edbda4eb 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1883,19 +1883,19 @@ xlog_write_iclog(
 		return;
 	}
 
-	bio_init(&iclog->ic_bio, iclog->ic_bvec, howmany(count, PAGE_SIZE));
-	bio_set_dev(&iclog->ic_bio, log->l_targ->bt_bdev);
-	iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno;
-	iclog->ic_bio.bi_end_io = xlog_bio_end_io;
-	iclog->ic_bio.bi_private = iclog;
-
 	/*
 	 * We use REQ_SYNC | REQ_IDLE here to tell the block layer the are more
 	 * IOs coming immediately after this one. This prevents the block layer
 	 * writeback throttle from throttling log writes behind background
 	 * metadata writeback and causing priority inversions.
 	 */
-	iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_IDLE;
+	bio_init(&iclog->ic_bio, log->l_targ->bt_bdev, iclog->ic_bvec,
+		 howmany(count, PAGE_SIZE),
+		 REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_IDLE);
+	iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno;
+	iclog->ic_bio.bi_end_io = xlog_bio_end_io;
+	iclog->ic_bio.bi_private = iclog;
+
 	if (iclog->ic_flags & XLOG_ICL_NEED_FLUSH) {
 		iclog->ic_bio.bi_opf |= REQ_PREFLUSH;
 		/*
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index c0fc2c326dce..d331b52592a0 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -1540,10 +1540,8 @@ static int zonefs_read_super(struct super_block *sb)
 	if (!page)
 		return -ENOMEM;
 
-	bio_init(&bio, &bio_vec, 1);
+	bio_init(&bio, sb->s_bdev, &bio_vec, 1, REQ_OP_READ);
 	bio.bi_iter.bi_sector = 0;
-	bio.bi_opf = REQ_OP_READ;
-	bio_set_dev(&bio, sb->s_bdev);
 	bio_add_page(&bio, page, PAGE_SIZE, 0);
 
 	ret = submit_bio_wait(&bio);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index be6ac92913d4..41bedf727f59 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -456,8 +456,8 @@ static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs)
 struct request_queue;
 
 extern int submit_bio_wait(struct bio *bio);
-extern void bio_init(struct bio *bio, struct bio_vec *table,
-		     unsigned short max_vecs);
+void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
+	      unsigned short max_vecs, unsigned int opf);
 extern void bio_uninit(struct bio *);
 extern void bio_reset(struct bio *);
 void bio_chain(struct bio *, struct bio *);

From a7c50c940477bae89fb2b4f51bd969a2d95d7512 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 24 Jan 2022 10:11:07 +0100
Subject: [PATCH 023/127] block: pass a block_device and opf to bio_reset

Pass the block_device that we plan to use this bio for and the
operation to bio_reset to optimize the assigment.  A NULL block_device
can be passed, both for the passthrough case on a raw request_queue and
to temporarily avoid refactoring some nasty code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Link: https://lore.kernel.org/r/20220124091107.642561-20-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c                 |  6 +++++-
 drivers/block/pktcdvd.c     |  8 ++------
 drivers/md/bcache/journal.c | 12 ++++--------
 drivers/md/bcache/request.c |  4 ++--
 drivers/md/raid1.c          |  5 ++---
 drivers/md/raid10.c         |  8 +++-----
 drivers/md/raid5-cache.c    |  9 +++------
 drivers/md/raid5.c          |  8 ++++----
 fs/btrfs/disk-io.c          |  4 +---
 fs/crypto/bio.c             |  8 ++------
 include/linux/bio.h         |  9 +--------
 11 files changed, 29 insertions(+), 52 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index b2133d86e885..03cefe81950f 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -295,6 +295,8 @@ EXPORT_SYMBOL(bio_init);
 /**
  * bio_reset - reinitialize a bio
  * @bio:	bio to reset
+ * @bdev:	block device to use the bio for
+ * @opf:	operation and flags for bio
  *
  * Description:
  *   After calling bio_reset(), @bio will be in the same state as a freshly
@@ -302,11 +304,13 @@ EXPORT_SYMBOL(bio_init);
  *   preserved are the ones that are initialized by bio_alloc_bioset(). See
  *   comment in struct bio.
  */
-void bio_reset(struct bio *bio)
+void bio_reset(struct bio *bio, struct block_device *bdev, unsigned int opf)
 {
 	bio_uninit(bio);
 	memset(bio, 0, BIO_RESET_BYTES);
 	atomic_set(&bio->__bi_remaining, 1);
+	bio->bi_bdev = bdev;
+	bio->bi_opf = opf;
 }
 EXPORT_SYMBOL(bio_reset);
 
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 2b6b70a39e76..3aa595442946 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -1020,9 +1020,8 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
 			continue;
 
 		bio = pkt->r_bios[f];
-		bio_reset(bio);
+		bio_reset(bio, pd->bdev, REQ_OP_READ);
 		bio->bi_iter.bi_sector = pkt->sector + f * (CD_FRAMESIZE >> 9);
-		bio_set_dev(bio, pd->bdev);
 		bio->bi_end_io = pkt_end_io_read;
 		bio->bi_private = pkt;
 
@@ -1034,7 +1033,6 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
 			BUG();
 
 		atomic_inc(&pkt->io_wait);
-		bio_set_op_attrs(bio, REQ_OP_READ, 0);
 		pkt_queue_bio(pd, bio);
 		frames_read++;
 	}
@@ -1235,9 +1233,8 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
 {
 	int f;
 
-	bio_reset(pkt->w_bio);
+	bio_reset(pkt->w_bio, pd->bdev, REQ_OP_WRITE);
 	pkt->w_bio->bi_iter.bi_sector = pkt->sector;
-	bio_set_dev(pkt->w_bio, pd->bdev);
 	pkt->w_bio->bi_end_io = pkt_end_io_packet_write;
 	pkt->w_bio->bi_private = pkt;
 
@@ -1270,7 +1267,6 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
 
 	/* Start the write request */
 	atomic_set(&pkt->io_wait, 1);
-	bio_set_op_attrs(pkt->w_bio, REQ_OP_WRITE, 0);
 	pkt_queue_bio(pd, pkt->w_bio);
 }
 
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 6d26c5b06e2b..7c2ca52ca3e4 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -53,14 +53,12 @@ static int journal_read_bucket(struct cache *ca, struct list_head *list,
 reread:		left = ca->sb.bucket_size - offset;
 		len = min_t(unsigned int, left, PAGE_SECTORS << JSET_BITS);
 
-		bio_reset(bio);
+		bio_reset(bio, ca->bdev, REQ_OP_READ);
 		bio->bi_iter.bi_sector	= bucket + offset;
-		bio_set_dev(bio, ca->bdev);
 		bio->bi_iter.bi_size	= len << 9;
 
 		bio->bi_end_io	= journal_read_endio;
 		bio->bi_private = &cl;
-		bio_set_op_attrs(bio, REQ_OP_READ, 0);
 		bch_bio_map(bio, data);
 
 		closure_bio_submit(ca->set, bio, &cl);
@@ -771,16 +769,14 @@ static void journal_write_unlocked(struct closure *cl)
 
 		atomic_long_add(sectors, &ca->meta_sectors_written);
 
-		bio_reset(bio);
+		bio_reset(bio, ca->bdev, REQ_OP_WRITE | 
+			  REQ_SYNC | REQ_META | REQ_PREFLUSH | REQ_FUA);
+		bch_bio_map(bio, w->data);
 		bio->bi_iter.bi_sector	= PTR_OFFSET(k, i);
-		bio_set_dev(bio, ca->bdev);
 		bio->bi_iter.bi_size = sectors << 9;
 
 		bio->bi_end_io	= journal_write_endio;
 		bio->bi_private = w;
-		bio_set_op_attrs(bio, REQ_OP_WRITE,
-				 REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
-		bch_bio_map(bio, w->data);
 
 		trace_bcache_journal_write(bio, w->data->keys);
 		bio_list_add(&list, bio);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index d4b98ebffd94..7ba59d08ed87 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -831,11 +831,11 @@ static void cached_dev_read_done(struct closure *cl)
 	 */
 
 	if (s->iop.bio) {
-		bio_reset(s->iop.bio);
+		bio_reset(s->iop.bio, s->cache_miss->bi_bdev, REQ_OP_READ);
 		s->iop.bio->bi_iter.bi_sector =
 			s->cache_miss->bi_iter.bi_sector;
-		bio_copy_dev(s->iop.bio, s->cache_miss);
 		s->iop.bio->bi_iter.bi_size = s->insert_bio_sectors << 9;
+		bio_clone_blkg_association(s->iop.bio, s->cache_miss);
 		bch_bio_map(s->iop.bio, NULL);
 
 		bio_copy_data(s->cache_miss, s->iop.bio);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 43276f8fdc81..e7710fb5befb 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2166,11 +2166,10 @@ static void process_checks(struct r1bio *r1_bio)
 			continue;
 		/* fixup the bio for reuse, but preserve errno */
 		status = b->bi_status;
-		bio_reset(b);
+		bio_reset(b, conf->mirrors[i].rdev->bdev, REQ_OP_READ);
 		b->bi_status = status;
 		b->bi_iter.bi_sector = r1_bio->sector +
 			conf->mirrors[i].rdev->data_offset;
-		bio_set_dev(b, conf->mirrors[i].rdev->bdev);
 		b->bi_end_io = end_sync_read;
 		rp->raid_bio = r1_bio;
 		b->bi_private = rp;
@@ -2651,7 +2650,7 @@ static struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf)
 	for (i = conf->poolinfo->raid_disks; i--; ) {
 		bio = r1bio->bios[i];
 		rps = bio->bi_private;
-		bio_reset(bio);
+		bio_reset(bio, NULL, 0);
 		bio->bi_private = rps;
 	}
 	r1bio->master_bio = NULL;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index cb7c58050708..da07bcbc06d0 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2422,7 +2422,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 		 * bi_vecs, as the read request might have corrupted these
 		 */
 		rp = get_resync_pages(tbio);
-		bio_reset(tbio);
+		bio_reset(tbio, conf->mirrors[d].rdev->bdev, REQ_OP_WRITE);
 
 		md_bio_reset_resync_pages(tbio, rp, fbio->bi_iter.bi_size);
 
@@ -2430,7 +2430,6 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 		tbio->bi_private = rp;
 		tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
 		tbio->bi_end_io = end_sync_write;
-		bio_set_op_attrs(tbio, REQ_OP_WRITE, 0);
 
 		bio_copy_data(tbio, fbio);
 
@@ -2441,7 +2440,6 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 		if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
 			tbio->bi_opf |= MD_FAILFAST;
 		tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset;
-		bio_set_dev(tbio, conf->mirrors[d].rdev->bdev);
 		submit_bio_noacct(tbio);
 	}
 
@@ -3160,12 +3158,12 @@ static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf)
 	for (i = 0; i < nalloc; i++) {
 		bio = r10bio->devs[i].bio;
 		rp = bio->bi_private;
-		bio_reset(bio);
+		bio_reset(bio, NULL, 0);
 		bio->bi_private = rp;
 		bio = r10bio->devs[i].repl_bio;
 		if (bio) {
 			rp = bio->bi_private;
-			bio_reset(bio);
+			bio_reset(bio, NULL, 0);
 			bio->bi_private = rp;
 		}
 	}
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 98b9ca11c28d..86e2bb89d9c7 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -1301,10 +1301,9 @@ void r5l_flush_stripe_to_raid(struct r5l_log *log)
 
 	if (!do_flush)
 		return;
-	bio_reset(&log->flush_bio);
-	bio_set_dev(&log->flush_bio, log->rdev->bdev);
+	bio_reset(&log->flush_bio, log->rdev->bdev,
+		  REQ_OP_WRITE | REQ_PREFLUSH);
 	log->flush_bio.bi_end_io = r5l_log_flush_endio;
-	log->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 	submit_bio(&log->flush_bio);
 }
 
@@ -1678,9 +1677,7 @@ static int r5l_recovery_fetch_ra_pool(struct r5l_log *log,
 				      struct r5l_recovery_ctx *ctx,
 				      sector_t offset)
 {
-	bio_reset(ctx->ra_bio);
-	bio_set_dev(ctx->ra_bio, log->rdev->bdev);
-	bio_set_op_attrs(ctx->ra_bio, REQ_OP_READ, 0);
+	bio_reset(ctx->ra_bio, log->rdev->bdev, REQ_OP_READ);
 	ctx->ra_bio->bi_iter.bi_sector = log->rdev->data_offset + offset;
 
 	ctx->valid_pages = 0;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a9dcc5bc9c32..7c119208a214 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2677,7 +2677,7 @@ static void raid5_end_read_request(struct bio * bi)
 		(unsigned long long)sh->sector, i, atomic_read(&sh->count),
 		bi->bi_status);
 	if (i == disks) {
-		bio_reset(bi);
+		bio_reset(bi, NULL, 0);
 		BUG();
 		return;
 	}
@@ -2785,7 +2785,7 @@ static void raid5_end_read_request(struct bio * bi)
 		}
 	}
 	rdev_dec_pending(rdev, conf->mddev);
-	bio_reset(bi);
+	bio_reset(bi, NULL, 0);
 	clear_bit(R5_LOCKED, &sh->dev[i].flags);
 	set_bit(STRIPE_HANDLE, &sh->state);
 	raid5_release_stripe(sh);
@@ -2823,7 +2823,7 @@ static void raid5_end_write_request(struct bio *bi)
 		(unsigned long long)sh->sector, i, atomic_read(&sh->count),
 		bi->bi_status);
 	if (i == disks) {
-		bio_reset(bi);
+		bio_reset(bi, NULL, 0);
 		BUG();
 		return;
 	}
@@ -2860,7 +2860,7 @@ static void raid5_end_write_request(struct bio *bi)
 	if (sh->batch_head && bi->bi_status && !replacement)
 		set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
 
-	bio_reset(bi);
+	bio_reset(bi, NULL, 0);
 	if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
 		clear_bit(R5_LOCKED, &sh->dev[i].flags);
 	set_bit(STRIPE_HANDLE, &sh->state);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f45aa506f9a6..505ba21230b1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4154,10 +4154,8 @@ static void write_dev_flush(struct btrfs_device *device)
 		return;
 #endif
 
-	bio_reset(bio);
+	bio_reset(bio, device->bdev, REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH);
 	bio->bi_end_io = btrfs_end_empty_barrier;
-	bio_set_dev(bio, device->bdev);
-	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
 	init_completion(&device->flush_wait);
 	bio->bi_private = &device->flush_wait;
 
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 755e985a42e0..2217fe5ece6f 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -80,9 +80,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode,
 			err = submit_bio_wait(bio);
 			if (err)
 				goto out;
-			bio_reset(bio);
-			bio_set_dev(bio, inode->i_sb->s_bdev);
-			bio->bi_opf = REQ_OP_WRITE;
+			bio_reset(bio, inode->i_sb->s_bdev, REQ_OP_WRITE);
 			num_pages = 0;
 		}
 	}
@@ -181,9 +179,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
 		err = submit_bio_wait(bio);
 		if (err)
 			goto out;
-		bio_reset(bio);
-		bio_set_dev(bio, inode->i_sb->s_bdev);
-		bio->bi_opf = REQ_OP_WRITE;
+		bio_reset(bio, inode->i_sb->s_bdev, REQ_OP_WRITE);
 	} while (len != 0);
 	err = 0;
 out:
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 41bedf727f59..18cfe5bb41ea 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -459,7 +459,7 @@ extern int submit_bio_wait(struct bio *bio);
 void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
 	      unsigned short max_vecs, unsigned int opf);
 extern void bio_uninit(struct bio *);
-extern void bio_reset(struct bio *);
+void bio_reset(struct bio *bio, struct block_device *bdev, unsigned int opf);
 void bio_chain(struct bio *, struct bio *);
 
 int bio_add_page(struct bio *, struct page *, unsigned len, unsigned off);
@@ -517,13 +517,6 @@ static inline void bio_set_dev(struct bio *bio, struct block_device *bdev)
 	bio_associate_blkg(bio);
 }
 
-static inline void bio_copy_dev(struct bio *dst, struct bio *src)
-{
-	bio_clear_flag(dst, BIO_REMAPPED);
-	dst->bi_bdev = src->bi_bdev;
-	bio_clone_blkg_association(dst, src);
-}
-
 /*
  * BIO list management for use by remapping drivers (e.g. DM or MD) and loop.
  *

From b1f866b013e6e5583f2f0bf4a61d13eddb9a1799 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 27 Jan 2022 08:05:48 +0100
Subject: [PATCH 024/127] block: remove blk_needs_flush_plug

blk_needs_flush_plug fails to account for the cb_list, which needs
flushing as well.  Remove it and just check if there is a plug instead
of poking into the internals of the plug structure.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220127070549.1377856-1-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/fs-writeback.c      |  2 +-
 include/linux/blkdev.h | 13 -------------
 kernel/exit.c          |  2 +-
 kernel/sched/core.c    |  2 +-
 4 files changed, 3 insertions(+), 16 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f8d7fe6db989..f4ce38f6fc31 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -2301,7 +2301,7 @@ void wakeup_flusher_threads(enum wb_reason reason)
 	/*
 	 * If we are expecting writeback progress we must submit plugged IO.
 	 */
-	if (blk_needs_flush_plug(current))
+	if (current->plug)
 		blk_flush_plug(current->plug, true);
 
 	rcu_read_lock();
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 99a4384bb8a5..f902a1c2fac0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1055,14 +1055,6 @@ extern void blk_finish_plug(struct blk_plug *);
 
 void blk_flush_plug(struct blk_plug *plug, bool from_schedule);
 
-static inline bool blk_needs_flush_plug(struct task_struct *tsk)
-{
-	struct blk_plug *plug = tsk->plug;
-
-	return plug &&
-		 (plug->mq_list || !list_empty(&plug->cb_list));
-}
-
 int blkdev_issue_flush(struct block_device *bdev);
 long nr_blockdev_pages(void);
 #else /* CONFIG_BLOCK */
@@ -1086,11 +1078,6 @@ static inline void blk_flush_plug(struct blk_plug *plug, bool async)
 {
 }
 
-static inline bool blk_needs_flush_plug(struct task_struct *tsk)
-{
-	return false;
-}
-
 static inline int blkdev_issue_flush(struct block_device *bdev)
 {
 	return 0;
diff --git a/kernel/exit.c b/kernel/exit.c
index b00a25bb4ab9..11fc6c9df9f2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -735,7 +735,7 @@ void __noreturn do_exit(long code)
 	struct task_struct *tsk = current;
 	int group_dead;
 
-	WARN_ON(blk_needs_flush_plug(tsk));
+	WARN_ON(tsk->plug);
 
 	/*
 	 * If do_dead is called because this processes oopsed, it's possible
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 848eaa0efe0e..3487bb92d1f2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6344,7 +6344,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
 	 * If we are going to sleep and we have plugged IO queued,
 	 * make sure to submit it to avoid deadlocks.
 	 */
-	if (blk_needs_flush_plug(tsk))
+	if (tsk->plug)
 		blk_flush_plug(tsk->plug, true);
 }
 

From aa8dcccaf32bfdc09f2aff089d5d60c37da5b7b5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 27 Jan 2022 08:05:49 +0100
Subject: [PATCH 025/127] block: check that there is a plug in blk_flush_plug

Rename blk_flush_plug to __blk_flush_plug and add a wrapper that includes
the NULL check instead of open coding that check everywhere.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Link: https://lore.kernel.org/r/20220127070549.1377856-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 7 +++----
 fs/fs-writeback.c      | 6 ++----
 include/linux/blkdev.h | 7 ++++++-
 kernel/sched/core.c    | 7 ++-----
 4 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index d93e3bb9a769..61f6a0dc4511 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -991,8 +991,7 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
 	    !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
 		return 0;
 
-	if (current->plug)
-		blk_flush_plug(current->plug, false);
+	blk_flush_plug(current->plug, false);
 
 	if (blk_queue_enter(q, BLK_MQ_REQ_NOWAIT))
 		return 0;
@@ -1274,7 +1273,7 @@ struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data,
 }
 EXPORT_SYMBOL(blk_check_plugged);
 
-void blk_flush_plug(struct blk_plug *plug, bool from_schedule)
+void __blk_flush_plug(struct blk_plug *plug, bool from_schedule)
 {
 	if (!list_empty(&plug->cb_list))
 		flush_plug_callbacks(plug, from_schedule);
@@ -1303,7 +1302,7 @@ void blk_flush_plug(struct blk_plug *plug, bool from_schedule)
 void blk_finish_plug(struct blk_plug *plug)
 {
 	if (plug == current->plug) {
-		blk_flush_plug(plug, false);
+		__blk_flush_plug(plug, false);
 		current->plug = NULL;
 	}
 }
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f4ce38f6fc31..33d54c9fbefc 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1903,8 +1903,7 @@ static long writeback_sb_inodes(struct super_block *sb,
 			 * unplug, so get our IOs out the door before we
 			 * give up the CPU.
 			 */
-			if (current->plug)
-				blk_flush_plug(current->plug, false);
+			blk_flush_plug(current->plug, false);
 			cond_resched();
 		}
 
@@ -2301,8 +2300,7 @@ void wakeup_flusher_threads(enum wb_reason reason)
 	/*
 	 * If we are expecting writeback progress we must submit plugged IO.
 	 */
-	if (current->plug)
-		blk_flush_plug(current->plug, true);
+	blk_flush_plug(current->plug, true);
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f902a1c2fac0..654163d3b903 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1053,7 +1053,12 @@ extern void blk_start_plug(struct blk_plug *);
 extern void blk_start_plug_nr_ios(struct blk_plug *, unsigned short);
 extern void blk_finish_plug(struct blk_plug *);
 
-void blk_flush_plug(struct blk_plug *plug, bool from_schedule);
+void __blk_flush_plug(struct blk_plug *plug, bool from_schedule);
+static inline void blk_flush_plug(struct blk_plug *plug, bool async)
+{
+	if (plug)
+		__blk_flush_plug(plug, async);
+}
 
 int blkdev_issue_flush(struct block_device *bdev);
 long nr_blockdev_pages(void);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3487bb92d1f2..46152982e400 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6344,8 +6344,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
 	 * If we are going to sleep and we have plugged IO queued,
 	 * make sure to submit it to avoid deadlocks.
 	 */
-	if (tsk->plug)
-		blk_flush_plug(tsk->plug, true);
+	blk_flush_plug(tsk->plug, true);
 }
 
 static void sched_update_worker(struct task_struct *tsk)
@@ -8371,9 +8370,7 @@ int io_schedule_prepare(void)
 	int old_iowait = current->in_iowait;
 
 	current->in_iowait = 1;
-	if (current->plug)
-		blk_flush_plug(current->plug, true);
-
+	blk_flush_plug(current->plug, true);
 	return old_iowait;
 }
 

From b42c1fc3d55e077d36718ad9800d89100b2aff81 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 27 Jan 2022 07:41:25 +0100
Subject: [PATCH 026/127] block: fix the kerneldoc for bio_end_io_acct

Document the actually existing parameter name.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220127064125.1314347-1-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 654163d3b903..3bfc75a2a450 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1520,7 +1520,7 @@ void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time,
 /**
  * bio_end_io_acct - end I/O accounting for bio based drivers
  * @bio:	bio to end account for
- * @start:	start time returned by bio_start_io_acct()
+ * @start_time:	start time returned by bio_start_io_acct()
  */
 static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time)
 {

From 3767c902719befe44067717fa893a9ef8cb8747d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 27 Jan 2022 07:42:21 +0100
Subject: [PATCH 027/127] MAINTAINERS: add bio.h to the block section

bio.h is part of the block layer, so list it in the MAINTAINERS file
as such.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220127064221.1314477-1-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index f41088418aae..f88a16ba8b47 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3436,6 +3436,7 @@ F:	Documentation/ABI/stable/sysfs-block
 F:	Documentation/block/
 F:	block/
 F:	drivers/block/
+F:	include/linux/bio.h
 F:	include/linux/blk*
 F:	kernel/trace/blktrace.c
 F:	lib/sbitmap.c

From 455a844d6345639be682977b3f0126be446b702e Mon Sep 17 00:00:00 2001
From: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Date: Fri, 28 Jan 2022 12:34:54 +0800
Subject: [PATCH 028/127] block: fix boolreturn.cocci warning

Return statements in functions returning bool should use true/false
instead of 1/0.

./block/bio.c:1081:9-10: WARNING: return of 0/1 in function
'bio_add_folio' with return type bool.

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Link: https://lore.kernel.org/r/20220128043454.68927-1-jiapeng.chong@linux.alibaba.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/bio.c b/block/bio.c
index 03cefe81950f..2e19ca600fcd 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1078,7 +1078,7 @@ bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len,
 		   size_t off)
 {
 	if (len > UINT_MAX || off > UINT_MAX)
-		return 0;
+		return false;
 	return bio_add_page(bio, &folio->page, len, off) > 0;
 }
 

From 365ab499153cdb2007d54e7e62bcbf2c67f7ab8f Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Fri, 28 Jan 2022 17:09:22 +0300
Subject: [PATCH 029/127] fs/ntfs3: remove unnecessary NULL check

This code triggers a Smatch warning:

    fs/ntfs3/fsntfs.c:1606 ntfs_bio_fill_1()
    warn: variable dereferenced before check 'bio' (see line 1591)

The "bio" pointer cannot be NULL so there is no need to check.
Originally there was more extensive NULL checking but it was removed
because bio_alloc() will never fail if it is allowed to sleep.

Remove this check as well.

Fixes: 39146b6f66ba ("ntfs3: remove ntfs_alloc_bio")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220128140922.GA29766@kili
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/ntfs3/fsntfs.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c
index 0660a07c5a96..3de5700a9b83 100644
--- a/fs/ntfs3/fsntfs.c
+++ b/fs/ntfs3/fsntfs.c
@@ -1603,11 +1603,10 @@ new_bio:
 		}
 	} while (run_get_entry(run, ++run_idx, NULL, &lcn, &clen));
 
-	if (bio) {
-		if (!err)
-			err = submit_bio_wait(bio);
-		bio_put(bio);
-	}
+	if (!err)
+		err = submit_bio_wait(bio);
+	bio_put(bio);
+
 	blk_finish_plug(&plug);
 out:
 	unlock_page(fill);

From 2651bf680bc2ad9a078b7222b0873145ab4ece07 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Thu, 3 Feb 2022 11:28:25 -0800
Subject: [PATCH 030/127] block: introduce BLK_STS_OFFLINE

Currently, drivers reports BLK_STS_IOERR for devices that are not full
online or being removed. This behavior could cause confusion for users,
as they are not really I/O errors from the device.

Solve this issue with a new state BLK_STS_OFFLINE, which reports "device
offline error" in dmesg instead of "I/O error".

EIO is intentionally kept to not change user visible return value.

Signed-off-by: Song Liu <song@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20220203192827.1370270-2-song@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c          | 1 +
 include/linux/blk_types.h | 7 +++++++
 2 files changed, 8 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index 61f6a0dc4511..24035dd2eef1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -164,6 +164,7 @@ static const struct {
 	[BLK_STS_RESOURCE]	= { -ENOMEM,	"kernel resource" },
 	[BLK_STS_DEV_RESOURCE]	= { -EBUSY,	"device resource" },
 	[BLK_STS_AGAIN]		= { -EAGAIN,	"nonblocking retry" },
+	[BLK_STS_OFFLINE]	= { -EIO,	"device offline" },
 
 	/* device mapper special case, should not leak out: */
 	[BLK_STS_DM_REQUEUE]	= { -EREMCHG, "dm internal retry" },
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index fe065c394fff..5561e58d158a 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -153,6 +153,13 @@ typedef u8 __bitwise blk_status_t;
  */
 #define BLK_STS_ZONE_ACTIVE_RESOURCE	((__force blk_status_t)16)
 
+/*
+ * BLK_STS_OFFLINE is returned from the driver when the target device is offline
+ * or is being taken offline. This could help differentiate the case where a
+ * device is intentionally being shut down from a real I/O error.
+ */
+#define BLK_STS_OFFLINE		((__force blk_status_t)17)
+
 /**
  * blk_path_error - returns true if error may be path related
  * @error: status the request was completed with

From 7d32c027a21ef7aa0a400763397644d44b3576a9 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Thu, 3 Feb 2022 11:28:26 -0800
Subject: [PATCH 031/127] block: return -ENODEV for BLK_STS_OFFLINE

Change the user visible return value for BLK_STS_OFFLINE to -ENODEV, which
is more descriptive than existing -EIO.

Signed-off-by: Song Liu <song@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20220203192827.1370270-3-song@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 24035dd2eef1..be8812f5489d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -164,7 +164,7 @@ static const struct {
 	[BLK_STS_RESOURCE]	= { -ENOMEM,	"kernel resource" },
 	[BLK_STS_DEV_RESOURCE]	= { -EBUSY,	"device resource" },
 	[BLK_STS_AGAIN]		= { -EAGAIN,	"nonblocking retry" },
-	[BLK_STS_OFFLINE]	= { -EIO,	"device offline" },
+	[BLK_STS_OFFLINE]	= { -ENODEV,	"device offline" },
 
 	/* device mapper special case, should not leak out: */
 	[BLK_STS_DM_REQUEUE]	= { -EREMCHG, "dm internal retry" },

From 9574d43479e16352e75bc875c9952ed8e129c9b2 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Thu, 3 Feb 2022 11:28:27 -0800
Subject: [PATCH 032/127] scsi: use BLK_STS_OFFLINE for not fully online
 devices

The new error message for such case looks like

[  172.809565] device offline error, dev sda, sector 3138208 ...

which will not be confused with regular I/O error (BLK_STS_IOERR).

Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Song Liu <song@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20220203192827.1370270-4-song@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/scsi_lib.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 0a70aa763a96..e30bc51578e9 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1276,7 +1276,7 @@ scsi_device_state_check(struct scsi_device *sdev, struct request *req)
 		 * power management commands.
 		 */
 		if (req && !(req->rq_flags & RQF_PM))
-			return BLK_STS_IOERR;
+			return BLK_STS_OFFLINE;
 		return BLK_STS_OK;
 	}
 }

From 78e3437450be5236c4949e377c9b848bbcd4fcb0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 4 Feb 2022 08:19:34 +0100
Subject: [PATCH 033/127] block: call bio_associate_blkg from bio_reset

Call bio_associate_blkg just like bio_set_dev did in the callers before
the conversion to set the block device in bio_reset.

Fixes: a7c50c940477 ("block: pass a block_device and opf to bio_reset")
Reported-by: syzbot+2b3f18414c37b42dcc94@syzkaller.appspotmail.com
Tested-by: syzbot+2b3f18414c37b42dcc94@syzkaller.appspotmail.com
Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Link: https://lore.kernel.org/r/20220204071934.168469-1-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/block/bio.c b/block/bio.c
index 2e19ca600fcd..d2f3c1035036 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -310,6 +310,8 @@ void bio_reset(struct bio *bio, struct block_device *bdev, unsigned int opf)
 	memset(bio, 0, BIO_RESET_BYTES);
 	atomic_set(&bio->__bi_remaining, 1);
 	bio->bi_bdev = bdev;
+	if (bio->bi_bdev)
+		bio_associate_blkg(bio);
 	bio->bi_opf = opf;
 }
 EXPORT_SYMBOL(bio_reset);

From c347a787e34cba0e5a80a04082dacaf259105605 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 2 Feb 2022 17:00:57 +0100
Subject: [PATCH 034/127] drbd: set ->bi_bdev in drbd_req_new

Make sure the newly allocated bio has the correct bi_bdev set from the
start.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Link: https://lore.kernel.org/r/20220202160109.108149-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/drbd/drbd_req.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 3235532ae077..8d44e96c4c4e 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -31,6 +31,7 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device, struct bio
 	memset(req, 0, sizeof(*req));
 
 	req->private_bio = bio_clone_fast(bio_src, GFP_NOIO, &drbd_io_bio_set);
+	bio_set_dev(req->private_bio, device->ldev->backing_bdev);
 	req->private_bio->bi_private = req;
 	req->private_bio->bi_end_io = drbd_request_endio;
 
@@ -1151,8 +1152,6 @@ drbd_submit_req_private_bio(struct drbd_request *req)
 	else
 		type = DRBD_FAULT_DT_RD;
 
-	bio_set_dev(bio, device->ldev->backing_bdev);
-
 	/* State may have changed since we grabbed our reference on the
 	 * ->ldev member. Double check, and short-circuit to endio.
 	 * In case the last activity log transaction failed to get on

From 6c23f0bd7f16d88c774db37b30c5da82811c41be Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 2 Feb 2022 17:00:58 +0100
Subject: [PATCH 035/127] dm: add a clone_to_tio helper

Add a helper to stop open coding the container_of operations to get
from the clone bio to the tio structure.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Link: https://lore.kernel.org/r/20220202160109.108149-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/dm.c | 34 +++++++++++++++-------------------
 1 file changed, 15 insertions(+), 19 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index fa596b654c99..5543e18f3c3b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -79,10 +79,14 @@ struct clone_info {
 #define DM_IO_BIO_OFFSET \
 	(offsetof(struct dm_target_io, clone) + offsetof(struct dm_io, tio))
 
+static inline struct dm_target_io *clone_to_tio(struct bio *clone)
+{
+	return container_of(clone, struct dm_target_io, clone);
+}
+
 void *dm_per_bio_data(struct bio *bio, size_t data_size)
 {
-	struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
-	if (!tio->inside_dm_io)
+	if (!clone_to_tio(bio)->inside_dm_io)
 		return (char *)bio - DM_TARGET_IO_BIO_OFFSET - data_size;
 	return (char *)bio - DM_IO_BIO_OFFSET - data_size;
 }
@@ -477,10 +481,7 @@ out:
 
 u64 dm_start_time_ns_from_clone(struct bio *bio)
 {
-	struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
-	struct dm_io *io = tio->io;
-
-	return jiffies_to_nsecs(io->start_time);
+	return jiffies_to_nsecs(clone_to_tio(bio)->io->start_time);
 }
 EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);
 
@@ -521,7 +522,7 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
 
 	clone = bio_alloc_bioset(NULL, 0, 0, GFP_NOIO, &md->io_bs);
 
-	tio = container_of(clone, struct dm_target_io, clone);
+	tio = clone_to_tio(clone);
 	tio->inside_dm_io = true;
 	tio->io = NULL;
 
@@ -557,7 +558,7 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *t
 		if (!clone)
 			return NULL;
 
-		tio = container_of(clone, struct dm_target_io, clone);
+		tio = clone_to_tio(clone);
 		tio->inside_dm_io = false;
 	}
 
@@ -878,7 +879,7 @@ static bool swap_bios_limit(struct dm_target *ti, struct bio *bio)
 static void clone_endio(struct bio *bio)
 {
 	blk_status_t error = bio->bi_status;
-	struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
+	struct dm_target_io *tio = clone_to_tio(bio);
 	struct dm_io *io = tio->io;
 	struct mapped_device *md = tio->io->md;
 	dm_endio_fn endio = tio->ti->type->end_io;
@@ -1084,7 +1085,7 @@ static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
  */
 void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
 {
-	struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
+	struct dm_target_io *tio = clone_to_tio(bio);
 	unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
 
 	BUG_ON(bio->bi_opf & REQ_PREFLUSH);
@@ -1257,10 +1258,8 @@ static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
 		if (bio_nr == num_bios)
 			return;
 
-		while ((bio = bio_list_pop(blist))) {
-			tio = container_of(bio, struct dm_target_io, clone);
-			free_tio(tio);
-		}
+		while ((bio = bio_list_pop(blist)))
+			free_tio(clone_to_tio(bio));
 	}
 }
 
@@ -1282,14 +1281,11 @@ static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
 {
 	struct bio_list blist = BIO_EMPTY_LIST;
 	struct bio *bio;
-	struct dm_target_io *tio;
 
 	alloc_multiple_bios(&blist, ci, ti, num_bios);
 
-	while ((bio = bio_list_pop(&blist))) {
-		tio = container_of(bio, struct dm_target_io, clone);
-		__clone_and_map_simple_bio(ci, tio, len);
-	}
+	while ((bio = bio_list_pop(&blist)))
+		__clone_and_map_simple_bio(ci, clone_to_tio(bio), len);
 }
 
 static int __send_empty_flush(struct clone_info *ci)

From b1bee79237ce0ab43ef7fe66aa6e5c4783165012 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 2 Feb 2022 17:00:59 +0100
Subject: [PATCH 036/127] dm: fold clone_bio into __clone_and_map_data_bio

Fold clone_bio into its only caller to prepare for refactoring.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Link: https://lore.kernel.org/r/20220202160109.108149-4-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/dm.c | 43 +++++++++++++++++--------------------------
 1 file changed, 17 insertions(+), 26 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 5543e18f3c3b..9384d250a3e4 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1190,17 +1190,22 @@ static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
 /*
  * Creates a bio that consists of range of complete bvecs.
  */
-static int clone_bio(struct dm_target_io *tio, struct bio *bio,
-		     sector_t sector, unsigned len)
+static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
+				    sector_t sector, unsigned *len)
 {
-	struct bio *clone = &tio->clone;
+	struct bio *bio = ci->bio, *clone;
+	struct dm_target_io *tio;
 	int r;
 
+	tio = alloc_tio(ci, ti, 0, GFP_NOIO);
+	tio->len_ptr = len;
+
+	clone = &tio->clone;
 	__bio_clone_fast(clone, bio);
 
 	r = bio_crypt_clone(clone, bio, GFP_NOIO);
 	if (r < 0)
-		return r;
+		goto free_tio;
 
 	if (bio_integrity(bio)) {
 		if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
@@ -1208,21 +1213,26 @@ static int clone_bio(struct dm_target_io *tio, struct bio *bio,
 			DMWARN("%s: the target %s doesn't support integrity data.",
 				dm_device_name(tio->io->md),
 				tio->ti->type->name);
-			return -EIO;
+			r = -EIO;
+			goto free_tio;
 		}
 
 		r = bio_integrity_clone(clone, bio, GFP_NOIO);
 		if (r < 0)
-			return r;
+			goto free_tio;
 	}
 
 	bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
-	clone->bi_iter.bi_size = to_bytes(len);
+	clone->bi_iter.bi_size = to_bytes(*len);
 
 	if (bio_integrity(bio))
 		bio_integrity_trim(clone);
 
+	__map_bio(tio);
 	return 0;
+free_tio:
+	free_tio(tio);
+	return r;
 }
 
 static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
@@ -1313,25 +1323,6 @@ static int __send_empty_flush(struct clone_info *ci)
 	return 0;
 }
 
-static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
-				    sector_t sector, unsigned *len)
-{
-	struct bio *bio = ci->bio;
-	struct dm_target_io *tio;
-	int r;
-
-	tio = alloc_tio(ci, ti, 0, GFP_NOIO);
-	tio->len_ptr = len;
-	r = clone_bio(tio, bio, sector, *len);
-	if (r < 0) {
-		free_tio(tio);
-		return r;
-	}
-	__map_bio(tio);
-
-	return 0;
-}
-
 static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
 				       unsigned num_bios)
 {

From 8eabf5d0a7bd9226d6cc25402dde67f372aae838 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 2 Feb 2022 17:01:00 +0100
Subject: [PATCH 037/127] dm: fold __send_duplicate_bios into
 __clone_and_map_simple_bio

Fold __send_duplicate_bios into its only caller to prepare for
refactoring.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Link: https://lore.kernel.org/r/20220202160109.108149-5-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/dm.c | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 9384d250a3e4..2527b287ead0 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1273,29 +1273,24 @@ static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
 	}
 }
 
-static void __clone_and_map_simple_bio(struct clone_info *ci,
-					   struct dm_target_io *tio, unsigned *len)
-{
-	struct bio *clone = &tio->clone;
-
-	tio->len_ptr = len;
-
-	__bio_clone_fast(clone, ci->bio);
-	if (len)
-		bio_setup_sector(clone, ci->sector, *len);
-	__map_bio(tio);
-}
-
 static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
 				  unsigned num_bios, unsigned *len)
 {
 	struct bio_list blist = BIO_EMPTY_LIST;
-	struct bio *bio;
+	struct bio *clone;
 
 	alloc_multiple_bios(&blist, ci, ti, num_bios);
 
-	while ((bio = bio_list_pop(&blist)))
-		__clone_and_map_simple_bio(ci, clone_to_tio(bio), len);
+	while ((clone = bio_list_pop(&blist))) {
+		struct dm_target_io *tio = clone_to_tio(clone);
+
+		tio->len_ptr = len;
+
+		__bio_clone_fast(clone, ci->bio);
+		if (len)
+			bio_setup_sector(clone, ci->sector, *len);
+		__map_bio(tio);
+	}
 }
 
 static int __send_empty_flush(struct clone_info *ci)

From dc8e2021da71f6b2d5971f98ee3e528cf30c409c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 2 Feb 2022 17:01:01 +0100
Subject: [PATCH 038/127] dm: move cloning the bio into alloc_tio

Move the call to __bio_clone_fast and the assignment of ->len_ptr from
the callers into alloc_tio to prepare for changes to the bio clone API.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Link: https://lore.kernel.org/r/20220202160109.108149-6-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/dm.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 2527b287ead0..90341b7fa580 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -545,7 +545,7 @@ static void free_io(struct mapped_device *md, struct dm_io *io)
 }
 
 static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *ti,
-				      unsigned target_bio_nr, gfp_t gfp_mask)
+		unsigned target_bio_nr, unsigned *len, gfp_t gfp_mask)
 {
 	struct dm_target_io *tio;
 
@@ -561,11 +561,13 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *t
 		tio = clone_to_tio(clone);
 		tio->inside_dm_io = false;
 	}
+	__bio_clone_fast(&tio->clone, ci->bio);
 
 	tio->magic = DM_TIO_MAGIC;
 	tio->io = ci->io;
 	tio->ti = ti;
 	tio->target_bio_nr = target_bio_nr;
+	tio->len_ptr = len;
 
 	return tio;
 }
@@ -1197,11 +1199,8 @@ static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
 	struct dm_target_io *tio;
 	int r;
 
-	tio = alloc_tio(ci, ti, 0, GFP_NOIO);
-	tio->len_ptr = len;
-
+	tio = alloc_tio(ci, ti, 0, len, GFP_NOIO);
 	clone = &tio->clone;
-	__bio_clone_fast(clone, bio);
 
 	r = bio_crypt_clone(clone, bio, GFP_NOIO);
 	if (r < 0)
@@ -1236,7 +1235,8 @@ free_tio:
 }
 
 static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
-				struct dm_target *ti, unsigned num_bios)
+				struct dm_target *ti, unsigned num_bios,
+				unsigned *len)
 {
 	struct dm_target_io *tio;
 	int try;
@@ -1245,7 +1245,7 @@ static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
 		return;
 
 	if (num_bios == 1) {
-		tio = alloc_tio(ci, ti, 0, GFP_NOIO);
+		tio = alloc_tio(ci, ti, 0, len, GFP_NOIO);
 		bio_list_add(blist, &tio->clone);
 		return;
 	}
@@ -1257,7 +1257,8 @@ static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
 		if (try)
 			mutex_lock(&ci->io->md->table_devices_lock);
 		for (bio_nr = 0; bio_nr < num_bios; bio_nr++) {
-			tio = alloc_tio(ci, ti, bio_nr, try ? GFP_NOIO : GFP_NOWAIT);
+			tio = alloc_tio(ci, ti, bio_nr, len,
+					try ? GFP_NOIO : GFP_NOWAIT);
 			if (!tio)
 				break;
 
@@ -1279,14 +1280,11 @@ static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
 	struct bio_list blist = BIO_EMPTY_LIST;
 	struct bio *clone;
 
-	alloc_multiple_bios(&blist, ci, ti, num_bios);
+	alloc_multiple_bios(&blist, ci, ti, num_bios, len);
 
 	while ((clone = bio_list_pop(&blist))) {
 		struct dm_target_io *tio = clone_to_tio(clone);
 
-		tio->len_ptr = len;
-
-		__bio_clone_fast(clone, ci->bio);
 		if (len)
 			bio_setup_sector(clone, ci->sector, *len);
 		__map_bio(tio);

From 1561b396106d759fdf5f9a71b412e068f74d2cc9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 2 Feb 2022 17:01:02 +0100
Subject: [PATCH 039/127] dm: pass the bio instead of tio to __map_bio

This simplifies the callers a bit.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Link: https://lore.kernel.org/r/20220202160109.108149-7-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/dm.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 90341b7fa580..a43d280e9bc5 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1117,11 +1117,11 @@ static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch)
 	mutex_unlock(&md->swap_bios_lock);
 }
 
-static void __map_bio(struct dm_target_io *tio)
+static void __map_bio(struct bio *clone)
 {
+	struct dm_target_io *tio = clone_to_tio(clone);
 	int r;
 	sector_t sector;
-	struct bio *clone = &tio->clone;
 	struct dm_io *io = tio->io;
 	struct dm_target *ti = tio->ti;
 
@@ -1227,7 +1227,7 @@ static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
 	if (bio_integrity(bio))
 		bio_integrity_trim(clone);
 
-	__map_bio(tio);
+	__map_bio(clone);
 	return 0;
 free_tio:
 	free_tio(tio);
@@ -1283,11 +1283,9 @@ static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
 	alloc_multiple_bios(&blist, ci, ti, num_bios, len);
 
 	while ((clone = bio_list_pop(&blist))) {
-		struct dm_target_io *tio = clone_to_tio(clone);
-
 		if (len)
 			bio_setup_sector(clone, ci->sector, *len);
-		__map_bio(tio);
+		__map_bio(clone);
 	}
 }
 

From 1d1068cecff70cb8e48c7cb0ba27cc3fd906eb31 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 2 Feb 2022 17:01:03 +0100
Subject: [PATCH 040/127] dm: retun the clone bio from alloc_tio

Return the clone bio embedded into the tio as that is what the callers
actually want.  Similar for the free side.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Link: https://lore.kernel.org/r/20220202160109.108149-8-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/dm.c | 39 +++++++++++++++++++--------------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index a43d280e9bc5..c05b6ff1bb95 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -544,7 +544,7 @@ static void free_io(struct mapped_device *md, struct dm_io *io)
 	bio_put(&io->tio.clone);
 }
 
-static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *ti,
+static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti,
 		unsigned target_bio_nr, unsigned *len, gfp_t gfp_mask)
 {
 	struct dm_target_io *tio;
@@ -569,14 +569,14 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *t
 	tio->target_bio_nr = target_bio_nr;
 	tio->len_ptr = len;
 
-	return tio;
+	return &tio->clone;
 }
 
-static void free_tio(struct dm_target_io *tio)
+static void free_tio(struct bio *clone)
 {
-	if (tio->inside_dm_io)
+	if (clone_to_tio(clone)->inside_dm_io)
 		return;
-	bio_put(&tio->clone);
+	bio_put(clone);
 }
 
 /*
@@ -932,7 +932,7 @@ static void clone_endio(struct bio *bio)
 		up(&md->swap_bios_semaphore);
 	}
 
-	free_tio(tio);
+	free_tio(bio);
 	dm_io_dec_pending(io, error);
 }
 
@@ -1166,7 +1166,7 @@ static void __map_bio(struct bio *clone)
 			struct mapped_device *md = io->md;
 			up(&md->swap_bios_semaphore);
 		}
-		free_tio(tio);
+		free_tio(clone);
 		dm_io_dec_pending(io, BLK_STS_IOERR);
 		break;
 	case DM_MAPIO_REQUEUE:
@@ -1174,7 +1174,7 @@ static void __map_bio(struct bio *clone)
 			struct mapped_device *md = io->md;
 			up(&md->swap_bios_semaphore);
 		}
-		free_tio(tio);
+		free_tio(clone);
 		dm_io_dec_pending(io, BLK_STS_DM_REQUEUE);
 		break;
 	default:
@@ -1196,17 +1196,17 @@ static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
 				    sector_t sector, unsigned *len)
 {
 	struct bio *bio = ci->bio, *clone;
-	struct dm_target_io *tio;
 	int r;
 
-	tio = alloc_tio(ci, ti, 0, len, GFP_NOIO);
-	clone = &tio->clone;
+	clone = alloc_tio(ci, ti, 0, len, GFP_NOIO);
 
 	r = bio_crypt_clone(clone, bio, GFP_NOIO);
 	if (r < 0)
 		goto free_tio;
 
 	if (bio_integrity(bio)) {
+		struct dm_target_io *tio = clone_to_tio(clone);
+
 		if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
 			     !dm_target_passes_integrity(tio->ti->type))) {
 			DMWARN("%s: the target %s doesn't support integrity data.",
@@ -1230,7 +1230,7 @@ static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
 	__map_bio(clone);
 	return 0;
 free_tio:
-	free_tio(tio);
+	free_tio(clone);
 	return r;
 }
 
@@ -1238,31 +1238,30 @@ static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
 				struct dm_target *ti, unsigned num_bios,
 				unsigned *len)
 {
-	struct dm_target_io *tio;
+	struct bio *bio;
 	int try;
 
 	if (!num_bios)
 		return;
 
 	if (num_bios == 1) {
-		tio = alloc_tio(ci, ti, 0, len, GFP_NOIO);
-		bio_list_add(blist, &tio->clone);
+		bio = alloc_tio(ci, ti, 0, len, GFP_NOIO);
+		bio_list_add(blist, bio);
 		return;
 	}
 
 	for (try = 0; try < 2; try++) {
 		int bio_nr;
-		struct bio *bio;
 
 		if (try)
 			mutex_lock(&ci->io->md->table_devices_lock);
 		for (bio_nr = 0; bio_nr < num_bios; bio_nr++) {
-			tio = alloc_tio(ci, ti, bio_nr, len,
+			bio = alloc_tio(ci, ti, bio_nr, len,
 					try ? GFP_NOIO : GFP_NOWAIT);
-			if (!tio)
+			if (!bio)
 				break;
 
-			bio_list_add(blist, &tio->clone);
+			bio_list_add(blist, bio);
 		}
 		if (try)
 			mutex_unlock(&ci->io->md->table_devices_lock);
@@ -1270,7 +1269,7 @@ static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
 			return;
 
 		while ((bio = bio_list_pop(blist)))
-			free_tio(clone_to_tio(bio));
+			free_tio(bio);
 	}
 }
 

From 891fced644a7529bfd4b1436b2341527ce8f68ad Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 2 Feb 2022 17:01:04 +0100
Subject: [PATCH 041/127] dm: simplify the single bio fast path in
 __send_duplicate_bios

Most targets just need a single flush bio.  Open code that case in
__send_duplicate_bios without the need to add the bio to a list.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Link: https://lore.kernel.org/r/20220202160109.108149-9-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/dm.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index c05b6ff1bb95..78df75f57288 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1241,15 +1241,6 @@ static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
 	struct bio *bio;
 	int try;
 
-	if (!num_bios)
-		return;
-
-	if (num_bios == 1) {
-		bio = alloc_tio(ci, ti, 0, len, GFP_NOIO);
-		bio_list_add(blist, bio);
-		return;
-	}
-
 	for (try = 0; try < 2; try++) {
 		int bio_nr;
 
@@ -1279,12 +1270,23 @@ static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
 	struct bio_list blist = BIO_EMPTY_LIST;
 	struct bio *clone;
 
-	alloc_multiple_bios(&blist, ci, ti, num_bios, len);
-
-	while ((clone = bio_list_pop(&blist))) {
+	switch (num_bios) {
+	case 0:
+		break;
+	case 1:
+		clone = alloc_tio(ci, ti, 0, len, GFP_NOIO);
 		if (len)
 			bio_setup_sector(clone, ci->sector, *len);
 		__map_bio(clone);
+		break;
+	default:
+		alloc_multiple_bios(&blist, ci, ti, num_bios, len);
+		while ((clone = bio_list_pop(&blist))) {
+			if (len)
+				bio_setup_sector(clone, ci->sector, *len);
+			__map_bio(clone);
+		}
+		break;
 	}
 }
 

From 3c4b455ef8acdacd0e5ecd33428d4f32f861637a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 2 Feb 2022 17:01:05 +0100
Subject: [PATCH 042/127] dm-cache: remove __remap_to_origin_clear_discard

Fold __remap_to_origin_clear_discard into the two callers to prepare
for bio cloning refactoring.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Link: https://lore.kernel.org/r/20220202160109.108149-10-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/dm-cache-target.c | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 447d030036d1..1c37fe028e53 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -744,21 +744,14 @@ static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
 	spin_unlock_irq(&cache->lock);
 }
 
-static void __remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
-					    dm_oblock_t oblock, bool bio_has_pbd)
-{
-	if (bio_has_pbd)
-		check_if_tick_bio_needed(cache, bio);
-	remap_to_origin(cache, bio);
-	if (bio_data_dir(bio) == WRITE)
-		clear_discard(cache, oblock_to_dblock(cache, oblock));
-}
-
 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
 					  dm_oblock_t oblock)
 {
 	// FIXME: check_if_tick_bio_needed() is called way too much through this interface
-	__remap_to_origin_clear_discard(cache, bio, oblock, true);
+	check_if_tick_bio_needed(cache, bio);
+	remap_to_origin(cache, bio);
+	if (bio_data_dir(bio) == WRITE)
+		clear_discard(cache, oblock_to_dblock(cache, oblock));
 }
 
 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
@@ -831,11 +824,10 @@ static void remap_to_origin_and_cache(struct cache *cache, struct bio *bio,
 	BUG_ON(!origin_bio);
 
 	bio_chain(origin_bio, bio);
-	/*
-	 * Passing false to __remap_to_origin_clear_discard() skips
-	 * all code that might use per_bio_data (since clone doesn't have it)
-	 */
-	__remap_to_origin_clear_discard(cache, origin_bio, oblock, false);
+
+	remap_to_origin(cache, origin_bio);
+	if (bio_data_dir(origin_bio) == WRITE)
+		clear_discard(cache, oblock_to_dblock(cache, oblock));
 	submit_bio(origin_bio);
 
 	remap_to_cache(cache, bio, cblock);

From 56b4b5abcdab6daf71c5536fca2772f178590e06 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 2 Feb 2022 17:01:06 +0100
Subject: [PATCH 043/127] block: clone crypto and integrity data in
 __bio_clone_fast

__bio_clone_fast should also clone integrity and crypto data, as a clone
without those is incomplete.  Right now the only caller that can actually
support crypto and integrity data (dm) does it manually for the one
callchain that supports these, but we better do it properly in the core.

Note that all callers except for the above mentioned one also don't need
to handle failure at all, given that the integrity and crypto clones are
based on mempool allocations that won't fail for sleeping allocations.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Link: https://lore.kernel.org/r/20220202160109.108149-11-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio-integrity.c       |  1 -
 block/bio.c                 | 26 +++++++++++++-------------
 block/blk-crypto.c          |  1 -
 drivers/md/bcache/request.c |  2 +-
 drivers/md/dm.c             | 33 ++++++---------------------------
 drivers/md/md-multipath.c   |  2 +-
 include/linux/bio.h         |  2 +-
 7 files changed, 22 insertions(+), 45 deletions(-)

diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index d25114715459..bd5453220065 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -420,7 +420,6 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
 
 	return 0;
 }
-EXPORT_SYMBOL(bio_integrity_clone);
 
 int bioset_integrity_create(struct bio_set *bs, int pool_size)
 {
diff --git a/block/bio.c b/block/bio.c
index d2f3c1035036..2a921875bb42 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -732,6 +732,7 @@ EXPORT_SYMBOL(bio_put);
  * 	__bio_clone_fast - clone a bio that shares the original bio's biovec
  * 	@bio: destination bio
  * 	@bio_src: bio to clone
+ *	@gfp: allocation flags
  *
  *	Clone a &bio. Caller will own the returned bio, but not
  *	the actual data it points to. Reference count of returned
@@ -739,7 +740,7 @@ EXPORT_SYMBOL(bio_put);
  *
  * 	Caller must ensure that @bio_src is not freed before @bio.
  */
-void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
+int __bio_clone_fast(struct bio *bio, struct bio *bio_src, gfp_t gfp)
 {
 	WARN_ON_ONCE(bio->bi_pool && bio->bi_max_vecs);
 
@@ -761,6 +762,13 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
 
 	bio_clone_blkg_association(bio, bio_src);
 	blkcg_bio_issue_init(bio);
+
+	if (bio_crypt_clone(bio, bio_src, gfp) < 0)
+		return -ENOMEM;
+	if (bio_integrity(bio_src) &&
+	    bio_integrity_clone(bio, bio_src, gfp) < 0)
+		return -ENOMEM;
+	return 0;
 }
 EXPORT_SYMBOL(__bio_clone_fast);
 
@@ -780,20 +788,12 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
 	if (!b)
 		return NULL;
 
-	__bio_clone_fast(b, bio);
-
-	if (bio_crypt_clone(b, bio, gfp_mask) < 0)
-		goto err_put;
-
-	if (bio_integrity(bio) &&
-	    bio_integrity_clone(b, bio, gfp_mask) < 0)
-		goto err_put;
+	if (__bio_clone_fast(b, bio, gfp_mask < 0)) {
+		bio_put(b);
+		return NULL;
+	}
 
 	return b;
-
-err_put:
-	bio_put(b);
-	return NULL;
 }
 EXPORT_SYMBOL(bio_clone_fast);
 
diff --git a/block/blk-crypto.c b/block/blk-crypto.c
index ec9efeeeca91..773dae4c329b 100644
--- a/block/blk-crypto.c
+++ b/block/blk-crypto.c
@@ -111,7 +111,6 @@ int __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask)
 	*dst->bi_crypt_context = *src->bi_crypt_context;
 	return 0;
 }
-EXPORT_SYMBOL_GPL(__bio_crypt_clone);
 
 /* Increments @dun by @inc, treating @dun as a multi-limb integer. */
 void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE],
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 7ba59d08ed87..574b02b94f1a 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -686,7 +686,7 @@ static void do_bio_hook(struct search *s,
 	struct bio *bio = &s->bio.bio;
 
 	bio_init(bio, NULL, NULL, 0, 0);
-	__bio_clone_fast(bio, orig_bio);
+	__bio_clone_fast(bio, orig_bio, GFP_NOIO);
 	/*
 	 * bi_end_io can be set separately somewhere else, e.g. the
 	 * variants in,
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 78df75f57288..0f8796159379 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -561,7 +561,12 @@ static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti,
 		tio = clone_to_tio(clone);
 		tio->inside_dm_io = false;
 	}
-	__bio_clone_fast(&tio->clone, ci->bio);
+
+	if (__bio_clone_fast(&tio->clone, ci->bio, gfp_mask) < 0) {
+		if (ci->io->tio.io)
+			bio_put(&tio->clone);
+		return NULL;
+	}
 
 	tio->magic = DM_TIO_MAGIC;
 	tio->io = ci->io;
@@ -1196,31 +1201,8 @@ static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
 				    sector_t sector, unsigned *len)
 {
 	struct bio *bio = ci->bio, *clone;
-	int r;
 
 	clone = alloc_tio(ci, ti, 0, len, GFP_NOIO);
-
-	r = bio_crypt_clone(clone, bio, GFP_NOIO);
-	if (r < 0)
-		goto free_tio;
-
-	if (bio_integrity(bio)) {
-		struct dm_target_io *tio = clone_to_tio(clone);
-
-		if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
-			     !dm_target_passes_integrity(tio->ti->type))) {
-			DMWARN("%s: the target %s doesn't support integrity data.",
-				dm_device_name(tio->io->md),
-				tio->ti->type->name);
-			r = -EIO;
-			goto free_tio;
-		}
-
-		r = bio_integrity_clone(clone, bio, GFP_NOIO);
-		if (r < 0)
-			goto free_tio;
-	}
-
 	bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
 	clone->bi_iter.bi_size = to_bytes(*len);
 
@@ -1229,9 +1211,6 @@ static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
 
 	__map_bio(clone);
 	return 0;
-free_tio:
-	free_tio(clone);
-	return r;
 }
 
 static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c
index 5e15940634d8..010c759c741a 100644
--- a/drivers/md/md-multipath.c
+++ b/drivers/md/md-multipath.c
@@ -122,7 +122,7 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio)
 	multipath = conf->multipaths + mp_bh->path;
 
 	bio_init(&mp_bh->bio, NULL, NULL, 0, 0);
-	__bio_clone_fast(&mp_bh->bio, bio);
+	__bio_clone_fast(&mp_bh->bio, bio, GFP_NOIO);
 
 	mp_bh->bio.bi_iter.bi_sector += multipath->rdev->data_offset;
 	bio_set_dev(&mp_bh->bio, multipath->rdev->bdev);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 18cfe5bb41ea..b814361c957b 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -413,7 +413,7 @@ struct bio *bio_alloc_kiocb(struct kiocb *kiocb, struct block_device *bdev,
 struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs);
 extern void bio_put(struct bio *);
 
-extern void __bio_clone_fast(struct bio *, struct bio *);
+int __bio_clone_fast(struct bio *bio, struct bio *bio_src, gfp_t gfp);
 extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *);
 
 extern struct bio_set fs_bio_set;

From 92986f6b4c8a2c24d3a36b80140624f80fd93de4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 2 Feb 2022 17:01:07 +0100
Subject: [PATCH 044/127] dm: use bio_clone_fast in alloc_io/alloc_tio

Replace open coded bio_clone_fast implementations with the actual helper.
Note that the bio allocated as part of the dm_io structure in alloc_io
will only actually be used later in alloc_tio, making this earlier
cloning of the information safe.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Link: https://lore.kernel.org/r/20220202160109.108149-12-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/dm.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 0f8796159379..862564a5df74 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -520,7 +520,7 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
 	struct dm_target_io *tio;
 	struct bio *clone;
 
-	clone = bio_alloc_bioset(NULL, 0, 0, GFP_NOIO, &md->io_bs);
+	clone = bio_clone_fast(bio, GFP_NOIO, &md->io_bs);
 
 	tio = clone_to_tio(clone);
 	tio->inside_dm_io = true;
@@ -553,8 +553,8 @@ static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti,
 		/* the dm_target_io embedded in ci->io is available */
 		tio = &ci->io->tio;
 	} else {
-		struct bio *clone = bio_alloc_bioset(NULL, 0, 0, gfp_mask,
-						     &ci->io->md->bs);
+		struct bio *clone = bio_clone_fast(ci->bio, gfp_mask,
+						   &ci->io->md->bs);
 		if (!clone)
 			return NULL;
 
@@ -562,12 +562,6 @@ static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti,
 		tio->inside_dm_io = false;
 	}
 
-	if (__bio_clone_fast(&tio->clone, ci->bio, gfp_mask) < 0) {
-		if (ci->io->tio.io)
-			bio_put(&tio->clone);
-		return NULL;
-	}
-
 	tio->magic = DM_TIO_MAGIC;
 	tio->io = ci->io;
 	tio->ti = ti;

From a0e8de798dd6710a69d69ec57b246a0e34c4a695 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 2 Feb 2022 17:01:08 +0100
Subject: [PATCH 045/127] block: initialize the target bio in __bio_clone_fast

All callers of __bio_clone_fast initialize the bio first.  Move that
initialization into __bio_clone_fast instead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Link: https://lore.kernel.org/r/20220202160109.108149-13-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c                 | 75 ++++++++++++++++++++-----------------
 drivers/md/bcache/request.c |  1 -
 drivers/md/md-multipath.c   |  1 -
 3 files changed, 40 insertions(+), 37 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 2a921875bb42..74f66e22ef63 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -728,37 +728,16 @@ void bio_put(struct bio *bio)
 }
 EXPORT_SYMBOL(bio_put);
 
-/**
- * 	__bio_clone_fast - clone a bio that shares the original bio's biovec
- * 	@bio: destination bio
- * 	@bio_src: bio to clone
- *	@gfp: allocation flags
- *
- *	Clone a &bio. Caller will own the returned bio, but not
- *	the actual data it points to. Reference count of returned
- * 	bio will be one.
- *
- * 	Caller must ensure that @bio_src is not freed before @bio.
- */
-int __bio_clone_fast(struct bio *bio, struct bio *bio_src, gfp_t gfp)
+static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
 {
-	WARN_ON_ONCE(bio->bi_pool && bio->bi_max_vecs);
-
-	/*
-	 * most users will be overriding ->bi_bdev with a new target,
-	 * so we don't set nor calculate new physical/hw segment counts here
-	 */
-	bio->bi_bdev = bio_src->bi_bdev;
 	bio_set_flag(bio, BIO_CLONED);
 	if (bio_flagged(bio_src, BIO_THROTTLED))
 		bio_set_flag(bio, BIO_THROTTLED);
 	if (bio_flagged(bio_src, BIO_REMAPPED))
 		bio_set_flag(bio, BIO_REMAPPED);
-	bio->bi_opf = bio_src->bi_opf;
 	bio->bi_ioprio = bio_src->bi_ioprio;
 	bio->bi_write_hint = bio_src->bi_write_hint;
 	bio->bi_iter = bio_src->bi_iter;
-	bio->bi_io_vec = bio_src->bi_io_vec;
 
 	bio_clone_blkg_association(bio, bio_src);
 	blkcg_bio_issue_init(bio);
@@ -770,33 +749,59 @@ int __bio_clone_fast(struct bio *bio, struct bio *bio_src, gfp_t gfp)
 		return -ENOMEM;
 	return 0;
 }
-EXPORT_SYMBOL(__bio_clone_fast);
 
 /**
- *	bio_clone_fast - clone a bio that shares the original bio's biovec
- *	@bio: bio to clone
- *	@gfp_mask: allocation priority
- *	@bs: bio_set to allocate from
+ * bio_clone_fast - clone a bio that shares the original bio's biovec
+ * @bio_src: bio to clone from
+ * @gfp: allocation priority
+ * @bs: bio_set to allocate from
  *
- * 	Like __bio_clone_fast, only also allocates the returned bio
+ * Allocate a new bio that is a clone of @bio_src. The caller owns the returned
+ * bio, but not the actual data it points to.
+ *
+ * The caller must ensure that the return bio is not freed before @bio_src.
  */
-struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
+struct bio *bio_clone_fast(struct bio *bio_src, gfp_t gfp, struct bio_set *bs)
 {
-	struct bio *b;
+	struct bio *bio;
 
-	b = bio_alloc_bioset(NULL, 0, 0, gfp_mask, bs);
-	if (!b)
+	bio = bio_alloc_bioset(bio_src->bi_bdev, 0, bio_src->bi_opf, gfp, bs);
+	if (!bio)
 		return NULL;
 
-	if (__bio_clone_fast(b, bio, gfp_mask < 0)) {
-		bio_put(b);
+	if (__bio_clone(bio, bio_src, gfp) < 0) {
+		bio_put(bio);
 		return NULL;
 	}
+	bio->bi_io_vec = bio_src->bi_io_vec;
 
-	return b;
+	return bio;
 }
 EXPORT_SYMBOL(bio_clone_fast);
 
+/**
+ * __bio_clone_fast - clone a bio that shares the original bio's biovec
+ * @bio: bio to clone into
+ * @bio_src: bio to clone from
+ * @gfp: allocation priority
+ *
+ * Initialize a new bio in caller provided memory that is a clone of @bio_src.
+ * The caller owns the returned bio, but not the actual data it points to.
+ *
+ * The caller must ensure that @bio_src is not freed before @bio.
+ */
+int __bio_clone_fast(struct bio *bio, struct bio *bio_src, gfp_t gfp)
+{
+	int ret;
+
+	bio_init(bio, bio_src->bi_bdev, bio_src->bi_io_vec, 0, bio_src->bi_opf);
+	ret = __bio_clone(bio, bio_src, gfp);
+	if (ret)
+		bio_uninit(bio);
+	return ret;
+}
+EXPORT_SYMBOL(__bio_clone_fast);
+
 const char *bio_devname(struct bio *bio, char *buf)
 {
 	return bdevname(bio->bi_bdev, buf);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 574b02b94f1a..d2cb853bf917 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -685,7 +685,6 @@ static void do_bio_hook(struct search *s,
 {
 	struct bio *bio = &s->bio.bio;
 
-	bio_init(bio, NULL, NULL, 0, 0);
 	__bio_clone_fast(bio, orig_bio, GFP_NOIO);
 	/*
 	 * bi_end_io can be set separately somewhere else, e.g. the
diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c
index 010c759c741a..483a5500f83c 100644
--- a/drivers/md/md-multipath.c
+++ b/drivers/md/md-multipath.c
@@ -121,7 +121,6 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio)
 	}
 	multipath = conf->multipaths + mp_bh->path;
 
-	bio_init(&mp_bh->bio, NULL, NULL, 0, 0);
 	__bio_clone_fast(&mp_bh->bio, bio, GFP_NOIO);
 
 	mp_bh->bio.bi_iter.bi_sector += multipath->rdev->data_offset;

From abfc426d1b2fb2176df59851a64223b58ddae7e7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 2 Feb 2022 17:01:09 +0100
Subject: [PATCH 046/127] block: pass a block_device to bio_clone_fast

Pass a block_device to bio_clone_fast and __bio_clone_fast and give
the functions more suitable names.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Link: https://lore.kernel.org/r/20220202160109.108149-14-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/block/biodoc.rst   |  5 -----
 block/bio.c                      | 31 +++++++++++++++++------------
 block/blk-mq.c                   |  4 ++--
 block/bounce.c                   |  3 +--
 drivers/block/drbd/drbd_req.c    |  4 ++--
 drivers/block/drbd/drbd_worker.c |  4 ++--
 drivers/block/pktcdvd.c          |  4 ++--
 drivers/md/bcache/request.c      |  5 +++--
 drivers/md/dm-cache-target.c     |  4 ++--
 drivers/md/dm-crypt.c            | 11 +++++------
 drivers/md/dm-zoned-target.c     |  3 +--
 drivers/md/dm.c                  |  6 +++---
 drivers/md/md-faulty.c           |  4 ++--
 drivers/md/md-multipath.c        |  3 +--
 drivers/md/md.c                  |  5 +++--
 drivers/md/raid1.c               | 34 ++++++++++++++++----------------
 drivers/md/raid10.c              | 16 +++++++--------
 drivers/md/raid5.c               |  4 ++--
 fs/btrfs/extent_io.c             |  4 ++--
 include/linux/bio.h              |  6 ++++--
 20 files changed, 80 insertions(+), 80 deletions(-)

diff --git a/Documentation/block/biodoc.rst b/Documentation/block/biodoc.rst
index 2098477851a4..4fbc367e62f9 100644
--- a/Documentation/block/biodoc.rst
+++ b/Documentation/block/biodoc.rst
@@ -663,11 +663,6 @@ to i/o submission, if the bio fields are likely to be accessed after the
 i/o is issued (since the bio may otherwise get freed in case i/o completion
 happens in the meantime).
 
-The bio_clone_fast() routine may be used to duplicate a bio, where the clone
-shares the bio_vec_list with the original bio (i.e. both point to the
-same bio_vec_list). This would typically be used for splitting i/o requests
-in lvm or md.
-
 3.2 Generic bio helper Routines
 -------------------------------
 
diff --git a/block/bio.c b/block/bio.c
index 74f66e22ef63..18d34b33351b 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -733,7 +733,8 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
 	bio_set_flag(bio, BIO_CLONED);
 	if (bio_flagged(bio_src, BIO_THROTTLED))
 		bio_set_flag(bio, BIO_THROTTLED);
-	if (bio_flagged(bio_src, BIO_REMAPPED))
+	if (bio->bi_bdev == bio_src->bi_bdev &&
+	    bio_flagged(bio_src, BIO_REMAPPED))
 		bio_set_flag(bio, BIO_REMAPPED);
 	bio->bi_ioprio = bio_src->bi_ioprio;
 	bio->bi_write_hint = bio_src->bi_write_hint;
@@ -751,7 +752,8 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
 }
 
 /**
- * bio_clone_fast - clone a bio that shares the original bio's biovec
+ * bio_alloc_clone - clone a bio that shares the original bio's biovec
+ * @bdev: block_device to clone onto
  * @bio_src: bio to clone from
  * @gfp: allocation priority
  * @bs: bio_set to allocate from
@@ -761,11 +763,12 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
  *
  * The caller must ensure that the return bio is not freed before @bio_src.
  */
-struct bio *bio_clone_fast(struct bio *bio_src, gfp_t gfp, struct bio_set *bs)
+struct bio *bio_alloc_clone(struct block_device *bdev, struct bio *bio_src,
+		gfp_t gfp, struct bio_set *bs)
 {
 	struct bio *bio;
 
-	bio = bio_alloc_bioset(bio_src->bi_bdev, 0, bio_src->bi_opf, gfp, bs);
+	bio = bio_alloc_bioset(bdev, 0, bio_src->bi_opf, gfp, bs);
 	if (!bio)
 		return NULL;
 
@@ -777,10 +780,11 @@ struct bio *bio_clone_fast(struct bio *bio_src, gfp_t gfp, struct bio_set *bs)
 
 	return bio;
 }
-EXPORT_SYMBOL(bio_clone_fast);
+EXPORT_SYMBOL(bio_alloc_clone);
 
 /**
- * __bio_clone_fast - clone a bio that shares the original bio's biovec
+ * bio_init_clone - clone a bio that shares the original bio's biovec
+ * @bdev: block_device to clone onto
  * @bio: bio to clone into
  * @bio_src: bio to clone from
  * @gfp: allocation priority
@@ -790,17 +794,18 @@ EXPORT_SYMBOL(bio_clone_fast);
  *
  * The caller must ensure that @bio_src is not freed before @bio.
  */
-int __bio_clone_fast(struct bio *bio, struct bio *bio_src, gfp_t gfp)
+int bio_init_clone(struct block_device *bdev, struct bio *bio,
+		struct bio *bio_src, gfp_t gfp)
 {
 	int ret;
 
-	bio_init(bio, bio_src->bi_bdev, bio_src->bi_io_vec, 0, bio_src->bi_opf);
+	bio_init(bio, bdev, bio_src->bi_io_vec, 0, bio_src->bi_opf);
 	ret = __bio_clone(bio, bio_src, gfp);
 	if (ret)
 		bio_uninit(bio);
 	return ret;
 }
-EXPORT_SYMBOL(__bio_clone_fast);
+EXPORT_SYMBOL(bio_init_clone);
 
 const char *bio_devname(struct bio *bio, char *buf)
 {
@@ -1572,7 +1577,7 @@ struct bio *bio_split(struct bio *bio, int sectors,
 	if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND))
 		return NULL;
 
-	split = bio_clone_fast(bio, gfp, bs);
+	split = bio_alloc_clone(bio->bi_bdev, bio, gfp, bs);
 	if (!split)
 		return NULL;
 
@@ -1667,9 +1672,9 @@ EXPORT_SYMBOL(bioset_exit);
  *    Note that the bio must be embedded at the END of that structure always,
  *    or things will break badly.
  *    If %BIOSET_NEED_BVECS is set in @flags, a separate pool will be allocated
- *    for allocating iovecs.  This pool is not needed e.g. for bio_clone_fast().
- *    If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used to
- *    dispatch queued requests when the mempool runs out of space.
+ *    for allocating iovecs.  This pool is not needed e.g. for bio_init_clone().
+ *    If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used
+ *    to dispatch queued requests when the mempool runs out of space.
  *
  */
 int bioset_init(struct bio_set *bs,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 1adfe4824ef5..4b868e792ba4 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2975,10 +2975,10 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 		bs = &fs_bio_set;
 
 	__rq_for_each_bio(bio_src, rq_src) {
-		bio = bio_clone_fast(bio_src, gfp_mask, bs);
+		bio = bio_alloc_clone(rq->q->disk->part0, bio_src, gfp_mask,
+				      bs);
 		if (!bio)
 			goto free_and_out;
-		bio->bi_bdev = rq->q->disk->part0;
 
 		if (bio_ctr && bio_ctr(bio, bio_src, data))
 			goto free_and_out;
diff --git a/block/bounce.c b/block/bounce.c
index 330ddde25b46..3fd3bc6fd5db 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -162,8 +162,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src)
 	 *    that does not own the bio - reason being drivers don't use it for
 	 *    iterating over the biovec anymore, so expecting it to be kept up
 	 *    to date (i.e. for clones that share the parent biovec) is just
-	 *    asking for trouble and would force extra work on
-	 *    __bio_clone_fast() anyways.
+	 *    asking for trouble and would force extra work.
 	 */
 	bio = bio_alloc_bioset(bio_src->bi_bdev, bio_segments(bio_src),
 			       bio_src->bi_opf, GFP_NOIO, &bounce_bio_set);
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 8d44e96c4c4e..c00ae8619519 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -30,8 +30,8 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device, struct bio
 		return NULL;
 	memset(req, 0, sizeof(*req));
 
-	req->private_bio = bio_clone_fast(bio_src, GFP_NOIO, &drbd_io_bio_set);
-	bio_set_dev(req->private_bio, device->ldev->backing_bdev);
+	req->private_bio = bio_alloc_clone(device->ldev->backing_bdev, bio_src,
+					   GFP_NOIO, &drbd_io_bio_set);
 	req->private_bio->bi_private = req;
 	req->private_bio->bi_end_io = drbd_request_endio;
 
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 64563bfdf0da..a5e04b38006b 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1523,9 +1523,9 @@ int w_restart_disk_io(struct drbd_work *w, int cancel)
 	if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
 		drbd_al_begin_io(device, &req->i);
 
-	req->private_bio = bio_clone_fast(req->master_bio, GFP_NOIO,
+	req->private_bio = bio_alloc_clone(device->ldev->backing_bdev,
+					   req->master_bio, GFP_NOIO,
 					  &drbd_io_bio_set);
-	bio_set_dev(req->private_bio, device->ldev->backing_bdev);
 	req->private_bio->bi_private = req;
 	req->private_bio->bi_end_io = drbd_request_endio;
 	submit_bio_noacct(req->private_bio);
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 3aa595442946..be749c686feb 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2294,12 +2294,12 @@ static void pkt_end_io_read_cloned(struct bio *bio)
 
 static void pkt_make_request_read(struct pktcdvd_device *pd, struct bio *bio)
 {
-	struct bio *cloned_bio = bio_clone_fast(bio, GFP_NOIO, &pkt_bio_set);
+	struct bio *cloned_bio =
+		bio_alloc_clone(pd->bdev, bio, GFP_NOIO, &pkt_bio_set);
 	struct packet_stacked_data *psd = mempool_alloc(&psd_pool, GFP_NOIO);
 
 	psd->pd = pd;
 	psd->bio = bio;
-	bio_set_dev(cloned_bio, pd->bdev);
 	cloned_bio->bi_private = psd;
 	cloned_bio->bi_end_io = pkt_end_io_read_cloned;
 	pd->stats.secs_r += bio_sectors(bio);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index d2cb853bf917..6869e010475a 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -685,7 +685,7 @@ static void do_bio_hook(struct search *s,
 {
 	struct bio *bio = &s->bio.bio;
 
-	__bio_clone_fast(bio, orig_bio, GFP_NOIO);
+	bio_init_clone(bio->bi_bdev, bio, orig_bio, GFP_NOIO);
 	/*
 	 * bi_end_io can be set separately somewhere else, e.g. the
 	 * variants in,
@@ -1036,7 +1036,8 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 			closure_bio_submit(s->iop.c, flush, cl);
 		}
 	} else {
-		s->iop.bio = bio_clone_fast(bio, GFP_NOIO, &dc->disk.bio_split);
+		s->iop.bio = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO,
+					     &dc->disk.bio_split);
 		/* I/O request sent to backing device */
 		bio->bi_end_io = backing_request_endio;
 		closure_bio_submit(s->iop.c, bio, cl);
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 1c37fe028e53..89fdfb49d564 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -819,13 +819,13 @@ static void issue_op(struct bio *bio, void *context)
 static void remap_to_origin_and_cache(struct cache *cache, struct bio *bio,
 				      dm_oblock_t oblock, dm_cblock_t cblock)
 {
-	struct bio *origin_bio = bio_clone_fast(bio, GFP_NOIO, &cache->bs);
+	struct bio *origin_bio = bio_alloc_clone(cache->origin_dev->bdev, bio,
+						 GFP_NOIO, &cache->bs);
 
 	BUG_ON(!origin_bio);
 
 	bio_chain(origin_bio, bio);
 
-	remap_to_origin(cache, origin_bio);
 	if (bio_data_dir(origin_bio) == WRITE)
 		clear_discard(cache, oblock_to_dblock(cache, oblock));
 	submit_bio(origin_bio);
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index f7e4435b7439..a5006cb6ee8a 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1834,17 +1834,16 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
 	struct bio *clone;
 
 	/*
-	 * We need the original biovec array in order to decrypt
-	 * the whole bio data *afterwards* -- thanks to immutable
-	 * biovecs we don't need to worry about the block layer
-	 * modifying the biovec array; so leverage bio_clone_fast().
+	 * We need the original biovec array in order to decrypt the whole bio
+	 * data *afterwards* -- thanks to immutable biovecs we don't need to
+	 * worry about the block layer modifying the biovec array; so leverage
+	 * bio_alloc_clone().
 	 */
-	clone = bio_clone_fast(io->base_bio, gfp, &cc->bs);
+	clone = bio_alloc_clone(cc->dev->bdev, io->base_bio, gfp, &cc->bs);
 	if (!clone)
 		return 1;
 	clone->bi_private = io;
 	clone->bi_end_io = crypt_endio;
-	bio_set_dev(clone, cc->dev->bdev);
 
 	crypt_inc_pending(io);
 
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index 166c4e9d99c9..a3f6d3ef3817 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -125,11 +125,10 @@ static int dmz_submit_bio(struct dmz_target *dmz, struct dm_zone *zone,
 	if (dev->flags & DMZ_BDEV_DYING)
 		return -EIO;
 
-	clone = bio_clone_fast(bio, GFP_NOIO, &dmz->bio_set);
+	clone = bio_alloc_clone(dev->bdev, bio, GFP_NOIO, &dmz->bio_set);
 	if (!clone)
 		return -ENOMEM;
 
-	bio_set_dev(clone, dev->bdev);
 	bioctx->dev = dev;
 	clone->bi_iter.bi_sector =
 		dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 862564a5df74..ab9cc91931f9 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -520,7 +520,7 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
 	struct dm_target_io *tio;
 	struct bio *clone;
 
-	clone = bio_clone_fast(bio, GFP_NOIO, &md->io_bs);
+	clone = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO, &md->io_bs);
 
 	tio = clone_to_tio(clone);
 	tio->inside_dm_io = true;
@@ -553,8 +553,8 @@ static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti,
 		/* the dm_target_io embedded in ci->io is available */
 		tio = &ci->io->tio;
 	} else {
-		struct bio *clone = bio_clone_fast(ci->bio, gfp_mask,
-						   &ci->io->md->bs);
+		struct bio *clone = bio_alloc_clone(ci->bio->bi_bdev, ci->bio,
+						    gfp_mask, &ci->io->md->bs);
 		if (!clone)
 			return NULL;
 
diff --git a/drivers/md/md-faulty.c b/drivers/md/md-faulty.c
index c0dc6f2ef4a3..50ad818978a4 100644
--- a/drivers/md/md-faulty.c
+++ b/drivers/md/md-faulty.c
@@ -205,9 +205,9 @@ static bool faulty_make_request(struct mddev *mddev, struct bio *bio)
 		}
 	}
 	if (failit) {
-		struct bio *b = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
+		struct bio *b = bio_alloc_clone(conf->rdev->bdev, bio, GFP_NOIO,
+						&mddev->bio_set);
 
-		bio_set_dev(b, conf->rdev->bdev);
 		b->bi_private = bio;
 		b->bi_end_io = faulty_fail;
 		bio = b;
diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c
index 483a5500f83c..97fb948e3e74 100644
--- a/drivers/md/md-multipath.c
+++ b/drivers/md/md-multipath.c
@@ -121,10 +121,9 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio)
 	}
 	multipath = conf->multipaths + mp_bh->path;
 
-	__bio_clone_fast(&mp_bh->bio, bio, GFP_NOIO);
+	bio_init_clone(multipath->rdev->bdev, &mp_bh->bio, bio, GFP_NOIO);
 
 	mp_bh->bio.bi_iter.bi_sector += multipath->rdev->data_offset;
-	bio_set_dev(&mp_bh->bio, multipath->rdev->bdev);
 	mp_bh->bio.bi_opf |= REQ_FAILFAST_TRANSPORT;
 	mp_bh->bio.bi_end_io = multipath_end_request;
 	mp_bh->bio.bi_private = mp_bh;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 0a89f072dae0..f88a9e948f3e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8634,13 +8634,14 @@ static void md_end_io_acct(struct bio *bio)
  */
 void md_account_bio(struct mddev *mddev, struct bio **bio)
 {
+	struct block_device *bdev = (*bio)->bi_bdev;
 	struct md_io_acct *md_io_acct;
 	struct bio *clone;
 
-	if (!blk_queue_io_stat((*bio)->bi_bdev->bd_disk->queue))
+	if (!blk_queue_io_stat(bdev->bd_disk->queue))
 		return;
 
-	clone = bio_clone_fast(*bio, GFP_NOIO, &mddev->io_acct_set);
+	clone = bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_acct_set);
 	md_io_acct = container_of(clone, struct md_io_acct, bio_clone);
 	md_io_acct->orig_bio = *bio;
 	md_io_acct->start_time = bio_start_io_acct(*bio);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e7710fb5befb..c3288d46948d 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1320,13 +1320,13 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
 	if (!r1bio_existed && blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
 		r1_bio->start_time = bio_start_io_acct(bio);
 
-	read_bio = bio_clone_fast(bio, gfp, &mddev->bio_set);
+	read_bio = bio_alloc_clone(mirror->rdev->bdev, bio, gfp,
+				   &mddev->bio_set);
 
 	r1_bio->bios[rdisk] = read_bio;
 
 	read_bio->bi_iter.bi_sector = r1_bio->sector +
 		mirror->rdev->data_offset;
-	bio_set_dev(read_bio, mirror->rdev->bdev);
 	read_bio->bi_end_io = raid1_end_read_request;
 	bio_set_op_attrs(read_bio, op, do_sync);
 	if (test_bit(FailFast, &mirror->rdev->flags) &&
@@ -1546,24 +1546,25 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 			first_clone = 0;
 		}
 
-		if (r1_bio->behind_master_bio)
-			mbio = bio_clone_fast(r1_bio->behind_master_bio,
-					      GFP_NOIO, &mddev->bio_set);
-		else
-			mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
-
 		if (r1_bio->behind_master_bio) {
+			mbio = bio_alloc_clone(rdev->bdev,
+					       r1_bio->behind_master_bio,
+					       GFP_NOIO, &mddev->bio_set);
 			if (test_bit(CollisionCheck, &rdev->flags))
 				wait_for_serialization(rdev, r1_bio);
 			if (test_bit(WriteMostly, &rdev->flags))
 				atomic_inc(&r1_bio->behind_remaining);
-		} else if (mddev->serialize_policy)
-			wait_for_serialization(rdev, r1_bio);
+		} else {
+			mbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO,
+					       &mddev->bio_set);
+
+			if (mddev->serialize_policy)
+				wait_for_serialization(rdev, r1_bio);
+		}
 
 		r1_bio->bios[i] = mbio;
 
 		mbio->bi_iter.bi_sector	= (r1_bio->sector + rdev->data_offset);
-		bio_set_dev(mbio, rdev->bdev);
 		mbio->bi_end_io	= raid1_end_write_request;
 		mbio->bi_opf = bio_op(bio) | (bio->bi_opf & (REQ_SYNC | REQ_FUA));
 		if (test_bit(FailFast, &rdev->flags) &&
@@ -2416,12 +2417,12 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
 		/* Write at 'sector' for 'sectors'*/
 
 		if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
-			wbio = bio_clone_fast(r1_bio->behind_master_bio,
-					      GFP_NOIO,
-					      &mddev->bio_set);
+			wbio = bio_alloc_clone(rdev->bdev,
+					       r1_bio->behind_master_bio,
+					       GFP_NOIO, &mddev->bio_set);
 		} else {
-			wbio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO,
-					      &mddev->bio_set);
+			wbio = bio_alloc_clone(rdev->bdev, r1_bio->master_bio,
+					       GFP_NOIO, &mddev->bio_set);
 		}
 
 		bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
@@ -2430,7 +2431,6 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
 
 		bio_trim(wbio, sector - r1_bio->sector, sectors);
 		wbio->bi_iter.bi_sector += rdev->data_offset;
-		bio_set_dev(wbio, rdev->bdev);
 
 		if (submit_bio_wait(wbio) < 0)
 			/* failure! */
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index da07bcbc06d0..5dd2e17e1d0e 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1208,14 +1208,13 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 
 	if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
 		r10_bio->start_time = bio_start_io_acct(bio);
-	read_bio = bio_clone_fast(bio, gfp, &mddev->bio_set);
+	read_bio = bio_alloc_clone(rdev->bdev, bio, gfp, &mddev->bio_set);
 
 	r10_bio->devs[slot].bio = read_bio;
 	r10_bio->devs[slot].rdev = rdev;
 
 	read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
 		choose_data_offset(r10_bio, rdev);
-	bio_set_dev(read_bio, rdev->bdev);
 	read_bio->bi_end_io = raid10_end_read_request;
 	bio_set_op_attrs(read_bio, op, do_sync);
 	if (test_bit(FailFast, &rdev->flags) &&
@@ -1255,7 +1254,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
 	} else
 		rdev = conf->mirrors[devnum].rdev;
 
-	mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
+	mbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO, &mddev->bio_set);
 	if (replacement)
 		r10_bio->devs[n_copy].repl_bio = mbio;
 	else
@@ -1263,7 +1262,6 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
 
 	mbio->bi_iter.bi_sector	= (r10_bio->devs[n_copy].addr +
 				   choose_data_offset(r10_bio, rdev));
-	bio_set_dev(mbio, rdev->bdev);
 	mbio->bi_end_io	= raid10_end_write_request;
 	bio_set_op_attrs(mbio, op, do_sync | do_fua);
 	if (!replacement && test_bit(FailFast,
@@ -1812,7 +1810,8 @@ retry_discard:
 		 */
 		if (r10_bio->devs[disk].bio) {
 			struct md_rdev *rdev = conf->mirrors[disk].rdev;
-			mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
+			mbio = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO,
+					       &mddev->bio_set);
 			mbio->bi_end_io = raid10_end_discard_request;
 			mbio->bi_private = r10_bio;
 			r10_bio->devs[disk].bio = mbio;
@@ -1825,7 +1824,8 @@ retry_discard:
 		}
 		if (r10_bio->devs[disk].repl_bio) {
 			struct md_rdev *rrdev = conf->mirrors[disk].replacement;
-			rbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
+			rbio = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO,
+					       &mddev->bio_set);
 			rbio->bi_end_io = raid10_end_discard_request;
 			rbio->bi_private = r10_bio;
 			r10_bio->devs[disk].repl_bio = rbio;
@@ -2892,12 +2892,12 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
 		if (sectors > sect_to_write)
 			sectors = sect_to_write;
 		/* Write at 'sector' for 'sectors' */
-		wbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
+		wbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO,
+				       &mddev->bio_set);
 		bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
 		wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector);
 		wbio->bi_iter.bi_sector = wsector +
 				   choose_data_offset(r10_bio, rdev);
-		bio_set_dev(wbio, rdev->bdev);
 		bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
 
 		if (submit_bio_wait(wbio) < 0)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7c119208a214..8891aaba6596 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5438,14 +5438,14 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 		return 0;
 	}
 
-	align_bio = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->io_acct_set);
+	align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
+				    &mddev->io_acct_set);
 	md_io_acct = container_of(align_bio, struct md_io_acct, bio_clone);
 	raid_bio->bi_next = (void *)rdev;
 	if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue))
 		md_io_acct->start_time = bio_start_io_acct(raid_bio);
 	md_io_acct->orig_bio = raid_bio;
 
-	bio_set_dev(align_bio, rdev->bdev);
 	align_bio->bi_end_io = raid5_align_endio;
 	align_bio->bi_private = md_io_acct;
 	align_bio->bi_iter.bi_sector = sector;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 421d921a0571..dee86911a4be 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3154,7 +3154,7 @@ struct bio *btrfs_bio_clone(struct bio *bio)
 	struct bio *new;
 
 	/* Bio allocation backed by a bioset does not fail */
-	new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset);
+	new = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOFS, &btrfs_bioset);
 	bbio = btrfs_bio(new);
 	btrfs_bio_init(bbio);
 	bbio->iter = bio->bi_iter;
@@ -3169,7 +3169,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
 	ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
 
 	/* this will never fail when it's backed by a bioset */
-	bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset);
+	bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset);
 	ASSERT(bio);
 
 	bbio = btrfs_bio(bio);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index b814361c957b..7523aba4ddf7 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -413,8 +413,10 @@ struct bio *bio_alloc_kiocb(struct kiocb *kiocb, struct block_device *bdev,
 struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs);
 extern void bio_put(struct bio *);
 
-int __bio_clone_fast(struct bio *bio, struct bio *bio_src, gfp_t gfp);
-extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *);
+struct bio *bio_alloc_clone(struct block_device *bdev, struct bio *bio_src,
+		gfp_t gfp, struct bio_set *bs);
+int bio_init_clone(struct block_device *bdev, struct bio *bio,
+		struct bio *bio_src, gfp_t gfp);
 
 extern struct bio_set fs_bio_set;
 

From 3301bc53358a0eb0a0db65fd7a513cd4cb50c83a Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 10 Jan 2022 15:29:45 +0800
Subject: [PATCH 047/127] lib/sbitmap: kill 'depth' from sbitmap_word

Only the last sbitmap_word can have different depth, and all the others
must have same depth of 1U << sb->shift, so not necessary to store it in
sbitmap_word, and it can be retrieved easily and efficiently by adding
one internal helper of __map_depth(sb, index).

Remove 'depth' field from sbitmap_word, then the annotation of
____cacheline_aligned_in_smp for 'word' isn't needed any more.

Not see performance effect when running high parallel IOPS test on
null_blk.

This way saves us one cacheline(usually 64 words) per each sbitmap_word.

Cc: Martin Wilck <martin.wilck@suse.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Martin Wilck <mwilck@suse.com>
Reviewed-by: John Garry <john.garry@huawei.com>
Link: https://lore.kernel.org/r/20220110072945.347535-1-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sbitmap.h | 17 ++++++++++-------
 lib/sbitmap.c           | 34 ++++++++++++++--------------------
 2 files changed, 24 insertions(+), 27 deletions(-)

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 95df357ec009..df3b584b0f0c 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -27,15 +27,10 @@ struct seq_file;
  * struct sbitmap_word - Word in a &struct sbitmap.
  */
 struct sbitmap_word {
-	/**
-	 * @depth: Number of bits being used in @word/@cleared
-	 */
-	unsigned long depth;
-
 	/**
 	 * @word: word holding free bits
 	 */
-	unsigned long word ____cacheline_aligned_in_smp;
+	unsigned long word;
 
 	/**
 	 * @cleared: word holding cleared bits
@@ -164,6 +159,14 @@ struct sbitmap_queue {
 int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
 		      gfp_t flags, int node, bool round_robin, bool alloc_hint);
 
+/* sbitmap internal helper */
+static inline unsigned int __map_depth(const struct sbitmap *sb, int index)
+{
+	if (index == sb->map_nr - 1)
+		return sb->depth - (index << sb->shift);
+	return 1U << sb->shift;
+}
+
 /**
  * sbitmap_free() - Free memory used by a &struct sbitmap.
  * @sb: Bitmap to free.
@@ -251,7 +254,7 @@ static inline void __sbitmap_for_each_set(struct sbitmap *sb,
 	while (scanned < sb->depth) {
 		unsigned long word;
 		unsigned int depth = min_t(unsigned int,
-					   sb->map[index].depth - nr,
+					   __map_depth(sb, index) - nr,
 					   sb->depth - scanned);
 
 		scanned += depth;
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 09d293c30fd2..b7cb96ae4701 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -85,7 +85,6 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
 		      bool alloc_hint)
 {
 	unsigned int bits_per_word;
-	unsigned int i;
 
 	if (shift < 0)
 		shift = sbitmap_calculate_shift(depth);
@@ -117,10 +116,6 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
 		return -ENOMEM;
 	}
 
-	for (i = 0; i < sb->map_nr; i++) {
-		sb->map[i].depth = min(depth, bits_per_word);
-		depth -= sb->map[i].depth;
-	}
 	return 0;
 }
 EXPORT_SYMBOL_GPL(sbitmap_init_node);
@@ -135,11 +130,6 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth)
 
 	sb->depth = depth;
 	sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word);
-
-	for (i = 0; i < sb->map_nr; i++) {
-		sb->map[i].depth = min(depth, bits_per_word);
-		depth -= sb->map[i].depth;
-	}
 }
 EXPORT_SYMBOL_GPL(sbitmap_resize);
 
@@ -184,8 +174,8 @@ static int sbitmap_find_bit_in_index(struct sbitmap *sb, int index,
 	int nr;
 
 	do {
-		nr = __sbitmap_get_word(&map->word, map->depth, alloc_hint,
-					!sb->round_robin);
+		nr = __sbitmap_get_word(&map->word, __map_depth(sb, index),
+					alloc_hint, !sb->round_robin);
 		if (nr != -1)
 			break;
 		if (!sbitmap_deferred_clear(map))
@@ -257,7 +247,9 @@ static int __sbitmap_get_shallow(struct sbitmap *sb,
 	for (i = 0; i < sb->map_nr; i++) {
 again:
 		nr = __sbitmap_get_word(&sb->map[index].word,
-					min(sb->map[index].depth, shallow_depth),
+					min_t(unsigned int,
+					      __map_depth(sb, index),
+					      shallow_depth),
 					SB_NR_TO_BIT(sb, alloc_hint), true);
 		if (nr != -1) {
 			nr += index << sb->shift;
@@ -315,11 +307,12 @@ static unsigned int __sbitmap_weight(const struct sbitmap *sb, bool set)
 
 	for (i = 0; i < sb->map_nr; i++) {
 		const struct sbitmap_word *word = &sb->map[i];
+		unsigned int word_depth = __map_depth(sb, i);
 
 		if (set)
-			weight += bitmap_weight(&word->word, word->depth);
+			weight += bitmap_weight(&word->word, word_depth);
 		else
-			weight += bitmap_weight(&word->cleared, word->depth);
+			weight += bitmap_weight(&word->cleared, word_depth);
 	}
 	return weight;
 }
@@ -367,7 +360,7 @@ void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m)
 	for (i = 0; i < sb->map_nr; i++) {
 		unsigned long word = READ_ONCE(sb->map[i].word);
 		unsigned long cleared = READ_ONCE(sb->map[i].cleared);
-		unsigned int word_bits = READ_ONCE(sb->map[i].depth);
+		unsigned int word_bits = __map_depth(sb, i);
 
 		word &= ~cleared;
 
@@ -531,15 +524,16 @@ unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags,
 	for (i = 0; i < sb->map_nr; i++) {
 		struct sbitmap_word *map = &sb->map[index];
 		unsigned long get_mask;
+		unsigned int map_depth = __map_depth(sb, index);
 
 		sbitmap_deferred_clear(map);
-		if (map->word == (1UL << (map->depth - 1)) - 1)
+		if (map->word == (1UL << (map_depth - 1)) - 1)
 			continue;
 
-		nr = find_first_zero_bit(&map->word, map->depth);
-		if (nr + nr_tags <= map->depth) {
+		nr = find_first_zero_bit(&map->word, map_depth);
+		if (nr + nr_tags <= map_depth) {
 			atomic_long_t *ptr = (atomic_long_t *) &map->word;
-			int map_tags = min_t(int, nr_tags, map->depth);
+			int map_tags = min_t(int, nr_tags, map_depth);
 			unsigned long val, ret;
 
 			get_mask = ((1UL << map_tags) - 1) << nr;

From 3f607293b74d6acb06571a774a500143c1f0ed2c Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Tue, 8 Feb 2022 20:07:04 +0800
Subject: [PATCH 048/127] sbitmap: Delete old sbitmap_queue_get_shallow()

Since __sbitmap_queue_get_shallow() was introduced in commit c05e66733788
("sbitmap: add sbitmap_get_shallow() operation"), it has not been used.

Delete __sbitmap_queue_get_shallow() and rename public
__sbitmap_queue_get_shallow() -> sbitmap_queue_get_shallow() as it is odd
to have public __foo but no foo at all.

Signed-off-by: John Garry <john.garry@huawei.com>
Link: https://lore.kernel.org/r/1644322024-105340-1-git-send-email-john.garry@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-tag.c      |  2 +-
 include/linux/sbitmap.h | 34 ++++------------------------------
 lib/sbitmap.c           |  6 +++---
 3 files changed, 8 insertions(+), 34 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 845f74e8dd7b..0fd409b8e86e 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -107,7 +107,7 @@ static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
 		return BLK_MQ_NO_TAG;
 
 	if (data->shallow_depth)
-		return __sbitmap_queue_get_shallow(bt, data->shallow_depth);
+		return sbitmap_queue_get_shallow(bt, data->shallow_depth);
 	else
 		return __sbitmap_queue_get(bt);
 }
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index df3b584b0f0c..dffeb8281c2d 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -135,7 +135,7 @@ struct sbitmap_queue {
 
 	/**
 	 * @min_shallow_depth: The minimum shallow depth which may be passed to
-	 * sbitmap_queue_get_shallow() or __sbitmap_queue_get_shallow().
+	 * sbitmap_queue_get_shallow()
 	 */
 	unsigned int min_shallow_depth;
 };
@@ -463,7 +463,7 @@ unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags,
 					unsigned int *offset);
 
 /**
- * __sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct
+ * sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct
  * sbitmap_queue, limiting the depth used from each word, with preemption
  * already disabled.
  * @sbq: Bitmap queue to allocate from.
@@ -475,8 +475,8 @@ unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags,
  *
  * Return: Non-negative allocated bit number if successful, -1 otherwise.
  */
-int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
-				unsigned int shallow_depth);
+int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
+			      unsigned int shallow_depth);
 
 /**
  * sbitmap_queue_get() - Try to allocate a free bit from a &struct
@@ -498,32 +498,6 @@ static inline int sbitmap_queue_get(struct sbitmap_queue *sbq,
 	return nr;
 }
 
-/**
- * sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct
- * sbitmap_queue, limiting the depth used from each word.
- * @sbq: Bitmap queue to allocate from.
- * @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to
- *       sbitmap_queue_clear()).
- * @shallow_depth: The maximum number of bits to allocate from a single word.
- * See sbitmap_get_shallow().
- *
- * If you call this, make sure to call sbitmap_queue_min_shallow_depth() after
- * initializing @sbq.
- *
- * Return: Non-negative allocated bit number if successful, -1 otherwise.
- */
-static inline int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
-					    unsigned int *cpu,
-					    unsigned int shallow_depth)
-{
-	int nr;
-
-	*cpu = get_cpu();
-	nr = __sbitmap_queue_get_shallow(sbq, shallow_depth);
-	put_cpu();
-	return nr;
-}
-
 /**
  * sbitmap_queue_min_shallow_depth() - Inform a &struct sbitmap_queue of the
  * minimum shallow depth that will be used.
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index b7cb96ae4701..2eb3de18ded3 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -557,14 +557,14 @@ unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags,
 	return 0;
 }
 
-int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
-				unsigned int shallow_depth)
+int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
+			      unsigned int shallow_depth)
 {
 	WARN_ON_ONCE(shallow_depth < sbq->min_shallow_depth);
 
 	return sbitmap_get_shallow(&sbq->sb, shallow_depth);
 }
-EXPORT_SYMBOL_GPL(__sbitmap_queue_get_shallow);
+EXPORT_SYMBOL_GPL(sbitmap_queue_get_shallow);
 
 void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq,
 				     unsigned int min_shallow_depth)

From d5869fdc189f0f12a954a48d58a48104a2f5d044 Mon Sep 17 00:00:00 2001
From: Yang Shi <shy828301@gmail.com>
Date: Thu, 10 Feb 2022 14:52:22 -0800
Subject: [PATCH 049/127] block: introduce block_rq_error tracepoint

Currently, rasdaemon uses the existing tracepoint block_rq_complete
and filters out non-error cases in order to capture block disk errors.

But there are a few problems with this approach:

1. Even kernel trace filter could do the filtering work, there is
   still some overhead after we enable this tracepoint.

2. The filter is merely based on errno, which does not align with kernel
   logic to check the errors for print_req_error().

3. block_rq_complete only provides dev major and minor to identify
   the block device, it is not convenient to use in user-space.

So introduce a new tracepoint block_rq_error just for the error case.
With this patch, rasdaemon could switch to block_rq_error.

Since the new tracepoint has the similar implementation with
block_rq_complete, so move the existing code from TRACE_EVENT
block_rq_complete() into new event class block_rq_completion(). Then add
event for block_rq_complete and block_rq_err respectively from the newly
created event class per the suggestion from Chaitanya Kulkarni.

Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@infradead.org>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Yang Shi <shy828301@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220210225222.260069-1-shy828301@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c               |  4 ++-
 include/trace/events/block.h | 49 ++++++++++++++++++++++++++----------
 2 files changed, 39 insertions(+), 14 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4b868e792ba4..6c59ffe765fd 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -789,8 +789,10 @@ bool blk_update_request(struct request *req, blk_status_t error,
 #endif
 
 	if (unlikely(error && !blk_rq_is_passthrough(req) &&
-		     !(req->rq_flags & RQF_QUIET)))
+		     !(req->rq_flags & RQF_QUIET))) {
 		blk_print_req_error(req, error);
+		trace_block_rq_error(req, error, nr_bytes);
+	}
 
 	blk_account_io_completion(req, nr_bytes);
 
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 27170e40e8c9..7f4dfbdf12a6 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -100,19 +100,7 @@ TRACE_EVENT(block_rq_requeue,
 		  __entry->nr_sector, 0)
 );
 
-/**
- * block_rq_complete - block IO operation completed by device driver
- * @rq: block operations request
- * @error: status code
- * @nr_bytes: number of completed bytes
- *
- * The block_rq_complete tracepoint event indicates that some portion
- * of operation request has been completed by the device driver.  If
- * the @rq->bio is %NULL, then there is absolutely no additional work to
- * do for the request. If @rq->bio is non-NULL then there is
- * additional work required to complete the request.
- */
-TRACE_EVENT(block_rq_complete,
+DECLARE_EVENT_CLASS(block_rq_completion,
 
 	TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes),
 
@@ -144,6 +132,41 @@ TRACE_EVENT(block_rq_complete,
 		  __entry->nr_sector, __entry->error)
 );
 
+/**
+ * block_rq_complete - block IO operation completed by device driver
+ * @rq: block operations request
+ * @error: status code
+ * @nr_bytes: number of completed bytes
+ *
+ * The block_rq_complete tracepoint event indicates that some portion
+ * of operation request has been completed by the device driver.  If
+ * the @rq->bio is %NULL, then there is absolutely no additional work to
+ * do for the request. If @rq->bio is non-NULL then there is
+ * additional work required to complete the request.
+ */
+DEFINE_EVENT(block_rq_completion, block_rq_complete,
+
+	TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes),
+
+	TP_ARGS(rq, error, nr_bytes)
+);
+
+/**
+ * block_rq_error - block IO operation error reported by device driver
+ * @rq: block operations request
+ * @error: status code
+ * @nr_bytes: number of completed bytes
+ *
+ * The block_rq_error tracepoint event indicates that some portion
+ * of operation request has failed as reported by the device driver.
+ */
+DEFINE_EVENT(block_rq_completion, block_rq_error,
+
+	TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes),
+
+	TP_ARGS(rq, error, nr_bytes)
+);
+
 DECLARE_EVENT_CLASS(block_rq,
 
 	TP_PROTO(struct request *rq),

From 0e51e2ab49a99bc5077760aa083dfa1c3bf9899b Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 11 Feb 2022 18:11:47 +0800
Subject: [PATCH 050/127] block: remove THROTL_IOPS_MAX

No one uses THROTL_IOPS_MAX any more, so remove it.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220211101149.2368042-2-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-cgroup.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index b4de2010fba5..bdc49bd4eef0 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -28,8 +28,6 @@
 /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
 #define BLKG_STAT_CPU_BATCH	(INT_MAX / 2)
 
-/* Max limits for throttle policy */
-#define THROTL_IOPS_MAX		UINT_MAX
 #define FC_APPID_LEN              129
 
 

From 472e4314c039d6cf36e28783b1c63f87b5b394c2 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 11 Feb 2022 18:11:48 +0800
Subject: [PATCH 051/127] block: move initialization of q->blkg_list into
 blkcg_init_queue

q->blkg_list is only used by blkcg code, so move it into
blkcg_init_queue.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20220211101149.2368042-3-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 2 ++
 block/blk-core.c   | 3 ---
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 671debbae941..35deaceba1f0 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1175,6 +1175,8 @@ int blkcg_init_queue(struct request_queue *q)
 	bool preloaded;
 	int ret;
 
+	INIT_LIST_HEAD(&q->blkg_list);
+
 	new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
 	if (!new_blkg)
 		return -ENOMEM;
diff --git a/block/blk-core.c b/block/blk-core.c
index be8812f5489d..ff972b968f25 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -476,9 +476,6 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
 	timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
 	INIT_WORK(&q->timeout_work, blk_timeout_work);
 	INIT_LIST_HEAD(&q->icq_list);
-#ifdef CONFIG_BLK_CGROUP
-	INIT_LIST_HEAD(&q->blkg_list);
-#endif
 
 	kobject_init(&q->kobj, &blk_queue_ktype);
 

From 672fdcf0e7de3b1e39416ac85abf178f023271f1 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 11 Feb 2022 18:11:49 +0800
Subject: [PATCH 052/127] block: partition include/linux/blk-cgroup.h

Partition include/linux/blk-cgroup.h into two parts: one is public part,
the other is block layer private part.

Suggested by Christoph Hellwig.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220211101149.2368042-4-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.h         |   1 -
 block/bio.c                 |   2 +-
 block/blk-cgroup-rwstat.h   |   2 +-
 block/blk-cgroup.c          |   2 +-
 block/blk-cgroup.h          | 477 ++++++++++++++++++++++++++++++++++++
 block/blk-core.c            |   2 +-
 block/blk-crypto-fallback.c |   2 +-
 block/blk-iocost.c          |   2 +-
 block/blk-iolatency.c       |   2 +-
 block/blk-ioprio.c          |   2 +-
 block/blk-sysfs.c           |   2 +-
 block/blk-throttle.c        |   1 -
 block/bounce.c              |   2 +-
 block/elevator.c            |   2 +-
 include/linux/blk-cgroup.h  | 459 +---------------------------------
 15 files changed, 493 insertions(+), 467 deletions(-)
 create mode 100644 block/blk-cgroup.h

diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 07288b9da389..72255ec44f8f 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -8,7 +8,6 @@
 
 #include <linux/blktrace_api.h>
 #include <linux/hrtimer.h>
-#include <linux/blk-cgroup.h>
 
 #include "blk-cgroup-rwstat.h"
 
diff --git a/block/bio.c b/block/bio.c
index 18d34b33351b..b15f5466ce08 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -15,7 +15,6 @@
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
 #include <linux/cgroup.h>
-#include <linux/blk-cgroup.h>
 #include <linux/highmem.h>
 #include <linux/sched/sysctl.h>
 #include <linux/blk-crypto.h>
@@ -24,6 +23,7 @@
 #include <trace/events/block.h>
 #include "blk.h"
 #include "blk-rq-qos.h"
+#include "blk-cgroup.h"
 
 struct bio_alloc_cache {
 	struct bio		*free_list;
diff --git a/block/blk-cgroup-rwstat.h b/block/blk-cgroup-rwstat.h
index ee746919c41f..9f2723b34b75 100644
--- a/block/blk-cgroup-rwstat.h
+++ b/block/blk-cgroup-rwstat.h
@@ -6,7 +6,7 @@
 #ifndef _BLK_CGROUP_RWSTAT_H
 #define _BLK_CGROUP_RWSTAT_H
 
-#include <linux/blk-cgroup.h>
+#include "blk-cgroup.h"
 
 enum blkg_rwstat_type {
 	BLKG_RWSTAT_READ,
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 35deaceba1f0..4108d445c73a 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -26,11 +26,11 @@
 #include <linux/delay.h>
 #include <linux/atomic.h>
 #include <linux/ctype.h>
-#include <linux/blk-cgroup.h>
 #include <linux/tracehook.h>
 #include <linux/psi.h>
 #include <linux/part_stat.h>
 #include "blk.h"
+#include "blk-cgroup.h"
 #include "blk-ioprio.h"
 #include "blk-throttle.h"
 
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
new file mode 100644
index 000000000000..3e91803c4a55
--- /dev/null
+++ b/block/blk-cgroup.h
@@ -0,0 +1,477 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BLK_CGROUP_PRIVATE_H
+#define _BLK_CGROUP_PRIVATE_H
+/*
+ * block cgroup private header
+ *
+ * Based on ideas and code from CFQ, CFS and BFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *		      Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
+ * 	              Nauman Rafique <nauman@google.com>
+ */
+
+#include <linux/blk-cgroup.h>
+
+/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
+#define BLKG_STAT_CPU_BATCH	(INT_MAX / 2)
+
+#ifdef CONFIG_BLK_CGROUP
+
+/*
+ * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a
+ * request_queue (q).  This is used by blkcg policies which need to track
+ * information per blkcg - q pair.
+ *
+ * There can be multiple active blkcg policies and each blkg:policy pair is
+ * represented by a blkg_policy_data which is allocated and freed by each
+ * policy's pd_alloc/free_fn() methods.  A policy can allocate private data
+ * area by allocating larger data structure which embeds blkg_policy_data
+ * at the beginning.
+ */
+struct blkg_policy_data {
+	/* the blkg and policy id this per-policy data belongs to */
+	struct blkcg_gq			*blkg;
+	int				plid;
+};
+
+/*
+ * Policies that need to keep per-blkcg data which is independent from any
+ * request_queue associated to it should implement cpd_alloc/free_fn()
+ * methods.  A policy can allocate private data area by allocating larger
+ * data structure which embeds blkcg_policy_data at the beginning.
+ * cpd_init() is invoked to let each policy handle per-blkcg data.
+ */
+struct blkcg_policy_data {
+	/* the blkcg and policy id this per-policy data belongs to */
+	struct blkcg			*blkcg;
+	int				plid;
+};
+
+typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
+typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp,
+				struct request_queue *q, struct blkcg *blkcg);
+typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
+typedef bool (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd,
+				struct seq_file *s);
+
+struct blkcg_policy {
+	int				plid;
+	/* cgroup files for the policy */
+	struct cftype			*dfl_cftypes;
+	struct cftype			*legacy_cftypes;
+
+	/* operations */
+	blkcg_pol_alloc_cpd_fn		*cpd_alloc_fn;
+	blkcg_pol_init_cpd_fn		*cpd_init_fn;
+	blkcg_pol_free_cpd_fn		*cpd_free_fn;
+	blkcg_pol_bind_cpd_fn		*cpd_bind_fn;
+
+	blkcg_pol_alloc_pd_fn		*pd_alloc_fn;
+	blkcg_pol_init_pd_fn		*pd_init_fn;
+	blkcg_pol_online_pd_fn		*pd_online_fn;
+	blkcg_pol_offline_pd_fn		*pd_offline_fn;
+	blkcg_pol_free_pd_fn		*pd_free_fn;
+	blkcg_pol_reset_pd_stats_fn	*pd_reset_stats_fn;
+	blkcg_pol_stat_pd_fn		*pd_stat_fn;
+};
+
+extern struct blkcg blkcg_root;
+extern bool blkcg_debug_stats;
+
+struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
+				      struct request_queue *q, bool update_hint);
+int blkcg_init_queue(struct request_queue *q);
+void blkcg_exit_queue(struct request_queue *q);
+
+/* Blkio controller policy registration */
+int blkcg_policy_register(struct blkcg_policy *pol);
+void blkcg_policy_unregister(struct blkcg_policy *pol);
+int blkcg_activate_policy(struct request_queue *q,
+			  const struct blkcg_policy *pol);
+void blkcg_deactivate_policy(struct request_queue *q,
+			     const struct blkcg_policy *pol);
+
+const char *blkg_dev_name(struct blkcg_gq *blkg);
+void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
+		       u64 (*prfill)(struct seq_file *,
+				     struct blkg_policy_data *, int),
+		       const struct blkcg_policy *pol, int data,
+		       bool show_total);
+u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
+
+struct blkg_conf_ctx {
+	struct block_device		*bdev;
+	struct blkcg_gq			*blkg;
+	char				*body;
+};
+
+struct block_device *blkcg_conf_open_bdev(char **inputp);
+int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
+		   char *input, struct blkg_conf_ctx *ctx);
+void blkg_conf_finish(struct blkg_conf_ctx *ctx);
+
+/**
+ * blkcg_css - find the current css
+ *
+ * Find the css associated with either the kthread or the current task.
+ * This may return a dying css, so it is up to the caller to use tryget logic
+ * to confirm it is alive and well.
+ */
+static inline struct cgroup_subsys_state *blkcg_css(void)
+{
+	struct cgroup_subsys_state *css;
+
+	css = kthread_blkcg();
+	if (css)
+		return css;
+	return task_css(current, io_cgrp_id);
+}
+
+/**
+ * __bio_blkcg - internal, inconsistent version to get blkcg
+ *
+ * DO NOT USE.
+ * This function is inconsistent and consequently is dangerous to use.  The
+ * first part of the function returns a blkcg where a reference is owned by the
+ * bio.  This means it does not need to be rcu protected as it cannot go away
+ * with the bio owning a reference to it.  However, the latter potentially gets
+ * it from task_css().  This can race against task migration and the cgroup
+ * dying.  It is also semantically different as it must be called rcu protected
+ * and is susceptible to failure when trying to get a reference to it.
+ * Therefore, it is not ok to assume that *_get() will always succeed on the
+ * blkcg returned here.
+ */
+static inline struct blkcg *__bio_blkcg(struct bio *bio)
+{
+	if (bio && bio->bi_blkg)
+		return bio->bi_blkg->blkcg;
+	return css_to_blkcg(blkcg_css());
+}
+
+/**
+ * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
+ * @return: true if this bio needs to be submitted with the root blkg context.
+ *
+ * In order to avoid priority inversions we sometimes need to issue a bio as if
+ * it were attached to the root blkg, and then backcharge to the actual owning
+ * blkg.  The idea is we do bio_blkcg() to look up the actual context for the
+ * bio and attach the appropriate blkg to the bio.  Then we call this helper and
+ * if it is true run with the root blkg for that queue and then do any
+ * backcharging to the originating cgroup once the io is complete.
+ */
+static inline bool bio_issue_as_root_blkg(struct bio *bio)
+{
+	return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0;
+}
+
+/**
+ * __blkg_lookup - internal version of blkg_lookup()
+ * @blkcg: blkcg of interest
+ * @q: request_queue of interest
+ * @update_hint: whether to update lookup hint with the result or not
+ *
+ * This is internal version and shouldn't be used by policy
+ * implementations.  Looks up blkgs for the @blkcg - @q pair regardless of
+ * @q's bypass state.  If @update_hint is %true, the caller should be
+ * holding @q->queue_lock and lookup hint is updated on success.
+ */
+static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
+					     struct request_queue *q,
+					     bool update_hint)
+{
+	struct blkcg_gq *blkg;
+
+	if (blkcg == &blkcg_root)
+		return q->root_blkg;
+
+	blkg = rcu_dereference(blkcg->blkg_hint);
+	if (blkg && blkg->q == q)
+		return blkg;
+
+	return blkg_lookup_slowpath(blkcg, q, update_hint);
+}
+
+/**
+ * blkg_lookup - lookup blkg for the specified blkcg - q pair
+ * @blkcg: blkcg of interest
+ * @q: request_queue of interest
+ *
+ * Lookup blkg for the @blkcg - @q pair.  This function should be called
+ * under RCU read lock.
+ */
+static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
+					   struct request_queue *q)
+{
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	return __blkg_lookup(blkcg, q, false);
+}
+
+/**
+ * blk_queue_root_blkg - return blkg for the (blkcg_root, @q) pair
+ * @q: request_queue of interest
+ *
+ * Lookup blkg for @q at the root level. See also blkg_lookup().
+ */
+static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q)
+{
+	return q->root_blkg;
+}
+
+/**
+ * blkg_to_pdata - get policy private data
+ * @blkg: blkg of interest
+ * @pol: policy of interest
+ *
+ * Return pointer to private data associated with the @blkg-@pol pair.
+ */
+static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
+						  struct blkcg_policy *pol)
+{
+	return blkg ? blkg->pd[pol->plid] : NULL;
+}
+
+static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
+						     struct blkcg_policy *pol)
+{
+	return blkcg ? blkcg->cpd[pol->plid] : NULL;
+}
+
+/**
+ * pdata_to_blkg - get blkg associated with policy private data
+ * @pd: policy private data of interest
+ *
+ * @pd is policy private data.  Determine the blkg it's associated with.
+ */
+static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
+{
+	return pd ? pd->blkg : NULL;
+}
+
+static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd)
+{
+	return cpd ? cpd->blkcg : NULL;
+}
+
+/**
+ * blkg_path - format cgroup path of blkg
+ * @blkg: blkg of interest
+ * @buf: target buffer
+ * @buflen: target buffer length
+ *
+ * Format the path of the cgroup of @blkg into @buf.
+ */
+static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
+{
+	return cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
+}
+
+/**
+ * blkg_get - get a blkg reference
+ * @blkg: blkg to get
+ *
+ * The caller should be holding an existing reference.
+ */
+static inline void blkg_get(struct blkcg_gq *blkg)
+{
+	percpu_ref_get(&blkg->refcnt);
+}
+
+/**
+ * blkg_tryget - try and get a blkg reference
+ * @blkg: blkg to get
+ *
+ * This is for use when doing an RCU lookup of the blkg.  We may be in the midst
+ * of freeing this blkg, so we can only use it if the refcnt is not zero.
+ */
+static inline bool blkg_tryget(struct blkcg_gq *blkg)
+{
+	return blkg && percpu_ref_tryget(&blkg->refcnt);
+}
+
+/**
+ * blkg_put - put a blkg reference
+ * @blkg: blkg to put
+ */
+static inline void blkg_put(struct blkcg_gq *blkg)
+{
+	percpu_ref_put(&blkg->refcnt);
+}
+
+/**
+ * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
+ * @d_blkg: loop cursor pointing to the current descendant
+ * @pos_css: used for iteration
+ * @p_blkg: target blkg to walk descendants of
+ *
+ * Walk @c_blkg through the descendants of @p_blkg.  Must be used with RCU
+ * read locked.  If called under either blkcg or queue lock, the iteration
+ * is guaranteed to include all and only online blkgs.  The caller may
+ * update @pos_css by calling css_rightmost_descendant() to skip subtree.
+ * @p_blkg is included in the iteration and the first node to be visited.
+ */
+#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg)		\
+	css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css)	\
+		if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),	\
+					      (p_blkg)->q, false)))
+
+/**
+ * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
+ * @d_blkg: loop cursor pointing to the current descendant
+ * @pos_css: used for iteration
+ * @p_blkg: target blkg to walk descendants of
+ *
+ * Similar to blkg_for_each_descendant_pre() but performs post-order
+ * traversal instead.  Synchronization rules are the same.  @p_blkg is
+ * included in the iteration and the last node to be visited.
+ */
+#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg)		\
+	css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css)	\
+		if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),	\
+					      (p_blkg)->q, false)))
+
+bool __blkcg_punt_bio_submit(struct bio *bio);
+
+static inline bool blkcg_punt_bio_submit(struct bio *bio)
+{
+	if (bio->bi_opf & REQ_CGROUP_PUNT)
+		return __blkcg_punt_bio_submit(bio);
+	else
+		return false;
+}
+
+static inline void blkcg_bio_issue_init(struct bio *bio)
+{
+	bio_issue_init(&bio->bi_issue, bio_sectors(bio));
+}
+
+static inline void blkcg_use_delay(struct blkcg_gq *blkg)
+{
+	if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
+		return;
+	if (atomic_add_return(1, &blkg->use_delay) == 1)
+		atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
+}
+
+static inline int blkcg_unuse_delay(struct blkcg_gq *blkg)
+{
+	int old = atomic_read(&blkg->use_delay);
+
+	if (WARN_ON_ONCE(old < 0))
+		return 0;
+	if (old == 0)
+		return 0;
+
+	/*
+	 * We do this song and dance because we can race with somebody else
+	 * adding or removing delay.  If we just did an atomic_dec we'd end up
+	 * negative and we'd already be in trouble.  We need to subtract 1 and
+	 * then check to see if we were the last delay so we can drop the
+	 * congestion count on the cgroup.
+	 */
+	while (old) {
+		int cur = atomic_cmpxchg(&blkg->use_delay, old, old - 1);
+		if (cur == old)
+			break;
+		old = cur;
+	}
+
+	if (old == 0)
+		return 0;
+	if (old == 1)
+		atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
+	return 1;
+}
+
+/**
+ * blkcg_set_delay - Enable allocator delay mechanism with the specified delay amount
+ * @blkg: target blkg
+ * @delay: delay duration in nsecs
+ *
+ * When enabled with this function, the delay is not decayed and must be
+ * explicitly cleared with blkcg_clear_delay(). Must not be mixed with
+ * blkcg_[un]use_delay() and blkcg_add_delay() usages.
+ */
+static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay)
+{
+	int old = atomic_read(&blkg->use_delay);
+
+	/* We only want 1 person setting the congestion count for this blkg. */
+	if (!old && atomic_cmpxchg(&blkg->use_delay, old, -1) == old)
+		atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
+
+	atomic64_set(&blkg->delay_nsec, delay);
+}
+
+/**
+ * blkcg_clear_delay - Disable allocator delay mechanism
+ * @blkg: target blkg
+ *
+ * Disable use_delay mechanism. See blkcg_set_delay().
+ */
+static inline void blkcg_clear_delay(struct blkcg_gq *blkg)
+{
+	int old = atomic_read(&blkg->use_delay);
+
+	/* We only want 1 person clearing the congestion count for this blkg. */
+	if (old && atomic_cmpxchg(&blkg->use_delay, old, 0) == old)
+		atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
+}
+
+void blk_cgroup_bio_start(struct bio *bio);
+void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta);
+#else	/* CONFIG_BLK_CGROUP */
+
+struct blkg_policy_data {
+};
+
+struct blkcg_policy_data {
+};
+
+struct blkcg_policy {
+};
+
+#ifdef CONFIG_BLOCK
+
+static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
+static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q)
+{ return NULL; }
+static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
+static inline void blkcg_exit_queue(struct request_queue *q) { }
+static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
+static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { }
+static inline int blkcg_activate_policy(struct request_queue *q,
+					const struct blkcg_policy *pol) { return 0; }
+static inline void blkcg_deactivate_policy(struct request_queue *q,
+					   const struct blkcg_policy *pol) { }
+
+static inline struct blkcg *__bio_blkcg(struct bio *bio) { return NULL; }
+
+static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
+						  struct blkcg_policy *pol) { return NULL; }
+static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
+static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
+static inline void blkg_get(struct blkcg_gq *blkg) { }
+static inline void blkg_put(struct blkcg_gq *blkg) { }
+
+static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; }
+static inline void blkcg_bio_issue_init(struct bio *bio) { }
+static inline void blk_cgroup_bio_start(struct bio *bio) { }
+
+#define blk_queue_for_each_rl(rl, q)	\
+	for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
+
+#endif	/* CONFIG_BLOCK */
+#endif	/* CONFIG_BLK_CGROUP */
+
+#endif /* _BLK_CGROUP_PRIVATE_H */
diff --git a/block/blk-core.c b/block/blk-core.c
index ff972b968f25..5a4a59041629 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -34,7 +34,6 @@
 #include <linux/delay.h>
 #include <linux/ratelimit.h>
 #include <linux/pm_runtime.h>
-#include <linux/blk-cgroup.h>
 #include <linux/t10-pi.h>
 #include <linux/debugfs.h>
 #include <linux/bpf.h>
@@ -49,6 +48,7 @@
 #include "blk.h"
 #include "blk-mq-sched.h"
 #include "blk-pm.h"
+#include "blk-cgroup.h"
 #include "blk-throttle.h"
 
 struct dentry *blk_debugfs_root;
diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c
index c87aba8584c6..18c8eafe20b9 100644
--- a/block/blk-crypto-fallback.c
+++ b/block/blk-crypto-fallback.c
@@ -10,7 +10,6 @@
 #define pr_fmt(fmt) "blk-crypto-fallback: " fmt
 
 #include <crypto/skcipher.h>
-#include <linux/blk-cgroup.h>
 #include <linux/blk-crypto.h>
 #include <linux/blk-crypto-profile.h>
 #include <linux/blkdev.h>
@@ -20,6 +19,7 @@
 #include <linux/random.h>
 #include <linux/scatterlist.h>
 
+#include "blk-cgroup.h"
 #include "blk-crypto-internal.h"
 
 static unsigned int num_prealloc_bounce_pg = 32;
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 769b64394298..70a0a3d680a3 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -178,12 +178,12 @@
 #include <linux/time64.h>
 #include <linux/parser.h>
 #include <linux/sched/signal.h>
-#include <linux/blk-cgroup.h>
 #include <asm/local.h>
 #include <asm/local64.h>
 #include "blk-rq-qos.h"
 #include "blk-stat.h"
 #include "blk-wbt.h"
+#include "blk-cgroup.h"
 
 #ifdef CONFIG_TRACEPOINTS
 
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 6593c7123b97..010e658d44a8 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -74,9 +74,9 @@
 #include <linux/sched/signal.h>
 #include <trace/events/block.h>
 #include <linux/blk-mq.h>
-#include <linux/blk-cgroup.h>
 #include "blk-rq-qos.h"
 #include "blk-stat.h"
+#include "blk-cgroup.h"
 #include "blk.h"
 
 #define DEFAULT_SCALE_COOKIE 1000000U
diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c
index 2e7f10e1c03f..79e797f5d194 100644
--- a/block/blk-ioprio.c
+++ b/block/blk-ioprio.c
@@ -12,11 +12,11 @@
  *   Documentation/admin-guide/cgroup-v2.rst.
  */
 
-#include <linux/blk-cgroup.h>
 #include <linux/blk-mq.h>
 #include <linux/blk_types.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include "blk-cgroup.h"
 #include "blk-ioprio.h"
 #include "blk-rq-qos.h"
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 9f32882ceb2f..4c6b7dff71e5 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -10,7 +10,6 @@
 #include <linux/backing-dev.h>
 #include <linux/blktrace_api.h>
 #include <linux/blk-mq.h>
-#include <linux/blk-cgroup.h>
 #include <linux/debugfs.h>
 
 #include "blk.h"
@@ -18,6 +17,7 @@
 #include "blk-mq-debugfs.h"
 #include "blk-mq-sched.h"
 #include "blk-wbt.h"
+#include "blk-cgroup.h"
 #include "blk-throttle.h"
 
 struct queue_sysfs_entry {
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 7c462c006b26..73640d80e99e 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -10,7 +10,6 @@
 #include <linux/blkdev.h>
 #include <linux/bio.h>
 #include <linux/blktrace_api.h>
-#include <linux/blk-cgroup.h>
 #include "blk.h"
 #include "blk-cgroup-rwstat.h"
 #include "blk-stat.h"
diff --git a/block/bounce.c b/block/bounce.c
index 3fd3bc6fd5db..3d50d19cde72 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -14,7 +14,6 @@
 #include <linux/pagemap.h>
 #include <linux/mempool.h>
 #include <linux/blkdev.h>
-#include <linux/blk-cgroup.h>
 #include <linux/backing-dev.h>
 #include <linux/init.h>
 #include <linux/hash.h>
@@ -24,6 +23,7 @@
 
 #include <trace/events/block.h>
 #include "blk.h"
+#include "blk-cgroup.h"
 
 #define POOL_SIZE	64
 #define ISA_POOL_SIZE	16
diff --git a/block/elevator.c b/block/elevator.c
index ec98aed39c4f..6847ab6e7aa5 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -35,7 +35,6 @@
 #include <linux/hash.h>
 #include <linux/uaccess.h>
 #include <linux/pm_runtime.h>
-#include <linux/blk-cgroup.h>
 
 #include <trace/events/block.h>
 
@@ -44,6 +43,7 @@
 #include "blk-mq-sched.h"
 #include "blk-pm.h"
 #include "blk-wbt.h"
+#include "blk-cgroup.h"
 
 static DEFINE_SPINLOCK(elv_list_lock);
 static LIST_HEAD(elv_list);
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index bdc49bd4eef0..f2ad8ed8f777 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -25,12 +25,8 @@
 #include <linux/kthread.h>
 #include <linux/fs.h>
 
-/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
-#define BLKG_STAT_CPU_BATCH	(INT_MAX / 2)
-
 #define FC_APPID_LEN              129
 
-
 #ifdef CONFIG_BLK_CGROUP
 
 enum blkg_iostat_type {
@@ -42,6 +38,7 @@ enum blkg_iostat_type {
 };
 
 struct blkcg_gq;
+struct blkg_policy_data;
 
 struct blkcg {
 	struct cgroup_subsys_state	css;
@@ -74,36 +71,6 @@ struct blkg_iostat_set {
 	struct blkg_iostat		last;
 };
 
-/*
- * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a
- * request_queue (q).  This is used by blkcg policies which need to track
- * information per blkcg - q pair.
- *
- * There can be multiple active blkcg policies and each blkg:policy pair is
- * represented by a blkg_policy_data which is allocated and freed by each
- * policy's pd_alloc/free_fn() methods.  A policy can allocate private data
- * area by allocating larger data structure which embeds blkg_policy_data
- * at the beginning.
- */
-struct blkg_policy_data {
-	/* the blkg and policy id this per-policy data belongs to */
-	struct blkcg_gq			*blkg;
-	int				plid;
-};
-
-/*
- * Policies that need to keep per-blkcg data which is independent from any
- * request_queue associated to it should implement cpd_alloc/free_fn()
- * methods.  A policy can allocate private data area by allocating larger
- * data structure which embeds blkcg_policy_data at the beginning.
- * cpd_init() is invoked to let each policy handle per-blkcg data.
- */
-struct blkcg_policy_data {
-	/* the blkcg and policy id this per-policy data belongs to */
-	struct blkcg			*blkcg;
-	int				plid;
-};
-
 /* association between a blk cgroup and a request queue */
 struct blkcg_gq {
 	/* Pointer to the associated request_queue */
@@ -139,120 +106,17 @@ struct blkcg_gq {
 	struct rcu_head			rcu_head;
 };
 
-typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
-typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd);
-typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd);
-typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd);
-typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp,
-				struct request_queue *q, struct blkcg *blkcg);
-typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd);
-typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
-typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd);
-typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd);
-typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
-typedef bool (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd,
-				struct seq_file *s);
-
-struct blkcg_policy {
-	int				plid;
-	/* cgroup files for the policy */
-	struct cftype			*dfl_cftypes;
-	struct cftype			*legacy_cftypes;
-
-	/* operations */
-	blkcg_pol_alloc_cpd_fn		*cpd_alloc_fn;
-	blkcg_pol_init_cpd_fn		*cpd_init_fn;
-	blkcg_pol_free_cpd_fn		*cpd_free_fn;
-	blkcg_pol_bind_cpd_fn		*cpd_bind_fn;
-
-	blkcg_pol_alloc_pd_fn		*pd_alloc_fn;
-	blkcg_pol_init_pd_fn		*pd_init_fn;
-	blkcg_pol_online_pd_fn		*pd_online_fn;
-	blkcg_pol_offline_pd_fn		*pd_offline_fn;
-	blkcg_pol_free_pd_fn		*pd_free_fn;
-	blkcg_pol_reset_pd_stats_fn	*pd_reset_stats_fn;
-	blkcg_pol_stat_pd_fn		*pd_stat_fn;
-};
-
-extern struct blkcg blkcg_root;
 extern struct cgroup_subsys_state * const blkcg_root_css;
-extern bool blkcg_debug_stats;
 
-struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
-				      struct request_queue *q, bool update_hint);
-int blkcg_init_queue(struct request_queue *q);
-void blkcg_exit_queue(struct request_queue *q);
-
-/* Blkio controller policy registration */
-int blkcg_policy_register(struct blkcg_policy *pol);
-void blkcg_policy_unregister(struct blkcg_policy *pol);
-int blkcg_activate_policy(struct request_queue *q,
-			  const struct blkcg_policy *pol);
-void blkcg_deactivate_policy(struct request_queue *q,
-			     const struct blkcg_policy *pol);
-
-const char *blkg_dev_name(struct blkcg_gq *blkg);
-void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
-		       u64 (*prfill)(struct seq_file *,
-				     struct blkg_policy_data *, int),
-		       const struct blkcg_policy *pol, int data,
-		       bool show_total);
-u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
-
-struct blkg_conf_ctx {
-	struct block_device		*bdev;
-	struct blkcg_gq			*blkg;
-	char				*body;
-};
-
-struct block_device *blkcg_conf_open_bdev(char **inputp);
-int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
-		   char *input, struct blkg_conf_ctx *ctx);
-void blkg_conf_finish(struct blkg_conf_ctx *ctx);
-
-/**
- * blkcg_css - find the current css
- *
- * Find the css associated with either the kthread or the current task.
- * This may return a dying css, so it is up to the caller to use tryget logic
- * to confirm it is alive and well.
- */
-static inline struct cgroup_subsys_state *blkcg_css(void)
-{
-	struct cgroup_subsys_state *css;
-
-	css = kthread_blkcg();
-	if (css)
-		return css;
-	return task_css(current, io_cgrp_id);
-}
+void blkcg_destroy_blkgs(struct blkcg *blkcg);
+void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay);
+void blkcg_maybe_throttle_current(void);
 
 static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
 {
 	return css ? container_of(css, struct blkcg, css) : NULL;
 }
 
-/**
- * __bio_blkcg - internal, inconsistent version to get blkcg
- *
- * DO NOT USE.
- * This function is inconsistent and consequently is dangerous to use.  The
- * first part of the function returns a blkcg where a reference is owned by the
- * bio.  This means it does not need to be rcu protected as it cannot go away
- * with the bio owning a reference to it.  However, the latter potentially gets
- * it from task_css().  This can race against task migration and the cgroup
- * dying.  It is also semantically different as it must be called rcu protected
- * and is susceptible to failure when trying to get a reference to it.
- * Therefore, it is not ok to assume that *_get() will always succeed on the
- * blkcg returned here.
- */
-static inline struct blkcg *__bio_blkcg(struct bio *bio)
-{
-	if (bio && bio->bi_blkg)
-		return bio->bi_blkg->blkcg;
-	return css_to_blkcg(blkcg_css());
-}
-
 /**
  * bio_blkcg - grab the blkcg associated with a bio
  * @bio: target bio
@@ -288,22 +152,6 @@ static inline bool blk_cgroup_congested(void)
 	return ret;
 }
 
-/**
- * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
- * @return: true if this bio needs to be submitted with the root blkg context.
- *
- * In order to avoid priority inversions we sometimes need to issue a bio as if
- * it were attached to the root blkg, and then backcharge to the actual owning
- * blkg.  The idea is we do bio_blkcg() to look up the actual context for the
- * bio and attach the appropriate blkg to the bio.  Then we call this helper and
- * if it is true run with the root blkg for that queue and then do any
- * backcharging to the originating cgroup once the io is complete.
- */
-static inline bool bio_issue_as_root_blkg(struct bio *bio)
-{
-	return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0;
-}
-
 /**
  * blkcg_parent - get the parent of a blkcg
  * @blkcg: blkcg of interest
@@ -315,96 +163,6 @@ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
 	return css_to_blkcg(blkcg->css.parent);
 }
 
-/**
- * __blkg_lookup - internal version of blkg_lookup()
- * @blkcg: blkcg of interest
- * @q: request_queue of interest
- * @update_hint: whether to update lookup hint with the result or not
- *
- * This is internal version and shouldn't be used by policy
- * implementations.  Looks up blkgs for the @blkcg - @q pair regardless of
- * @q's bypass state.  If @update_hint is %true, the caller should be
- * holding @q->queue_lock and lookup hint is updated on success.
- */
-static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
-					     struct request_queue *q,
-					     bool update_hint)
-{
-	struct blkcg_gq *blkg;
-
-	if (blkcg == &blkcg_root)
-		return q->root_blkg;
-
-	blkg = rcu_dereference(blkcg->blkg_hint);
-	if (blkg && blkg->q == q)
-		return blkg;
-
-	return blkg_lookup_slowpath(blkcg, q, update_hint);
-}
-
-/**
- * blkg_lookup - lookup blkg for the specified blkcg - q pair
- * @blkcg: blkcg of interest
- * @q: request_queue of interest
- *
- * Lookup blkg for the @blkcg - @q pair.  This function should be called
- * under RCU read lock.
- */
-static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
-					   struct request_queue *q)
-{
-	WARN_ON_ONCE(!rcu_read_lock_held());
-	return __blkg_lookup(blkcg, q, false);
-}
-
-/**
- * blk_queue_root_blkg - return blkg for the (blkcg_root, @q) pair
- * @q: request_queue of interest
- *
- * Lookup blkg for @q at the root level. See also blkg_lookup().
- */
-static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q)
-{
-	return q->root_blkg;
-}
-
-/**
- * blkg_to_pdata - get policy private data
- * @blkg: blkg of interest
- * @pol: policy of interest
- *
- * Return pointer to private data associated with the @blkg-@pol pair.
- */
-static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
-						  struct blkcg_policy *pol)
-{
-	return blkg ? blkg->pd[pol->plid] : NULL;
-}
-
-static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
-						     struct blkcg_policy *pol)
-{
-	return blkcg ? blkcg->cpd[pol->plid] : NULL;
-}
-
-/**
- * pdata_to_blkg - get blkg associated with policy private data
- * @pd: policy private data of interest
- *
- * @pd is policy private data.  Determine the blkg it's associated with.
- */
-static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
-{
-	return pd ? pd->blkg : NULL;
-}
-
-static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd)
-{
-	return cpd ? cpd->blkcg : NULL;
-}
-
-extern void blkcg_destroy_blkgs(struct blkcg *blkcg);
-
 /**
  * blkcg_pin_online - pin online state
  * @blkcg: blkcg of interest
@@ -437,231 +195,24 @@ static inline void blkcg_unpin_online(struct blkcg *blkcg)
 	} while (blkcg);
 }
 
-/**
- * blkg_path - format cgroup path of blkg
- * @blkg: blkg of interest
- * @buf: target buffer
- * @buflen: target buffer length
- *
- * Format the path of the cgroup of @blkg into @buf.
- */
-static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
-{
-	return cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
-}
-
-/**
- * blkg_get - get a blkg reference
- * @blkg: blkg to get
- *
- * The caller should be holding an existing reference.
- */
-static inline void blkg_get(struct blkcg_gq *blkg)
-{
-	percpu_ref_get(&blkg->refcnt);
-}
-
-/**
- * blkg_tryget - try and get a blkg reference
- * @blkg: blkg to get
- *
- * This is for use when doing an RCU lookup of the blkg.  We may be in the midst
- * of freeing this blkg, so we can only use it if the refcnt is not zero.
- */
-static inline bool blkg_tryget(struct blkcg_gq *blkg)
-{
-	return blkg && percpu_ref_tryget(&blkg->refcnt);
-}
-
-/**
- * blkg_put - put a blkg reference
- * @blkg: blkg to put
- */
-static inline void blkg_put(struct blkcg_gq *blkg)
-{
-	percpu_ref_put(&blkg->refcnt);
-}
-
-/**
- * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
- * @d_blkg: loop cursor pointing to the current descendant
- * @pos_css: used for iteration
- * @p_blkg: target blkg to walk descendants of
- *
- * Walk @c_blkg through the descendants of @p_blkg.  Must be used with RCU
- * read locked.  If called under either blkcg or queue lock, the iteration
- * is guaranteed to include all and only online blkgs.  The caller may
- * update @pos_css by calling css_rightmost_descendant() to skip subtree.
- * @p_blkg is included in the iteration and the first node to be visited.
- */
-#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg)		\
-	css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css)	\
-		if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),	\
-					      (p_blkg)->q, false)))
-
-/**
- * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
- * @d_blkg: loop cursor pointing to the current descendant
- * @pos_css: used for iteration
- * @p_blkg: target blkg to walk descendants of
- *
- * Similar to blkg_for_each_descendant_pre() but performs post-order
- * traversal instead.  Synchronization rules are the same.  @p_blkg is
- * included in the iteration and the last node to be visited.
- */
-#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg)		\
-	css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css)	\
-		if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),	\
-					      (p_blkg)->q, false)))
-
-bool __blkcg_punt_bio_submit(struct bio *bio);
-
-static inline bool blkcg_punt_bio_submit(struct bio *bio)
-{
-	if (bio->bi_opf & REQ_CGROUP_PUNT)
-		return __blkcg_punt_bio_submit(bio);
-	else
-		return false;
-}
-
-static inline void blkcg_bio_issue_init(struct bio *bio)
-{
-	bio_issue_init(&bio->bi_issue, bio_sectors(bio));
-}
-
-static inline void blkcg_use_delay(struct blkcg_gq *blkg)
-{
-	if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
-		return;
-	if (atomic_add_return(1, &blkg->use_delay) == 1)
-		atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
-}
-
-static inline int blkcg_unuse_delay(struct blkcg_gq *blkg)
-{
-	int old = atomic_read(&blkg->use_delay);
-
-	if (WARN_ON_ONCE(old < 0))
-		return 0;
-	if (old == 0)
-		return 0;
-
-	/*
-	 * We do this song and dance because we can race with somebody else
-	 * adding or removing delay.  If we just did an atomic_dec we'd end up
-	 * negative and we'd already be in trouble.  We need to subtract 1 and
-	 * then check to see if we were the last delay so we can drop the
-	 * congestion count on the cgroup.
-	 */
-	while (old) {
-		int cur = atomic_cmpxchg(&blkg->use_delay, old, old - 1);
-		if (cur == old)
-			break;
-		old = cur;
-	}
-
-	if (old == 0)
-		return 0;
-	if (old == 1)
-		atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
-	return 1;
-}
-
-/**
- * blkcg_set_delay - Enable allocator delay mechanism with the specified delay amount
- * @blkg: target blkg
- * @delay: delay duration in nsecs
- *
- * When enabled with this function, the delay is not decayed and must be
- * explicitly cleared with blkcg_clear_delay(). Must not be mixed with
- * blkcg_[un]use_delay() and blkcg_add_delay() usages.
- */
-static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay)
-{
-	int old = atomic_read(&blkg->use_delay);
-
-	/* We only want 1 person setting the congestion count for this blkg. */
-	if (!old && atomic_cmpxchg(&blkg->use_delay, old, -1) == old)
-		atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
-
-	atomic64_set(&blkg->delay_nsec, delay);
-}
-
-/**
- * blkcg_clear_delay - Disable allocator delay mechanism
- * @blkg: target blkg
- *
- * Disable use_delay mechanism. See blkcg_set_delay().
- */
-static inline void blkcg_clear_delay(struct blkcg_gq *blkg)
-{
-	int old = atomic_read(&blkg->use_delay);
-
-	/* We only want 1 person clearing the congestion count for this blkg. */
-	if (old && atomic_cmpxchg(&blkg->use_delay, old, 0) == old)
-		atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
-}
-
-void blk_cgroup_bio_start(struct bio *bio);
-void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta);
-void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay);
-void blkcg_maybe_throttle_current(void);
 #else	/* CONFIG_BLK_CGROUP */
 
 struct blkcg {
 };
 
-struct blkg_policy_data {
-};
-
-struct blkcg_policy_data {
-};
-
 struct blkcg_gq {
 };
 
-struct blkcg_policy {
-};
-
 #define blkcg_root_css	((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
 
 static inline void blkcg_maybe_throttle_current(void) { }
 static inline bool blk_cgroup_congested(void) { return false; }
 
 #ifdef CONFIG_BLOCK
-
 static inline void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { }
-
-static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
-static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q)
-{ return NULL; }
-static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
-static inline void blkcg_exit_queue(struct request_queue *q) { }
-static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
-static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { }
-static inline int blkcg_activate_policy(struct request_queue *q,
-					const struct blkcg_policy *pol) { return 0; }
-static inline void blkcg_deactivate_policy(struct request_queue *q,
-					   const struct blkcg_policy *pol) { }
-
-static inline struct blkcg *__bio_blkcg(struct bio *bio) { return NULL; }
 static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
+#endif /* CONFIG_BLOCK */
 
-static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
-						  struct blkcg_policy *pol) { return NULL; }
-static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
-static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
-static inline void blkg_get(struct blkcg_gq *blkg) { }
-static inline void blkg_put(struct blkcg_gq *blkg) { }
-
-static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; }
-static inline void blkcg_bio_issue_init(struct bio *bio) { }
-static inline void blk_cgroup_bio_start(struct bio *bio) { }
-
-#define blk_queue_for_each_rl(rl, q)	\
-	for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
-
-#endif	/* CONFIG_BLOCK */
 #endif	/* CONFIG_BLK_CGROUP */
 
 #ifdef CONFIG_BLK_CGROUP_FC_APPID

From 2e2f0199a20780463945e1cbffb3a191fa84bd9d Mon Sep 17 00:00:00 2001
From: Barry Song <21cnbao@gmail.com>
Date: Mon, 7 Feb 2022 15:49:31 +0800
Subject: [PATCH 053/127] docs: block: biodoc.rst: Drop the obsolete and
 incorrect content

Since commit 7eaceaccab5f ("block: remove per-queue plugging"), kernel
has removed blk_run_address_space(), blk_unplug() and sync_buffer(),
and moved to on-stack plugging. The document has been obsolete for
years.
Given that there is no obvious counterparts in the new mechinism to
replace old APIs, this patch drops the content directly.

Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
Link: https://lore.kernel.org/r/20220207074931.20067-1-song.bao.hua@hisilicon.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/block/biodoc.rst | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/Documentation/block/biodoc.rst b/Documentation/block/biodoc.rst
index 4fbc367e62f9..fdebc0fd8468 100644
--- a/Documentation/block/biodoc.rst
+++ b/Documentation/block/biodoc.rst
@@ -965,11 +965,7 @@ till it fills up with a few more requests, before starting to service
 the requests. This provides an opportunity to merge/sort the requests before
 passing them down to the device. There are various conditions when the queue is
 unplugged (to open up the flow again), either through a scheduled task or
-could be on demand. For example wait_on_buffer sets the unplugging going
-through sync_buffer() running blk_run_address_space(mapping). Or the caller
-can do it explicity through blk_unplug(bdev). So in the read case,
-the queue gets explicitly unplugged as part of waiting for completion on that
-buffer.
+could be on demand.
 
 Aside:
   This is kind of controversial territory, as it's not clear if plugging is

From 69591a402d18fd60c8c95b061279f17bf0b78129 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 15 Feb 2022 09:10:47 +0100
Subject: [PATCH 054/127] block: remove biodoc.rst

This document is completely out of date and extremely misleading. In
general the existing kerneldoc comment serve as a much better
documentation of the still existing functionality, while the history
blurbs are pretty much irrelevant today.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20220215081047.3693582-1-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/block/biodoc.rst | 1155 --------------------------------
 Documentation/block/index.rst  |    1 -
 2 files changed, 1156 deletions(-)
 delete mode 100644 Documentation/block/biodoc.rst

diff --git a/Documentation/block/biodoc.rst b/Documentation/block/biodoc.rst
deleted file mode 100644
index fdebc0fd8468..000000000000
--- a/Documentation/block/biodoc.rst
+++ /dev/null
@@ -1,1155 +0,0 @@
-=====================================================
-Notes on the Generic Block Layer Rewrite in Linux 2.5
-=====================================================
-
-.. note::
-
-	It seems that there are lot of outdated stuff here. This seems
-	to be written somewhat as a task list. Yet, eventually, something
-	here might still be useful.
-
-Notes Written on Jan 15, 2002:
-
-	- Jens Axboe <jens.axboe@oracle.com>
-	- Suparna Bhattacharya <suparna@in.ibm.com>
-
-Last Updated May 2, 2002
-
-September 2003: Updated I/O Scheduler portions
-	- Nick Piggin <npiggin@kernel.dk>
-
-Introduction
-============
-
-These are some notes describing some aspects of the 2.5 block layer in the
-context of the bio rewrite. The idea is to bring out some of the key
-changes and a glimpse of the rationale behind those changes.
-
-Please mail corrections & suggestions to suparna@in.ibm.com.
-
-Credits
-=======
-
-2.5 bio rewrite:
-	- Jens Axboe <jens.axboe@oracle.com>
-
-Many aspects of the generic block layer redesign were driven by and evolved
-over discussions, prior patches and the collective experience of several
-people. See sections 8 and 9 for a list of some related references.
-
-The following people helped with review comments and inputs for this
-document:
-
-	- Christoph Hellwig <hch@infradead.org>
-	- Arjan van de Ven <arjanv@redhat.com>
-	- Randy Dunlap <rdunlap@xenotime.net>
-	- Andre Hedrick <andre@linux-ide.org>
-
-The following people helped with fixes/contributions to the bio patches
-while it was still work-in-progress:
-
-	- David S. Miller <davem@redhat.com>
-
-
-.. Description of Contents:
-
-   1. Scope for tuning of logic to various needs
-     1.1 Tuning based on device or low level driver capabilities
-	- Per-queue parameters
-	- Highmem I/O support
-	- I/O scheduler modularization
-     1.2 Tuning based on high level requirements/capabilities
-	1.2.1 Request Priority/Latency
-     1.3 Direct access/bypass to lower layers for diagnostics and special
-	 device operations
-	1.3.1 Pre-built commands
-   2. New flexible and generic but minimalist i/o structure or descriptor
-      (instead of using buffer heads at the i/o layer)
-     2.1 Requirements/Goals addressed
-     2.2 The bio struct in detail (multi-page io unit)
-     2.3 Changes in the request structure
-   3. Using bios
-     3.1 Setup/teardown (allocation, splitting)
-     3.2 Generic bio helper routines
-       3.2.1 Traversing segments and completion units in a request
-       3.2.2 Setting up DMA scatterlists
-       3.2.3 I/O completion
-       3.2.4 Implications for drivers that do not interpret bios (don't handle
-	  multiple segments)
-     3.3 I/O submission
-   4. The I/O scheduler
-   5. Scalability related changes
-     5.1 Granular locking: Removal of io_request_lock
-     5.2 Prepare for transition to 64 bit sector_t
-   6. Other Changes/Implications
-     6.1 Partition re-mapping handled by the generic block layer
-   7. A few tips on migration of older drivers
-   8. A list of prior/related/impacted patches/ideas
-   9. Other References/Discussion Threads
-
-
-Bio Notes
-=========
-
-Let us discuss the changes in the context of how some overall goals for the
-block layer are addressed.
-
-1. Scope for tuning the generic logic to satisfy various requirements
-=====================================================================
-
-The block layer design supports adaptable abstractions to handle common
-processing with the ability to tune the logic to an appropriate extent
-depending on the nature of the device and the requirements of the caller.
-One of the objectives of the rewrite was to increase the degree of tunability
-and to enable higher level code to utilize underlying device/driver
-capabilities to the maximum extent for better i/o performance. This is
-important especially in the light of ever improving hardware capabilities
-and application/middleware software designed to take advantage of these
-capabilities.
-
-1.1 Tuning based on low level device / driver capabilities
-----------------------------------------------------------
-
-Sophisticated devices with large built-in caches, intelligent i/o scheduling
-optimizations, high memory DMA support, etc may find some of the
-generic processing an overhead, while for less capable devices the
-generic functionality is essential for performance or correctness reasons.
-Knowledge of some of the capabilities or parameters of the device should be
-used at the generic block layer to take the right decisions on
-behalf of the driver.
-
-How is this achieved ?
-
-Tuning at a per-queue level:
-
-i. Per-queue limits/values exported to the generic layer by the driver
-
-Various parameters that the generic i/o scheduler logic uses are set at
-a per-queue level (e.g maximum request size, maximum number of segments in
-a scatter-gather list, logical block size)
-
-Some parameters that were earlier available as global arrays indexed by
-major/minor are now directly associated with the queue. Some of these may
-move into the block device structure in the future. Some characteristics
-have been incorporated into a queue flags field rather than separate fields
-in themselves.  There are blk_queue_xxx functions to set the parameters,
-rather than update the fields directly
-
-Some new queue property settings:
-
-	blk_queue_bounce_limit(q, u64 dma_address)
-		Enable I/O to highmem pages, dma_address being the
-		limit. No highmem default.
-
-	blk_queue_max_sectors(q, max_sectors)
-		Sets two variables that limit the size of the request.
-
-		- The request queue's max_sectors, which is a soft size in
-		  units of 512 byte sectors, and could be dynamically varied
-		  by the core kernel.
-
-		- The request queue's max_hw_sectors, which is a hard limit
-		  and reflects the maximum size request a driver can handle
-		  in units of 512 byte sectors.
-
-		The default for both max_sectors and max_hw_sectors is
-		255. The upper limit of max_sectors is 1024.
-
-	blk_queue_max_phys_segments(q, max_segments)
-		Maximum physical segments you can handle in a request. 128
-		default (driver limit). (See 3.2.2)
-
-	blk_queue_max_hw_segments(q, max_segments)
-		Maximum dma segments the hardware can handle in a request. 128
-		default (host adapter limit, after dma remapping).
-		(See 3.2.2)
-
-	blk_queue_max_segment_size(q, max_seg_size)
-		Maximum size of a clustered segment, 64kB default.
-
-	blk_queue_logical_block_size(q, logical_block_size)
-		Lowest possible sector size that the hardware can operate
-		on, 512 bytes default.
-
-New queue flags:
-
-	- QUEUE_FLAG_CLUSTER (see 3.2.2)
-	- QUEUE_FLAG_QUEUED (see 3.2.4)
-
-
-ii. High-mem i/o capabilities are now considered the default
-
-The generic bounce buffer logic, present in 2.4, where the block layer would
-by default copyin/out i/o requests on high-memory buffers to low-memory buffers
-assuming that the driver wouldn't be able to handle it directly, has been
-changed in 2.5. The bounce logic is now applied only for memory ranges
-for which the device cannot handle i/o. A driver can specify this by
-setting the queue bounce limit for the request queue for the device
-(blk_queue_bounce_limit()). This avoids the inefficiencies of the copyin/out
-where a device is capable of handling high memory i/o.
-
-In order to enable high-memory i/o where the device is capable of supporting
-it, the pci dma mapping routines and associated data structures have now been
-modified to accomplish a direct page -> bus translation, without requiring
-a virtual address mapping (unlike the earlier scheme of virtual address
--> bus translation). So this works uniformly for high-memory pages (which
-do not have a corresponding kernel virtual address space mapping) and
-low-memory pages.
-
-Note: Please refer to Documentation/core-api/dma-api-howto.rst for a discussion
-on PCI high mem DMA aspects and mapping of scatter gather lists, and support
-for 64 bit PCI.
-
-Special handling is required only for cases where i/o needs to happen on
-pages at physical memory addresses beyond what the device can support. In these
-cases, a bounce bio representing a buffer from the supported memory range
-is used for performing the i/o with copyin/copyout as needed depending on
-the type of the operation.  For example, in case of a read operation, the
-data read has to be copied to the original buffer on i/o completion, so a
-callback routine is set up to do this, while for write, the data is copied
-from the original buffer to the bounce buffer prior to issuing the
-operation. Since an original buffer may be in a high memory area that's not
-mapped in kernel virtual addr, a kmap operation may be required for
-performing the copy, and special care may be needed in the completion path
-as it may not be in irq context. Special care is also required (by way of
-GFP flags) when allocating bounce buffers, to avoid certain highmem
-deadlock possibilities.
-
-It is also possible that a bounce buffer may be allocated from high-memory
-area that's not mapped in kernel virtual addr, but within the range that the
-device can use directly; so the bounce page may need to be kmapped during
-copy operations. [Note: This does not hold in the current implementation,
-though]
-
-There are some situations when pages from high memory may need to
-be kmapped, even if bounce buffers are not necessary. For example a device
-may need to abort DMA operations and revert to PIO for the transfer, in
-which case a virtual mapping of the page is required. For SCSI it is also
-done in some scenarios where the low level driver cannot be trusted to
-handle a single sg entry correctly. The driver is expected to perform the
-kmaps as needed on such occasions as appropriate. A driver could also use
-the blk_queue_bounce() routine on its own to bounce highmem i/o to low
-memory for specific requests if so desired.
-
-iii. The i/o scheduler algorithm itself can be replaced/set as appropriate
-
-As in 2.4, it is possible to plugin a brand new i/o scheduler for a particular
-queue or pick from (copy) existing generic schedulers and replace/override
-certain portions of it. The 2.5 rewrite provides improved modularization
-of the i/o scheduler. There are more pluggable callbacks, e.g for init,
-add request, extract request, which makes it possible to abstract specific
-i/o scheduling algorithm aspects and details outside of the generic loop.
-It also makes it possible to completely hide the implementation details of
-the i/o scheduler from block drivers.
-
-I/O scheduler wrappers are to be used instead of accessing the queue directly.
-See section 4. The I/O scheduler for details.
-
-1.2 Tuning Based on High level code capabilities
-------------------------------------------------
-
-i. Application capabilities for raw i/o
-
-This comes from some of the high-performance database/middleware
-requirements where an application prefers to make its own i/o scheduling
-decisions based on an understanding of the access patterns and i/o
-characteristics
-
-ii. High performance filesystems or other higher level kernel code's
-capabilities
-
-Kernel components like filesystems could also take their own i/o scheduling
-decisions for optimizing performance. Journalling filesystems may need
-some control over i/o ordering.
-
-What kind of support exists at the generic block layer for this ?
-
-The flags and rw fields in the bio structure can be used for some tuning
-from above e.g indicating that an i/o is just a readahead request, or priority
-settings (currently unused). As far as user applications are concerned they
-would need an additional mechanism either via open flags or ioctls, or some
-other upper level mechanism to communicate such settings to block.
-
-1.2.1 Request Priority/Latency
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Todo/Under discussion::
-
-  Arjan's proposed request priority scheme allows higher levels some broad
-  control (high/med/low) over the priority  of an i/o request vs other pending
-  requests in the queue. For example it allows reads for bringing in an
-  executable page on demand to be given a higher priority over pending write
-  requests which haven't aged too much on the queue. Potentially this priority
-  could even be exposed to applications in some manner, providing higher level
-  tunability. Time based aging avoids starvation of lower priority
-  requests. Some bits in the bi_opf flags field in the bio structure are
-  intended to be used for this priority information.
-
-
-1.3 Direct Access to Low level Device/Driver Capabilities (Bypass mode)
------------------------------------------------------------------------
-
-(e.g Diagnostics, Systems Management)
-
-There are situations where high-level code needs to have direct access to
-the low level device capabilities or requires the ability to issue commands
-to the device bypassing some of the intermediate i/o layers.
-These could, for example, be special control commands issued through ioctl
-interfaces, or could be raw read/write commands that stress the drive's
-capabilities for certain kinds of fitness tests. Having direct interfaces at
-multiple levels without having to pass through upper layers makes
-it possible to perform bottom up validation of the i/o path, layer by
-layer, starting from the media.
-
-The normal i/o submission interfaces, e.g submit_bio, could be bypassed
-for specially crafted requests which such ioctl or diagnostics
-interfaces would typically use, and the elevator add_request routine
-can instead be used to directly insert such requests in the queue or preferably
-the blk_do_rq routine can be used to place the request on the queue and
-wait for completion. Alternatively, sometimes the caller might just
-invoke a lower level driver specific interface with the request as a
-parameter.
-
-If the request is a means for passing on special information associated with
-the command, then such information is associated with the request->special
-field (rather than misuse the request->buffer field which is meant for the
-request data buffer's virtual mapping).
-
-For passing request data, the caller must build up a bio descriptor
-representing the concerned memory buffer if the underlying driver interprets
-bio segments or uses the block layer end*request* functions for i/o
-completion. Alternatively one could directly use the request->buffer field to
-specify the virtual address of the buffer, if the driver expects buffer
-addresses passed in this way and ignores bio entries for the request type
-involved. In the latter case, the driver would modify and manage the
-request->buffer, request->sector and request->nr_sectors or
-request->current_nr_sectors fields itself rather than using the block layer
-end_request or end_that_request_first completion interfaces.
-(See 2.3 or Documentation/block/request.rst for a brief explanation of
-the request structure fields)
-
-::
-
-  [TBD: end_that_request_last should be usable even in this case;
-  Perhaps an end_that_direct_request_first routine could be implemented to make
-  handling direct requests easier for such drivers; Also for drivers that
-  expect bios, a helper function could be provided for setting up a bio
-  corresponding to a data buffer]
-
-  <JENS: I dont understand the above, why is end_that_request_first() not
-  usable? Or _last for that matter. I must be missing something>
-
-  <SUP: What I meant here was that if the request doesn't have a bio, then
-   end_that_request_first doesn't modify nr_sectors or current_nr_sectors,
-   and hence can't be used for advancing request state settings on the
-   completion of partial transfers. The driver has to modify these fields
-   directly by hand.
-   This is because end_that_request_first only iterates over the bio list,
-   and always returns 0 if there are none associated with the request.
-   _last works OK in this case, and is not a problem, as I mentioned earlier
-  >
-
-1.3.1 Pre-built Commands
-^^^^^^^^^^^^^^^^^^^^^^^^
-
-A request can be created with a pre-built custom command  to be sent directly
-to the device. The cmd block in the request structure has room for filling
-in the command bytes. (i.e rq->cmd is now 16 bytes in size, and meant for
-command pre-building, and the type of the request is now indicated
-through rq->flags instead of via rq->cmd)
-
-The request structure flags can be set up to indicate the type of request
-in such cases (REQ_PC: direct packet command passed to driver, REQ_BLOCK_PC:
-packet command issued via blk_do_rq, REQ_SPECIAL: special request).
-
-It can help to pre-build device commands for requests in advance.
-Drivers can now specify a request prepare function (q->prep_rq_fn) that the
-block layer would invoke to pre-build device commands for a given request,
-or perform other preparatory processing for the request. This is routine is
-called by elv_next_request(), i.e. typically just before servicing a request.
-(The prepare function would not be called for requests that have RQF_DONTPREP
-enabled)
-
-Aside:
-  Pre-building could possibly even be done early, i.e before placing the
-  request on the queue, rather than construct the command on the fly in the
-  driver while servicing the request queue when it may affect latencies in
-  interrupt context or responsiveness in general. One way to add early
-  pre-building would be to do it whenever we fail to merge on a request.
-  Now REQ_NOMERGE is set in the request flags to skip this one in the future,
-  which means that it will not change before we feed it to the device. So
-  the pre-builder hook can be invoked there.
-
-
-2. Flexible and generic but minimalist i/o structure/descriptor
-===============================================================
-
-2.1 Reason for a new structure and requirements addressed
----------------------------------------------------------
-
-Prior to 2.5, buffer heads were used as the unit of i/o at the generic block
-layer, and the low level request structure was associated with a chain of
-buffer heads for a contiguous i/o request. This led to certain inefficiencies
-when it came to large i/o requests and readv/writev style operations, as it
-forced such requests to be broken up into small chunks before being passed
-on to the generic block layer, only to be merged by the i/o scheduler
-when the underlying device was capable of handling the i/o in one shot.
-Also, using the buffer head as an i/o structure for i/os that didn't originate
-from the buffer cache unnecessarily added to the weight of the descriptors
-which were generated for each such chunk.
-
-The following were some of the goals and expectations considered in the
-redesign of the block i/o data structure in 2.5.
-
-1.  Should be appropriate as a descriptor for both raw and buffered i/o  -
-    avoid cache related fields which are irrelevant in the direct/page i/o path,
-    or filesystem block size alignment restrictions which may not be relevant
-    for raw i/o.
-2.  Ability to represent high-memory buffers (which do not have a virtual
-    address mapping in kernel address space).
-3.  Ability to represent large i/os w/o unnecessarily breaking them up (i.e
-    greater than PAGE_SIZE chunks in one shot)
-4.  At the same time, ability to retain independent identity of i/os from
-    different sources or i/o units requiring individual completion (e.g. for
-    latency reasons)
-5.  Ability to represent an i/o involving multiple physical memory segments
-    (including non-page aligned page fragments, as specified via readv/writev)
-    without unnecessarily breaking it up, if the underlying device is capable of
-    handling it.
-6.  Preferably should be based on a memory descriptor structure that can be
-    passed around different types of subsystems or layers, maybe even
-    networking, without duplication or extra copies of data/descriptor fields
-    themselves in the process
-7.  Ability to handle the possibility of splits/merges as the structure passes
-    through layered drivers (lvm, md, evms), with minimal overhead.
-
-The solution was to define a new structure (bio)  for the block layer,
-instead of using the buffer head structure (bh) directly, the idea being
-avoidance of some associated baggage and limitations. The bio structure
-is uniformly used for all i/o at the block layer ; it forms a part of the
-bh structure for buffered i/o, and in the case of raw/direct i/o kiobufs are
-mapped to bio structures.
-
-2.2 The bio struct
-------------------
-
-The bio structure uses a vector representation pointing to an array of tuples
-of <page, offset, len> to describe the i/o buffer, and has various other
-fields describing i/o parameters and state that needs to be maintained for
-performing the i/o.
-
-Notice that this representation means that a bio has no virtual address
-mapping at all (unlike buffer heads).
-
-::
-
-  struct bio_vec {
-       struct page     *bv_page;
-       unsigned short  bv_len;
-       unsigned short  bv_offset;
-  };
-
-  /*
-   * main unit of I/O for the block layer and lower layers (ie drivers)
-   */
-  struct bio {
-       struct bio          *bi_next;    /* request queue link */
-       struct block_device *bi_bdev;	/* target device */
-       unsigned long       bi_flags;    /* status, command, etc */
-       unsigned long       bi_opf;       /* low bits: r/w, high: priority */
-
-       unsigned int	bi_vcnt;     /* how may bio_vec's */
-       struct bvec_iter	bi_iter;	/* current index into bio_vec array */
-
-       unsigned int	bi_size;     /* total size in bytes */
-       unsigned short	bi_hw_segments; /* segments after DMA remapping */
-       unsigned int	bi_max;	     /* max bio_vecs we can hold
-                                        used as index into pool */
-       struct bio_vec   *bi_io_vec;  /* the actual vec list */
-       bio_end_io_t	*bi_end_io;  /* bi_end_io (bio) */
-       atomic_t		bi_cnt;	     /* pin count: free when it hits zero */
-       void             *bi_private;
-  };
-
-With this multipage bio design:
-
-- Large i/os can be sent down in one go using a bio_vec list consisting
-  of an array of <page, offset, len> fragments (similar to the way fragments
-  are represented in the zero-copy network code)
-- Splitting of an i/o request across multiple devices (as in the case of
-  lvm or raid) is achieved by cloning the bio (where the clone points to
-  the same bi_io_vec array, but with the index and size accordingly modified)
-- A linked list of bios is used as before for unrelated merges [#]_ - this
-  avoids reallocs and makes independent completions easier to handle.
-- Code that traverses the req list can find all the segments of a bio
-  by using rq_for_each_segment.  This handles the fact that a request
-  has multiple bios, each of which can have multiple segments.
-- Drivers which can't process a large bio in one shot can use the bi_iter
-  field to keep track of the next bio_vec entry to process.
-  (e.g a 1MB bio_vec needs to be handled in max 128kB chunks for IDE)
-  [TBD: Should preferably also have a bi_voffset and bi_vlen to avoid modifying
-  bi_offset an len fields]
-
-.. [#]
-
-	unrelated merges -- a request ends up containing two or more bios that
-	didn't originate from the same place.
-
-bi_end_io() i/o callback gets called on i/o completion of the entire bio.
-
-At a lower level, drivers build a scatter gather list from the merged bios.
-The scatter gather list is in the form of an array of <page, offset, len>
-entries with their corresponding dma address mappings filled in at the
-appropriate time. As an optimization, contiguous physical pages can be
-covered by a single entry where <page> refers to the first page and <len>
-covers the range of pages (up to 16 contiguous pages could be covered this
-way). There is a helper routine (blk_rq_map_sg) which drivers can use to build
-the sg list.
-
-Note: Right now the only user of bios with more than one page is ll_rw_kio,
-which in turn means that only raw I/O uses it (direct i/o may not work
-right now). The intent however is to enable clustering of pages etc to
-become possible. The pagebuf abstraction layer from SGI also uses multi-page
-bios, but that is currently not included in the stock development kernels.
-The same is true of Andrew Morton's work-in-progress multipage bio writeout
-and readahead patches.
-
-2.3 Changes in the Request Structure
-------------------------------------
-
-The request structure is the structure that gets passed down to low level
-drivers. The block layer make_request function builds up a request structure,
-places it on the queue and invokes the drivers request_fn. The driver makes
-use of block layer helper routine elv_next_request to pull the next request
-off the queue. Control or diagnostic functions might bypass block and directly
-invoke underlying driver entry points passing in a specially constructed
-request structure.
-
-Only some relevant fields (mainly those which changed or may be referred
-to in some of the discussion here) are listed below, not necessarily in
-the order in which they occur in the structure (see include/linux/blkdev.h)
-Refer to Documentation/block/request.rst for details about all the request
-structure fields and a quick reference about the layers which are
-supposed to use or modify those fields::
-
-  struct request {
-	struct list_head queuelist;  /* Not meant to be directly accessed by
-					the driver.
-					Used by q->elv_next_request_fn
-					rq->queue is gone
-					*/
-	.
-	.
-	unsigned char cmd[16]; /* prebuilt command data block */
-	unsigned long flags;   /* also includes earlier rq->cmd settings */
-	.
-	.
-	sector_t sector; /* this field is now of type sector_t instead of int
-			    preparation for 64 bit sectors */
-	.
-	.
-
-	/* Number of scatter-gather DMA addr+len pairs after
-	 * physical address coalescing is performed.
-	 */
-	unsigned short nr_phys_segments;
-
-	/* Number of scatter-gather addr+len pairs after
-	 * physical and DMA remapping hardware coalescing is performed.
-	 * This is the number of scatter-gather entries the driver
-	 * will actually have to deal with after DMA mapping is done.
-	 */
-	unsigned short nr_hw_segments;
-
-	/* Various sector counts */
-	unsigned long nr_sectors;  /* no. of sectors left: driver modifiable */
-	unsigned long hard_nr_sectors;  /* block internal copy of above */
-	unsigned int current_nr_sectors; /* no. of sectors left in the
-					   current segment:driver modifiable */
-	unsigned long hard_cur_sectors; /* block internal copy of the above */
-	.
-	.
-	int tag;	/* command tag associated with request */
-	void *special;  /* same as before */
-	char *buffer;   /* valid only for low memory buffers up to
-			 current_nr_sectors */
-	.
-	.
-	struct bio *bio, *biotail;  /* bio list instead of bh */
-	struct request_list *rl;
-  }
-
-See the req_ops and req_flag_bits definitions for an explanation of the various
-flags available. Some bits are used by the block layer or i/o scheduler.
-
-The behaviour of the various sector counts are almost the same as before,
-except that since we have multi-segment bios, current_nr_sectors refers
-to the numbers of sectors in the current segment being processed which could
-be one of the many segments in the current bio (i.e i/o completion unit).
-The nr_sectors value refers to the total number of sectors in the whole
-request that remain to be transferred (no change). The purpose of the
-hard_xxx values is for block to remember these counts every time it hands
-over the request to the driver. These values are updated by block on
-end_that_request_first, i.e. every time the driver completes a part of the
-transfer and invokes block end*request helpers to mark this. The
-driver should not modify these values. The block layer sets up the
-nr_sectors and current_nr_sectors fields (based on the corresponding
-hard_xxx values and the number of bytes transferred) and updates it on
-every transfer that invokes end_that_request_first. It does the same for the
-buffer, bio, bio->bi_iter fields too.
-
-The buffer field is just a virtual address mapping of the current segment
-of the i/o buffer in cases where the buffer resides in low-memory. For high
-memory i/o, this field is not valid and must not be used by drivers.
-
-Code that sets up its own request structures and passes them down to
-a driver needs to be careful about interoperation with the block layer helper
-functions which the driver uses. (Section 1.3)
-
-3. Using bios
-=============
-
-3.1 Setup/Teardown
-------------------
-
-There are routines for managing the allocation, and reference counting, and
-freeing of bios (bio_alloc, bio_get, bio_put).
-
-This makes use of Ingo Molnar's mempool implementation, which enables
-subsystems like bio to maintain their own reserve memory pools for guaranteed
-deadlock-free allocations during extreme VM load. For example, the VM
-subsystem makes use of the block layer to writeout dirty pages in order to be
-able to free up memory space, a case which needs careful handling. The
-allocation logic draws from the preallocated emergency reserve in situations
-where it cannot allocate through normal means. If the pool is empty and it
-can wait, then it would trigger action that would help free up memory or
-replenish the pool (without deadlocking) and wait for availability in the pool.
-If it is in IRQ context, and hence not in a position to do this, allocation
-could fail if the pool is empty. In general mempool always first tries to
-perform allocation without having to wait, even if it means digging into the
-pool as long it is not less that 50% full.
-
-On a free, memory is released to the pool or directly freed depending on
-the current availability in the pool. The mempool interface lets the
-subsystem specify the routines to be used for normal alloc and free. In the
-case of bio, these routines make use of the standard slab allocator.
-
-The caller of bio_alloc is expected to taken certain steps to avoid
-deadlocks, e.g. avoid trying to allocate more memory from the pool while
-already holding memory obtained from the pool.
-
-::
-
-  [TBD: This is a potential issue, though a rare possibility
-   in the bounce bio allocation that happens in the current code, since
-   it ends up allocating a second bio from the same pool while
-   holding the original bio ]
-
-Memory allocated from the pool should be released back within a limited
-amount of time (in the case of bio, that would be after the i/o is completed).
-This ensures that if part of the pool has been used up, some work (in this
-case i/o) must already be in progress and memory would be available when it
-is over. If allocating from multiple pools in the same code path, the order
-or hierarchy of allocation needs to be consistent, just the way one deals
-with multiple locks.
-
-The bio_alloc routine also needs to allocate the bio_vec_list (bvec_alloc())
-for a non-clone bio. There are the 6 pools setup for different size biovecs,
-so bio_alloc(gfp_mask, nr_iovecs) will allocate a vec_list of the
-given size from these slabs.
-
-The bio_get() routine may be used to hold an extra reference on a bio prior
-to i/o submission, if the bio fields are likely to be accessed after the
-i/o is issued (since the bio may otherwise get freed in case i/o completion
-happens in the meantime).
-
-3.2 Generic bio helper Routines
--------------------------------
-
-3.2.1 Traversing segments and completion units in a request
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-The macro rq_for_each_segment() should be used for traversing the bios
-in the request list (drivers should avoid directly trying to do it
-themselves). Using these helpers should also make it easier to cope
-with block changes in the future.
-
-::
-
-	struct req_iterator iter;
-	rq_for_each_segment(bio_vec, rq, iter)
-		/* bio_vec is now current segment */
-
-I/O completion callbacks are per-bio rather than per-segment, so drivers
-that traverse bio chains on completion need to keep that in mind. Drivers
-which don't make a distinction between segments and completion units would
-need to be reorganized to support multi-segment bios.
-
-3.2.2 Setting up DMA scatterlists
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-The blk_rq_map_sg() helper routine would be used for setting up scatter
-gather lists from a request, so a driver need not do it on its own.
-
-	nr_segments = blk_rq_map_sg(q, rq, scatterlist);
-
-The helper routine provides a level of abstraction which makes it easier
-to modify the internals of request to scatterlist conversion down the line
-without breaking drivers. The blk_rq_map_sg routine takes care of several
-things like collapsing physically contiguous segments (if QUEUE_FLAG_CLUSTER
-is set) and correct segment accounting to avoid exceeding the limits which
-the i/o hardware can handle, based on various queue properties.
-
-- Prevents a clustered segment from crossing a 4GB mem boundary
-- Avoids building segments that would exceed the number of physical
-  memory segments that the driver can handle (phys_segments) and the
-  number that the underlying hardware can handle at once, accounting for
-  DMA remapping (hw_segments)  (i.e. IOMMU aware limits).
-
-Routines which the low level driver can use to set up the segment limits:
-
-blk_queue_max_hw_segments() : Sets an upper limit of the maximum number of
-hw data segments in a request (i.e. the maximum number of address/length
-pairs the host adapter can actually hand to the device at once)
-
-blk_queue_max_phys_segments() : Sets an upper limit on the maximum number
-of physical data segments in a request (i.e. the largest sized scatter list
-a driver could handle)
-
-3.2.3 I/O completion
-^^^^^^^^^^^^^^^^^^^^
-
-The existing generic block layer helper routines end_request,
-end_that_request_first and end_that_request_last can be used for i/o
-completion (and setting things up so the rest of the i/o or the next
-request can be kicked of) as before. With the introduction of multi-page
-bio support, end_that_request_first requires an additional argument indicating
-the number of sectors completed.
-
-3.2.4 Implications for drivers that do not interpret bios
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-(don't handle multiple segments)
-
-Drivers that do not interpret bios e.g those which do not handle multiple
-segments and do not support i/o into high memory addresses (require bounce
-buffers) and expect only virtually mapped buffers, can access the rq->buffer
-field. As before the driver should use current_nr_sectors to determine the
-size of remaining data in the current segment (that is the maximum it can
-transfer in one go unless it interprets segments), and rely on the block layer
-end_request, or end_that_request_first/last to take care of all accounting
-and transparent mapping of the next bio segment when a segment boundary
-is crossed on completion of a transfer. (The end*request* functions should
-be used if only if the request has come down from block/bio path, not for
-direct access requests which only specify rq->buffer without a valid rq->bio)
-
-3.3 I/O Submission
-------------------
-
-The routine submit_bio() is used to submit a single io. Higher level i/o
-routines make use of this:
-
-(a) Buffered i/o:
-
-The routine submit_bh() invokes submit_bio() on a bio corresponding to the
-bh, allocating the bio if required. ll_rw_block() uses submit_bh() as before.
-
-(b) Kiobuf i/o (for raw/direct i/o):
-
-The ll_rw_kio() routine breaks up the kiobuf into page sized chunks and
-maps the array to one or more multi-page bios, issuing submit_bio() to
-perform the i/o on each of these.
-
-The embedded bh array in the kiobuf structure has been removed and no
-preallocation of bios is done for kiobufs. [The intent is to remove the
-blocks array as well, but it's currently in there to kludge around direct i/o.]
-Thus kiobuf allocation has switched back to using kmalloc rather than vmalloc.
-
-Todo/Observation:
-
- A single kiobuf structure is assumed to correspond to a contiguous range
- of data, so brw_kiovec() invokes ll_rw_kio for each kiobuf in a kiovec.
- So right now it wouldn't work for direct i/o on non-contiguous blocks.
- This is to be resolved.  The eventual direction is to replace kiobuf
- by kvec's.
-
- Badari Pulavarty has a patch to implement direct i/o correctly using
- bio and kvec.
-
-
-(c) Page i/o:
-
-Todo/Under discussion:
-
- Andrew Morton's multi-page bio patches attempt to issue multi-page
- writeouts (and reads) from the page cache, by directly building up
- large bios for submission completely bypassing the usage of buffer
- heads. This work is still in progress.
-
- Christoph Hellwig had some code that uses bios for page-io (rather than
- bh). This isn't included in bio as yet. Christoph was also working on a
- design for representing virtual/real extents as an entity and modifying
- some of the address space ops interfaces to utilize this abstraction rather
- than buffer_heads. (This is somewhat along the lines of the SGI XFS pagebuf
- abstraction, but intended to be as lightweight as possible).
-
-(d) Direct access i/o:
-
-Direct access requests that do not contain bios would be submitted differently
-as discussed earlier in section 1.3.
-
-Aside:
-
-  Kvec i/o:
-
-  Ben LaHaise's aio code uses a slightly different structure instead
-  of kiobufs, called a kvec_cb. This contains an array of <page, offset, len>
-  tuples (very much like the networking code), together with a callback function
-  and data pointer. This is embedded into a brw_cb structure when passed
-  to brw_kvec_async().
-
-  Now it should be possible to directly map these kvecs to a bio. Just as while
-  cloning, in this case rather than PRE_BUILT bio_vecs, we set the bi_io_vec
-  array pointer to point to the veclet array in kvecs.
-
-  TBD: In order for this to work, some changes are needed in the way multi-page
-  bios are handled today. The values of the tuples in such a vector passed in
-  from higher level code should not be modified by the block layer in the course
-  of its request processing, since that would make it hard for the higher layer
-  to continue to use the vector descriptor (kvec) after i/o completes. Instead,
-  all such transient state should either be maintained in the request structure,
-  and passed on in some way to the endio completion routine.
-
-
-4. The I/O scheduler
-====================
-
-I/O scheduler, a.k.a. elevator, is implemented in two layers.  Generic dispatch
-queue and specific I/O schedulers.  Unless stated otherwise, elevator is used
-to refer to both parts and I/O scheduler to specific I/O schedulers.
-
-Block layer implements generic dispatch queue in `block/*.c`.
-The generic dispatch queue is responsible for requeueing, handling non-fs
-requests and all other subtleties.
-
-Specific I/O schedulers are responsible for ordering normal filesystem
-requests.  They can also choose to delay certain requests to improve
-throughput or whatever purpose.  As the plural form indicates, there are
-multiple I/O schedulers.  They can be built as modules but at least one should
-be built inside the kernel.  Each queue can choose different one and can also
-change to another one dynamically.
-
-A block layer call to the i/o scheduler follows the convention elv_xxx(). This
-calls elevator_xxx_fn in the elevator switch (block/elevator.c). Oh, xxx
-and xxx might not match exactly, but use your imagination. If an elevator
-doesn't implement a function, the switch does nothing or some minimal house
-keeping work.
-
-4.1. I/O scheduler API
-----------------------
-
-The functions an elevator may implement are: (* are mandatory)
-
-=============================== ================================================
-elevator_merge_fn		called to query requests for merge with a bio
-
-elevator_merge_req_fn		called when two requests get merged. the one
-				which gets merged into the other one will be
-				never seen by I/O scheduler again. IOW, after
-				being merged, the request is gone.
-
-elevator_merged_fn		called when a request in the scheduler has been
-				involved in a merge. It is used in the deadline
-				scheduler for example, to reposition the request
-				if its sorting order has changed.
-
-elevator_allow_merge_fn		called whenever the block layer determines
-				that a bio can be merged into an existing
-				request safely. The io scheduler may still
-				want to stop a merge at this point if it
-				results in some sort of conflict internally,
-				this hook allows it to do that. Note however
-				that two *requests* can still be merged at later
-				time. Currently the io scheduler has no way to
-				prevent that. It can only learn about the fact
-				from elevator_merge_req_fn callback.
-
-elevator_dispatch_fn*		fills the dispatch queue with ready requests.
-				I/O schedulers are free to postpone requests by
-				not filling the dispatch queue unless @force
-				is non-zero.  Once dispatched, I/O schedulers
-				are not allowed to manipulate the requests -
-				they belong to generic dispatch queue.
-
-elevator_add_req_fn*		called to add a new request into the scheduler
-
-elevator_former_req_fn
-elevator_latter_req_fn		These return the request before or after the
-				one specified in disk sort order. Used by the
-				block layer to find merge possibilities.
-
-elevator_completed_req_fn	called when a request is completed.
-
-elevator_set_req_fn
-elevator_put_req_fn		Must be used to allocate and free any elevator
-				specific storage for a request.
-
-elevator_activate_req_fn	Called when device driver first sees a request.
-				I/O schedulers can use this callback to
-				determine when actual execution of a request
-				starts.
-elevator_deactivate_req_fn	Called when device driver decides to delay
-				a request by requeueing it.
-
-elevator_init_fn*
-elevator_exit_fn		Allocate and free any elevator specific storage
-				for a queue.
-=============================== ================================================
-
-4.2 Request flows seen by I/O schedulers
-----------------------------------------
-
-All requests seen by I/O schedulers strictly follow one of the following three
-flows.
-
- set_req_fn ->
-
- i.   add_req_fn -> (merged_fn ->)* -> dispatch_fn -> activate_req_fn ->
-      (deactivate_req_fn -> activate_req_fn ->)* -> completed_req_fn
- ii.  add_req_fn -> (merged_fn ->)* -> merge_req_fn
- iii. [none]
-
- -> put_req_fn
-
-4.3 I/O scheduler implementation
---------------------------------
-
-The generic i/o scheduler algorithm attempts to sort/merge/batch requests for
-optimal disk scan and request servicing performance (based on generic
-principles and device capabilities), optimized for:
-
-i.   improved throughput
-ii.  improved latency
-iii. better utilization of h/w & CPU time
-
-Characteristics:
-
-i. Binary tree
-AS and deadline i/o schedulers use red black binary trees for disk position
-sorting and searching, and a fifo linked list for time-based searching. This
-gives good scalability and good availability of information. Requests are
-almost always dispatched in disk sort order, so a cache is kept of the next
-request in sort order to prevent binary tree lookups.
-
-This arrangement is not a generic block layer characteristic however, so
-elevators may implement queues as they please.
-
-ii. Merge hash
-AS and deadline use a hash table indexed by the last sector of a request. This
-enables merging code to quickly look up "back merge" candidates, even when
-multiple I/O streams are being performed at once on one disk.
-
-"Front merges", a new request being merged at the front of an existing request,
-are far less common than "back merges" due to the nature of most I/O patterns.
-Front merges are handled by the binary trees in AS and deadline schedulers.
-
-iii. Plugging the queue to batch requests in anticipation of opportunities for
-     merge/sort optimizations
-
-Plugging is an approach that the current i/o scheduling algorithm resorts to so
-that it collects up enough requests in the queue to be able to take
-advantage of the sorting/merging logic in the elevator. If the
-queue is empty when a request comes in, then it plugs the request queue
-(sort of like plugging the bath tub of a vessel to get fluid to build up)
-till it fills up with a few more requests, before starting to service
-the requests. This provides an opportunity to merge/sort the requests before
-passing them down to the device. There are various conditions when the queue is
-unplugged (to open up the flow again), either through a scheduled task or
-could be on demand.
-
-Aside:
-  This is kind of controversial territory, as it's not clear if plugging is
-  always the right thing to do. Devices typically have their own queues,
-  and allowing a big queue to build up in software, while letting the device be
-  idle for a while may not always make sense. The trick is to handle the fine
-  balance between when to plug and when to open up. Also now that we have
-  multi-page bios being queued in one shot, we may not need to wait to merge
-  a big request from the broken up pieces coming by.
-
-4.4 I/O contexts
-----------------
-
-I/O contexts provide a dynamically allocated per process data area. They may
-be used in I/O schedulers, and in the block layer (could be used for IO statis,
-priorities for example). See `*io_context` in block/ll_rw_blk.c, and as-iosched.c
-for an example of usage in an i/o scheduler.
-
-
-5. Scalability related changes
-==============================
-
-5.1 Granular Locking: io_request_lock replaced by a per-queue lock
-------------------------------------------------------------------
-
-The global io_request_lock has been removed as of 2.5, to avoid
-the scalability bottleneck it was causing, and has been replaced by more
-granular locking. The request queue structure has a pointer to the
-lock to be used for that queue. As a result, locking can now be
-per-queue, with a provision for sharing a lock across queues if
-necessary (e.g the scsi layer sets the queue lock pointers to the
-corresponding adapter lock, which results in a per host locking
-granularity). The locking semantics are the same, i.e. locking is
-still imposed by the block layer, grabbing the lock before
-request_fn execution which it means that lots of older drivers
-should still be SMP safe. Drivers are free to drop the queue
-lock themselves, if required. Drivers that explicitly used the
-io_request_lock for serialization need to be modified accordingly.
-Usually it's as easy as adding a global lock::
-
-	static DEFINE_SPINLOCK(my_driver_lock);
-
-and passing the address to that lock to blk_init_queue().
-
-5.2 64 bit sector numbers (sector_t prepares for 64 bit support)
-----------------------------------------------------------------
-
-The sector number used in the bio structure has been changed to sector_t,
-which could be defined as 64 bit in preparation for 64 bit sector support.
-
-6. Other Changes/Implications
-=============================
-
-6.1 Partition re-mapping handled by the generic block layer
------------------------------------------------------------
-
-In 2.5 some of the gendisk/partition related code has been reorganized.
-Now the generic block layer performs partition-remapping early and thus
-provides drivers with a sector number relative to whole device, rather than
-having to take partition number into account in order to arrive at the true
-sector number. The routine blk_partition_remap() is invoked by
-submit_bio_noacct even before invoking the queue specific ->submit_bio,
-so the i/o scheduler also gets to operate on whole disk sector numbers. This
-should typically not require changes to block drivers, it just never gets
-to invoke its own partition sector offset calculations since all bios
-sent are offset from the beginning of the device.
-
-
-7. A Few Tips on Migration of older drivers
-===========================================
-
-Old-style drivers that just use CURRENT and ignores clustered requests,
-may not need much change.  The generic layer will automatically handle
-clustered requests, multi-page bios, etc for the driver.
-
-For a low performance driver or hardware that is PIO driven or just doesn't
-support scatter-gather changes should be minimal too.
-
-The following are some points to keep in mind when converting old drivers
-to bio.
-
-Drivers should use elv_next_request to pick up requests and are no longer
-supposed to handle looping directly over the request list.
-(struct request->queue has been removed)
-
-Now end_that_request_first takes an additional number_of_sectors argument.
-It used to handle always just the first buffer_head in a request, now
-it will loop and handle as many sectors (on a bio-segment granularity)
-as specified.
-
-Now bh->b_end_io is replaced by bio->bi_end_io, but most of the time the
-right thing to use is bio_endio(bio) instead.
-
-If the driver is dropping the io_request_lock from its request_fn strategy,
-then it just needs to replace that with q->queue_lock instead.
-
-As described in Sec 1.1, drivers can set max sector size, max segment size
-etc per queue now. Drivers that used to define their own merge functions i
-to handle things like this can now just use the blk_queue_* functions at
-blk_init_queue time.
-
-Drivers no longer have to map a {partition, sector offset} into the
-correct absolute location anymore, this is done by the block layer, so
-where a driver received a request ala this before::
-
-	rq->rq_dev = mk_kdev(3, 5);	/* /dev/hda5 */
-	rq->sector = 0;			/* first sector on hda5 */
-
-it will now see::
-
-	rq->rq_dev = mk_kdev(3, 0);	/* /dev/hda */
-	rq->sector = 123128;		/* offset from start of disk */
-
-As mentioned, there is no virtual mapping of a bio. For DMA, this is
-not a problem as the driver probably never will need a virtual mapping.
-Instead it needs a bus mapping (dma_map_page for a single segment or
-use dma_map_sg for scatter gather) to be able to ship it to the driver. For
-PIO drivers (or drivers that need to revert to PIO transfer once in a
-while (IDE for example)), where the CPU is doing the actual data
-transfer a virtual mapping is needed. If the driver supports highmem I/O,
-(Sec 1.1, (ii) ) it needs to use kmap_atomic or similar to temporarily map
-a bio into the virtual address space.
-
-
-8. Prior/Related/Impacted patches
-=================================
-
-8.1. Earlier kiobuf patches (sct/axboe/chait/hch/mkp)
------------------------------------------------------
-
-- orig kiobuf & raw i/o patches (now in 2.4 tree)
-- direct kiobuf based i/o to devices (no intermediate bh's)
-- page i/o using kiobuf
-- kiobuf splitting for lvm (mkp)
-- elevator support for kiobuf request merging (axboe)
-
-8.2. Zero-copy networking (Dave Miller)
----------------------------------------
-
-8.3. SGI XFS - pagebuf patches - use of kiobufs
------------------------------------------------
-8.4. Multi-page pioent patch for bio (Christoph Hellwig)
---------------------------------------------------------
-8.5. Direct i/o implementation (Andrea Arcangeli) since 2.4.10-pre11
---------------------------------------------------------------------
-8.6. Async i/o implementation patch (Ben LaHaise)
--------------------------------------------------
-8.7. EVMS layering design (IBM EVMS team)
------------------------------------------
-8.8. Larger page cache size patch (Ben LaHaise) and Large page size (Daniel Phillips)
--------------------------------------------------------------------------------------
-
-    => larger contiguous physical memory buffers
-
-8.9. VM reservations patch (Ben LaHaise)
-----------------------------------------
-8.10. Write clustering patches ? (Marcelo/Quintela/Riel ?)
-----------------------------------------------------------
-8.11. Block device in page cache patch (Andrea Archangeli) - now in 2.4.10+
----------------------------------------------------------------------------
-8.12. Multiple block-size transfers for faster raw i/o (Shailabh Nagar, Badari)
--------------------------------------------------------------------------------
-8.13  Priority based i/o scheduler - prepatches (Arjan van de Ven)
-------------------------------------------------------------------
-8.14  IDE Taskfile i/o patch (Andre Hedrick)
---------------------------------------------
-8.15  Multi-page writeout and readahead patches (Andrew Morton)
----------------------------------------------------------------
-8.16  Direct i/o patches for 2.5 using kvec and bio (Badari Pulavarthy)
------------------------------------------------------------------------
-
-9. Other References
-===================
-
-9.1 The Splice I/O Model
-------------------------
-
-Larry McVoy (and subsequent discussions on lkml, and Linus' comments - Jan 2001
-
-9.2 Discussions about kiobuf and bh design
-------------------------------------------
-
-On lkml between sct, linus, alan et al - Feb-March 2001 (many of the
-initial thoughts that led to bio were brought up in this discussion thread)
-
-9.3 Discussions on mempool on lkml - Dec 2001.
-----------------------------------------------
diff --git a/Documentation/block/index.rst b/Documentation/block/index.rst
index 3a41495dd77b..68f115f2b1c6 100644
--- a/Documentation/block/index.rst
+++ b/Documentation/block/index.rst
@@ -8,7 +8,6 @@ Block
    :maxdepth: 1
 
    bfq-iosched
-   biodoc
    biovecs
    blk-mq
    capability

From ec9fd2a13d743ae129bd1b5c6edea0d65260ce10 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <kch@nvidia.com>
Date: Tue, 15 Feb 2022 03:52:47 -0800
Subject: [PATCH 055/127] blk-lib: don't check bdev_get_queue() NULL check

Based on the comment present in the bdev_get_queue()
bdev->bd_queue can never be NULL. Remove the NULL check for the local
variable q that is set from bdev_get_queue() for discard, write_same,
and write_zeroes.

Signed-off-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20220215115247.11717-2-kch@nvidia.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-lib.c | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 1b8ced45e4e5..fc6ea52e7482 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -19,9 +19,6 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 	unsigned int op;
 	sector_t bs_mask, part_offset = 0;
 
-	if (!q)
-		return -ENXIO;
-
 	if (bdev_read_only(bdev))
 		return -EPERM;
 
@@ -156,9 +153,6 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 	struct bio *bio = *biop;
 	sector_t bs_mask;
 
-	if (!q)
-		return -ENXIO;
-
 	if (bdev_read_only(bdev))
 		return -EPERM;
 
@@ -232,10 +226,6 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
 {
 	struct bio *bio = *biop;
 	unsigned int max_write_zeroes_sectors;
-	struct request_queue *q = bdev_get_queue(bdev);
-
-	if (!q)
-		return -ENXIO;
 
 	if (bdev_read_only(bdev))
 		return -EPERM;
@@ -284,14 +274,10 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev,
 		sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
 		struct bio **biop)
 {
-	struct request_queue *q = bdev_get_queue(bdev);
 	struct bio *bio = *biop;
 	int bi_size = 0;
 	unsigned int sz;
 
-	if (!q)
-		return -ENXIO;
-
 	if (bdev_read_only(bdev))
 		return -EPERM;
 

From f122d103b564e5fb7c82de902c6f8f6cbdf50ec9 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Sun, 13 Feb 2022 16:59:02 +0800
Subject: [PATCH 056/127] blk-cgroup: set blkg iostat after percpu stat
 aggregation

Don't need to do blkg_iostat_set for top blkg iostat on each CPU,
so move it after percpu stat aggregation.

Fixes: ef45fe470e1e ("blk-cgroup: show global disk stats in root cgroup io.stat")
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://lore.kernel.org/r/20220213085902.88884-1-zhouchengming@bytedance.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 4108d445c73a..fa063c6c0338 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -856,11 +856,11 @@ static void blkcg_fill_root_iostats(void)
 			blk_queue_root_blkg(bdev_get_queue(bdev));
 		struct blkg_iostat tmp;
 		int cpu;
+		unsigned long flags;
 
 		memset(&tmp, 0, sizeof(tmp));
 		for_each_possible_cpu(cpu) {
 			struct disk_stats *cpu_dkstats;
-			unsigned long flags;
 
 			cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu);
 			tmp.ios[BLKG_IOSTAT_READ] +=
@@ -876,11 +876,11 @@ static void blkcg_fill_root_iostats(void)
 				cpu_dkstats->sectors[STAT_WRITE] << 9;
 			tmp.bytes[BLKG_IOSTAT_DISCARD] +=
 				cpu_dkstats->sectors[STAT_DISCARD] << 9;
-
-			flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
-			blkg_iostat_set(&blkg->iostat.cur, &tmp);
-			u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
 		}
+
+		flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
+		blkg_iostat_set(&blkg->iostat.cur, &tmp);
+		u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
 	}
 }
 

From 248c793359daacd826a7507a258ffe41653efef7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 15 Feb 2022 11:05:36 +0100
Subject: [PATCH 057/127] blk-mq: make the blk-mq stacking code optional

The code to stack blk-mq drivers is only used by dm-multipath, and
will preferably stay that way.  Make it optional and only selected
by device mapper, so that the buildbots more easily catch abuses
like the one that slipped in in the ufs driver in the last merged
window.  Another positive side effects is that kernel builds without
device mapper shrink a little bit as well.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Link: https://lore.kernel.org/r/20220215100540.3892965-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/Kconfig      | 3 +++
 block/blk-mq.c     | 2 ++
 drivers/md/Kconfig | 1 +
 3 files changed, 6 insertions(+)

diff --git a/block/Kconfig b/block/Kconfig
index 205f8d01c695..168b873eb666 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -230,6 +230,9 @@ config BLK_PM
 config BLOCK_HOLDER_DEPRECATED
 	bool
 
+config BLK_MQ_STACKING
+	bool
+
 source "block/Kconfig.iosched"
 
 endif # BLOCK
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6c59ffe765fd..db62d34afb63 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2840,6 +2840,7 @@ void blk_mq_submit_bio(struct bio *bio)
 				blk_mq_try_issue_directly(rq->mq_hctx, rq));
 }
 
+#ifdef CONFIG_BLK_MQ_STACKING
 /**
  * blk_cloned_rq_check_limits - Helper function to check a cloned request
  *                              for the new queue limits
@@ -3017,6 +3018,7 @@ free_and_out:
 	return -ENOMEM;
 }
 EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
+#endif /* CONFIG_BLK_MQ_STACKING */
 
 /*
  * Steal bios from a request and add them to a bio list.
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index b5ea378e66cb..998a5cfdbc4e 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -204,6 +204,7 @@ config BLK_DEV_DM
 	tristate "Device mapper support"
 	select BLOCK_HOLDER_DEPRECATED if SYSFS
 	select BLK_DEV_DM_BUILTIN
+	select BLK_MQ_STACKING
 	depends on DAX || DAX=n
 	help
 	  Device-mapper is a low level volume manager.  It works by allowing

From a5efda3c46a1db9a579d953667906933d5037bf9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 15 Feb 2022 11:05:37 +0100
Subject: [PATCH 058/127] blk-mq: fold blk_cloned_rq_check_limits into
 blk_insert_cloned_request

Fold blk_cloned_rq_check_limits into its only caller.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Link: https://lore.kernel.org/r/20220215100540.3892965-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 38 +++++---------------------------------
 1 file changed, 5 insertions(+), 33 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index db62d34afb63..fc132933397f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2842,26 +2842,14 @@ void blk_mq_submit_bio(struct bio *bio)
 
 #ifdef CONFIG_BLK_MQ_STACKING
 /**
- * blk_cloned_rq_check_limits - Helper function to check a cloned request
- *                              for the new queue limits
- * @q:  the queue
- * @rq: the request being checked
- *
- * Description:
- *    @rq may have been made based on weaker limitations of upper-level queues
- *    in request stacking drivers, and it may violate the limitation of @q.
- *    Since the block layer and the underlying device driver trust @rq
- *    after it is inserted to @q, it should be checked against @q before
- *    the insertion using this generic function.
- *
- *    Request stacking drivers like request-based dm may change the queue
- *    limits when retrying requests on other queues. Those requests need
- *    to be checked against the new queue limits again during dispatch.
+ * blk_insert_cloned_request - Helper for stacking drivers to submit a request
+ * @q:  the queue to submit the request
+ * @rq: the request being queued
  */
-static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q,
-				      struct request *rq)
+blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 {
 	unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
+	blk_status_t ret;
 
 	if (blk_rq_sectors(rq) > max_sectors) {
 		/*
@@ -2893,22 +2881,6 @@ static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q,
 		return BLK_STS_IOERR;
 	}
 
-	return BLK_STS_OK;
-}
-
-/**
- * blk_insert_cloned_request - Helper for stacking drivers to submit a request
- * @q:  the queue to submit the request
- * @rq: the request being queued
- */
-blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq)
-{
-	blk_status_t ret;
-
-	ret = blk_cloned_rq_check_limits(q, rq);
-	if (ret != BLK_STS_OK)
-		return ret;
-
 	if (rq->q->disk &&
 	    should_fail_request(rq->q->disk->part0, blk_rq_bytes(rq)))
 		return BLK_STS_IOERR;

From 28db4711bf48303814dcfd8d41a41106e90bc374 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 15 Feb 2022 11:05:38 +0100
Subject: [PATCH 059/127] blk-mq: remove the request_queue argument to
 blk_insert_cloned_request

The request must be submitted to the queue it was allocated for, so
remove the extra request_queue argument.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Link: https://lore.kernel.org/r/20220215100540.3892965-4-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         | 9 ++++-----
 drivers/md/dm-rq.c     | 2 +-
 include/linux/blk-mq.h | 3 +--
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index fc132933397f..886836a54064 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2843,11 +2843,11 @@ void blk_mq_submit_bio(struct bio *bio)
 #ifdef CONFIG_BLK_MQ_STACKING
 /**
  * blk_insert_cloned_request - Helper for stacking drivers to submit a request
- * @q:  the queue to submit the request
  * @rq: the request being queued
  */
-blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq)
+blk_status_t blk_insert_cloned_request(struct request *rq)
 {
+	struct request_queue *q = rq->q;
 	unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
 	blk_status_t ret;
 
@@ -2881,8 +2881,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
 		return BLK_STS_IOERR;
 	}
 
-	if (rq->q->disk &&
-	    should_fail_request(rq->q->disk->part0, blk_rq_bytes(rq)))
+	if (q->disk && should_fail_request(q->disk->part0, blk_rq_bytes(rq)))
 		return BLK_STS_IOERR;
 
 	if (blk_crypto_insert_cloned_request(rq))
@@ -2895,7 +2894,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
 	 * bypass a potential scheduler on the bottom device for
 	 * insert.
 	 */
-	blk_mq_run_dispatch_ops(rq->q,
+	blk_mq_run_dispatch_ops(q,
 			ret = blk_mq_request_issue_directly(rq, true));
 	if (ret)
 		blk_account_io_done(rq, ktime_get_ns());
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 579ab6183d4d..2fcc9b7f391b 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -311,7 +311,7 @@ static blk_status_t dm_dispatch_clone_request(struct request *clone, struct requ
 		clone->rq_flags |= RQF_IO_STAT;
 
 	clone->start_time_ns = ktime_get_ns();
-	r = blk_insert_cloned_request(clone->q, clone);
+	r = blk_insert_cloned_request(clone);
 	if (r != BLK_STS_OK && r != BLK_STS_RESOURCE && r != BLK_STS_DEV_RESOURCE)
 		/* must complete clone in terms of original request */
 		dm_complete_request(rq, r);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index d319ffa59354..3a41d50b85d3 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -952,8 +952,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 		struct bio_set *bs, gfp_t gfp_mask,
 		int (*bio_ctr)(struct bio *, struct bio *, void *), void *data);
 void blk_rq_unprep_clone(struct request *rq);
-blk_status_t blk_insert_cloned_request(struct request_queue *q,
-		struct request *rq);
+blk_status_t blk_insert_cloned_request(struct request *rq);
 
 struct rq_map_data {
 	struct page **pages;

From 8803c89f365b344859decd5b3074e9bb3b65caa1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 15 Feb 2022 11:05:39 +0100
Subject: [PATCH 060/127] dm: remove useless code from
 dm_dispatch_clone_request

Both ->start_time_ns and the RQF_IO_STAT are set when the request is
allocated using blk_mq_alloc_request by dm-mpath in blk_mq_rq_ctx_init.
The block layer also ensures ->start_time_ns is only set when actually
needed.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Link: https://lore.kernel.org/r/20220215100540.3892965-5-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/dm-rq.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 2fcc9b7f391b..8f6117342d32 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -307,10 +307,6 @@ static blk_status_t dm_dispatch_clone_request(struct request *clone, struct requ
 {
 	blk_status_t r;
 
-	if (blk_queue_io_stat(clone->q))
-		clone->rq_flags |= RQF_IO_STAT;
-
-	clone->start_time_ns = ktime_get_ns();
 	r = blk_insert_cloned_request(clone);
 	if (r != BLK_STS_OK && r != BLK_STS_RESOURCE && r != BLK_STS_DEV_RESOURCE)
 		/* must complete clone in terms of original request */

From 9f9adea7187ec1978bd3863f59b7fe27ccf33519 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 15 Feb 2022 11:05:40 +0100
Subject: [PATCH 061/127] dm: remove dm_dispatch_clone_request

Fold dm_dispatch_clone_request into it's only caller, and use a switch
statement to single dispatch for the handling of the different return
values from blk_insert_cloned_request.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Link: https://lore.kernel.org/r/20220215100540.3892965-6-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/dm-rq.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 8f6117342d32..6948d5db9092 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -303,17 +303,6 @@ static void end_clone_request(struct request *clone, blk_status_t error)
 	dm_complete_request(tio->orig, error);
 }
 
-static blk_status_t dm_dispatch_clone_request(struct request *clone, struct request *rq)
-{
-	blk_status_t r;
-
-	r = blk_insert_cloned_request(clone);
-	if (r != BLK_STS_OK && r != BLK_STS_RESOURCE && r != BLK_STS_DEV_RESOURCE)
-		/* must complete clone in terms of original request */
-		dm_complete_request(rq, r);
-	return r;
-}
-
 static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
 				 void *data)
 {
@@ -394,13 +383,20 @@ static int map_request(struct dm_rq_target_io *tio)
 		/* The target has remapped the I/O so dispatch it */
 		trace_block_rq_remap(clone, disk_devt(dm_disk(md)),
 				     blk_rq_pos(rq));
-		ret = dm_dispatch_clone_request(clone, rq);
-		if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
+		ret = blk_insert_cloned_request(clone);
+		switch (ret) {
+		case BLK_STS_OK:
+			break;
+		case BLK_STS_RESOURCE:
+		case BLK_STS_DEV_RESOURCE:
 			blk_rq_unprep_clone(clone);
 			blk_mq_cleanup_rq(clone);
 			tio->ti->type->release_clone_rq(clone, &tio->info);
 			tio->clone = NULL;
 			return DM_MAPIO_REQUEUE;
+		default:
+			/* must complete clone in terms of original request */
+			dm_complete_request(rq, ret);
 		}
 		break;
 	case DM_MAPIO_REQUEUE:

From a650628bde77f6ac5b1d532092346feff7b58c52 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Wed, 16 Feb 2022 12:45:07 +0800
Subject: [PATCH 062/127] block: move submit_bio_checks() into
 submit_bio_noacct

It is more clean & readable to check bio when starting to submit it,
instead of just before calling ->submit_bio() or blk_mq_submit_bio().

Also it provides us chance to optimize bio submission without checking
bio.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Link: https://lore.kernel.org/r/20220216044514.2903784-2-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 5a4a59041629..d4a023667ac1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -797,9 +797,6 @@ static void __submit_bio(struct bio *bio)
 {
 	struct gendisk *disk = bio->bi_bdev->bd_disk;
 
-	if (unlikely(!submit_bio_checks(bio)))
-		return;
-
 	if (!disk->fops->submit_bio)
 		blk_mq_submit_bio(bio);
 	else
@@ -893,6 +890,9 @@ static void __submit_bio_noacct_mq(struct bio *bio)
  */
 void submit_bio_noacct(struct bio *bio)
 {
+	if (unlikely(!submit_bio_checks(bio)))
+		return;
+
 	/*
 	 * We only want one ->submit_bio to be active at a time, else stack
 	 * usage with stacked devices could be a problem.  Use current->bio_list

From 7f36b7d02a287ed18d02ae821868aa07b0235521 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Wed, 16 Feb 2022 12:45:08 +0800
Subject: [PATCH 063/127] block: move blk_crypto_bio_prep() out of blk-mq.c

blk_crypto_bio_prep() is called for both bio based and blk-mq drivers,
so move it out of blk-mq.c, then we can unify this kind of handling.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Link: https://lore.kernel.org/r/20220216044514.2903784-3-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 21 ++++++++-------------
 block/blk-mq.c   |  3 ---
 2 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index d4a023667ac1..f03fff1fa391 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -783,24 +783,19 @@ end_io:
 	return false;
 }
 
-static void __submit_bio_fops(struct gendisk *disk, struct bio *bio)
-{
-	if (blk_crypto_bio_prep(&bio)) {
-		if (likely(bio_queue_enter(bio) == 0)) {
-			disk->fops->submit_bio(bio);
-			blk_queue_exit(disk->queue);
-		}
-	}
-}
-
 static void __submit_bio(struct bio *bio)
 {
 	struct gendisk *disk = bio->bi_bdev->bd_disk;
 
-	if (!disk->fops->submit_bio)
+	if (unlikely(!blk_crypto_bio_prep(&bio)))
+		return;
+
+	if (!disk->fops->submit_bio) {
 		blk_mq_submit_bio(bio);
-	else
-		__submit_bio_fops(disk, bio);
+	} else if (likely(bio_queue_enter(bio) == 0)) {
+		disk->fops->submit_bio(bio);
+		blk_queue_exit(disk->queue);
+	}
 }
 
 /*
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 886836a54064..7ca0b47246a6 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2788,9 +2788,6 @@ void blk_mq_submit_bio(struct bio *bio)
 	unsigned int nr_segs = 1;
 	blk_status_t ret;
 
-	if (unlikely(!blk_crypto_bio_prep(&bio)))
-		return;
-
 	blk_queue_bounce(q, &bio);
 	if (blk_may_split(q, bio))
 		__blk_queue_split(q, &bio, &nr_segs);

From 29ff23624e21c89d3321d6429dec8ad3847b534a Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Wed, 16 Feb 2022 12:45:09 +0800
Subject: [PATCH 064/127] block: don't declare submit_bio_checks in local
 header

submit_bio_checks() won't be called outside of block/blk-core.c any more
since commit 9d497e2941c3 ("block: don't protect submit_bio_checks by
q_usage_counter"), so mark it as one local helper.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Link: https://lore.kernel.org/r/20220216044514.2903784-4-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 2 +-
 block/blk.h      | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index f03fff1fa391..5248b94d276b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -676,7 +676,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
 	return BLK_STS_OK;
 }
 
-noinline_for_stack bool submit_bio_checks(struct bio *bio)
+static noinline_for_stack bool submit_bio_checks(struct bio *bio)
 {
 	struct block_device *bdev = bio->bi_bdev;
 	struct request_queue *q = bdev_get_queue(bdev);
diff --git a/block/blk.h b/block/blk.h
index abb663a2a147..b2516cb4f98e 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -46,7 +46,6 @@ void blk_freeze_queue(struct request_queue *q);
 void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
 void blk_queue_start_drain(struct request_queue *q);
 int __bio_queue_enter(struct request_queue *q, struct bio *bio);
-bool submit_bio_checks(struct bio *bio);
 
 static inline bool blk_try_enter_queue(struct request_queue *q, bool pm)
 {

From 3f98c753717c600eb5708e9b78b3eba6664bddf1 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Wed, 16 Feb 2022 12:45:10 +0800
Subject: [PATCH 065/127] block: don't check bio in blk_throtl_dispatch_work_fn

The bio has been checked already before throttling, so no need to check
it again before dispatching it from throttle queue.

Add a helper of submit_bio_noacct_nocheck() for this purpose.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220216044514.2903784-5-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c     | 30 +++++++++++++++++-------------
 block/blk-throttle.c |  2 +-
 block/blk.h          |  1 +
 3 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 5248b94d276b..72b7b2214c70 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -874,6 +874,22 @@ static void __submit_bio_noacct_mq(struct bio *bio)
 	current->bio_list = NULL;
 }
 
+void submit_bio_noacct_nocheck(struct bio *bio)
+{
+	/*
+	 * We only want one ->submit_bio to be active at a time, else stack
+	 * usage with stacked devices could be a problem.  Use current->bio_list
+	 * to collect a list of requests submited by a ->submit_bio method while
+	 * it is active, and then process them after it returned.
+	 */
+	if (current->bio_list)
+		bio_list_add(&current->bio_list[0], bio);
+	else if (!bio->bi_bdev->bd_disk->fops->submit_bio)
+		__submit_bio_noacct_mq(bio);
+	else
+		__submit_bio_noacct(bio);
+}
+
 /**
  * submit_bio_noacct - re-submit a bio to the block device layer for I/O
  * @bio:  The bio describing the location in memory and on the device.
@@ -887,19 +903,7 @@ void submit_bio_noacct(struct bio *bio)
 {
 	if (unlikely(!submit_bio_checks(bio)))
 		return;
-
-	/*
-	 * We only want one ->submit_bio to be active at a time, else stack
-	 * usage with stacked devices could be a problem.  Use current->bio_list
-	 * to collect a list of requests submited by a ->submit_bio method while
-	 * it is active, and then process them after it returned.
-	 */
-	if (current->bio_list)
-		bio_list_add(&current->bio_list[0], bio);
-	else if (!bio->bi_bdev->bd_disk->fops->submit_bio)
-		__submit_bio_noacct_mq(bio);
-	else
-		__submit_bio_noacct(bio);
+	submit_bio_noacct_nocheck(bio);
 }
 EXPORT_SYMBOL(submit_bio_noacct);
 
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 73640d80e99e..8770768f1000 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1218,7 +1218,7 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
 	if (!bio_list_empty(&bio_list_on_stack)) {
 		blk_start_plug(&plug);
 		while ((bio = bio_list_pop(&bio_list_on_stack)))
-			submit_bio_noacct(bio);
+			submit_bio_noacct_nocheck(bio);
 		blk_finish_plug(&plug);
 	}
 }
diff --git a/block/blk.h b/block/blk.h
index b2516cb4f98e..ebaa59ca46ca 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -46,6 +46,7 @@ void blk_freeze_queue(struct request_queue *q);
 void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
 void blk_queue_start_drain(struct request_queue *q);
 int __bio_queue_enter(struct request_queue *q, struct bio *bio);
+void submit_bio_noacct_nocheck(struct bio *bio);
 
 static inline bool blk_try_enter_queue(struct request_queue *q, bool pm)
 {

From d24c670ec1f9f1dc320e59004e61f3491ae24546 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Wed, 16 Feb 2022 12:45:11 +0800
Subject: [PATCH 066/127] block: merge submit_bio_checks() into
 submit_bio_noacct

Now submit_bio_checks() is only called by submit_bio_noacct(), so merge
it into submit_bio_noacct().

Suggested-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Link: https://lore.kernel.org/r/20220216044514.2903784-6-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 209 +++++++++++++++++++++++------------------------
 1 file changed, 101 insertions(+), 108 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 72b7b2214c70..94bf37f8e61d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -676,113 +676,6 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
 	return BLK_STS_OK;
 }
 
-static noinline_for_stack bool submit_bio_checks(struct bio *bio)
-{
-	struct block_device *bdev = bio->bi_bdev;
-	struct request_queue *q = bdev_get_queue(bdev);
-	blk_status_t status = BLK_STS_IOERR;
-	struct blk_plug *plug;
-
-	might_sleep();
-
-	plug = blk_mq_plug(q, bio);
-	if (plug && plug->nowait)
-		bio->bi_opf |= REQ_NOWAIT;
-
-	/*
-	 * For a REQ_NOWAIT based request, return -EOPNOTSUPP
-	 * if queue does not support NOWAIT.
-	 */
-	if ((bio->bi_opf & REQ_NOWAIT) && !blk_queue_nowait(q))
-		goto not_supported;
-
-	if (should_fail_bio(bio))
-		goto end_io;
-	if (unlikely(bio_check_ro(bio)))
-		goto end_io;
-	if (!bio_flagged(bio, BIO_REMAPPED)) {
-		if (unlikely(bio_check_eod(bio)))
-			goto end_io;
-		if (bdev->bd_partno && unlikely(blk_partition_remap(bio)))
-			goto end_io;
-	}
-
-	/*
-	 * Filter flush bio's early so that bio based drivers without flush
-	 * support don't have to worry about them.
-	 */
-	if (op_is_flush(bio->bi_opf) &&
-	    !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
-		bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
-		if (!bio_sectors(bio)) {
-			status = BLK_STS_OK;
-			goto end_io;
-		}
-	}
-
-	if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
-		bio_clear_polled(bio);
-
-	switch (bio_op(bio)) {
-	case REQ_OP_DISCARD:
-		if (!blk_queue_discard(q))
-			goto not_supported;
-		break;
-	case REQ_OP_SECURE_ERASE:
-		if (!blk_queue_secure_erase(q))
-			goto not_supported;
-		break;
-	case REQ_OP_WRITE_SAME:
-		if (!q->limits.max_write_same_sectors)
-			goto not_supported;
-		break;
-	case REQ_OP_ZONE_APPEND:
-		status = blk_check_zone_append(q, bio);
-		if (status != BLK_STS_OK)
-			goto end_io;
-		break;
-	case REQ_OP_ZONE_RESET:
-	case REQ_OP_ZONE_OPEN:
-	case REQ_OP_ZONE_CLOSE:
-	case REQ_OP_ZONE_FINISH:
-		if (!blk_queue_is_zoned(q))
-			goto not_supported;
-		break;
-	case REQ_OP_ZONE_RESET_ALL:
-		if (!blk_queue_is_zoned(q) || !blk_queue_zone_resetall(q))
-			goto not_supported;
-		break;
-	case REQ_OP_WRITE_ZEROES:
-		if (!q->limits.max_write_zeroes_sectors)
-			goto not_supported;
-		break;
-	default:
-		break;
-	}
-
-	if (blk_throtl_bio(bio))
-		return false;
-
-	blk_cgroup_bio_start(bio);
-	blkcg_bio_issue_init(bio);
-
-	if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
-		trace_block_bio_queue(bio);
-		/* Now that enqueuing has been traced, we need to trace
-		 * completion as well.
-		 */
-		bio_set_flag(bio, BIO_TRACE_COMPLETION);
-	}
-	return true;
-
-not_supported:
-	status = BLK_STS_NOTSUPP;
-end_io:
-	bio->bi_status = status;
-	bio_endio(bio);
-	return false;
-}
-
 static void __submit_bio(struct bio *bio)
 {
 	struct gendisk *disk = bio->bi_bdev->bd_disk;
@@ -901,9 +794,109 @@ void submit_bio_noacct_nocheck(struct bio *bio)
  */
 void submit_bio_noacct(struct bio *bio)
 {
-	if (unlikely(!submit_bio_checks(bio)))
+	struct block_device *bdev = bio->bi_bdev;
+	struct request_queue *q = bdev_get_queue(bdev);
+	blk_status_t status = BLK_STS_IOERR;
+	struct blk_plug *plug;
+
+	might_sleep();
+
+	plug = blk_mq_plug(q, bio);
+	if (plug && plug->nowait)
+		bio->bi_opf |= REQ_NOWAIT;
+
+	/*
+	 * For a REQ_NOWAIT based request, return -EOPNOTSUPP
+	 * if queue does not support NOWAIT.
+	 */
+	if ((bio->bi_opf & REQ_NOWAIT) && !blk_queue_nowait(q))
+		goto not_supported;
+
+	if (should_fail_bio(bio))
+		goto end_io;
+	if (unlikely(bio_check_ro(bio)))
+		goto end_io;
+	if (!bio_flagged(bio, BIO_REMAPPED)) {
+		if (unlikely(bio_check_eod(bio)))
+			goto end_io;
+		if (bdev->bd_partno && unlikely(blk_partition_remap(bio)))
+			goto end_io;
+	}
+
+	/*
+	 * Filter flush bio's early so that bio based drivers without flush
+	 * support don't have to worry about them.
+	 */
+	if (op_is_flush(bio->bi_opf) &&
+	    !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
+		bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
+		if (!bio_sectors(bio)) {
+			status = BLK_STS_OK;
+			goto end_io;
+		}
+	}
+
+	if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+		bio_clear_polled(bio);
+
+	switch (bio_op(bio)) {
+	case REQ_OP_DISCARD:
+		if (!blk_queue_discard(q))
+			goto not_supported;
+		break;
+	case REQ_OP_SECURE_ERASE:
+		if (!blk_queue_secure_erase(q))
+			goto not_supported;
+		break;
+	case REQ_OP_WRITE_SAME:
+		if (!q->limits.max_write_same_sectors)
+			goto not_supported;
+		break;
+	case REQ_OP_ZONE_APPEND:
+		status = blk_check_zone_append(q, bio);
+		if (status != BLK_STS_OK)
+			goto end_io;
+		break;
+	case REQ_OP_ZONE_RESET:
+	case REQ_OP_ZONE_OPEN:
+	case REQ_OP_ZONE_CLOSE:
+	case REQ_OP_ZONE_FINISH:
+		if (!blk_queue_is_zoned(q))
+			goto not_supported;
+		break;
+	case REQ_OP_ZONE_RESET_ALL:
+		if (!blk_queue_is_zoned(q) || !blk_queue_zone_resetall(q))
+			goto not_supported;
+		break;
+	case REQ_OP_WRITE_ZEROES:
+		if (!q->limits.max_write_zeroes_sectors)
+			goto not_supported;
+		break;
+	default:
+		break;
+	}
+
+	if (blk_throtl_bio(bio))
 		return;
+
+	blk_cgroup_bio_start(bio);
+	blkcg_bio_issue_init(bio);
+
+	if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
+		trace_block_bio_queue(bio);
+		/* Now that enqueuing has been traced, we need to trace
+		 * completion as well.
+		 */
+		bio_set_flag(bio, BIO_TRACE_COMPLETION);
+	}
 	submit_bio_noacct_nocheck(bio);
+	return;
+
+not_supported:
+	status = BLK_STS_NOTSUPP;
+end_io:
+	bio->bi_status = status;
+	bio_endio(bio);
 }
 EXPORT_SYMBOL(submit_bio_noacct);
 

From 9f5ede3c01f9951b0ae7d68b28762ad51d9bacc8 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Wed, 16 Feb 2022 12:45:12 +0800
Subject: [PATCH 067/127] block: throttle split bio in case of iops limit

Commit 111be8839817 ("block-throttle: avoid double charge") marks bio as
BIO_THROTTLED unconditionally if __blk_throtl_bio() is called on this bio,
then this bio won't be called into __blk_throtl_bio() any more. This way
is to avoid double charge in case of bio splitting. It is reasonable for
read/write throughput limit, but not reasonable for IOPS limit because
block layer provides io accounting against split bio.

Chunguang Xu has already observed this issue and fixed it in commit
4f1e9630afe6 ("blk-throtl: optimize IOPS throttle for large IO scenarios").
However, that patch only covers bio splitting in __blk_queue_split(), and
we have other kind of bio splitting, such as bio_split() &
submit_bio_noacct() and other ways.

This patch tries to fix the issue in one generic way by always charging
the bio for iops limit in blk_throtl_bio(). This way is reasonable:
re-submission & fast-cloned bio is charged if it is submitted to same
disk/queue, and BIO_THROTTLED will be cleared if bio->bi_bdev is changed.

This new approach can get much more smooth/stable iops limit compared with
commit 4f1e9630afe6 ("blk-throtl: optimize IOPS throttle for large IO
scenarios") since that commit can't throttle current split bios actually.

Also this way won't cause new double bio iops charge in
blk_throtl_dispatch_work_fn() in which blk_throtl_bio() won't be called
any more.

Reported-by: Ning Li <lining2020x@163.com>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Chunguang Xu <brookxu@tencent.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20220216044514.2903784-7-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-merge.c    |  2 --
 block/blk-throttle.c | 10 +++++++---
 block/blk-throttle.h |  2 --
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 4de34a332c9f..f5255991b773 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -368,8 +368,6 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio,
 		trace_block_split(split, (*bio)->bi_iter.bi_sector);
 		submit_bio_noacct(*bio);
 		*bio = split;
-
-		blk_throtl_charge_bio_split(*bio);
 	}
 }
 
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 8770768f1000..c7aa26d52e84 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -807,7 +807,8 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
 	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
 	unsigned int bio_size = throtl_bio_data_size(bio);
 
-	if (bps_limit == U64_MAX) {
+	/* no need to throttle if this bio's bytes have been accounted */
+	if (bps_limit == U64_MAX || bio_flagged(bio, BIO_THROTTLED)) {
 		if (wait)
 			*wait = 0;
 		return true;
@@ -919,9 +920,12 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 	unsigned int bio_size = throtl_bio_data_size(bio);
 
 	/* Charge the bio to the group */
-	tg->bytes_disp[rw] += bio_size;
+	if (!bio_flagged(bio, BIO_THROTTLED)) {
+		tg->bytes_disp[rw] += bio_size;
+		tg->last_bytes_disp[rw] += bio_size;
+	}
+
 	tg->io_disp[rw]++;
-	tg->last_bytes_disp[rw] += bio_size;
 	tg->last_io_disp[rw]++;
 
 	/*
diff --git a/block/blk-throttle.h b/block/blk-throttle.h
index 175f03abd9e4..cb43f4417d6e 100644
--- a/block/blk-throttle.h
+++ b/block/blk-throttle.h
@@ -170,8 +170,6 @@ static inline bool blk_throtl_bio(struct bio *bio)
 {
 	struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg);
 
-	if (bio_flagged(bio, BIO_THROTTLED))
-		return false;
 	if (!tg->has_rules[bio_data_dir(bio)])
 		return false;
 

From 5a93b6027eb4ef5db60a4bc5bdbeba5fb9f29384 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Wed, 16 Feb 2022 12:45:13 +0800
Subject: [PATCH 068/127] block: don't try to throttle split bio if iops limit
 isn't set

We need to throttle split bio in case of IOPS limit even though the
split bio has been marked as BIO_THROTTLED since block layer
accounts split bio actually.

If only throughput throttle is setup, no need to throttle any more
if BIO_THROTTLED is set since we have accounted & considered the
whole bio bytes already.

Add one flag of THROTL_TG_HAS_IOPS_LIMIT for serving this purpose.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20220216044514.2903784-8-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-throttle.c | 21 ++++++++++++++-------
 block/blk-throttle.h | 11 +++++++++++
 2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index c7aa26d52e84..ec72eced24d2 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -41,11 +41,6 @@
 /* A workqueue to queue throttle related work */
 static struct workqueue_struct *kthrotld_workqueue;
 
-enum tg_state_flags {
-	THROTL_TG_PENDING	= 1 << 0,	/* on parent's pending tree */
-	THROTL_TG_WAS_EMPTY	= 1 << 1,	/* bio_lists[] became non-empty */
-};
-
 #define rb_entry_tg(node)	rb_entry((node), struct throtl_grp, rb_node)
 
 /* We measure latency for request size from <= 4k to >= 1M */
@@ -425,12 +420,24 @@ static void tg_update_has_rules(struct throtl_grp *tg)
 	struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq);
 	struct throtl_data *td = tg->td;
 	int rw;
+	int has_iops_limit = 0;
+
+	for (rw = READ; rw <= WRITE; rw++) {
+		unsigned int iops_limit = tg_iops_limit(tg, rw);
 
-	for (rw = READ; rw <= WRITE; rw++)
 		tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) ||
 			(td->limit_valid[td->limit_index] &&
 			 (tg_bps_limit(tg, rw) != U64_MAX ||
-			  tg_iops_limit(tg, rw) != UINT_MAX));
+			  iops_limit != UINT_MAX));
+
+		if (iops_limit != UINT_MAX)
+			has_iops_limit = 1;
+	}
+
+	if (has_iops_limit)
+		tg->flags |= THROTL_TG_HAS_IOPS_LIMIT;
+	else
+		tg->flags &= ~THROTL_TG_HAS_IOPS_LIMIT;
 }
 
 static void throtl_pd_online(struct blkg_policy_data *pd)
diff --git a/block/blk-throttle.h b/block/blk-throttle.h
index cb43f4417d6e..c996a15f290e 100644
--- a/block/blk-throttle.h
+++ b/block/blk-throttle.h
@@ -52,6 +52,12 @@ struct throtl_service_queue {
 	struct timer_list	pending_timer;	/* fires on first_pending_disptime */
 };
 
+enum tg_state_flags {
+	THROTL_TG_PENDING	= 1 << 0,	/* on parent's pending tree */
+	THROTL_TG_WAS_EMPTY	= 1 << 1,	/* bio_lists[] became non-empty */
+	THROTL_TG_HAS_IOPS_LIMIT = 1 << 2,	/* tg has iops limit */
+};
+
 enum {
 	LIMIT_LOW,
 	LIMIT_MAX,
@@ -170,6 +176,11 @@ static inline bool blk_throtl_bio(struct bio *bio)
 {
 	struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg);
 
+	/* no need to throttle bps any more if the bio has been throttled */
+	if (bio_flagged(bio, BIO_THROTTLED) &&
+	    !(tg->flags & THROTL_TG_HAS_IOPS_LIMIT))
+		return false;
+
 	if (!tg->has_rules[bio_data_dir(bio)])
 		return false;
 

From 34841e6fb125aa3f0e33e4eaac9f5eb86b2bb34b Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Wed, 16 Feb 2022 12:45:14 +0800
Subject: [PATCH 069/127] block: revert 4f1e9630afe6 ("blk-throtl: optimize
 IOPS throttle for large IO scenarios")

Revert commit 4f1e9630afe6 ("blk-throtl: optimize IOPS throttle for large
IO scenarios") since we have another easier way to address this issue and
get better iops throttling result.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20220216044514.2903784-9-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-throttle.c | 28 ----------------------------
 block/blk-throttle.h |  5 -----
 2 files changed, 33 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index ec72eced24d2..a3b3ebc72dd4 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -640,8 +640,6 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
 	tg->bytes_disp[rw] = 0;
 	tg->io_disp[rw] = 0;
 
-	atomic_set(&tg->io_split_cnt[rw], 0);
-
 	/*
 	 * Previous slice has expired. We must have trimmed it after last
 	 * bio dispatch. That means since start of last slice, we never used
@@ -665,8 +663,6 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
 	tg->slice_start[rw] = jiffies;
 	tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
 
-	atomic_set(&tg->io_split_cnt[rw], 0);
-
 	throtl_log(&tg->service_queue,
 		   "[%c] new slice start=%lu end=%lu jiffies=%lu",
 		   rw == READ ? 'R' : 'W', tg->slice_start[rw],
@@ -900,9 +896,6 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
 				jiffies + tg->td->throtl_slice);
 	}
 
-	if (iops_limit != UINT_MAX)
-		tg->io_disp[rw] += atomic_xchg(&tg->io_split_cnt[rw], 0);
-
 	if (tg_with_in_bps_limit(tg, bio, bps_limit, &bps_wait) &&
 	    tg_with_in_iops_limit(tg, bio, iops_limit, &iops_wait)) {
 		if (wait)
@@ -1927,14 +1920,12 @@ static void throtl_downgrade_check(struct throtl_grp *tg)
 	}
 
 	if (tg->iops[READ][LIMIT_LOW]) {
-		tg->last_io_disp[READ] += atomic_xchg(&tg->last_io_split_cnt[READ], 0);
 		iops = tg->last_io_disp[READ] * HZ / elapsed_time;
 		if (iops >= tg->iops[READ][LIMIT_LOW])
 			tg->last_low_overflow_time[READ] = now;
 	}
 
 	if (tg->iops[WRITE][LIMIT_LOW]) {
-		tg->last_io_disp[WRITE] += atomic_xchg(&tg->last_io_split_cnt[WRITE], 0);
 		iops = tg->last_io_disp[WRITE] * HZ / elapsed_time;
 		if (iops >= tg->iops[WRITE][LIMIT_LOW])
 			tg->last_low_overflow_time[WRITE] = now;
@@ -2053,25 +2044,6 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td)
 }
 #endif
 
-void blk_throtl_charge_bio_split(struct bio *bio)
-{
-	struct blkcg_gq *blkg = bio->bi_blkg;
-	struct throtl_grp *parent = blkg_to_tg(blkg);
-	struct throtl_service_queue *parent_sq;
-	bool rw = bio_data_dir(bio);
-
-	do {
-		if (!parent->has_rules[rw])
-			break;
-
-		atomic_inc(&parent->io_split_cnt[rw]);
-		atomic_inc(&parent->last_io_split_cnt[rw]);
-
-		parent_sq = parent->service_queue.parent_sq;
-		parent = sq_to_tg(parent_sq);
-	} while (parent);
-}
-
 bool __blk_throtl_bio(struct bio *bio)
 {
 	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
diff --git a/block/blk-throttle.h b/block/blk-throttle.h
index c996a15f290e..b23a9f3abb82 100644
--- a/block/blk-throttle.h
+++ b/block/blk-throttle.h
@@ -138,9 +138,6 @@ struct throtl_grp {
 	unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
 	unsigned long bio_cnt_reset_time;
 
-	atomic_t io_split_cnt[2];
-	atomic_t last_io_split_cnt[2];
-
 	struct blkg_rwstat stat_bytes;
 	struct blkg_rwstat stat_ios;
 };
@@ -164,13 +161,11 @@ static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
 static inline int blk_throtl_init(struct request_queue *q) { return 0; }
 static inline void blk_throtl_exit(struct request_queue *q) { }
 static inline void blk_throtl_register_queue(struct request_queue *q) { }
-static inline void blk_throtl_charge_bio_split(struct bio *bio) { }
 static inline bool blk_throtl_bio(struct bio *bio) { return false; }
 #else /* CONFIG_BLK_DEV_THROTTLING */
 int blk_throtl_init(struct request_queue *q);
 void blk_throtl_exit(struct request_queue *q);
 void blk_throtl_register_queue(struct request_queue *q);
-void blk_throtl_charge_bio_split(struct bio *bio);
 bool __blk_throtl_bio(struct bio *bio);
 static inline bool blk_throtl_bio(struct bio *bio)
 {

From 76792055c4c8b2472ca1ae48e0ddaf8497529f08 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 15 Feb 2022 10:45:10 +0100
Subject: [PATCH 070/127] block: add a ->free_disk method

Add a method to notify the driver that the gendisk is about to be freed.
This allows drivers to tie the lifetime of their private data to that of
the gendisk and thus deal with device removal races without expensive
synchronization and boilerplate code.

A new flag is added so that ->free_disk is only called after a successful
call to add_disk, which significantly simplifies the error handling path
during probing.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220215094514.3828912-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c          | 5 +++++
 include/linux/blkdev.h | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/block/genhd.c b/block/genhd.c
index 9589d1d59afa..e351fac41bf2 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -526,6 +526,7 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
 
 	disk_update_readahead(disk);
 	disk_add_events(disk);
+	set_bit(GD_ADDED, &disk->state);
 	return 0;
 
 out_unregister_bdi:
@@ -1119,6 +1120,10 @@ static void disk_release(struct device *dev)
 	xa_destroy(&disk->part_tbl);
 	disk->queue->disk = NULL;
 	blk_put_queue(disk->queue);
+
+	if (test_bit(GD_ADDED, &disk->state) && disk->fops->free_disk)
+		disk->fops->free_disk(disk);
+
 	iput(disk->part0->bd_inode);	/* frees the disk */
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 3bfc75a2a450..f757f9c2871f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -146,6 +146,7 @@ struct gendisk {
 #define GD_READ_ONLY			1
 #define GD_DEAD				2
 #define GD_NATIVE_CAPACITY		3
+#define GD_ADDED			4
 
 	struct mutex open_mutex;	/* open/close mutex */
 	unsigned open_partitions;	/* number of open partitions */
@@ -1464,6 +1465,7 @@ struct block_device_operations {
 	void (*unlock_native_capacity) (struct gendisk *);
 	int (*getgeo)(struct block_device *, struct hd_geometry *);
 	int (*set_read_only)(struct block_device *bdev, bool ro);
+	void (*free_disk)(struct gendisk *disk);
 	/* this callback is with swap_lock and sometimes page table lock held */
 	void (*swap_slot_free_notify) (struct block_device *, unsigned long);
 	int (*report_zones)(struct gendisk *, sector_t sector,

From e2efa0796607efe60c708271be483c3a2b0128de Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 15 Feb 2022 10:45:11 +0100
Subject: [PATCH 071/127] memstick/ms_block: simplify refcounting

Implement the ->free_disk method to free the msb_data structure only once
the last gendisk reference goes away instead of keeping a local refcount.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220215094514.3828912-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/memstick/core/ms_block.c | 64 ++++++++------------------------
 drivers/memstick/core/ms_block.h |  1 -
 2 files changed, 15 insertions(+), 50 deletions(-)

diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c
index 0cda6c6baefc..3993bdd4b519 100644
--- a/drivers/memstick/core/ms_block.c
+++ b/drivers/memstick/core/ms_block.c
@@ -1943,22 +1943,6 @@ static void msb_io_work(struct work_struct *work)
 static DEFINE_IDR(msb_disk_idr); /*set of used disk numbers */
 static DEFINE_MUTEX(msb_disk_lock); /* protects against races in open/release */
 
-static int msb_bd_open(struct block_device *bdev, fmode_t mode)
-{
-	struct gendisk *disk = bdev->bd_disk;
-	struct msb_data *msb = disk->private_data;
-
-	dbg_verbose("block device open");
-
-	mutex_lock(&msb_disk_lock);
-
-	if (msb && msb->card)
-		msb->usage_count++;
-
-	mutex_unlock(&msb_disk_lock);
-	return 0;
-}
-
 static void msb_data_clear(struct msb_data *msb)
 {
 	kfree(msb->boot_page);
@@ -1968,33 +1952,6 @@ static void msb_data_clear(struct msb_data *msb)
 	msb->card = NULL;
 }
 
-static int msb_disk_release(struct gendisk *disk)
-{
-	struct msb_data *msb = disk->private_data;
-
-	dbg_verbose("block device release");
-	mutex_lock(&msb_disk_lock);
-
-	if (msb) {
-		if (msb->usage_count)
-			msb->usage_count--;
-
-		if (!msb->usage_count) {
-			disk->private_data = NULL;
-			idr_remove(&msb_disk_idr, msb->disk_id);
-			put_disk(disk);
-			kfree(msb);
-		}
-	}
-	mutex_unlock(&msb_disk_lock);
-	return 0;
-}
-
-static void msb_bd_release(struct gendisk *disk, fmode_t mode)
-{
-	msb_disk_release(disk);
-}
-
 static int msb_bd_getgeo(struct block_device *bdev,
 				 struct hd_geometry *geo)
 {
@@ -2003,6 +1960,17 @@ static int msb_bd_getgeo(struct block_device *bdev,
 	return 0;
 }
 
+static void msb_bd_free_disk(struct gendisk *disk)
+{
+	struct msb_data *msb = disk->private_data;
+
+	mutex_lock(&msb_disk_lock);
+	idr_remove(&msb_disk_idr, msb->disk_id);
+	mutex_unlock(&msb_disk_lock);
+
+	kfree(msb);
+}
+
 static blk_status_t msb_queue_rq(struct blk_mq_hw_ctx *hctx,
 				 const struct blk_mq_queue_data *bd)
 {
@@ -2096,10 +2064,9 @@ static void msb_start(struct memstick_dev *card)
 }
 
 static const struct block_device_operations msb_bdops = {
-	.open    = msb_bd_open,
-	.release = msb_bd_release,
-	.getgeo  = msb_bd_getgeo,
-	.owner   = THIS_MODULE
+	.owner		= THIS_MODULE,
+	.getgeo		= msb_bd_getgeo,
+	.free_disk	= msb_bd_free_disk, 
 };
 
 static const struct blk_mq_ops msb_mq_ops = {
@@ -2147,7 +2114,6 @@ static int msb_init_disk(struct memstick_dev *card)
 	set_capacity(msb->disk, capacity);
 	dbg("Set total disk size to %lu sectors", capacity);
 
-	msb->usage_count = 1;
 	msb->io_queue = alloc_ordered_workqueue("ms_block", WQ_MEM_RECLAIM);
 	INIT_WORK(&msb->io_work, msb_io_work);
 	sg_init_table(msb->prealloc_sg, MS_BLOCK_MAX_SEGS+1);
@@ -2229,7 +2195,7 @@ static void msb_remove(struct memstick_dev *card)
 	msb_data_clear(msb);
 	mutex_unlock(&msb_disk_lock);
 
-	msb_disk_release(msb->disk);
+	put_disk(msb->disk);
 	memstick_set_drvdata(card, NULL);
 }
 
diff --git a/drivers/memstick/core/ms_block.h b/drivers/memstick/core/ms_block.h
index 122e1a8a8bd5..7058f9aefeb9 100644
--- a/drivers/memstick/core/ms_block.h
+++ b/drivers/memstick/core/ms_block.h
@@ -143,7 +143,6 @@ struct ms_boot_page {
 } __packed;
 
 struct msb_data {
-	unsigned int			usage_count;
 	struct memstick_dev		*card;
 	struct gendisk			*disk;
 	struct request_queue		*queue;

From 6dab421bfe06a59bf8f212a72e34673e8acf2018 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 15 Feb 2022 10:45:12 +0100
Subject: [PATCH 072/127] memstick/mspro_block: fix handling of read-only
 devices

Use set_disk_ro to propagate the read-only state to the block layer
instead of checking for it in ->open and leaking a reference in case
of a read-only device.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220215094514.3828912-4-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/memstick/core/mspro_block.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index c0450397b673..7ea312f0840e 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -186,13 +186,8 @@ static int mspro_block_bd_open(struct block_device *bdev, fmode_t mode)
 
 	mutex_lock(&mspro_block_disk_lock);
 
-	if (msb && msb->card) {
+	if (msb && msb->card)
 		msb->usage_count++;
-		if ((mode & FMODE_WRITE) && msb->read_only)
-			rc = -EROFS;
-		else
-			rc = 0;
-	}
 
 	mutex_unlock(&mspro_block_disk_lock);
 
@@ -1239,6 +1234,9 @@ static int mspro_block_init_disk(struct memstick_dev *card)
 	set_capacity(msb->disk, capacity);
 	dev_dbg(&card->dev, "capacity set %ld\n", capacity);
 
+	if (msb->read_only)
+		set_disk_ro(msb->disk, true);
+
 	rc = device_add_disk(&card->dev, msb->disk, NULL);
 	if (rc)
 		goto out_cleanup_disk;

From 185ed423d1898ead071c18f6161959cd3cab2dde Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 15 Feb 2022 10:45:13 +0100
Subject: [PATCH 073/127] memstick/mspro_block: simplify refcounting

Implement the ->free_disk method to free the msb_data structure only once
the last gendisk reference goes away instead of keeping a local
refcount.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220215094514.3828912-5-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/memstick/core/mspro_block.c | 49 +++++------------------------
 1 file changed, 7 insertions(+), 42 deletions(-)

diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index 7ea312f0840e..725ba74ded30 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -133,7 +133,6 @@ struct mspro_devinfo {
 
 struct mspro_block_data {
 	struct memstick_dev   *card;
-	unsigned int          usage_count;
 	unsigned int          caps;
 	struct gendisk        *disk;
 	struct request_queue  *queue;
@@ -178,48 +177,16 @@ static int mspro_block_complete_req(struct memstick_dev *card, int error);
 
 /*** Block device ***/
 
-static int mspro_block_bd_open(struct block_device *bdev, fmode_t mode)
-{
-	struct gendisk *disk = bdev->bd_disk;
-	struct mspro_block_data *msb = disk->private_data;
-	int rc = -ENXIO;
-
-	mutex_lock(&mspro_block_disk_lock);
-
-	if (msb && msb->card)
-		msb->usage_count++;
-
-	mutex_unlock(&mspro_block_disk_lock);
-
-	return rc;
-}
-
-
-static void mspro_block_disk_release(struct gendisk *disk)
+static void mspro_block_bd_free_disk(struct gendisk *disk)
 {
 	struct mspro_block_data *msb = disk->private_data;
 	int disk_id = MINOR(disk_devt(disk)) >> MSPRO_BLOCK_PART_SHIFT;
 
 	mutex_lock(&mspro_block_disk_lock);
-
-	if (msb) {
-		if (msb->usage_count)
-			msb->usage_count--;
-
-		if (!msb->usage_count) {
-			kfree(msb);
-			disk->private_data = NULL;
-			idr_remove(&mspro_block_disk_idr, disk_id);
-			put_disk(disk);
-		}
-	}
-
+	idr_remove(&mspro_block_disk_idr, disk_id);
 	mutex_unlock(&mspro_block_disk_lock);
-}
 
-static void mspro_block_bd_release(struct gendisk *disk, fmode_t mode)
-{
-	mspro_block_disk_release(disk);
+	kfree(msb);
 }
 
 static int mspro_block_bd_getgeo(struct block_device *bdev,
@@ -235,10 +202,9 @@ static int mspro_block_bd_getgeo(struct block_device *bdev,
 }
 
 static const struct block_device_operations ms_block_bdops = {
-	.open    = mspro_block_bd_open,
-	.release = mspro_block_bd_release,
-	.getgeo  = mspro_block_bd_getgeo,
-	.owner   = THIS_MODULE
+	.owner		= THIS_MODULE,
+	.getgeo		= mspro_block_bd_getgeo,
+	.free_disk	= mspro_block_bd_free_disk,
 };
 
 /*** Information ***/
@@ -1221,7 +1187,6 @@ static int mspro_block_init_disk(struct memstick_dev *card)
 	msb->disk->first_minor = disk_id << MSPRO_BLOCK_PART_SHIFT;
 	msb->disk->minors = 1 << MSPRO_BLOCK_PART_SHIFT;
 	msb->disk->fops = &ms_block_bdops;
-	msb->usage_count = 1;
 	msb->disk->private_data = msb;
 
 	sprintf(msb->disk->disk_name, "mspblk%d", disk_id);
@@ -1339,7 +1304,7 @@ static void mspro_block_remove(struct memstick_dev *card)
 	mspro_block_data_clear(msb);
 	mutex_unlock(&mspro_block_disk_lock);
 
-	mspro_block_disk_release(msb->disk);
+	put_disk(msb->disk);
 	memstick_set_drvdata(card, NULL);
 }
 

From 24b45e6c25173abcf8d5e82285212b47f2b0f86b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 15 Feb 2022 10:45:14 +0100
Subject: [PATCH 074/127] virtio_blk: simplify refcounting

Implement the ->free_disk method to free the virtio_blk structure only
once the last gendisk reference goes away instead of keeping a local
refcount.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Link: https://lore.kernel.org/r/20220215094514.3828912-6-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/virtio_blk.c | 66 ++++++++------------------------------
 1 file changed, 14 insertions(+), 52 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index c443cd64fc9b..5c636ca7f1a7 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -69,13 +69,6 @@ struct virtio_blk {
 	/* Process context for config space updates */
 	struct work_struct config_work;
 
-	/*
-	 * Tracks references from block_device_operations open/release and
-	 * virtio_driver probe/remove so this object can be freed once no
-	 * longer in use.
-	 */
-	refcount_t refs;
-
 	/* What host tells us, plus 2 for header & tailer. */
 	unsigned int sg_elems;
 
@@ -391,43 +384,6 @@ out:
 	return err;
 }
 
-static void virtblk_get(struct virtio_blk *vblk)
-{
-	refcount_inc(&vblk->refs);
-}
-
-static void virtblk_put(struct virtio_blk *vblk)
-{
-	if (refcount_dec_and_test(&vblk->refs)) {
-		ida_simple_remove(&vd_index_ida, vblk->index);
-		mutex_destroy(&vblk->vdev_mutex);
-		kfree(vblk);
-	}
-}
-
-static int virtblk_open(struct block_device *bd, fmode_t mode)
-{
-	struct virtio_blk *vblk = bd->bd_disk->private_data;
-	int ret = 0;
-
-	mutex_lock(&vblk->vdev_mutex);
-
-	if (vblk->vdev)
-		virtblk_get(vblk);
-	else
-		ret = -ENXIO;
-
-	mutex_unlock(&vblk->vdev_mutex);
-	return ret;
-}
-
-static void virtblk_release(struct gendisk *disk, fmode_t mode)
-{
-	struct virtio_blk *vblk = disk->private_data;
-
-	virtblk_put(vblk);
-}
-
 /* We provide getgeo only to please some old bootloader/partitioning tools */
 static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
 {
@@ -460,11 +416,19 @@ out:
 	return ret;
 }
 
+static void virtblk_free_disk(struct gendisk *disk)
+{
+	struct virtio_blk *vblk = disk->private_data;
+
+	ida_simple_remove(&vd_index_ida, vblk->index);
+	mutex_destroy(&vblk->vdev_mutex);
+	kfree(vblk);
+}
+
 static const struct block_device_operations virtblk_fops = {
-	.owner  = THIS_MODULE,
-	.open = virtblk_open,
-	.release = virtblk_release,
-	.getgeo = virtblk_getgeo,
+	.owner  	= THIS_MODULE,
+	.getgeo		= virtblk_getgeo,
+	.free_disk	= virtblk_free_disk,
 };
 
 static int index_to_minor(int index)
@@ -791,8 +755,6 @@ static int virtblk_probe(struct virtio_device *vdev)
 		goto out_free_index;
 	}
 
-	/* This reference is dropped in virtblk_remove(). */
-	refcount_set(&vblk->refs, 1);
 	mutex_init(&vblk->vdev_mutex);
 
 	vblk->vdev = vdev;
@@ -970,7 +932,7 @@ static void virtblk_remove(struct virtio_device *vdev)
 	flush_work(&vblk->config_work);
 
 	del_gendisk(vblk->disk);
-	blk_cleanup_disk(vblk->disk);
+	blk_cleanup_queue(vblk->disk->queue);
 	blk_mq_free_tag_set(&vblk->tag_set);
 
 	mutex_lock(&vblk->vdev_mutex);
@@ -986,7 +948,7 @@ static void virtblk_remove(struct virtio_device *vdev)
 
 	mutex_unlock(&vblk->vdev_mutex);
 
-	virtblk_put(vblk);
+	put_disk(vblk->disk);
 }
 
 #ifdef CONFIG_PM_SLEEP

From 8f5fea65b06de1cc51d4fc23fb4d378d1abd6ed7 Mon Sep 17 00:00:00 2001
From: David Jeffery <djeffery@redhat.com>
Date: Mon, 31 Jan 2022 15:33:37 -0500
Subject: [PATCH 075/127] blk-mq: avoid extending delays of active hctx from
 blk_mq_delay_run_hw_queues

When blk_mq_delay_run_hw_queues sets an hctx to run in the future, it can
reset the delay length for an already pending delayed work run_work. This
creates a scenario where multiple hctx may have their queues set to run,
but if one runs first and finds nothing to do, it can reset the delay of
another hctx and stall the other hctx's ability to run requests.

To avoid this I/O stall when an hctx's run_work is already pending,
leave it untouched to run at its current designated time rather than
extending its delay. The work will still run which keeps closed the race
calling blk_mq_delay_run_hw_queues is needed for while also avoiding the
I/O stall.

Signed-off-by: David Jeffery <djeffery@redhat.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20220131203337.GA17666@redhat
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7ca0b47246a6..a05ce7725031 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2179,6 +2179,14 @@ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (blk_mq_hctx_stopped(hctx))
 			continue;
+		/*
+		 * If there is already a run_work pending, leave the
+		 * pending delay untouched. Otherwise, a hctx can stall
+		 * if another hctx is re-delaying the other's work
+		 * before the work executes.
+		 */
+		if (delayed_work_pending(&hctx->run_work))
+			continue;
 		/*
 		 * Dispatch from this hctx either if there's no hctx preferred
 		 * by IO scheduler or if it has requests that bypass the

From bcd2be763252f3a4d5fc4d6008d4d96c601ee74b Mon Sep 17 00:00:00 2001
From: Yahu Gao <gaoyahu19@gmail.com>
Date: Fri, 7 Jan 2022 14:58:59 +0800
Subject: [PATCH 076/127] block/bfq_wf2q: correct weight to ioprio

The return value is ioprio * BFQ_WEIGHT_CONVERSION_COEFF or 0.
What we want is ioprio or 0.
Correct this by changing the calculation.

Signed-off-by: Yahu Gao <gaoyahu19@gmail.com>
Acked-by: Paolo Valente <paolo.valente@linaro.org>
Link: https://lore.kernel.org/r/20220107065859.25689-1-gaoyahu19@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-wf2q.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index b74cc0da118e..709b901de3ca 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -519,7 +519,7 @@ unsigned short bfq_ioprio_to_weight(int ioprio)
 static unsigned short bfq_weight_to_ioprio(int weight)
 {
 	return max_t(int, 0,
-		     IOPRIO_NR_LEVELS * BFQ_WEIGHT_CONVERSION_COEFF - weight);
+		     IOPRIO_NR_LEVELS - weight / BFQ_WEIGHT_CONVERSION_COEFF);
 }
 
 static void bfq_get_entity(struct bfq_entity *entity)

From 43a4b1fee098bd38eed9c334d0e0df221ecdf719 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Sat, 29 Jan 2022 09:59:22 +0800
Subject: [PATCH 077/127] block, bfq: cleanup bfq_bfqq_to_bfqg()

Use bfq_group() instead, which do the same thing.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: Paolo Valente <paolo.valente@linaro.org>
Link: https://lore.kernel.org/r/20220129015924.3958918-2-yukuai3@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c |  4 ++--
 block/bfq-iosched.h |  1 -
 block/bfq-wf2q.c    | 15 ---------------
 3 files changed, 2 insertions(+), 18 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 0c612a911696..2f2b97cad980 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -774,7 +774,7 @@ bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 	if (!bfqq->next_rq)
 		return;
 
-	bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
+	bfqq->pos_root = &bfqq_group(bfqq)->rq_pos_tree;
 	__bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
 			blk_rq_pos(bfqq->next_rq), &parent, &p);
 	if (!__bfqq) {
@@ -2669,7 +2669,7 @@ static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd,
 					 struct bfq_queue *bfqq,
 					 sector_t sector)
 {
-	struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
+	struct rb_root *root = &bfqq_group(bfqq)->rq_pos_tree;
 	struct rb_node *parent, *node;
 	struct bfq_queue *__bfqq;
 
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 72255ec44f8f..3b83e3d1c2e5 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -1050,7 +1050,6 @@ extern struct blkcg_policy blkcg_policy_bfq;
 	for (parent = NULL; entity ; entity = parent)
 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
 
-struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq);
 struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
 unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd);
 struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity);
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index 709b901de3ca..f8eb340381cf 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -142,16 +142,6 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
 
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
 
-struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
-{
-	struct bfq_entity *group_entity = bfqq->entity.parent;
-
-	if (!group_entity)
-		group_entity = &bfqq->bfqd->root_group->entity;
-
-	return container_of(group_entity, struct bfq_group, entity);
-}
-
 /*
  * Returns true if this budget changes may let next_in_service->parent
  * become the next_in_service entity for its parent entity.
@@ -230,11 +220,6 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
 
 #else /* CONFIG_BFQ_GROUP_IOSCHED */
 
-struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
-{
-	return bfqq->bfqd->root_group;
-}
-
 static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
 {
 	return false;

From c5e4cb0fcbbaa5ad853818c4a2383e9bd147fad6 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Sat, 29 Jan 2022 09:59:23 +0800
Subject: [PATCH 078/127] block, bfq: avoid moving bfqq to it's parent bfqg

Moving bfqq to it's parent bfqg is pointless.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Link: https://lore.kernel.org/r/20220129015924.3958918-3-yukuai3@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-cgroup.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 24a5c5329bcd..9783c1156159 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -645,6 +645,14 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 		   struct bfq_group *bfqg)
 {
 	struct bfq_entity *entity = &bfqq->entity;
+	struct bfq_group *old_parent = bfqq_group(bfqq);
+
+	/*
+	 * No point to move bfqq to the same group, which can happen when
+	 * root group is offlined
+	 */
+	if (old_parent == bfqg)
+		return;
 
 	/*
 	 * Get extra reference to prevent bfqq from being freed in
@@ -666,7 +674,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 		bfq_deactivate_bfqq(bfqd, bfqq, false, false);
 	else if (entity->on_st_or_in_serv)
 		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
-	bfqg_and_blkg_put(bfqq_group(bfqq));
+	bfqg_and_blkg_put(old_parent);
 
 	if (entity->parent &&
 	    entity->parent->last_bfqq_created == bfqq)

From 8410f70977734f21b8ed45c37e925d311dfda2e7 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Sat, 29 Jan 2022 09:59:24 +0800
Subject: [PATCH 079/127] block, bfq: don't move oom_bfqq

Our test report a UAF:

[ 2073.019181] ==================================================================
[ 2073.019188] BUG: KASAN: use-after-free in __bfq_put_async_bfqq+0xa0/0x168
[ 2073.019191] Write of size 8 at addr ffff8000ccf64128 by task rmmod/72584
[ 2073.019192]
[ 2073.019196] CPU: 0 PID: 72584 Comm: rmmod Kdump: loaded Not tainted 4.19.90-yk #5
[ 2073.019198] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
[ 2073.019200] Call trace:
[ 2073.019203]  dump_backtrace+0x0/0x310
[ 2073.019206]  show_stack+0x28/0x38
[ 2073.019210]  dump_stack+0xec/0x15c
[ 2073.019216]  print_address_description+0x68/0x2d0
[ 2073.019220]  kasan_report+0x238/0x2f0
[ 2073.019224]  __asan_store8+0x88/0xb0
[ 2073.019229]  __bfq_put_async_bfqq+0xa0/0x168
[ 2073.019233]  bfq_put_async_queues+0xbc/0x208
[ 2073.019236]  bfq_pd_offline+0x178/0x238
[ 2073.019240]  blkcg_deactivate_policy+0x1f0/0x420
[ 2073.019244]  bfq_exit_queue+0x128/0x178
[ 2073.019249]  blk_mq_exit_sched+0x12c/0x160
[ 2073.019252]  elevator_exit+0xc8/0xd0
[ 2073.019256]  blk_exit_queue+0x50/0x88
[ 2073.019259]  blk_cleanup_queue+0x228/0x3d8
[ 2073.019267]  null_del_dev+0xfc/0x1e0 [null_blk]
[ 2073.019274]  null_exit+0x90/0x114 [null_blk]
[ 2073.019278]  __arm64_sys_delete_module+0x358/0x5a0
[ 2073.019282]  el0_svc_common+0xc8/0x320
[ 2073.019287]  el0_svc_handler+0xf8/0x160
[ 2073.019290]  el0_svc+0x10/0x218
[ 2073.019291]
[ 2073.019294] Allocated by task 14163:
[ 2073.019301]  kasan_kmalloc+0xe0/0x190
[ 2073.019305]  kmem_cache_alloc_node_trace+0x1cc/0x418
[ 2073.019308]  bfq_pd_alloc+0x54/0x118
[ 2073.019313]  blkcg_activate_policy+0x250/0x460
[ 2073.019317]  bfq_create_group_hierarchy+0x38/0x110
[ 2073.019321]  bfq_init_queue+0x6d0/0x948
[ 2073.019325]  blk_mq_init_sched+0x1d8/0x390
[ 2073.019330]  elevator_switch_mq+0x88/0x170
[ 2073.019334]  elevator_switch+0x140/0x270
[ 2073.019338]  elv_iosched_store+0x1a4/0x2a0
[ 2073.019342]  queue_attr_store+0x90/0xe0
[ 2073.019348]  sysfs_kf_write+0xa8/0xe8
[ 2073.019351]  kernfs_fop_write+0x1f8/0x378
[ 2073.019359]  __vfs_write+0xe0/0x360
[ 2073.019363]  vfs_write+0xf0/0x270
[ 2073.019367]  ksys_write+0xdc/0x1b8
[ 2073.019371]  __arm64_sys_write+0x50/0x60
[ 2073.019375]  el0_svc_common+0xc8/0x320
[ 2073.019380]  el0_svc_handler+0xf8/0x160
[ 2073.019383]  el0_svc+0x10/0x218
[ 2073.019385]
[ 2073.019387] Freed by task 72584:
[ 2073.019391]  __kasan_slab_free+0x120/0x228
[ 2073.019394]  kasan_slab_free+0x10/0x18
[ 2073.019397]  kfree+0x94/0x368
[ 2073.019400]  bfqg_put+0x64/0xb0
[ 2073.019404]  bfqg_and_blkg_put+0x90/0xb0
[ 2073.019408]  bfq_put_queue+0x220/0x228
[ 2073.019413]  __bfq_put_async_bfqq+0x98/0x168
[ 2073.019416]  bfq_put_async_queues+0xbc/0x208
[ 2073.019420]  bfq_pd_offline+0x178/0x238
[ 2073.019424]  blkcg_deactivate_policy+0x1f0/0x420
[ 2073.019429]  bfq_exit_queue+0x128/0x178
[ 2073.019433]  blk_mq_exit_sched+0x12c/0x160
[ 2073.019437]  elevator_exit+0xc8/0xd0
[ 2073.019440]  blk_exit_queue+0x50/0x88
[ 2073.019443]  blk_cleanup_queue+0x228/0x3d8
[ 2073.019451]  null_del_dev+0xfc/0x1e0 [null_blk]
[ 2073.019459]  null_exit+0x90/0x114 [null_blk]
[ 2073.019462]  __arm64_sys_delete_module+0x358/0x5a0
[ 2073.019467]  el0_svc_common+0xc8/0x320
[ 2073.019471]  el0_svc_handler+0xf8/0x160
[ 2073.019474]  el0_svc+0x10/0x218
[ 2073.019475]
[ 2073.019479] The buggy address belongs to the object at ffff8000ccf63f00
 which belongs to the cache kmalloc-1024 of size 1024
[ 2073.019484] The buggy address is located 552 bytes inside of
 1024-byte region [ffff8000ccf63f00, ffff8000ccf64300)
[ 2073.019486] The buggy address belongs to the page:
[ 2073.019492] page:ffff7e000333d800 count:1 mapcount:0 mapping:ffff8000c0003a00 index:0x0 compound_mapcount: 0
[ 2073.020123] flags: 0x7ffff0000008100(slab|head)
[ 2073.020403] raw: 07ffff0000008100 ffff7e0003334c08 ffff7e00001f5a08 ffff8000c0003a00
[ 2073.020409] raw: 0000000000000000 00000000001c001c 00000001ffffffff 0000000000000000
[ 2073.020411] page dumped because: kasan: bad access detected
[ 2073.020412]
[ 2073.020414] Memory state around the buggy address:
[ 2073.020420]  ffff8000ccf64000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[ 2073.020424]  ffff8000ccf64080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[ 2073.020428] >ffff8000ccf64100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[ 2073.020430]                                   ^
[ 2073.020434]  ffff8000ccf64180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[ 2073.020438]  ffff8000ccf64200: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[ 2073.020439] ==================================================================

The same problem exist in mainline as well.

This is because oom_bfqq is moved to a non-root group, thus root_group
is freed earlier.

Thus fix the problem by don't move oom_bfqq.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: Paolo Valente <paolo.valente@linaro.org>
Link: https://lore.kernel.org/r/20220129015924.3958918-4-yukuai3@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-cgroup.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 9783c1156159..420eda2589c0 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -654,6 +654,12 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	if (old_parent == bfqg)
 		return;
 
+	/*
+	 * oom_bfqq is not allowed to move, oom_bfqq will hold ref to root_group
+	 * until elevator exit.
+	 */
+	if (bfqq == &bfqd->oom_bfqq)
+		return;
 	/*
 	 * Get extra reference to prevent bfqq from being freed in
 	 * next possible expire or deactivate.

From 86d7331299fda7634b11c1b7c911432679d525a5 Mon Sep 17 00:00:00 2001
From: Zhang Wensheng <zhangwensheng5@huawei.com>
Date: Thu, 17 Feb 2022 14:42:47 +0800
Subject: [PATCH 080/127] block: update io_ticks when io hang

When the inflight IOs are slow and no new IOs are issued, we expect
iostat could manifest the IO hang problem. However after
commit 5b18b5a73760 ("block: delete part_round_stats and switch to less
precise counting"), io_tick and time_in_queue will not be updated until
the end of IO, and the avgqu-sz and %util columns of iostat will be zero.

Because it has using stat.nsecs accumulation to express time_in_queue
which is not suitable to change, and may %util will express the status
better when io hang occur. To fix io_ticks, we use update_io_ticks and
inflight to update io_ticks when diskstats_show and part_stat_show
been called.

Fixes: 5b18b5a73760 ("block: delete part_round_stats and switch to less precise counting")
Signed-off-by: Zhang Wensheng <zhangwensheng5@huawei.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220217064247.4041435-1-zhangwensheng5@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index e351fac41bf2..54f60ded2ee6 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -919,12 +919,17 @@ ssize_t part_stat_show(struct device *dev,
 	struct disk_stats stat;
 	unsigned int inflight;
 
-	part_stat_read_all(bdev, &stat);
 	if (queue_is_mq(q))
 		inflight = blk_mq_in_flight(q, bdev);
 	else
 		inflight = part_in_flight(bdev);
 
+	if (inflight) {
+		part_stat_lock();
+		update_io_ticks(bdev, jiffies, true);
+		part_stat_unlock();
+	}
+	part_stat_read_all(bdev, &stat);
 	return sprintf(buf,
 		"%8lu %8lu %8llu %8u "
 		"%8lu %8lu %8llu %8u "
@@ -1184,12 +1189,17 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 	xa_for_each(&gp->part_tbl, idx, hd) {
 		if (bdev_is_partition(hd) && !bdev_nr_sectors(hd))
 			continue;
-		part_stat_read_all(hd, &stat);
 		if (queue_is_mq(gp->queue))
 			inflight = blk_mq_in_flight(gp->queue, hd);
 		else
 			inflight = part_in_flight(hd);
 
+		if (inflight) {
+			part_stat_lock();
+			update_io_ticks(hd, jiffies, true);
+			part_stat_unlock();
+		}
+		part_stat_read_all(hd, &stat);
 		seq_printf(seqf, "%4d %7d %pg "
 			   "%lu %lu %lu %u "
 			   "%lu %lu %lu %u "

From 451f0b6f4c44d7b649ae609157b114b71f6d7875 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 25 Feb 2022 19:14:40 +0100
Subject: [PATCH 081/127] block: default BLOCK_LEGACY_AUTOLOAD to y

As Luis reported, losetup currently doesn't properly create the loop
device without this if the device node already exists because old
scripts created it manually.  So default to y for now and remove the
aggressive removal schedule.

Reported-by: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Link: https://lore.kernel.org/r/20220225181440.1351591-1-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/Kconfig | 8 +++-----
 block/bdev.c  | 2 +-
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/block/Kconfig b/block/Kconfig
index 168b873eb666..7eb5d6d53b3f 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -28,15 +28,13 @@ if BLOCK
 
 config BLOCK_LEGACY_AUTOLOAD
 	bool "Legacy autoloading support"
+	default y
 	help
 	  Enable loading modules and creating block device instances based on
 	  accesses through their device special file.  This is a historic Linux
 	  feature and makes no sense in a udev world where device files are
-	  created on demand.
-
-	  Say N here unless booting or other functionality broke without it, in
-	  which case you should also send a report to your distribution and
-	  linux-block@vger.kernel.org.
+	  created on demand, but scripts that manually create device nodes and
+	  then call losetup might rely on this behavior.
 
 config BLK_RQ_ALLOC_TIME
 	bool
diff --git a/block/bdev.c b/block/bdev.c
index c68772644566..a3632317c8aa 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -738,7 +738,7 @@ struct block_device *blkdev_get_no_open(dev_t dev)
 		inode = ilookup(blockdev_superblock, dev);
 		if (inode)
 			pr_warn_ratelimited(
-"block device autoloading is deprecated. It will be removed in Linux 5.19\n");
+"block device autoloading is deprecated and will be removed.\n");
 	}
 	if (!inode)
 		return NULL;

From 483546c11d702fd6f74c8c3f8123b7672def10b2 Mon Sep 17 00:00:00 2001
From: Nian Yanchuan <yanchuan@nfschina.com>
Date: Mon, 28 Feb 2022 01:01:24 +0800
Subject: [PATCH 082/127] block: remove redundant semicolon

Remove redundant semicolon from block/bdev.c

Signed-off-by: Nian Yanchuan <yanchuan@nfschina.com>
Link: https://lore.kernel.org/r/20220227170124.GA14658@localhost.localdomain
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bdev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/bdev.c b/block/bdev.c
index a3632317c8aa..ce8de42a89be 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -678,7 +678,7 @@ static int blkdev_get_whole(struct block_device *bdev, fmode_t mode)
 	if (test_bit(GD_NEED_PART_SCAN, &disk->state))
 		bdev_disk_changed(disk, false);
 	bdev->bd_openers++;
-	return 0;;
+	return 0;
 }
 
 static void blkdev_put_whole(struct block_device *bdev, fmode_t mode)

From f5ec592dd3bcf7c91f7c262a7f5011e001d269cd Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 24 Jan 2022 13:59:36 -0800
Subject: [PATCH 083/127] block: simplify calling convention of
 elv_unregister_queue()

Make elv_unregister_queue() a no-op if q->elevator is NULL or is not
registered.

This simplifies the existing callers, as well as the future caller in
the error path of blk_register_queue().

Also don't bother checking whether q is NULL, since it never is.

Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220124215938.2769-2-ebiggers@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-sysfs.c | 3 +--
 block/elevator.c  | 8 ++++----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 4c6b7dff71e5..49854c7ea29b 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -960,8 +960,7 @@ void blk_unregister_queue(struct gendisk *disk)
 	blk_trace_remove_sysfs(disk_to_dev(disk));
 
 	mutex_lock(&q->sysfs_lock);
-	if (q->elevator)
-		elv_unregister_queue(q);
+	elv_unregister_queue(q);
 	disk_unregister_independent_access_ranges(disk);
 	mutex_unlock(&q->sysfs_lock);
 	mutex_unlock(&q->sysfs_dir_lock);
diff --git a/block/elevator.c b/block/elevator.c
index 6847ab6e7aa5..a842e4b8ebc6 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -516,9 +516,11 @@ int elv_register_queue(struct request_queue *q, bool uevent)
 
 void elv_unregister_queue(struct request_queue *q)
 {
+	struct elevator_queue *e = q->elevator;
+
 	lockdep_assert_held(&q->sysfs_lock);
 
-	if (q) {
+	if (e && e->registered) {
 		struct elevator_queue *e = q->elevator;
 
 		kobject_uevent(&e->kobj, KOBJ_REMOVE);
@@ -593,9 +595,7 @@ int elevator_switch_mq(struct request_queue *q,
 	lockdep_assert_held(&q->sysfs_lock);
 
 	if (q->elevator) {
-		if (q->elevator->registered)
-			elv_unregister_queue(q);
-
+		elv_unregister_queue(q);
 		ioc_clear_queue(q);
 		blk_mq_sched_free_rqs(q);
 		elevator_exit(q);

From 0f69288253e9fc7c495047720e523b9f1aba5712 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 24 Jan 2022 13:59:37 -0800
Subject: [PATCH 084/127] block: don't delete queue kobject before its children

kobjects aren't supposed to be deleted before their child kobjects are
deleted.  Apparently this is usually benign; however, a WARN will be
triggered if one of the child kobjects has a named attribute group:

    sysfs group 'modes' not found for kobject 'crypto'
    WARNING: CPU: 0 PID: 1 at fs/sysfs/group.c:278 sysfs_remove_group+0x72/0x80
    ...
    Call Trace:
      sysfs_remove_groups+0x29/0x40 fs/sysfs/group.c:312
      __kobject_del+0x20/0x80 lib/kobject.c:611
      kobject_cleanup+0xa4/0x140 lib/kobject.c:696
      kobject_release lib/kobject.c:736 [inline]
      kref_put include/linux/kref.h:65 [inline]
      kobject_put+0x53/0x70 lib/kobject.c:753
      blk_crypto_sysfs_unregister+0x10/0x20 block/blk-crypto-sysfs.c:159
      blk_unregister_queue+0xb0/0x110 block/blk-sysfs.c:962
      del_gendisk+0x117/0x250 block/genhd.c:610

Fix this by moving the kobject_del() and the corresponding
kobject_uevent() to the correct place.

Fixes: 2c2086afc2b8 ("block: Protect less code with sysfs_lock in blk_{un,}register_queue()")
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220124215938.2769-3-ebiggers@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-sysfs.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 49854c7ea29b..b38ef1bbf389 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -954,15 +954,17 @@ void blk_unregister_queue(struct gendisk *disk)
 	 */
 	if (queue_is_mq(q))
 		blk_mq_unregister_dev(disk_to_dev(disk), q);
-
-	kobject_uevent(&q->kobj, KOBJ_REMOVE);
-	kobject_del(&q->kobj);
 	blk_trace_remove_sysfs(disk_to_dev(disk));
 
 	mutex_lock(&q->sysfs_lock);
 	elv_unregister_queue(q);
 	disk_unregister_independent_access_ranges(disk);
 	mutex_unlock(&q->sysfs_lock);
+
+	/* Now that we've deleted all child objects, we can delete the queue. */
+	kobject_uevent(&q->kobj, KOBJ_REMOVE);
+	kobject_del(&q->kobj);
+
 	mutex_unlock(&q->sysfs_dir_lock);
 
 	kobject_put(&disk_to_dev(disk)->kobj);

From 20f01f163203666010ee1560852590a0c0572726 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 24 Jan 2022 13:59:38 -0800
Subject: [PATCH 085/127] blk-crypto: show crypto capabilities in sysfs

Add sysfs files that expose the inline encryption capabilities of
request queues:

	/sys/block/$disk/queue/crypto/max_dun_bits
	/sys/block/$disk/queue/crypto/modes/$mode
	/sys/block/$disk/queue/crypto/num_keyslots

Userspace can use these new files to decide what encryption settings to
use, or whether to use inline encryption at all.  This also brings the
crypto capabilities in line with the other queue properties, which are
already discoverable via the queue directory in sysfs.

Design notes:

  - Place the new files in a new subdirectory "crypto" to group them
    together and to avoid complicating the main "queue" directory.  This
    also makes it possible to replace "crypto" with a symlink later if
    we ever make the blk_crypto_profiles into real kobjects (see below).

  - It was necessary to define a new kobject that corresponds to the
    crypto subdirectory.  For now, this kobject just contains a pointer
    to the blk_crypto_profile.  Note that multiple queues (and hence
    multiple such kobjects) may refer to the same blk_crypto_profile.

    An alternative design would more closely match the current kernel
    data structures: the blk_crypto_profile could be a kobject itself,
    located directly under the host controller device's kobject, while
    /sys/block/$disk/queue/crypto would be a symlink to it.

    I decided not to do that for now because it would require a lot more
    changes, such as no longer embedding blk_crypto_profile in other
    structures, and also because I'm not sure we can rule out moving the
    crypto capabilities into 'struct queue_limits' in the future.  (Even
    if multiple queues share the same crypto engine, maybe the supported
    data unit sizes could differ due to other queue properties.)  It
    would also still be possible to switch to that design later without
    breaking userspace, by replacing the directory with a symlink.

  - Use "max_dun_bits" instead of "max_dun_bytes".  Currently, the
    kernel internally stores this value in bytes, but that's an
    implementation detail.  It probably makes more sense to talk about
    this value in bits, and choosing bits is more future-proof.

  - "modes" is a sub-subdirectory, since there may be multiple supported
    crypto modes, sysfs is supposed to have one value per file, and it
    makes sense to group all the mode files together.

  - Each mode had to be named.  The crypto API names like "xts(aes)" are
    not appropriate because they don't specify the key size.  Therefore,
    I assigned new names.  The exact names chosen are arbitrary, but
    they happen to match the names used in log messages in fs/crypto/.

  - The "num_keyslots" file is a bit different from the others in that
    it is only useful to know for performance reasons.  However, it's
    included as it can still be useful.  For example, a user might not
    want to use inline encryption if there aren't very many keyslots.

Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20220124215938.2769-4-ebiggers@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/ABI/stable/sysfs-block |  49 ++++++++
 block/Makefile                       |   3 +-
 block/blk-crypto-internal.h          |  12 ++
 block/blk-crypto-sysfs.c             | 172 +++++++++++++++++++++++++++
 block/blk-crypto.c                   |   3 +
 block/blk-sysfs.c                    |   6 +
 include/linux/blkdev.h               |   1 +
 7 files changed, 245 insertions(+), 1 deletion(-)
 create mode 100644 block/blk-crypto-sysfs.c

diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
index 8dd3e84a8aad..e8797cd09aff 100644
--- a/Documentation/ABI/stable/sysfs-block
+++ b/Documentation/ABI/stable/sysfs-block
@@ -155,6 +155,55 @@ Description:
 		last zone of the device which may be smaller.
 
 
+What:		/sys/block/<disk>/queue/crypto/
+Date:		February 2022
+Contact:	linux-block@vger.kernel.org
+Description:
+		The presence of this subdirectory of /sys/block/<disk>/queue/
+		indicates that the device supports inline encryption.  This
+		subdirectory contains files which describe the inline encryption
+		capabilities of the device.  For more information about inline
+		encryption, refer to Documentation/block/inline-encryption.rst.
+
+
+What:		/sys/block/<disk>/queue/crypto/max_dun_bits
+Date:		February 2022
+Contact:	linux-block@vger.kernel.org
+Description:
+		[RO] This file shows the maximum length, in bits, of data unit
+		numbers accepted by the device in inline encryption requests.
+
+
+What:		/sys/block/<disk>/queue/crypto/modes/<mode>
+Date:		February 2022
+Contact:	linux-block@vger.kernel.org
+Description:
+		[RO] For each crypto mode (i.e., encryption/decryption
+		algorithm) the device supports with inline encryption, a file
+		will exist at this location.  It will contain a hexadecimal
+		number that is a bitmask of the supported data unit sizes, in
+		bytes, for that crypto mode.
+
+		Currently, the crypto modes that may be supported are:
+
+		   * AES-256-XTS
+		   * AES-128-CBC-ESSIV
+		   * Adiantum
+
+		For example, if a device supports AES-256-XTS inline encryption
+		with data unit sizes of 512 and 4096 bytes, the file
+		/sys/block/<disk>/queue/crypto/modes/AES-256-XTS will exist and
+		will contain "0x1200".
+
+
+What:		/sys/block/<disk>/queue/crypto/num_keyslots
+Date:		February 2022
+Contact:	linux-block@vger.kernel.org
+Description:
+		[RO] This file shows the number of keyslots the device has for
+		use with inline encryption.
+
+
 What:		/sys/block/<disk>/queue/dax
 Date:		June 2016
 Contact:	linux-block@vger.kernel.org
diff --git a/block/Makefile b/block/Makefile
index f38eaa612929..3950ecbc5c26 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_BLK_DEBUG_FS)	+= blk-mq-debugfs.o
 obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o
 obj-$(CONFIG_BLK_SED_OPAL)	+= sed-opal.o
 obj-$(CONFIG_BLK_PM)		+= blk-pm.o
-obj-$(CONFIG_BLK_INLINE_ENCRYPTION)	+= blk-crypto.o blk-crypto-profile.o
+obj-$(CONFIG_BLK_INLINE_ENCRYPTION)	+= blk-crypto.o blk-crypto-profile.o \
+					   blk-crypto-sysfs.o
 obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK)	+= blk-crypto-fallback.o
 obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED)	+= holder.o
diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h
index 2fb0d65a464c..e6818ffaddbf 100644
--- a/block/blk-crypto-internal.h
+++ b/block/blk-crypto-internal.h
@@ -11,6 +11,7 @@
 
 /* Represents a crypto mode supported by blk-crypto  */
 struct blk_crypto_mode {
+	const char *name; /* name of this mode, shown in sysfs */
 	const char *cipher_str; /* crypto API name (for fallback case) */
 	unsigned int keysize; /* key size in bytes */
 	unsigned int ivsize; /* iv size in bytes */
@@ -20,6 +21,10 @@ extern const struct blk_crypto_mode blk_crypto_modes[];
 
 #ifdef CONFIG_BLK_INLINE_ENCRYPTION
 
+int blk_crypto_sysfs_register(struct request_queue *q);
+
+void blk_crypto_sysfs_unregister(struct request_queue *q);
+
 void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE],
 			     unsigned int inc);
 
@@ -62,6 +67,13 @@ static inline bool blk_crypto_rq_is_encrypted(struct request *rq)
 
 #else /* CONFIG_BLK_INLINE_ENCRYPTION */
 
+static inline int blk_crypto_sysfs_register(struct request_queue *q)
+{
+	return 0;
+}
+
+static inline void blk_crypto_sysfs_unregister(struct request_queue *q) { }
+
 static inline bool bio_crypt_rq_ctx_compatible(struct request *rq,
 					       struct bio *bio)
 {
diff --git a/block/blk-crypto-sysfs.c b/block/blk-crypto-sysfs.c
new file mode 100644
index 000000000000..fd93bd2f33b7
--- /dev/null
+++ b/block/blk-crypto-sysfs.c
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2021 Google LLC
+ *
+ * sysfs support for blk-crypto.  This file contains the code which exports the
+ * crypto capabilities of devices via /sys/block/$disk/queue/crypto/.
+ */
+
+#include <linux/blk-crypto-profile.h>
+
+#include "blk-crypto-internal.h"
+
+struct blk_crypto_kobj {
+	struct kobject kobj;
+	struct blk_crypto_profile *profile;
+};
+
+struct blk_crypto_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct blk_crypto_profile *profile,
+			struct blk_crypto_attr *attr, char *page);
+};
+
+static struct blk_crypto_profile *kobj_to_crypto_profile(struct kobject *kobj)
+{
+	return container_of(kobj, struct blk_crypto_kobj, kobj)->profile;
+}
+
+static struct blk_crypto_attr *attr_to_crypto_attr(struct attribute *attr)
+{
+	return container_of(attr, struct blk_crypto_attr, attr);
+}
+
+static ssize_t max_dun_bits_show(struct blk_crypto_profile *profile,
+				 struct blk_crypto_attr *attr, char *page)
+{
+	return sysfs_emit(page, "%u\n", 8 * profile->max_dun_bytes_supported);
+}
+
+static ssize_t num_keyslots_show(struct blk_crypto_profile *profile,
+				 struct blk_crypto_attr *attr, char *page)
+{
+	return sysfs_emit(page, "%u\n", profile->num_slots);
+}
+
+#define BLK_CRYPTO_RO_ATTR(_name) \
+	static struct blk_crypto_attr _name##_attr = __ATTR_RO(_name)
+
+BLK_CRYPTO_RO_ATTR(max_dun_bits);
+BLK_CRYPTO_RO_ATTR(num_keyslots);
+
+static struct attribute *blk_crypto_attrs[] = {
+	&max_dun_bits_attr.attr,
+	&num_keyslots_attr.attr,
+	NULL,
+};
+
+static const struct attribute_group blk_crypto_attr_group = {
+	.attrs = blk_crypto_attrs,
+};
+
+/*
+ * The encryption mode attributes.  To avoid hard-coding the list of encryption
+ * modes, these are initialized at boot time by blk_crypto_sysfs_init().
+ */
+static struct blk_crypto_attr __blk_crypto_mode_attrs[BLK_ENCRYPTION_MODE_MAX];
+static struct attribute *blk_crypto_mode_attrs[BLK_ENCRYPTION_MODE_MAX + 1];
+
+static umode_t blk_crypto_mode_is_visible(struct kobject *kobj,
+					  struct attribute *attr, int n)
+{
+	struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj);
+	struct blk_crypto_attr *a = attr_to_crypto_attr(attr);
+	int mode_num = a - __blk_crypto_mode_attrs;
+
+	if (profile->modes_supported[mode_num])
+		return 0444;
+	return 0;
+}
+
+static ssize_t blk_crypto_mode_show(struct blk_crypto_profile *profile,
+				    struct blk_crypto_attr *attr, char *page)
+{
+	int mode_num = attr - __blk_crypto_mode_attrs;
+
+	return sysfs_emit(page, "0x%x\n", profile->modes_supported[mode_num]);
+}
+
+static const struct attribute_group blk_crypto_modes_attr_group = {
+	.name = "modes",
+	.attrs = blk_crypto_mode_attrs,
+	.is_visible = blk_crypto_mode_is_visible,
+};
+
+static const struct attribute_group *blk_crypto_attr_groups[] = {
+	&blk_crypto_attr_group,
+	&blk_crypto_modes_attr_group,
+	NULL,
+};
+
+static ssize_t blk_crypto_attr_show(struct kobject *kobj,
+				    struct attribute *attr, char *page)
+{
+	struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj);
+	struct blk_crypto_attr *a = attr_to_crypto_attr(attr);
+
+	return a->show(profile, a, page);
+}
+
+static const struct sysfs_ops blk_crypto_attr_ops = {
+	.show = blk_crypto_attr_show,
+};
+
+static void blk_crypto_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct blk_crypto_kobj, kobj));
+}
+
+static struct kobj_type blk_crypto_ktype = {
+	.default_groups = blk_crypto_attr_groups,
+	.sysfs_ops	= &blk_crypto_attr_ops,
+	.release	= blk_crypto_release,
+};
+
+/*
+ * If the request_queue has a blk_crypto_profile, create the "crypto"
+ * subdirectory in sysfs (/sys/block/$disk/queue/crypto/).
+ */
+int blk_crypto_sysfs_register(struct request_queue *q)
+{
+	struct blk_crypto_kobj *obj;
+	int err;
+
+	if (!q->crypto_profile)
+		return 0;
+
+	obj = kzalloc(sizeof(*obj), GFP_KERNEL);
+	if (!obj)
+		return -ENOMEM;
+	obj->profile = q->crypto_profile;
+
+	err = kobject_init_and_add(&obj->kobj, &blk_crypto_ktype, &q->kobj,
+				   "crypto");
+	if (err) {
+		kobject_put(&obj->kobj);
+		return err;
+	}
+	q->crypto_kobject = &obj->kobj;
+	return 0;
+}
+
+void blk_crypto_sysfs_unregister(struct request_queue *q)
+{
+	kobject_put(q->crypto_kobject);
+}
+
+static int __init blk_crypto_sysfs_init(void)
+{
+	int i;
+
+	BUILD_BUG_ON(BLK_ENCRYPTION_MODE_INVALID != 0);
+	for (i = 1; i < BLK_ENCRYPTION_MODE_MAX; i++) {
+		struct blk_crypto_attr *attr = &__blk_crypto_mode_attrs[i];
+
+		attr->attr.name = blk_crypto_modes[i].name;
+		attr->attr.mode = 0444;
+		attr->show = blk_crypto_mode_show;
+		blk_crypto_mode_attrs[i - 1] = &attr->attr;
+	}
+	return 0;
+}
+subsys_initcall(blk_crypto_sysfs_init);
diff --git a/block/blk-crypto.c b/block/blk-crypto.c
index 773dae4c329b..a496aaef85ba 100644
--- a/block/blk-crypto.c
+++ b/block/blk-crypto.c
@@ -19,16 +19,19 @@
 
 const struct blk_crypto_mode blk_crypto_modes[] = {
 	[BLK_ENCRYPTION_MODE_AES_256_XTS] = {
+		.name = "AES-256-XTS",
 		.cipher_str = "xts(aes)",
 		.keysize = 64,
 		.ivsize = 16,
 	},
 	[BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV] = {
+		.name = "AES-128-CBC-ESSIV",
 		.cipher_str = "essiv(cbc(aes),sha256)",
 		.keysize = 16,
 		.ivsize = 16,
 	},
 	[BLK_ENCRYPTION_MODE_ADIANTUM] = {
+		.name = "Adiantum",
 		.cipher_str = "adiantum(xchacha12,aes)",
 		.keysize = 32,
 		.ivsize = 32,
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index b38ef1bbf389..241ded62f458 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -880,6 +880,10 @@ int blk_register_queue(struct gendisk *disk)
 			goto put_dev;
 	}
 
+	ret = blk_crypto_sysfs_register(q);
+	if (ret)
+		goto put_dev;
+
 	blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
 	wbt_enable_default(q);
 	blk_throtl_register_queue(q);
@@ -910,6 +914,7 @@ unlock:
 	return ret;
 
 put_dev:
+	elv_unregister_queue(q);
 	disk_unregister_independent_access_ranges(disk);
 	mutex_unlock(&q->sysfs_lock);
 	mutex_unlock(&q->sysfs_dir_lock);
@@ -954,6 +959,7 @@ void blk_unregister_queue(struct gendisk *disk)
 	 */
 	if (queue_is_mq(q))
 		blk_mq_unregister_dev(disk_to_dev(disk), q);
+	blk_crypto_sysfs_unregister(q);
 	blk_trace_remove_sysfs(disk_to_dev(disk));
 
 	mutex_lock(&q->sysfs_lock);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f757f9c2871f..e19947d84f12 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -413,6 +413,7 @@ struct request_queue {
 
 #ifdef CONFIG_BLK_INLINE_ENCRYPTION
 	struct blk_crypto_profile *crypto_profile;
+	struct kobject *crypto_kobject;
 #endif
 
 	unsigned int		rq_timeout;

From ab552fcb17cc9e4afe0e4ac4df95fc7b30e8490a Mon Sep 17 00:00:00 2001
From: Zhang Wensheng <zhangwensheng5@huawei.com>
Date: Thu, 3 Mar 2022 15:03:34 +0800
Subject: [PATCH 086/127] bfq: fix use-after-free in bfq_dispatch_request

KASAN reports a use-after-free report when doing normal scsi-mq test

[69832.239032] ==================================================================
[69832.241810] BUG: KASAN: use-after-free in bfq_dispatch_request+0x1045/0x44b0
[69832.243267] Read of size 8 at addr ffff88802622ba88 by task kworker/3:1H/155
[69832.244656]
[69832.245007] CPU: 3 PID: 155 Comm: kworker/3:1H Not tainted 5.10.0-10295-g576c6382529e #8
[69832.246626] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
[69832.249069] Workqueue: kblockd blk_mq_run_work_fn
[69832.250022] Call Trace:
[69832.250541]  dump_stack+0x9b/0xce
[69832.251232]  ? bfq_dispatch_request+0x1045/0x44b0
[69832.252243]  print_address_description.constprop.6+0x3e/0x60
[69832.253381]  ? __cpuidle_text_end+0x5/0x5
[69832.254211]  ? vprintk_func+0x6b/0x120
[69832.254994]  ? bfq_dispatch_request+0x1045/0x44b0
[69832.255952]  ? bfq_dispatch_request+0x1045/0x44b0
[69832.256914]  kasan_report.cold.9+0x22/0x3a
[69832.257753]  ? bfq_dispatch_request+0x1045/0x44b0
[69832.258755]  check_memory_region+0x1c1/0x1e0
[69832.260248]  bfq_dispatch_request+0x1045/0x44b0
[69832.261181]  ? bfq_bfqq_expire+0x2440/0x2440
[69832.262032]  ? blk_mq_delay_run_hw_queues+0xf9/0x170
[69832.263022]  __blk_mq_do_dispatch_sched+0x52f/0x830
[69832.264011]  ? blk_mq_sched_request_inserted+0x100/0x100
[69832.265101]  __blk_mq_sched_dispatch_requests+0x398/0x4f0
[69832.266206]  ? blk_mq_do_dispatch_ctx+0x570/0x570
[69832.267147]  ? __switch_to+0x5f4/0xee0
[69832.267898]  blk_mq_sched_dispatch_requests+0xdf/0x140
[69832.268946]  __blk_mq_run_hw_queue+0xc0/0x270
[69832.269840]  blk_mq_run_work_fn+0x51/0x60
[69832.278170]  process_one_work+0x6d4/0xfe0
[69832.278984]  worker_thread+0x91/0xc80
[69832.279726]  ? __kthread_parkme+0xb0/0x110
[69832.280554]  ? process_one_work+0xfe0/0xfe0
[69832.281414]  kthread+0x32d/0x3f0
[69832.282082]  ? kthread_park+0x170/0x170
[69832.282849]  ret_from_fork+0x1f/0x30
[69832.283573]
[69832.283886] Allocated by task 7725:
[69832.284599]  kasan_save_stack+0x19/0x40
[69832.285385]  __kasan_kmalloc.constprop.2+0xc1/0xd0
[69832.286350]  kmem_cache_alloc_node+0x13f/0x460
[69832.287237]  bfq_get_queue+0x3d4/0x1140
[69832.287993]  bfq_get_bfqq_handle_split+0x103/0x510
[69832.289015]  bfq_init_rq+0x337/0x2d50
[69832.289749]  bfq_insert_requests+0x304/0x4e10
[69832.290634]  blk_mq_sched_insert_requests+0x13e/0x390
[69832.291629]  blk_mq_flush_plug_list+0x4b4/0x760
[69832.292538]  blk_flush_plug_list+0x2c5/0x480
[69832.293392]  io_schedule_prepare+0xb2/0xd0
[69832.294209]  io_schedule_timeout+0x13/0x80
[69832.295014]  wait_for_common_io.constprop.1+0x13c/0x270
[69832.296137]  submit_bio_wait+0x103/0x1a0
[69832.296932]  blkdev_issue_discard+0xe6/0x160
[69832.297794]  blk_ioctl_discard+0x219/0x290
[69832.298614]  blkdev_common_ioctl+0x50a/0x1750
[69832.304715]  blkdev_ioctl+0x470/0x600
[69832.305474]  block_ioctl+0xde/0x120
[69832.306232]  vfs_ioctl+0x6c/0xc0
[69832.306877]  __se_sys_ioctl+0x90/0xa0
[69832.307629]  do_syscall_64+0x2d/0x40
[69832.308362]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[69832.309382]
[69832.309701] Freed by task 155:
[69832.310328]  kasan_save_stack+0x19/0x40
[69832.311121]  kasan_set_track+0x1c/0x30
[69832.311868]  kasan_set_free_info+0x1b/0x30
[69832.312699]  __kasan_slab_free+0x111/0x160
[69832.313524]  kmem_cache_free+0x94/0x460
[69832.314367]  bfq_put_queue+0x582/0x940
[69832.315112]  __bfq_bfqd_reset_in_service+0x166/0x1d0
[69832.317275]  bfq_bfqq_expire+0xb27/0x2440
[69832.318084]  bfq_dispatch_request+0x697/0x44b0
[69832.318991]  __blk_mq_do_dispatch_sched+0x52f/0x830
[69832.319984]  __blk_mq_sched_dispatch_requests+0x398/0x4f0
[69832.321087]  blk_mq_sched_dispatch_requests+0xdf/0x140
[69832.322225]  __blk_mq_run_hw_queue+0xc0/0x270
[69832.323114]  blk_mq_run_work_fn+0x51/0x60
[69832.323942]  process_one_work+0x6d4/0xfe0
[69832.324772]  worker_thread+0x91/0xc80
[69832.325518]  kthread+0x32d/0x3f0
[69832.326205]  ret_from_fork+0x1f/0x30
[69832.326932]
[69832.338297] The buggy address belongs to the object at ffff88802622b968
[69832.338297]  which belongs to the cache bfq_queue of size 512
[69832.340766] The buggy address is located 288 bytes inside of
[69832.340766]  512-byte region [ffff88802622b968, ffff88802622bb68)
[69832.343091] The buggy address belongs to the page:
[69832.344097] page:ffffea0000988a00 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff88802622a528 pfn:0x26228
[69832.346214] head:ffffea0000988a00 order:2 compound_mapcount:0 compound_pincount:0
[69832.347719] flags: 0x1fffff80010200(slab|head)
[69832.348625] raw: 001fffff80010200 ffffea0000dbac08 ffff888017a57650 ffff8880179fe840
[69832.354972] raw: ffff88802622a528 0000000000120008 00000001ffffffff 0000000000000000
[69832.356547] page dumped because: kasan: bad access detected
[69832.357652]
[69832.357970] Memory state around the buggy address:
[69832.358926]  ffff88802622b980: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[69832.360358]  ffff88802622ba00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[69832.361810] >ffff88802622ba80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[69832.363273]                       ^
[69832.363975]  ffff88802622bb00: fb fb fb fb fb fb fb fb fb fb fb fb fb fc fc fc
[69832.375960]  ffff88802622bb80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[69832.377405] ==================================================================

In bfq_dispatch_requestfunction, it may have function call:

bfq_dispatch_request
	__bfq_dispatch_request
		bfq_select_queue
			bfq_bfqq_expire
				__bfq_bfqd_reset_in_service
					bfq_put_queue
						kmem_cache_free
In this function call, in_serv_queue has beed expired and meet the
conditions to free. In the function bfq_dispatch_request, the address
of in_serv_queue pointing to has been released. For getting the value
of idle_timer_disabled, it will get flags value from the address which
in_serv_queue pointing to, then the problem of use-after-free happens;

Fix the problem by check in_serv_queue == bfqd->in_service_queue, to
get the value of idle_timer_disabled if in_serve_queue is equel to
bfqd->in_service_queue. If the space of in_serv_queue pointing has
been released, this judge will aviod use-after-free problem.
And if in_serv_queue may be expired or finished, the idle_timer_disabled
will be false which would not give effects to bfq_update_dispatch_stats.

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Zhang Wensheng <zhangwensheng5@huawei.com>
Link: https://lore.kernel.org/r/20220303070334.3020168-1-zhangwensheng5@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 2f2b97cad980..e234eb408ed0 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -5181,7 +5181,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
 	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
 	struct request *rq;
 	struct bfq_queue *in_serv_queue;
-	bool waiting_rq, idle_timer_disabled;
+	bool waiting_rq, idle_timer_disabled = false;
 
 	spin_lock_irq(&bfqd->lock);
 
@@ -5189,14 +5189,15 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
 	waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue);
 
 	rq = __bfq_dispatch_request(hctx);
-
-	idle_timer_disabled =
-		waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
+	if (in_serv_queue == bfqd->in_service_queue) {
+		idle_timer_disabled =
+			waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
+	}
 
 	spin_unlock_irq(&bfqd->lock);
-
-	bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue,
-				  idle_timer_disabled);
+	bfq_update_dispatch_stats(hctx->queue, rq,
+			idle_timer_disabled ? in_serv_queue : NULL,
+				idle_timer_disabled);
 
 	return rq;
 }

From 57e95e4670d1126c103305bcf34a9442f49f6d6a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 4 Mar 2022 19:00:56 +0100
Subject: [PATCH 087/127] block: fix and cleanup bio_check_ro

Don't use a WARN_ON when printing a potentially user triggered
condition.  Also don't print the partno when the block device name
already includes it, and use the %pg specifier to simplify printing
the block device name.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20220304180105.409765-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 94bf37f8e61d..34e1b7fdb7c8 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -580,14 +580,10 @@ late_initcall(fail_make_request_debugfs);
 static inline bool bio_check_ro(struct bio *bio)
 {
 	if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) {
-		char b[BDEVNAME_SIZE];
-
 		if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
 			return false;
-
-		WARN_ONCE(1,
-		       "Trying to write to read-only block-device %s (partno %d)\n",
-			bio_devname(bio, b), bio->bi_bdev->bd_partno);
+		pr_warn("Trying to write to read-only block-device %pg\n",
+			bio->bi_bdev);
 		/* Older lvm-tools actually trigger this */
 		return false;
 	}

From ad740780bbc2fe37856f944dbbaff07aac9db9e3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 4 Mar 2022 19:00:57 +0100
Subject: [PATCH 088/127] block: remove handle_bad_sector

Use the %pg format specifier instead of the stack hungry bdevname
function, and remove handle_bad_sector given that it is not pointless.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20220304180105.409765-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 34e1b7fdb7c8..4d858fc08f8b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -540,17 +540,6 @@ bool blk_get_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_get_queue);
 
-static void handle_bad_sector(struct bio *bio, sector_t maxsector)
-{
-	char b[BDEVNAME_SIZE];
-
-	pr_info_ratelimited("%s: attempt to access beyond end of device\n"
-			    "%s: rw=%d, want=%llu, limit=%llu\n",
-			    current->comm,
-			    bio_devname(bio, b), bio->bi_opf,
-			    bio_end_sector(bio), maxsector);
-}
-
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 
 static DECLARE_FAULT_ATTR(fail_make_request);
@@ -612,7 +601,11 @@ static inline int bio_check_eod(struct bio *bio)
 	if (nr_sectors && maxsector &&
 	    (nr_sectors > maxsector ||
 	     bio->bi_iter.bi_sector > maxsector - nr_sectors)) {
-		handle_bad_sector(bio, maxsector);
+		pr_info_ratelimited("%s: attempt to access beyond end of device\n"
+				    "%pg: rw=%d, want=%llu, limit=%llu\n",
+				    current->comm,
+				    bio->bi_bdev, bio->bi_opf,
+				    bio_end_sector(bio), maxsector);
 		return -EIO;
 	}
 	return 0;

From 47c426d5241795cfcd9be748c44d1b2e2987ce70 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 4 Mar 2022 19:00:58 +0100
Subject: [PATCH 089/127] pktcdvd: remove a pointless debug check in
 pkt_submit_bio

->queuedata is set up in pkt_init_queue, so it can't be NULL here.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20220304180105.409765-4-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/pktcdvd.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index be749c686feb..e745fc29e55d 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2400,18 +2400,11 @@ static void pkt_make_request_write(struct request_queue *q, struct bio *bio)
 
 static void pkt_submit_bio(struct bio *bio)
 {
-	struct pktcdvd_device *pd;
-	char b[BDEVNAME_SIZE];
+	struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->queue->queuedata;
 	struct bio *split;
 
 	blk_queue_split(&bio);
 
-	pd = bio->bi_bdev->bd_disk->queue->queuedata;
-	if (!pd) {
-		pr_err("%s incorrect request queue\n", bio_devname(bio, b));
-		goto end_io;
-	}
-
 	pkt_dbg(2, pd, "start = %6llx stop = %6llx\n",
 		(unsigned long long)bio->bi_iter.bi_sector,
 		(unsigned long long)bio_end_sector(bio));

From 66671719650085f92fd460d2a356c33f9198dd35 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 4 Mar 2022 19:00:59 +0100
Subject: [PATCH 090/127] dm-crypt: stop using bio_devname

Use the %pg format specifier to save on stack consuption and code size.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20220304180105.409765-5-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/dm-crypt.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index a5006cb6ee8a..e2b0af4a2ee8 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1364,11 +1364,10 @@ static int crypt_convert_block_aead(struct crypt_config *cc,
 	}
 
 	if (r == -EBADMSG) {
-		char b[BDEVNAME_SIZE];
 		sector_t s = le64_to_cpu(*sector);
 
-		DMERR_LIMIT("%s: INTEGRITY AEAD ERROR, sector %llu",
-			    bio_devname(ctx->bio_in, b), s);
+		DMERR_LIMIT("%pg: INTEGRITY AEAD ERROR, sector %llu",
+			    ctx->bio_in->bi_bdev, s);
 		dm_audit_log_bio(DM_MSG_PREFIX, "integrity-aead",
 				 ctx->bio_in, s, 0);
 	}
@@ -2169,11 +2168,10 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
 		error = cc->iv_gen_ops->post(cc, org_iv_of_dmreq(cc, dmreq), dmreq);
 
 	if (error == -EBADMSG) {
-		char b[BDEVNAME_SIZE];
 		sector_t s = le64_to_cpu(*org_sector_of_dmreq(cc, dmreq));
 
-		DMERR_LIMIT("%s: INTEGRITY AEAD ERROR, sector %llu",
-			    bio_devname(ctx->bio_in, b), s);
+		DMERR_LIMIT("%pg: INTEGRITY AEAD ERROR, sector %llu",
+			    ctx->bio_in->bi_bdev, s);
 		dm_audit_log_bio(DM_MSG_PREFIX, "integrity-aead",
 				 ctx->bio_in, s, 0);
 		io->error = BLK_STS_PROTECTION;

From 0a806cfde82fcd1fb856864e33d17c68d1b51dee Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 4 Mar 2022 19:01:00 +0100
Subject: [PATCH 091/127] dm-integrity: stop using bio_devname

Use the %pg format specifier to save on stack consuption and code size.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20220304180105.409765-6-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/dm-integrity.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index eb4b5e52bd6f..c58a5111cb57 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -1788,12 +1788,11 @@ again:
 						checksums_ptr - checksums, dio->op == REQ_OP_READ ? TAG_CMP : TAG_WRITE);
 			if (unlikely(r)) {
 				if (r > 0) {
-					char b[BDEVNAME_SIZE];
 					sector_t s;
 
 					s = sector - ((r + ic->tag_size - 1) / ic->tag_size);
-					DMERR_LIMIT("%s: Checksum failed at sector 0x%llx",
-						    bio_devname(bio, b), s);
+					DMERR_LIMIT("%pg: Checksum failed at sector 0x%llx",
+						    bio->bi_bdev, s);
 					r = -EILSEQ;
 					atomic64_inc(&ic->number_of_mismatches);
 					dm_audit_log_bio(DM_MSG_PREFIX, "integrity-checksum",

From ee1925bd834418218c782c94e889f826d40b14d5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 4 Mar 2022 19:01:01 +0100
Subject: [PATCH 092/127] md-multipath: stop using bio_devname

Use the %pg format specifier to save on stack consuption and code size.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Song Liu <song@kernel.org>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20220304180105.409765-7-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/md-multipath.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c
index 97fb948e3e74..3081a936350d 100644
--- a/drivers/md/md-multipath.c
+++ b/drivers/md/md-multipath.c
@@ -297,7 +297,6 @@ static void multipathd(struct md_thread *thread)
 
 	md_check_recovery(mddev);
 	for (;;) {
-		char b[BDEVNAME_SIZE];
 		spin_lock_irqsave(&conf->device_lock, flags);
 		if (list_empty(head))
 			break;
@@ -309,13 +308,13 @@ static void multipathd(struct md_thread *thread)
 		bio->bi_iter.bi_sector = mp_bh->master_bio->bi_iter.bi_sector;
 
 		if ((mp_bh->path = multipath_map (conf))<0) {
-			pr_err("multipath: %s: unrecoverable IO read error for block %llu\n",
-			       bio_devname(bio, b),
+			pr_err("multipath: %pg: unrecoverable IO read error for block %llu\n",
+			       bio->bi_bdev,
 			       (unsigned long long)bio->bi_iter.bi_sector);
 			multipath_end_bh_io(mp_bh, BLK_STS_IOERR);
 		} else {
-			pr_err("multipath: %s: redirecting sector %llu to another IO path\n",
-			       bio_devname(bio, b),
+			pr_err("multipath: %pg: redirecting sector %llu to another IO path\n",
+			       bio->bi_bdev,
 			       (unsigned long long)bio->bi_iter.bi_sector);
 			*bio = *(mp_bh->master_bio);
 			bio->bi_iter.bi_sector +=

From ac483eb375fa4a815a515945a5456086c197430e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 4 Mar 2022 19:01:02 +0100
Subject: [PATCH 093/127] raid1: stop using bio_devname

Use the %pg format specifier to save on stack consuption and code size.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Song Liu <song@kernel.org>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20220304180105.409765-8-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/raid1.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index c3288d46948d..03477e971699 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2072,15 +2072,14 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
 		} while (!success && d != r1_bio->read_disk);
 
 		if (!success) {
-			char b[BDEVNAME_SIZE];
 			int abort = 0;
 			/* Cannot read from anywhere, this block is lost.
 			 * Record a bad block on each device.  If that doesn't
 			 * work just disable and interrupt the recovery.
 			 * Don't fail devices as that won't really help.
 			 */
-			pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n",
-					    mdname(mddev), bio_devname(bio, b),
+			pr_crit_ratelimited("md/raid1:%s: %pg: unrecoverable I/O read error for block %llu\n",
+					    mdname(mddev), bio->bi_bdev,
 					    (unsigned long long)r1_bio->sector);
 			for (d = 0; d < conf->raid_disks * 2; d++) {
 				rdev = conf->mirrors[d].rdev;

From c7dec4623c9cde20dad8de319d177ed6aa382aaa Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 4 Mar 2022 19:01:03 +0100
Subject: [PATCH 094/127] raid5-ppl: stop using bio_devname

Use the %pg format specifier to save on stack consuption and code size.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Song Liu <song@kernel.org>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20220304180105.409765-9-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/raid5-ppl.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 3446797fa0ac..bbb5673104ec 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -416,12 +416,10 @@ static void ppl_log_endio(struct bio *bio)
 
 static void ppl_submit_iounit_bio(struct ppl_io_unit *io, struct bio *bio)
 {
-	char b[BDEVNAME_SIZE];
-
-	pr_debug("%s: seq: %llu size: %u sector: %llu dev: %s\n",
+	pr_debug("%s: seq: %llu size: %u sector: %llu dev: %pg\n",
 		 __func__, io->seq, bio->bi_iter.bi_size,
 		 (unsigned long long)bio->bi_iter.bi_sector,
-		 bio_devname(bio, b));
+		 bio->bi_bdev);
 
 	submit_bio(bio);
 }
@@ -589,9 +587,8 @@ static void ppl_flush_endio(struct bio *bio)
 	struct ppl_log *log = io->log;
 	struct ppl_conf *ppl_conf = log->ppl_conf;
 	struct r5conf *conf = ppl_conf->mddev->private;
-	char b[BDEVNAME_SIZE];
 
-	pr_debug("%s: dev: %s\n", __func__, bio_devname(bio, b));
+	pr_debug("%s: dev: %pg\n", __func__, bio->bi_bdev);
 
 	if (bio->bi_status) {
 		struct md_rdev *rdev;
@@ -634,7 +631,6 @@ static void ppl_do_flush(struct ppl_io_unit *io)
 
 		if (bdev) {
 			struct bio *bio;
-			char b[BDEVNAME_SIZE];
 
 			bio = bio_alloc_bioset(bdev, 0, GFP_NOIO,
 					       REQ_OP_WRITE | REQ_PREFLUSH,
@@ -642,8 +638,7 @@ static void ppl_do_flush(struct ppl_io_unit *io)
 			bio->bi_private = io;
 			bio->bi_end_io = ppl_flush_endio;
 
-			pr_debug("%s: dev: %s\n", __func__,
-				 bio_devname(bio, b));
+			pr_debug("%s: dev: %ps\n", __func__, bio->bi_bdev);
 
 			submit_bio(bio);
 			flushed_disks++;

From 734294e47a2ec48fd25dcf2d96cdf2c6c6740c00 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 4 Mar 2022 19:01:04 +0100
Subject: [PATCH 095/127] ext4: stop using bio_devname

Use the %pg format specifier to save on stack consuption and code size.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20220304180105.409765-10-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/ext4/page-io.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 125398226873..18373fa52922 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -323,10 +323,9 @@ static void ext4_end_bio(struct bio *bio)
 {
 	ext4_io_end_t *io_end = bio->bi_private;
 	sector_t bi_sector = bio->bi_iter.bi_sector;
-	char b[BDEVNAME_SIZE];
 
-	if (WARN_ONCE(!io_end, "io_end is NULL: %s: sector %Lu len %u err %d\n",
-		      bio_devname(bio, b),
+	if (WARN_ONCE(!io_end, "io_end is NULL: %pg: sector %Lu len %u err %d\n",
+		      bio->bi_bdev,
 		      (long long) bio->bi_iter.bi_sector,
 		      (unsigned) bio_sectors(bio),
 		      bio->bi_status)) {

From 97939610b893de068c82c347d06319cd231a4602 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 4 Mar 2022 19:01:05 +0100
Subject: [PATCH 096/127] block: remove bio_devname

All callers are gone, so remove this wrapper.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20220304180105.409765-11-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c         | 6 ------
 include/linux/bio.h | 2 --
 2 files changed, 8 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index b15f5466ce08..151cace2dbe1 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -807,12 +807,6 @@ int bio_init_clone(struct block_device *bdev, struct bio *bio,
 }
 EXPORT_SYMBOL(bio_init_clone);
 
-const char *bio_devname(struct bio *bio, char *buf)
-{
-	return bdevname(bio->bi_bdev, buf);
-}
-EXPORT_SYMBOL(bio_devname);
-
 /**
  * bio_full - check if the bio is full
  * @bio:	bio to check
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 7523aba4ddf7..4c21f6e69e18 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -491,8 +491,6 @@ static inline void bio_release_pages(struct bio *bio, bool mark_dirty)
 		__bio_release_pages(bio, mark_dirty);
 }
 
-extern const char *bio_devname(struct bio *bio, char *buffer);
-
 #define bio_dev(bio) \
 	disk_devt((bio)->bi_bdev->bd_disk)
 

From 15729ff8143f8135b03988a100a19e66d7cb7ecd Mon Sep 17 00:00:00 2001
From: Paolo Valente <paolo.valente@linaro.org>
Date: Thu, 25 Nov 2021 19:15:10 +0100
Subject: [PATCH 097/127] Revert "Revert "block, bfq: honor already-setup queue
 merges""
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A crash [1] happened to be triggered in conjunction with commit
2d52c58b9c9b ("block, bfq: honor already-setup queue merges"). The
latter was then reverted by commit ebc69e897e17 ("Revert "block, bfq:
honor already-setup queue merges""). Yet, the reverted commit was not
the one introducing the bug. In fact, it actually triggered a UAF
introduced by a different commit, and now fixed by commit d29bd41428cf
("block, bfq: reset last_bfqq_created on group change").

So, there is no point in keeping commit 2d52c58b9c9b ("block, bfq:
honor already-setup queue merges") out. This commit restores it.

[1] https://bugzilla.kernel.org/show_bug.cgi?id=214503

Reported-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Link: https://lore.kernel.org/r/20211125181510.15004-1-paolo.valente@linaro.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index e234eb408ed0..975fd30a07c9 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -2782,6 +2782,15 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
 	 * are likely to increase the throughput.
 	 */
 	bfqq->new_bfqq = new_bfqq;
+	/*
+	 * The above assignment schedules the following redirections:
+	 * each time some I/O for bfqq arrives, the process that
+	 * generated that I/O is disassociated from bfqq and
+	 * associated with new_bfqq. Here we increases new_bfqq->ref
+	 * in advance, adding the number of processes that are
+	 * expected to be associated with new_bfqq as they happen to
+	 * issue I/O.
+	 */
 	new_bfqq->ref += process_refs;
 	return new_bfqq;
 }
@@ -2844,6 +2853,10 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 {
 	struct bfq_queue *in_service_bfqq, *new_bfqq;
 
+	/* if a merge has already been setup, then proceed with that first */
+	if (bfqq->new_bfqq)
+		return bfqq->new_bfqq;
+
 	/*
 	 * Check delayed stable merge for rotational or non-queueing
 	 * devs. For this branch to be executed, bfqq must not be
@@ -2945,9 +2958,6 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	if (bfq_too_late_for_merging(bfqq))
 		return NULL;
 
-	if (bfqq->new_bfqq)
-		return bfqq->new_bfqq;
-
 	if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq))
 		return NULL;
 

From 4d805131abf219e0019715f1cf29763c613aae07 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 8 Mar 2022 15:32:14 +0800
Subject: [PATCH 098/127] blk-mq: figure out correct numa node for hw queue

The current code always uses default queue map and hw queue index
for figuring out the numa node for hw queue, this way isn't correct
because blk-mq supports three queue maps, and the correct queue map
should be used for the specified hw queue.

Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20220308073219.91173-2-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 36 ++++++++++++++++++++++++++++++------
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index a05ce7725031..5d25abf9e551 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3107,15 +3107,41 @@ void blk_mq_free_rq_map(struct blk_mq_tags *tags)
 	blk_mq_free_tags(tags);
 }
 
+static enum hctx_type hctx_idx_to_type(struct blk_mq_tag_set *set,
+		unsigned int hctx_idx)
+{
+	int i;
+
+	for (i = 0; i < set->nr_maps; i++) {
+		unsigned int start = set->map[i].queue_offset;
+		unsigned int end = start + set->map[i].nr_queues;
+
+		if (hctx_idx >= start && hctx_idx < end)
+			break;
+	}
+
+	if (i >= set->nr_maps)
+		i = HCTX_TYPE_DEFAULT;
+
+	return i;
+}
+
+static int blk_mq_get_hctx_node(struct blk_mq_tag_set *set,
+		unsigned int hctx_idx)
+{
+	enum hctx_type type = hctx_idx_to_type(set, hctx_idx);
+
+	return blk_mq_hw_queue_to_node(&set->map[type], hctx_idx);
+}
+
 static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
 					       unsigned int hctx_idx,
 					       unsigned int nr_tags,
 					       unsigned int reserved_tags)
 {
+	int node = blk_mq_get_hctx_node(set, hctx_idx);
 	struct blk_mq_tags *tags;
-	int node;
 
-	node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
 	if (node == NUMA_NO_NODE)
 		node = set->numa_node;
 
@@ -3164,10 +3190,9 @@ static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set,
 			    unsigned int hctx_idx, unsigned int depth)
 {
 	unsigned int i, j, entries_per_page, max_order = 4;
+	int node = blk_mq_get_hctx_node(set, hctx_idx);
 	size_t rq_size, left;
-	int node;
 
-	node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
 	if (node == NUMA_NO_NODE)
 		node = set->numa_node;
 
@@ -3941,10 +3966,9 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 	/* protect against switching io scheduler  */
 	mutex_lock(&q->sysfs_lock);
 	for (i = 0; i < set->nr_hw_queues; i++) {
-		int node;
+		int node = blk_mq_get_hctx_node(set, i);
 		struct blk_mq_hw_ctx *hctx;
 
-		node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i);
 		/*
 		 * If the hw queue has been mapped to another numa node,
 		 * we need to realloc the hctx. If allocation fails, fallback

From 306f13ee16424805d602d427a66ea38078d473b0 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 8 Mar 2022 15:32:15 +0800
Subject: [PATCH 099/127] blk-mq: simplify reallocation of hw ctxs a bit

blk_mq_alloc_and_init_hctx() has already taken reuse into account, so
no need to do it outside, then we can simplify blk_mq_realloc_hw_ctxs().

Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20220308073219.91173-3-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 33 ++++++++++++++-------------------
 1 file changed, 14 insertions(+), 19 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5d25abf9e551..64fddb36c93c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3966,29 +3966,24 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 	/* protect against switching io scheduler  */
 	mutex_lock(&q->sysfs_lock);
 	for (i = 0; i < set->nr_hw_queues; i++) {
+		int old_node;
 		int node = blk_mq_get_hctx_node(set, i);
-		struct blk_mq_hw_ctx *hctx;
+		struct blk_mq_hw_ctx *old_hctx = hctxs[i];
 
-		/*
-		 * If the hw queue has been mapped to another numa node,
-		 * we need to realloc the hctx. If allocation fails, fallback
-		 * to use the previous one.
-		 */
-		if (hctxs[i] && (hctxs[i]->numa_node == node))
-			continue;
+		if (old_hctx) {
+			old_node = old_hctx->numa_node;
+			blk_mq_exit_hctx(q, set, old_hctx, i);
+		}
 
-		hctx = blk_mq_alloc_and_init_hctx(set, q, i, node);
-		if (hctx) {
-			if (hctxs[i])
-				blk_mq_exit_hctx(q, set, hctxs[i], i);
-			hctxs[i] = hctx;
-		} else {
-			if (hctxs[i])
-				pr_warn("Allocate new hctx on node %d fails,\
-						fallback to previous one on node %d\n",
-						node, hctxs[i]->numa_node);
-			else
+		hctxs[i] = blk_mq_alloc_and_init_hctx(set, q, i, node);
+		if (!hctxs[i]) {
+			if (!old_hctx)
 				break;
+			pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n",
+					node, old_node);
+			hctxs[i] = blk_mq_alloc_and_init_hctx(set, q, i,
+					old_node);
+			WARN_ON_ONCE(!hctxs[i]);
 		}
 	}
 	/*

From 42ee3061293e206bc0f3a084feea1304c61c137a Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 8 Mar 2022 15:32:16 +0800
Subject: [PATCH 100/127] blk-mq: reconfigure poll after queue map is changed

queue map can be changed when updating nr_hw_queues, so we need to
reconfigure queue's poll capability. Add one helper for doing this job.

Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20220308073219.91173-4-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 64fddb36c93c..57ae9df0f4dc 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -4010,6 +4010,17 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 	mutex_unlock(&q->sysfs_lock);
 }
 
+static void blk_mq_update_poll_flag(struct request_queue *q)
+{
+	struct blk_mq_tag_set *set = q->tag_set;
+
+	if (set->nr_maps > HCTX_TYPE_POLL &&
+	    set->map[HCTX_TYPE_POLL].nr_queues)
+		blk_queue_flag_set(QUEUE_FLAG_POLL, q);
+	else
+		blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
+}
+
 int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 		struct request_queue *q)
 {
@@ -4044,9 +4055,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	q->tag_set = set;
 
 	q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
-	if (set->nr_maps > HCTX_TYPE_POLL &&
-	    set->map[HCTX_TYPE_POLL].nr_queues)
-		blk_queue_flag_set(QUEUE_FLAG_POLL, q);
+	blk_mq_update_poll_flag(q);
 
 	INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
 	INIT_LIST_HEAD(&q->requeue_list);
@@ -4512,6 +4521,7 @@ fallback:
 	blk_mq_update_queue_map(set);
 	list_for_each_entry(q, &set->tag_list, tag_set_list) {
 		blk_mq_realloc_hw_ctxs(set, q);
+		blk_mq_update_poll_flag(q);
 		if (q->nr_hw_queues != set->nr_hw_queues) {
 			int i = prev_nr_hw_queues;
 

From de0328d3a253a339be14a80fe2a0256ec26867da Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 8 Mar 2022 15:32:17 +0800
Subject: [PATCH 101/127] block: mtip32xx: don't touch q->queue_hw_ctx

q->queue_hw_ctx is really one blk-mq internal structure for retrieving
hctx via its index, not supposed to be used by drivers. Meantime drivers
can get the tags structure easily from tagset.

Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20220308073219.91173-5-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index cba956881d55..2d43ab5b5cc0 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -160,9 +160,7 @@ static bool mtip_check_surprise_removal(struct driver_data *dd)
 static struct mtip_cmd *mtip_cmd_from_tag(struct driver_data *dd,
 					  unsigned int tag)
 {
-	struct blk_mq_hw_ctx *hctx = dd->queue->queue_hw_ctx[0];
-
-	return blk_mq_rq_to_pdu(blk_mq_tag_to_rq(hctx->tags, tag));
+	return blk_mq_rq_to_pdu(blk_mq_tag_to_rq(dd->tags.tags[0], tag));
 }
 
 /*

From 4f481208749a22d3570073e629dbc27d7d27c8da Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 8 Mar 2022 15:32:18 +0800
Subject: [PATCH 102/127] blk-mq: prepare for implementing hctx table via
 xarray

It is inevitable to cause use-after-free on q->queue_hw_ctx between
queue_for_each_hw_ctx() and blk_mq_update_nr_hw_queues(). And converting
to xarray can fix the uaf, meantime code gets cleaner.

Prepare for converting q->queue_hctx_ctx into xarray, one thing is that
xa_for_each() can only accept 'unsigned long' as index, so changes type
of hctx index of queue_for_each_hw_ctx() into 'unsigned long'.

Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20220308073219.91173-6-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.c        |  6 +++---
 block/blk-mq-sched.c          |  9 +++++----
 block/blk-mq-sysfs.c          | 16 ++++++++++------
 block/blk-mq-tag.c            |  2 +-
 block/blk-mq.c                | 30 ++++++++++++++++--------------
 drivers/block/rnbd/rnbd-clt.c |  2 +-
 6 files changed, 36 insertions(+), 29 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 3a790eb4995c..e2880f6deb34 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -707,7 +707,7 @@ static void debugfs_create_files(struct dentry *parent, void *data,
 void blk_mq_debugfs_register(struct request_queue *q)
 {
 	struct blk_mq_hw_ctx *hctx;
-	int i;
+	unsigned long i;
 
 	debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs);
 
@@ -780,7 +780,7 @@ void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx)
 void blk_mq_debugfs_register_hctxs(struct request_queue *q)
 {
 	struct blk_mq_hw_ctx *hctx;
-	int i;
+	unsigned long i;
 
 	queue_for_each_hw_ctx(q, hctx, i)
 		blk_mq_debugfs_register_hctx(q, hctx);
@@ -789,7 +789,7 @@ void blk_mq_debugfs_register_hctxs(struct request_queue *q)
 void blk_mq_debugfs_unregister_hctxs(struct request_queue *q)
 {
 	struct blk_mq_hw_ctx *hctx;
-	int i;
+	unsigned long i;
 
 	queue_for_each_hw_ctx(q, hctx, i)
 		blk_mq_debugfs_unregister_hctx(hctx);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 55488ba97823..e6ad8f761474 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -515,7 +515,7 @@ static void blk_mq_exit_sched_shared_tags(struct request_queue *queue)
 static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags)
 {
 	struct blk_mq_hw_ctx *hctx;
-	int i;
+	unsigned long i;
 
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (hctx->sched_tags) {
@@ -550,9 +550,10 @@ static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
 
 int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 {
-	unsigned int i, flags = q->tag_set->flags;
+	unsigned int flags = q->tag_set->flags;
 	struct blk_mq_hw_ctx *hctx;
 	struct elevator_queue *eq;
+	unsigned long i;
 	int ret;
 
 	if (!e) {
@@ -618,7 +619,7 @@ err_free_map_and_rqs:
 void blk_mq_sched_free_rqs(struct request_queue *q)
 {
 	struct blk_mq_hw_ctx *hctx;
-	int i;
+	unsigned long i;
 
 	if (blk_mq_is_shared_tags(q->tag_set->flags)) {
 		blk_mq_free_rqs(q->tag_set, q->sched_shared_tags,
@@ -635,7 +636,7 @@ void blk_mq_sched_free_rqs(struct request_queue *q)
 void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
 {
 	struct blk_mq_hw_ctx *hctx;
-	unsigned int i;
+	unsigned long i;
 	unsigned int flags = 0;
 
 	queue_for_each_hw_ctx(q, hctx, i) {
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 674786574075..c08426975856 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -206,7 +206,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
 void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
 {
 	struct blk_mq_hw_ctx *hctx;
-	int i;
+	unsigned long i;
 
 	lockdep_assert_held(&q->sysfs_dir_lock);
 
@@ -255,7 +255,8 @@ void blk_mq_sysfs_init(struct request_queue *q)
 int __blk_mq_register_dev(struct device *dev, struct request_queue *q)
 {
 	struct blk_mq_hw_ctx *hctx;
-	int ret, i;
+	unsigned long i, j;
+	int ret;
 
 	WARN_ON_ONCE(!q->kobj.parent);
 	lockdep_assert_held(&q->sysfs_dir_lock);
@@ -278,8 +279,10 @@ out:
 	return ret;
 
 unreg:
-	while (--i >= 0)
-		blk_mq_unregister_hctx(q->queue_hw_ctx[i]);
+	queue_for_each_hw_ctx(q, hctx, j) {
+		if (j < i)
+			blk_mq_unregister_hctx(hctx);
+	}
 
 	kobject_uevent(q->mq_kobj, KOBJ_REMOVE);
 	kobject_del(q->mq_kobj);
@@ -290,7 +293,7 @@ unreg:
 void blk_mq_sysfs_unregister(struct request_queue *q)
 {
 	struct blk_mq_hw_ctx *hctx;
-	int i;
+	unsigned long i;
 
 	mutex_lock(&q->sysfs_dir_lock);
 	if (!q->mq_sysfs_init_done)
@@ -306,7 +309,8 @@ unlock:
 int blk_mq_sysfs_register(struct request_queue *q)
 {
 	struct blk_mq_hw_ctx *hctx;
-	int i, ret = 0;
+	unsigned long i;
+	int ret = 0;
 
 	mutex_lock(&q->sysfs_dir_lock);
 	if (!q->mq_sysfs_init_done)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 0fd409b8e86e..1850a4225e12 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -515,7 +515,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
 		bt_for_each(NULL, q, btags, fn, priv, false);
 	} else {
 		struct blk_mq_hw_ctx *hctx;
-		int i;
+		unsigned long i;
 
 		queue_for_each_hw_ctx(q, hctx, i) {
 			struct blk_mq_tags *tags = hctx->tags;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 57ae9df0f4dc..bffdd71c670d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -312,7 +312,7 @@ EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
 void blk_mq_wake_waiters(struct request_queue *q)
 {
 	struct blk_mq_hw_ctx *hctx;
-	unsigned int i;
+	unsigned long i;
 
 	queue_for_each_hw_ctx(q, hctx, i)
 		if (blk_mq_hw_queue_mapped(hctx))
@@ -1442,7 +1442,7 @@ static void blk_mq_timeout_work(struct work_struct *work)
 		container_of(work, struct request_queue, timeout_work);
 	unsigned long next = 0;
 	struct blk_mq_hw_ctx *hctx;
-	int i;
+	unsigned long i;
 
 	/* A deadlock might occur if a request is stuck requiring a
 	 * timeout at the same time a queue freeze is waiting
@@ -2143,7 +2143,7 @@ static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q)
 void blk_mq_run_hw_queues(struct request_queue *q, bool async)
 {
 	struct blk_mq_hw_ctx *hctx, *sq_hctx;
-	int i;
+	unsigned long i;
 
 	sq_hctx = NULL;
 	if (blk_mq_has_sqsched(q))
@@ -2171,7 +2171,7 @@ EXPORT_SYMBOL(blk_mq_run_hw_queues);
 void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
 {
 	struct blk_mq_hw_ctx *hctx, *sq_hctx;
-	int i;
+	unsigned long i;
 
 	sq_hctx = NULL;
 	if (blk_mq_has_sqsched(q))
@@ -2209,7 +2209,7 @@ EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
 bool blk_mq_queue_stopped(struct request_queue *q)
 {
 	struct blk_mq_hw_ctx *hctx;
-	int i;
+	unsigned long i;
 
 	queue_for_each_hw_ctx(q, hctx, i)
 		if (blk_mq_hctx_stopped(hctx))
@@ -2248,7 +2248,7 @@ EXPORT_SYMBOL(blk_mq_stop_hw_queue);
 void blk_mq_stop_hw_queues(struct request_queue *q)
 {
 	struct blk_mq_hw_ctx *hctx;
-	int i;
+	unsigned long i;
 
 	queue_for_each_hw_ctx(q, hctx, i)
 		blk_mq_stop_hw_queue(hctx);
@@ -2266,7 +2266,7 @@ EXPORT_SYMBOL(blk_mq_start_hw_queue);
 void blk_mq_start_hw_queues(struct request_queue *q)
 {
 	struct blk_mq_hw_ctx *hctx;
-	int i;
+	unsigned long i;
 
 	queue_for_each_hw_ctx(q, hctx, i)
 		blk_mq_start_hw_queue(hctx);
@@ -2286,7 +2286,7 @@ EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue);
 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
 {
 	struct blk_mq_hw_ctx *hctx;
-	int i;
+	unsigned long i;
 
 	queue_for_each_hw_ctx(q, hctx, i)
 		blk_mq_start_stopped_hw_queue(hctx, async);
@@ -3446,7 +3446,7 @@ static void blk_mq_exit_hw_queues(struct request_queue *q,
 		struct blk_mq_tag_set *set, int nr_queue)
 {
 	struct blk_mq_hw_ctx *hctx;
-	unsigned int i;
+	unsigned long i;
 
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (i == nr_queue)
@@ -3637,7 +3637,8 @@ static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
 
 static void blk_mq_map_swqueue(struct request_queue *q)
 {
-	unsigned int i, j, hctx_idx;
+	unsigned int j, hctx_idx;
+	unsigned long i;
 	struct blk_mq_hw_ctx *hctx;
 	struct blk_mq_ctx *ctx;
 	struct blk_mq_tag_set *set = q->tag_set;
@@ -3744,7 +3745,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 static void queue_set_hctx_shared(struct request_queue *q, bool shared)
 {
 	struct blk_mq_hw_ctx *hctx;
-	int i;
+	unsigned long i;
 
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (shared) {
@@ -3844,7 +3845,7 @@ static int blk_mq_alloc_ctxs(struct request_queue *q)
 void blk_mq_release(struct request_queue *q)
 {
 	struct blk_mq_hw_ctx *hctx, *next;
-	int i;
+	unsigned long i;
 
 	queue_for_each_hw_ctx(q, hctx, i)
 		WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
@@ -4362,7 +4363,8 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
 {
 	struct blk_mq_tag_set *set = q->tag_set;
 	struct blk_mq_hw_ctx *hctx;
-	int i, ret;
+	int ret;
+	unsigned long i;
 
 	if (!set)
 		return -EINVAL;
@@ -4738,7 +4740,7 @@ void blk_mq_cancel_work_sync(struct request_queue *q)
 {
 	if (queue_is_mq(q)) {
 		struct blk_mq_hw_ctx *hctx;
-		int i;
+		unsigned long i;
 
 		cancel_delayed_work_sync(&q->requeue_work);
 
diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c
index c08971de369f..58304f978e10 100644
--- a/drivers/block/rnbd/rnbd-clt.c
+++ b/drivers/block/rnbd/rnbd-clt.c
@@ -1343,7 +1343,7 @@ static inline void rnbd_init_hw_queue(struct rnbd_clt_dev *dev,
 
 static void rnbd_init_mq_hw_queues(struct rnbd_clt_dev *dev)
 {
-	int i;
+	unsigned long i;
 	struct blk_mq_hw_ctx *hctx;
 	struct rnbd_queue *q;
 

From 4e5cc99e1e485954a9c09872e0eeea570fb2b5a5 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 8 Mar 2022 15:32:19 +0800
Subject: [PATCH 103/127] blk-mq: manage hctx map via xarray

First code becomes more clean by switching to xarray from plain array.

Second use-after-free on q->queue_hw_ctx can be fixed because
queue_for_each_hw_ctx() may be run when updating nr_hw_queues is
in-progress. With this patch, q->hctx_table is defined as xarray, and
this structure will share same lifetime with request queue, so
queue_for_each_hw_ctx() can use q->hctx_table to lookup hctx reliably.

Reported-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220308073219.91173-7-ming.lei@redhat.com
[axboe: fix blk_mq_hw_ctx forward declaration]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.h |  2 ++
 block/blk-mq-tag.c     |  2 +-
 block/blk-mq.c         | 62 ++++++++++++++++--------------------------
 block/blk-mq.h         |  2 +-
 include/linux/blk-mq.h |  3 +-
 include/linux/blkdev.h |  2 +-
 6 files changed, 30 insertions(+), 43 deletions(-)

diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
index a68aa6041a10..69918f4170d6 100644
--- a/block/blk-mq-debugfs.h
+++ b/block/blk-mq-debugfs.h
@@ -6,6 +6,8 @@
 
 #include <linux/seq_file.h>
 
+struct blk_mq_hw_ctx;
+
 struct blk_mq_debugfs_attr {
 	const char *name;
 	umode_t mode;
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 1850a4225e12..68ac23d0b640 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -498,7 +498,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
 		void *priv)
 {
 	/*
-	 * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx
+	 * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and hctx_table
 	 * while the queue is frozen. So we can use q_usage_counter to avoid
 	 * racing with it.
 	 */
diff --git a/block/blk-mq.c b/block/blk-mq.c
index bffdd71c670d..e8d6d7a05a43 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -71,7 +71,8 @@ static int blk_mq_poll_stats_bkt(const struct request *rq)
 static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q,
 		blk_qc_t qc)
 {
-	return q->queue_hw_ctx[(qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT];
+	return xa_load(&q->hctx_table,
+			(qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT);
 }
 
 static inline struct request *blk_qc_to_rq(struct blk_mq_hw_ctx *hctx,
@@ -573,7 +574,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 	 * If not tell the caller that it should skip this queue.
 	 */
 	ret = -EXDEV;
-	data.hctx = q->queue_hw_ctx[hctx_idx];
+	data.hctx = xa_load(&q->hctx_table, hctx_idx);
 	if (!blk_mq_hw_queue_mapped(data.hctx))
 		goto out_queue_exit;
 	cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
@@ -3437,6 +3438,8 @@ static void blk_mq_exit_hctx(struct request_queue *q,
 
 	blk_mq_remove_cpuhp(hctx);
 
+	xa_erase(&q->hctx_table, hctx_idx);
+
 	spin_lock(&q->unused_hctx_lock);
 	list_add(&hctx->hctx_list, &q->unused_hctx_list);
 	spin_unlock(&q->unused_hctx_lock);
@@ -3476,8 +3479,15 @@ static int blk_mq_init_hctx(struct request_queue *q,
 	if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
 				hctx->numa_node))
 		goto exit_hctx;
+
+	if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL))
+		goto exit_flush_rq;
+
 	return 0;
 
+ exit_flush_rq:
+	if (set->ops->exit_request)
+		set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
  exit_hctx:
 	if (set->ops->exit_hctx)
 		set->ops->exit_hctx(hctx, hctx_idx);
@@ -3856,7 +3866,7 @@ void blk_mq_release(struct request_queue *q)
 		kobject_put(&hctx->kobj);
 	}
 
-	kfree(q->queue_hw_ctx);
+	xa_destroy(&q->hctx_table);
 
 	/*
 	 * release .mq_kobj and sw queue's kobject now because
@@ -3945,46 +3955,28 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
 static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 						struct request_queue *q)
 {
-	int i, j, end;
-	struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
-
-	if (q->nr_hw_queues < set->nr_hw_queues) {
-		struct blk_mq_hw_ctx **new_hctxs;
-
-		new_hctxs = kcalloc_node(set->nr_hw_queues,
-				       sizeof(*new_hctxs), GFP_KERNEL,
-				       set->numa_node);
-		if (!new_hctxs)
-			return;
-		if (hctxs)
-			memcpy(new_hctxs, hctxs, q->nr_hw_queues *
-			       sizeof(*hctxs));
-		q->queue_hw_ctx = new_hctxs;
-		kfree(hctxs);
-		hctxs = new_hctxs;
-	}
+	struct blk_mq_hw_ctx *hctx;
+	unsigned long i, j;
 
 	/* protect against switching io scheduler  */
 	mutex_lock(&q->sysfs_lock);
 	for (i = 0; i < set->nr_hw_queues; i++) {
 		int old_node;
 		int node = blk_mq_get_hctx_node(set, i);
-		struct blk_mq_hw_ctx *old_hctx = hctxs[i];
+		struct blk_mq_hw_ctx *old_hctx = xa_load(&q->hctx_table, i);
 
 		if (old_hctx) {
 			old_node = old_hctx->numa_node;
 			blk_mq_exit_hctx(q, set, old_hctx, i);
 		}
 
-		hctxs[i] = blk_mq_alloc_and_init_hctx(set, q, i, node);
-		if (!hctxs[i]) {
+		if (!blk_mq_alloc_and_init_hctx(set, q, i, node)) {
 			if (!old_hctx)
 				break;
 			pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n",
 					node, old_node);
-			hctxs[i] = blk_mq_alloc_and_init_hctx(set, q, i,
-					old_node);
-			WARN_ON_ONCE(!hctxs[i]);
+			hctx = blk_mq_alloc_and_init_hctx(set, q, i, old_node);
+			WARN_ON_ONCE(!hctx);
 		}
 	}
 	/*
@@ -3993,21 +3985,13 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 	 */
 	if (i != set->nr_hw_queues) {
 		j = q->nr_hw_queues;
-		end = i;
 	} else {
 		j = i;
-		end = q->nr_hw_queues;
 		q->nr_hw_queues = set->nr_hw_queues;
 	}
 
-	for (; j < end; j++) {
-		struct blk_mq_hw_ctx *hctx = hctxs[j];
-
-		if (hctx) {
-			blk_mq_exit_hctx(q, set, hctx, j);
-			hctxs[j] = NULL;
-		}
-	}
+	xa_for_each_start(&q->hctx_table, j, hctx, j)
+		blk_mq_exit_hctx(q, set, hctx, j);
 	mutex_unlock(&q->sysfs_lock);
 }
 
@@ -4046,6 +4030,8 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	INIT_LIST_HEAD(&q->unused_hctx_list);
 	spin_lock_init(&q->unused_hctx_lock);
 
+	xa_init(&q->hctx_table);
+
 	blk_mq_realloc_hw_ctxs(set, q);
 	if (!q->nr_hw_queues)
 		goto err_hctxs;
@@ -4075,7 +4061,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	return 0;
 
 err_hctxs:
-	kfree(q->queue_hw_ctx);
+	xa_destroy(&q->hctx_table);
 	q->nr_hw_queues = 0;
 	blk_mq_sysfs_deinit(q);
 err_poll:
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 948791ea2a3e..2615bd58bad3 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -83,7 +83,7 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *
 							  enum hctx_type type,
 							  unsigned int cpu)
 {
-	return q->queue_hw_ctx[q->tag_set->map[type].mq_map[cpu]];
+	return xa_load(&q->hctx_table, q->tag_set->map[type].mq_map[cpu]);
 }
 
 static inline enum hctx_type blk_mq_get_hctx_type(unsigned int flags)
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 3a41d50b85d3..7aa5c54901a9 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -917,8 +917,7 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq)
 }
 
 #define queue_for_each_hw_ctx(q, hctx, i)				\
-	for ((i) = 0; (i) < (q)->nr_hw_queues &&			\
-	     ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++)
+	xa_for_each(&(q)->hctx_table, (i), (hctx))
 
 #define hctx_for_each_ctx(hctx, ctx, i)					\
 	for ((i) = 0; (i) < (hctx)->nr_ctx &&				\
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e19947d84f12..d982171469bc 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -355,7 +355,7 @@ struct request_queue {
 	unsigned int		queue_depth;
 
 	/* hw dispatch queues */
-	struct blk_mq_hw_ctx	**queue_hw_ctx;
+	struct xarray		hctx_table;
 	unsigned int		nr_hw_queues;
 
 	/*

From 41fa722239b46f0c033da94746212344175cdaa0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 8 Mar 2022 06:51:47 +0100
Subject: [PATCH 104/127] blk-mq: do not include passthrough requests in I/O
 accounting

I/O accounting buckets I/O into the read/write/discard categories into
which passthrough I/O does not fit at all.  It also accounts to the
block_device, which may not even exist for passthrough I/O.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20220308055200.735835-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 11 ++++++++---
 block/blk.h    |  2 +-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index e8d6d7a05a43..40f634ab7026 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -884,10 +884,15 @@ static inline void blk_account_io_done(struct request *req, u64 now)
 
 static void __blk_account_io_start(struct request *rq)
 {
-	/* passthrough requests can hold bios that do not have ->bi_bdev set */
-	if (rq->bio && rq->bio->bi_bdev)
+	/*
+	 * All non-passthrough requests are created from a bio with one
+	 * exception: when a flush command that is part of a flush sequence
+	 * generated by the state machine in blk-flush.c is cloned onto the
+	 * lower device by dm-multipath we can get here without a bio.
+	 */
+	if (rq->bio)
 		rq->part = rq->bio->bi_bdev;
-	else if (rq->q->disk)
+	else
 		rq->part = rq->q->disk->part0;
 
 	part_stat_lock();
diff --git a/block/blk.h b/block/blk.h
index ebaa59ca46ca..6f21859c7f0f 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -325,7 +325,7 @@ int blk_dev_init(void);
  */
 static inline bool blk_do_io_stat(struct request *rq)
 {
-	return (rq->rq_flags & RQF_IO_STAT) && rq->q->disk;
+	return (rq->rq_flags & RQF_IO_STAT) && !blk_rq_is_passthrough(rq);
 }
 
 void update_io_ticks(struct block_device *part, unsigned long now, bool end);

From e02657ea7b86d1c41e095f49e4f7c7bb24bbee64 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 8 Mar 2022 06:51:48 +0100
Subject: [PATCH 105/127] blk-mq: handle already freed tags gracefully in
 blk_mq_free_rqs

To simplify further changes allow for double calling blk_mq_free_rqs on
a queue.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
[hch: split out from a larger patch]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20220308055200.735835-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 40f634ab7026..f7ef8e2ab935 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3071,6 +3071,9 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 	struct blk_mq_tags *drv_tags;
 	struct page *page;
 
+	if (list_empty(&tags->page_list))
+		return;
+
 	if (blk_mq_is_shared_tags(set->flags))
 		drv_tags = set->shared_tags;
 	else

From e7f76552277ca79d9dfe4c59f7b3620a98467f9c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 8 Mar 2022 06:51:49 +0100
Subject: [PATCH 106/127] scsi: don't use disk->private_data to find the
 scsi_driver

Requiring every ULP to have the scsi_drive as first member of the
private data is rather fragile and not necessary anyway.  Just use
the driver hanging off the SCSI device instead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20220308055200.735835-4-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/sd.c          | 3 +--
 drivers/scsi/sd.h          | 3 +--
 drivers/scsi/sr.c          | 5 ++---
 drivers/scsi/sr.h          | 1 -
 drivers/scsi/st.c          | 1 -
 drivers/scsi/st.h          | 1 -
 include/scsi/scsi_cmnd.h   | 9 ---------
 include/scsi/scsi_driver.h | 9 +++++++--
 8 files changed, 11 insertions(+), 21 deletions(-)

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 2d648d27bfd7..2a1e19e871d3 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -3515,7 +3515,6 @@ static int sd_probe(struct device *dev)
 	}
 
 	sdkp->device = sdp;
-	sdkp->driver = &sd_template;
 	sdkp->disk = gd;
 	sdkp->index = index;
 	sdkp->max_retries = SD_MAX_RETRIES;
@@ -3548,7 +3547,7 @@ static int sd_probe(struct device *dev)
 	gd->minors = SD_MINORS;
 
 	gd->fops = &sd_fops;
-	gd->private_data = &sdkp->driver;
+	gd->private_data = sdkp;
 
 	/* defaults, until the device tells us otherwise */
 	sdp->sector_size = 512;
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 2e5932bde43d..303aa1c23aef 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -68,7 +68,6 @@ enum {
 };
 
 struct scsi_disk {
-	struct scsi_driver *driver;	/* always &sd_template */
 	struct scsi_device *device;
 	struct device	dev;
 	struct gendisk	*disk;
@@ -131,7 +130,7 @@ struct scsi_disk {
 
 static inline struct scsi_disk *scsi_disk(struct gendisk *disk)
 {
-	return container_of(disk->private_data, struct scsi_disk, driver);
+	return disk->private_data;
 }
 
 #define sd_printk(prefix, sdsk, fmt, a...)				\
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index f925b1f1f9ad..569bda76a517 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -147,7 +147,7 @@ static void sr_kref_release(struct kref *kref);
 
 static inline struct scsi_cd *scsi_cd(struct gendisk *disk)
 {
-	return container_of(disk->private_data, struct scsi_cd, driver);
+	return disk->private_data;
 }
 
 static int sr_runtime_suspend(struct device *dev)
@@ -692,7 +692,6 @@ static int sr_probe(struct device *dev)
 
 	cd->device = sdev;
 	cd->disk = disk;
-	cd->driver = &sr_template;
 	cd->capacity = 0x1fffff;
 	cd->device->changed = 1;	/* force recheck CD type */
 	cd->media_present = 1;
@@ -713,7 +712,7 @@ static int sr_probe(struct device *dev)
 	sr_vendor_init(cd);
 
 	set_capacity(disk, cd->capacity);
-	disk->private_data = &cd->driver;
+	disk->private_data = cd;
 
 	if (register_cdrom(disk, &cd->cdi))
 		goto fail_minor;
diff --git a/drivers/scsi/sr.h b/drivers/scsi/sr.h
index 1609f02ed29a..d80af3fcb6f9 100644
--- a/drivers/scsi/sr.h
+++ b/drivers/scsi/sr.h
@@ -32,7 +32,6 @@ struct scsi_device;
 
 
 typedef struct scsi_cd {
-	struct scsi_driver *driver;
 	unsigned capacity;	/* size in blocks                       */
 	struct scsi_device *device;
 	unsigned int vendor;	/* vendor code, see sr_vendor.c         */
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index e869e90e05af..ebe9412c86f4 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -4276,7 +4276,6 @@ static int st_probe(struct device *dev)
 		goto out_buffer_free;
 	}
 	kref_init(&tpnt->kref);
-	tpnt->driver = &st_template;
 
 	tpnt->device = SDp;
 	if (SDp->scsi_level <= 2)
diff --git a/drivers/scsi/st.h b/drivers/scsi/st.h
index c0ef0d9aaf8a..7a68eaba7e81 100644
--- a/drivers/scsi/st.h
+++ b/drivers/scsi/st.h
@@ -117,7 +117,6 @@ struct scsi_tape_stats {
 
 /* The tape drive descriptor */
 struct scsi_tape {
-	struct scsi_driver *driver;
 	struct scsi_device *device;
 	struct mutex lock;	/* For serialization */
 	struct completion wait;	/* For SCSI commands */
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index 6794d7322cbd..e3a4c67794b1 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -13,7 +13,6 @@
 #include <scsi/scsi_request.h>
 
 struct Scsi_Host;
-struct scsi_driver;
 
 /*
  * MAX_COMMAND_SIZE is:
@@ -159,14 +158,6 @@ static inline void *scsi_cmd_priv(struct scsi_cmnd *cmd)
 	return cmd + 1;
 }
 
-/* make sure not to use it with passthrough commands */
-static inline struct scsi_driver *scsi_cmd_to_driver(struct scsi_cmnd *cmd)
-{
-	struct request *rq = scsi_cmd_to_rq(cmd);
-
-	return *(struct scsi_driver **)rq->q->disk->private_data;
-}
-
 void scsi_done(struct scsi_cmnd *cmd);
 
 extern void scsi_finish_command(struct scsi_cmnd *cmd);
diff --git a/include/scsi/scsi_driver.h b/include/scsi/scsi_driver.h
index 6dffa8555a39..4ce1988b2ba0 100644
--- a/include/scsi/scsi_driver.h
+++ b/include/scsi/scsi_driver.h
@@ -4,11 +4,10 @@
 
 #include <linux/blk_types.h>
 #include <linux/device.h>
+#include <scsi/scsi_cmnd.h>
 
 struct module;
 struct request;
-struct scsi_cmnd;
-struct scsi_device;
 
 struct scsi_driver {
 	struct device_driver	gendrv;
@@ -31,4 +30,10 @@ extern int scsi_register_interface(struct class_interface *);
 #define scsi_unregister_interface(intf) \
 	class_interface_unregister(intf)
 
+/* make sure not to use it with passthrough commands */
+static inline struct scsi_driver *scsi_cmd_to_driver(struct scsi_cmnd *cmd)
+{
+	return to_scsi_driver(cmd->device->sdev_gendev.driver);
+}
+
 #endif /* _SCSI_SCSI_DRIVER_H */

From fad45c3007a18064da759b4dba35eb722bc64e97 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 8 Mar 2022 06:51:50 +0100
Subject: [PATCH 107/127] sd: rename the scsi_disk.dev field

dev is very hard to grep for.  Give the field a more descriptive name and
documents its purpose.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20220308055200.735835-5-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/sd.c | 22 +++++++++++-----------
 drivers/scsi/sd.h |  9 +++++++--
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 2a1e19e871d3..7479e7cb36b4 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -672,7 +672,7 @@ static struct scsi_disk *scsi_disk_get(struct gendisk *disk)
 	if (disk->private_data) {
 		sdkp = scsi_disk(disk);
 		if (scsi_device_get(sdkp->device) == 0)
-			get_device(&sdkp->dev);
+			get_device(&sdkp->disk_dev);
 		else
 			sdkp = NULL;
 	}
@@ -685,7 +685,7 @@ static void scsi_disk_put(struct scsi_disk *sdkp)
 	struct scsi_device *sdev = sdkp->device;
 
 	mutex_lock(&sd_ref_mutex);
-	put_device(&sdkp->dev);
+	put_device(&sdkp->disk_dev);
 	scsi_device_put(sdev);
 	mutex_unlock(&sd_ref_mutex);
 }
@@ -3529,14 +3529,14 @@ static int sd_probe(struct device *dev)
 					     SD_MOD_TIMEOUT);
 	}
 
-	device_initialize(&sdkp->dev);
-	sdkp->dev.parent = get_device(dev);
-	sdkp->dev.class = &sd_disk_class;
-	dev_set_name(&sdkp->dev, "%s", dev_name(dev));
+	device_initialize(&sdkp->disk_dev);
+	sdkp->disk_dev.parent = get_device(dev);
+	sdkp->disk_dev.class = &sd_disk_class;
+	dev_set_name(&sdkp->disk_dev, "%s", dev_name(dev));
 
-	error = device_add(&sdkp->dev);
+	error = device_add(&sdkp->disk_dev);
 	if (error) {
-		put_device(&sdkp->dev);
+		put_device(&sdkp->disk_dev);
 		goto out;
 	}
 
@@ -3577,7 +3577,7 @@ static int sd_probe(struct device *dev)
 
 	error = device_add_disk(dev, gd, NULL);
 	if (error) {
-		put_device(&sdkp->dev);
+		put_device(&sdkp->disk_dev);
 		goto out;
 	}
 
@@ -3628,7 +3628,7 @@ static int sd_remove(struct device *dev)
 	sdkp = dev_get_drvdata(dev);
 	scsi_autopm_get_device(sdkp->device);
 
-	device_del(&sdkp->dev);
+	device_del(&sdkp->disk_dev);
 	del_gendisk(sdkp->disk);
 	sd_shutdown(dev);
 
@@ -3636,7 +3636,7 @@ static int sd_remove(struct device *dev)
 
 	mutex_lock(&sd_ref_mutex);
 	dev_set_drvdata(dev, NULL);
-	put_device(&sdkp->dev);
+	put_device(&sdkp->disk_dev);
 	mutex_unlock(&sd_ref_mutex);
 
 	return 0;
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 303aa1c23aef..0a33a4b68ffb 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -69,7 +69,12 @@ enum {
 
 struct scsi_disk {
 	struct scsi_device *device;
-	struct device	dev;
+
+	/*
+	 * disk_dev is used to show attributes in /sys/class/scsi_disk/,
+	 * but otherwise not really needed.  Do not use for refcounting.
+	 */
+	struct device	disk_dev;
 	struct gendisk	*disk;
 	struct opal_dev *opal_dev;
 #ifdef CONFIG_BLK_DEV_ZONED
@@ -126,7 +131,7 @@ struct scsi_disk {
 	unsigned	security : 1;
 	unsigned	ignore_medium_access_errors : 1;
 };
-#define to_scsi_disk(obj) container_of(obj,struct scsi_disk,dev)
+#define to_scsi_disk(obj) container_of(obj, struct scsi_disk, disk_dev)
 
 static inline struct scsi_disk *scsi_disk(struct gendisk *disk)
 {

From c76c46fa04c42f7d3a494c526ce5f030da36b553 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 8 Mar 2022 06:51:51 +0100
Subject: [PATCH 108/127] sd: call sd_zbc_release_disk before releasing the
 scsi_device reference

sd_zbc_release_disk accesses disk->device, so ensure that actually still has
a valid reference.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20220308055200.735835-6-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/sd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 7479e7cb36b4..7bfebf5b2832 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -3672,9 +3672,9 @@ static void scsi_disk_release(struct device *dev)
 
 	disk->private_data = NULL;
 	put_disk(disk);
-	put_device(&sdkp->device->sdev_gendev);
 
 	sd_zbc_release_disk(sdkp);
+	put_device(&sdkp->device->sdev_gendev);
 
 	kfree(sdkp);
 }

From 534cf52aa9c18d87599a8811a39735c1b62742a0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 8 Mar 2022 06:51:52 +0100
Subject: [PATCH 109/127] sd: delay calling free_opal_dev

Call free_opal_dev from scsi_disk_release as the opal_dev field is accessed
from the ioctl handler, which isn't synchronized vs sd_release and thus
can be accessed during or after sd_release was called.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20220308055200.735835-7-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/sd.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 7bfebf5b2832..346b8d62de7d 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -3632,8 +3632,6 @@ static int sd_remove(struct device *dev)
 	del_gendisk(sdkp->disk);
 	sd_shutdown(dev);
 
-	free_opal_dev(sdkp->opal_dev);
-
 	mutex_lock(&sd_ref_mutex);
 	dev_set_drvdata(dev, NULL);
 	put_device(&sdkp->disk_dev);
@@ -3675,6 +3673,7 @@ static void scsi_disk_release(struct device *dev)
 
 	sd_zbc_release_disk(sdkp);
 	put_device(&sdkp->device->sdev_gendev);
+	free_opal_dev(sdkp->opal_dev);
 
 	kfree(sdkp);
 }

From 9c63f7f6ff3142edc81776df35d79e088df6ba6c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 8 Mar 2022 06:51:53 +0100
Subject: [PATCH 110/127] sd: implement ->free_disk to simplify refcounting

Implement the ->free_disk method to to put struct scsi_disk when the last
gendisk reference count goes away.  This removes the need to clear
->private_data and thus freeze the queue on unbind.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20220308055200.735835-8-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/sd.c | 90 +++++++++--------------------------------------
 1 file changed, 16 insertions(+), 74 deletions(-)

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 346b8d62de7d..73e6f5f0f37c 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -121,11 +121,6 @@ static void scsi_disk_release(struct device *cdev);
 
 static DEFINE_IDA(sd_index_ida);
 
-/* This semaphore is used to mediate the 0->1 reference get in the
- * face of object destruction (i.e. we can't allow a get on an
- * object after last put) */
-static DEFINE_MUTEX(sd_ref_mutex);
-
 static struct kmem_cache *sd_cdb_cache;
 static mempool_t *sd_cdb_pool;
 static mempool_t *sd_page_pool;
@@ -663,33 +658,6 @@ static int sd_major(int major_idx)
 	}
 }
 
-static struct scsi_disk *scsi_disk_get(struct gendisk *disk)
-{
-	struct scsi_disk *sdkp = NULL;
-
-	mutex_lock(&sd_ref_mutex);
-
-	if (disk->private_data) {
-		sdkp = scsi_disk(disk);
-		if (scsi_device_get(sdkp->device) == 0)
-			get_device(&sdkp->disk_dev);
-		else
-			sdkp = NULL;
-	}
-	mutex_unlock(&sd_ref_mutex);
-	return sdkp;
-}
-
-static void scsi_disk_put(struct scsi_disk *sdkp)
-{
-	struct scsi_device *sdev = sdkp->device;
-
-	mutex_lock(&sd_ref_mutex);
-	put_device(&sdkp->disk_dev);
-	scsi_device_put(sdev);
-	mutex_unlock(&sd_ref_mutex);
-}
-
 #ifdef CONFIG_BLK_SED_OPAL
 static int sd_sec_submit(void *data, u16 spsp, u8 secp, void *buffer,
 		size_t len, bool send)
@@ -1418,17 +1386,15 @@ static bool sd_need_revalidate(struct block_device *bdev,
  **/
 static int sd_open(struct block_device *bdev, fmode_t mode)
 {
-	struct scsi_disk *sdkp = scsi_disk_get(bdev->bd_disk);
-	struct scsi_device *sdev;
+	struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk);
+	struct scsi_device *sdev = sdkp->device;
 	int retval;
 
-	if (!sdkp)
+	if (scsi_device_get(sdev))
 		return -ENXIO;
 
 	SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_open\n"));
 
-	sdev = sdkp->device;
-
 	/*
 	 * If the device is in error recovery, wait until it is done.
 	 * If the device is offline, then disallow any access to it.
@@ -1473,7 +1439,7 @@ static int sd_open(struct block_device *bdev, fmode_t mode)
 	return 0;
 
 error_out:
-	scsi_disk_put(sdkp);
+	scsi_device_put(sdev);
 	return retval;	
 }
 
@@ -1502,7 +1468,7 @@ static void sd_release(struct gendisk *disk, fmode_t mode)
 			scsi_set_medium_removal(sdev, SCSI_REMOVAL_ALLOW);
 	}
 
-	scsi_disk_put(sdkp);
+	scsi_device_put(sdev);
 }
 
 static int sd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
@@ -1616,7 +1582,7 @@ static int media_not_present(struct scsi_disk *sdkp,
  **/
 static unsigned int sd_check_events(struct gendisk *disk, unsigned int clearing)
 {
-	struct scsi_disk *sdkp = scsi_disk_get(disk);
+	struct scsi_disk *sdkp = disk->private_data;
 	struct scsi_device *sdp;
 	int retval;
 	bool disk_changed;
@@ -1679,7 +1645,6 @@ out:
 	 */
 	disk_changed = sdp->changed;
 	sdp->changed = 0;
-	scsi_disk_put(sdkp);
 	return disk_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
 }
 
@@ -1887,6 +1852,13 @@ static const struct pr_ops sd_pr_ops = {
 	.pr_clear	= sd_pr_clear,
 };
 
+static void scsi_disk_free_disk(struct gendisk *disk)
+{
+	struct scsi_disk *sdkp = scsi_disk(disk);
+
+	put_device(&sdkp->disk_dev);
+}
+
 static const struct block_device_operations sd_fops = {
 	.owner			= THIS_MODULE,
 	.open			= sd_open,
@@ -1898,6 +1870,7 @@ static const struct block_device_operations sd_fops = {
 	.unlock_native_capacity	= sd_unlock_native_capacity,
 	.report_zones		= sd_zbc_report_zones,
 	.get_unique_id		= sd_get_unique_id,
+	.free_disk		= scsi_disk_free_disk,
 	.pr_ops			= &sd_pr_ops,
 };
 
@@ -3623,54 +3596,23 @@ static int sd_probe(struct device *dev)
  **/
 static int sd_remove(struct device *dev)
 {
-	struct scsi_disk *sdkp;
+	struct scsi_disk *sdkp = dev_get_drvdata(dev);
 
-	sdkp = dev_get_drvdata(dev);
 	scsi_autopm_get_device(sdkp->device);
 
 	device_del(&sdkp->disk_dev);
 	del_gendisk(sdkp->disk);
 	sd_shutdown(dev);
 
-	mutex_lock(&sd_ref_mutex);
-	dev_set_drvdata(dev, NULL);
-	put_device(&sdkp->disk_dev);
-	mutex_unlock(&sd_ref_mutex);
-
+	put_disk(sdkp->disk);
 	return 0;
 }
 
-/**
- *	scsi_disk_release - Called to free the scsi_disk structure
- *	@dev: pointer to embedded class device
- *
- *	sd_ref_mutex must be held entering this routine.  Because it is
- *	called on last put, you should always use the scsi_disk_get()
- *	scsi_disk_put() helpers which manipulate the semaphore directly
- *	and never do a direct put_device.
- **/
 static void scsi_disk_release(struct device *dev)
 {
 	struct scsi_disk *sdkp = to_scsi_disk(dev);
-	struct gendisk *disk = sdkp->disk;
-	struct request_queue *q = disk->queue;
 
 	ida_free(&sd_index_ida, sdkp->index);
-
-	/*
-	 * Wait until all requests that are in progress have completed.
-	 * This is necessary to avoid that e.g. scsi_end_request() crashes
-	 * due to clearing the disk->private_data pointer. Wait from inside
-	 * scsi_disk_release() instead of from sd_release() to avoid that
-	 * freezing and unfreezing the request queue affects user space I/O
-	 * in case multiple processes open a /dev/sd... node concurrently.
-	 */
-	blk_mq_freeze_queue(q);
-	blk_mq_unfreeze_queue(q);
-
-	disk->private_data = NULL;
-	put_disk(disk);
-
 	sd_zbc_release_disk(sdkp);
 	put_device(&sdkp->device->sdev_gendev);
 	free_opal_dev(sdkp->opal_dev);

From 01d0c698536fe920733fc6cab7f9740c7acfdece Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 8 Mar 2022 06:51:54 +0100
Subject: [PATCH 111/127] sr: implement ->free_disk to simplify refcounting

Simplify the refcounting and remove the need to clear disk->private_data
by implementing the ->free_disk method.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20220308055200.735835-9-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/sr.c | 124 ++++++++++------------------------------------
 drivers/scsi/sr.h |   4 --
 2 files changed, 26 insertions(+), 102 deletions(-)

diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 569bda76a517..11fbdc75bb71 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -109,11 +109,6 @@ static DEFINE_SPINLOCK(sr_index_lock);
 
 static struct lock_class_key sr_bio_compl_lkclass;
 
-/* This semaphore is used to mediate the 0->1 reference get in the
- * face of object destruction (i.e. we can't allow a get on an
- * object after last put) */
-static DEFINE_MUTEX(sr_ref_mutex);
-
 static int sr_open(struct cdrom_device_info *, int);
 static void sr_release(struct cdrom_device_info *);
 
@@ -143,8 +138,6 @@ static const struct cdrom_device_ops sr_dops = {
 	.capability		= SR_CAPABILITIES,
 };
 
-static void sr_kref_release(struct kref *kref);
-
 static inline struct scsi_cd *scsi_cd(struct gendisk *disk)
 {
 	return disk->private_data;
@@ -163,38 +156,6 @@ static int sr_runtime_suspend(struct device *dev)
 		return 0;
 }
 
-/*
- * The get and put routines for the struct scsi_cd.  Note this entity
- * has a scsi_device pointer and owns a reference to this.
- */
-static inline struct scsi_cd *scsi_cd_get(struct gendisk *disk)
-{
-	struct scsi_cd *cd = NULL;
-
-	mutex_lock(&sr_ref_mutex);
-	if (disk->private_data == NULL)
-		goto out;
-	cd = scsi_cd(disk);
-	kref_get(&cd->kref);
-	if (scsi_device_get(cd->device)) {
-		kref_put(&cd->kref, sr_kref_release);
-		cd = NULL;
-	}
- out:
-	mutex_unlock(&sr_ref_mutex);
-	return cd;
-}
-
-static void scsi_cd_put(struct scsi_cd *cd)
-{
-	struct scsi_device *sdev = cd->device;
-
-	mutex_lock(&sr_ref_mutex);
-	kref_put(&cd->kref, sr_kref_release);
-	scsi_device_put(sdev);
-	mutex_unlock(&sr_ref_mutex);
-}
-
 static unsigned int sr_get_events(struct scsi_device *sdev)
 {
 	u8 buf[8];
@@ -522,15 +483,13 @@ static void sr_revalidate_disk(struct scsi_cd *cd)
 
 static int sr_block_open(struct block_device *bdev, fmode_t mode)
 {
-	struct scsi_cd *cd;
-	struct scsi_device *sdev;
+	struct scsi_cd *cd = cd = scsi_cd(bdev->bd_disk);
+	struct scsi_device *sdev = cd->device;
 	int ret = -ENXIO;
 
-	cd = scsi_cd_get(bdev->bd_disk);
-	if (!cd)
-		goto out;
+	if (scsi_device_get(cd->device))
+		return -ENXIO;
 
-	sdev = cd->device;
 	scsi_autopm_get_device(sdev);
 	if (bdev_check_media_change(bdev))
 		sr_revalidate_disk(cd);
@@ -541,9 +500,7 @@ static int sr_block_open(struct block_device *bdev, fmode_t mode)
 
 	scsi_autopm_put_device(sdev);
 	if (ret)
-		scsi_cd_put(cd);
-
-out:
+		scsi_device_put(cd->device);
 	return ret;
 }
 
@@ -555,7 +512,7 @@ static void sr_block_release(struct gendisk *disk, fmode_t mode)
 	cdrom_release(&cd->cdi, mode);
 	mutex_unlock(&cd->lock);
 
-	scsi_cd_put(cd);
+	scsi_device_put(cd->device);
 }
 
 static int sr_block_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
@@ -595,18 +552,24 @@ out:
 static unsigned int sr_block_check_events(struct gendisk *disk,
 					  unsigned int clearing)
 {
-	unsigned int ret = 0;
-	struct scsi_cd *cd;
+	struct scsi_cd *cd = disk->private_data;
 
-	cd = scsi_cd_get(disk);
-	if (!cd)
+	if (atomic_read(&cd->device->disk_events_disable_depth))
 		return 0;
+	return cdrom_check_events(&cd->cdi, clearing);
+}
 
-	if (!atomic_read(&cd->device->disk_events_disable_depth))
-		ret = cdrom_check_events(&cd->cdi, clearing);
+static void sr_free_disk(struct gendisk *disk)
+{
+	struct scsi_cd *cd = disk->private_data;
 
-	scsi_cd_put(cd);
-	return ret;
+	spin_lock(&sr_index_lock);
+	clear_bit(MINOR(disk_devt(disk)), sr_index_bits);
+	spin_unlock(&sr_index_lock);
+
+	unregister_cdrom(&cd->cdi);
+	mutex_destroy(&cd->lock);
+	kfree(cd);
 }
 
 static const struct block_device_operations sr_bdops =
@@ -617,6 +580,7 @@ static const struct block_device_operations sr_bdops =
 	.ioctl		= sr_block_ioctl,
 	.compat_ioctl	= blkdev_compat_ptr_ioctl,
 	.check_events	= sr_block_check_events,
+	.free_disk	= sr_free_disk,
 };
 
 static int sr_open(struct cdrom_device_info *cdi, int purpose)
@@ -660,8 +624,6 @@ static int sr_probe(struct device *dev)
 	if (!cd)
 		goto fail;
 
-	kref_init(&cd->kref);
-
 	disk = __alloc_disk_node(sdev->request_queue, NUMA_NO_NODE,
 				 &sr_bio_compl_lkclass);
 	if (!disk)
@@ -727,10 +689,8 @@ static int sr_probe(struct device *dev)
 	sr_revalidate_disk(cd);
 
 	error = device_add_disk(&sdev->sdev_gendev, disk, NULL);
-	if (error) {
-		kref_put(&cd->kref, sr_kref_release);
-		goto fail;
-	}
+	if (error)
+		goto unregister_cdrom;
 
 	sdev_printk(KERN_DEBUG, sdev,
 		    "Attached scsi CD-ROM %s\n", cd->cdi.name);
@@ -738,6 +698,8 @@ static int sr_probe(struct device *dev)
 
 	return 0;
 
+unregister_cdrom:
+	unregister_cdrom(&cd->cdi);
 fail_minor:
 	spin_lock(&sr_index_lock);
 	clear_bit(minor, sr_index_bits);
@@ -1009,36 +971,6 @@ out_put_request:
 	return ret;
 }
 
-
-/**
- *	sr_kref_release - Called to free the scsi_cd structure
- *	@kref: pointer to embedded kref
- *
- *	sr_ref_mutex must be held entering this routine.  Because it is
- *	called on last put, you should always use the scsi_cd_get()
- *	scsi_cd_put() helpers which manipulate the semaphore directly
- *	and never do a direct kref_put().
- **/
-static void sr_kref_release(struct kref *kref)
-{
-	struct scsi_cd *cd = container_of(kref, struct scsi_cd, kref);
-	struct gendisk *disk = cd->disk;
-
-	spin_lock(&sr_index_lock);
-	clear_bit(MINOR(disk_devt(disk)), sr_index_bits);
-	spin_unlock(&sr_index_lock);
-
-	unregister_cdrom(&cd->cdi);
-
-	disk->private_data = NULL;
-
-	put_disk(disk);
-
-	mutex_destroy(&cd->lock);
-
-	kfree(cd);
-}
-
 static int sr_remove(struct device *dev)
 {
 	struct scsi_cd *cd = dev_get_drvdata(dev);
@@ -1046,11 +978,7 @@ static int sr_remove(struct device *dev)
 	scsi_autopm_get_device(cd->device);
 
 	del_gendisk(cd->disk);
-	dev_set_drvdata(dev, NULL);
-
-	mutex_lock(&sr_ref_mutex);
-	kref_put(&cd->kref, sr_kref_release);
-	mutex_unlock(&sr_ref_mutex);
+	put_disk(cd->disk);
 
 	return 0;
 }
diff --git a/drivers/scsi/sr.h b/drivers/scsi/sr.h
index d80af3fcb6f9..1175f2e213b5 100644
--- a/drivers/scsi/sr.h
+++ b/drivers/scsi/sr.h
@@ -18,7 +18,6 @@
 #ifndef _SR_H
 #define _SR_H
 
-#include <linux/kref.h>
 #include <linux/mutex.h>
 
 #define MAX_RETRIES	3
@@ -51,9 +50,6 @@ typedef struct scsi_cd {
 
 	struct cdrom_device_info cdi;
 	struct mutex lock;
-	/* We hold gendisk and scsi_device references on probe and use
-	 * the refs on this kref to decide when to release them */
-	struct kref kref;
 	struct gendisk *disk;
 } Scsi_CD;
 

From 1059699f87eb0b3aa9d574b91a572d534897134a Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 8 Mar 2022 06:51:55 +0100
Subject: [PATCH 112/127] block: move blkcg initialization/destroy into disk
 allocation/release handler

blkcg works on FS bio level, so it is reasonable to make both blkcg and
gendisk sharing same lifetime. Meantime there won't be any FS IO when
releasing disk, so safe to move blkcg initialization/destroy into disk
allocation/release handler

Long term, we can move blkcg into gendisk completely.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Link: https://lore.kernel.org/r/20220308055200.735835-10-hch@lst.de
[axboe: fixup missing blk-cgroup.h include]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c  | 5 -----
 block/blk-sysfs.c | 7 -------
 block/genhd.c     | 9 +++++++++
 3 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 4d858fc08f8b..3fa2f08d3750 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -496,17 +496,12 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
 				PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
 		goto fail_stats;
 
-	if (blkcg_init_queue(q))
-		goto fail_ref;
-
 	blk_queue_dma_alignment(q, 511);
 	blk_set_default_limits(&q->limits);
 	q->nr_requests = BLKDEV_DEFAULT_RQ;
 
 	return q;
 
-fail_ref:
-	percpu_ref_exit(&q->q_usage_counter);
 fail_stats:
 	blk_free_queue_stats(q->stats);
 fail_split:
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 241ded62f458..220085109d7f 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -751,13 +751,6 @@ static void blk_exit_queue(struct request_queue *q)
 		ioc_clear_queue(q);
 		elevator_exit(q);
 	}
-
-	/*
-	 * Remove all references to @q from the block cgroup controller before
-	 * restoring @q->queue_lock to avoid that restoring this pointer causes
-	 * e.g. blkcg_print_blkgs() to crash.
-	 */
-	blkcg_exit_queue(q);
 }
 
 /**
diff --git a/block/genhd.c b/block/genhd.c
index 54f60ded2ee6..fc10aedab209 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -29,6 +29,7 @@
 #include "blk.h"
 #include "blk-mq-sched.h"
 #include "blk-rq-qos.h"
+#include "blk-cgroup.h"
 
 static struct kobject *block_depr;
 
@@ -1120,9 +1121,12 @@ static void disk_release(struct device *dev)
 
 	blk_mq_cancel_work_sync(disk->queue);
 
+	blkcg_exit_queue(disk->queue);
+
 	disk_release_events(disk);
 	kfree(disk->random);
 	xa_destroy(&disk->part_tbl);
+
 	disk->queue->disk = NULL;
 	blk_put_queue(disk->queue);
 
@@ -1328,6 +1332,9 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
 	if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL))
 		goto out_destroy_part_tbl;
 
+	if (blkcg_init_queue(q))
+		goto out_erase_part0;
+
 	rand_initialize_disk(disk);
 	disk_to_dev(disk)->class = &block_class;
 	disk_to_dev(disk)->type = &disk_type;
@@ -1340,6 +1347,8 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
 #endif
 	return disk;
 
+out_erase_part0:
+	xa_erase(&disk->part_tbl, 0);
 out_destroy_part_tbl:
 	xa_destroy(&disk->part_tbl);
 	disk->part0->bd_disk = NULL;

From de3d347f7b8a1a997fbc9267454ed6065068b969 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 8 Mar 2022 06:51:56 +0100
Subject: [PATCH 113/127] block: don't remove hctx debugfs dir from
 blk_mq_exit_queue

The queue's top debugfs dir is removed from blk_release_queue(), so all
hctx's debugfs dirs are removed from there. Given blk_mq_exit_queue()
is only called from blk_cleanup_queue(), it isn't necessary to remove
hctx debugfs from blk_mq_exit_queue().

So remove it from blk_mq_exit_queue().

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220308055200.735835-11-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index f7ef8e2ab935..862d91c6112e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3462,7 +3462,6 @@ static void blk_mq_exit_hw_queues(struct request_queue *q,
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (i == nr_queue)
 			break;
-		blk_mq_debugfs_unregister_hctx(hctx);
 		blk_mq_exit_hctx(q, set, hctx, i);
 	}
 }

From ba3e845665fbbb0252336f27200cd5cf288a3573 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 8 Mar 2022 06:51:57 +0100
Subject: [PATCH 114/127] block: move q_usage_counter release into
 blk_queue_release

After blk_cleanup_queue() returns, disk may not be released yet, so
probably bio may still be submitted and ->q_usage_counter may be
touched, so far this way seems safe, but not good from API's viewpoint.

Move the release q_usage_counter into blk_queue_release().

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220308055200.735835-12-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c  | 2 --
 block/blk-sysfs.c | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 3fa2f08d3750..a97918d107a0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -342,8 +342,6 @@ void blk_cleanup_queue(struct request_queue *q)
 		blk_mq_sched_free_rqs(q);
 	mutex_unlock(&q->sysfs_lock);
 
-	percpu_ref_exit(&q->q_usage_counter);
-
 	/* @q is and will stay empty, shutdown and put */
 	blk_put_queue(q);
 }
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 220085109d7f..af5a6d86073f 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -780,6 +780,8 @@ static void blk_release_queue(struct kobject *kobj)
 
 	might_sleep();
 
+	percpu_ref_exit(&q->q_usage_counter);
+
 	if (q->poll_stat)
 		blk_stat_remove_callback(q, q->poll_cb);
 	blk_stat_free_callback(q->poll_cb);

From 28ce942fa2d5d80af5367ba9d39f2e0b4af37bfd Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 8 Mar 2022 06:51:58 +0100
Subject: [PATCH 115/127] block: move blk_exit_queue into disk_release

There can't be file system I/O in disk_release(), so move the call to
blk_exit_queue() there, preparing to have the teardown of file system I/O
only functionality in one place, when the gendisk that is needed for it
is torn down.

We still need to freeze queue here since the request is freed after the
bio is completed and passthrough request rely on scheduler tags as well.

The disk can be released before or after queue is cleaned up, and we have
to free the scheduler request pool before blk_cleanup_queue returns,
while the static request pool has to be freed before exiting the
I/O scheduler.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
[hch: rebased, updated the commit log]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20220308055200.735835-13-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-sysfs.c | 16 ----------------
 block/genhd.c     | 31 ++++++++++++++++++++++++++++++-
 2 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index af5a6d86073f..85c4ba006671 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -739,20 +739,6 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
 	kmem_cache_free(blk_get_queue_kmem_cache(blk_queue_has_srcu(q)), q);
 }
 
-/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */
-static void blk_exit_queue(struct request_queue *q)
-{
-	/*
-	 * Since the I/O scheduler exit code may access cgroup information,
-	 * perform I/O scheduler exit before disassociating from the block
-	 * cgroup controller.
-	 */
-	if (q->elevator) {
-		ioc_clear_queue(q);
-		elevator_exit(q);
-	}
-}
-
 /**
  * blk_release_queue - releases all allocated resources of the request_queue
  * @kobj: pointer to a kobject, whose container is a request_queue
@@ -786,8 +772,6 @@ static void blk_release_queue(struct kobject *kobj)
 		blk_stat_remove_callback(q, q->poll_cb);
 	blk_stat_free_callback(q->poll_cb);
 
-	blk_exit_queue(q);
-
 	blk_free_queue_stats(q->stats);
 	kfree(q->poll_stat);
 
diff --git a/block/genhd.c b/block/genhd.c
index fc10aedab209..73705a749ea9 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1098,6 +1098,34 @@ static const struct attribute_group *disk_attr_groups[] = {
 	NULL
 };
 
+static void disk_release_mq(struct request_queue *q)
+{
+	blk_mq_cancel_work_sync(q);
+
+	/*
+	 * There can't be any non non-passthrough bios in flight here, but
+	 * requests stay around longer, including passthrough ones so we
+	 * still need to freeze the queue here.
+	 */
+	blk_mq_freeze_queue(q);
+
+	/*
+	 * Since the I/O scheduler exit code may access cgroup information,
+	 * perform I/O scheduler exit before disassociating from the block
+	 * cgroup controller.
+	 */
+	if (q->elevator) {
+		ioc_clear_queue(q);
+
+		mutex_lock(&q->sysfs_lock);
+		blk_mq_sched_free_rqs(q);
+		elevator_exit(q);
+		mutex_unlock(&q->sysfs_lock);
+	}
+
+	__blk_mq_unfreeze_queue(q, true);
+}
+
 /**
  * disk_release - releases all allocated resources of the gendisk
  * @dev: the device representing this disk
@@ -1119,7 +1147,8 @@ static void disk_release(struct device *dev)
 	might_sleep();
 	WARN_ON_ONCE(disk_live(disk));
 
-	blk_mq_cancel_work_sync(disk->queue);
+	if (queue_is_mq(disk->queue))
+		disk_release_mq(disk->queue);
 
 	blkcg_exit_queue(disk->queue);
 

From 28883074fc104f16ab181207d556ab75df6c6526 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 8 Mar 2022 06:51:59 +0100
Subject: [PATCH 116/127] block: do more work in elevator_exit

Move the calls to ioc_clear_queue and blk_mq_sched_free_rqs into
elevator_exit.  Except for one call where we know we can't have io_cq
structures yet these always go together, and that extra call in an
error path is harmless.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220308055200.735835-14-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/elevator.c | 6 +++---
 block/genhd.c    | 3 ---
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/block/elevator.c b/block/elevator.c
index a842e4b8ebc6..20a4e7c7c774 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -192,6 +192,9 @@ void elevator_exit(struct request_queue *q)
 {
 	struct elevator_queue *e = q->elevator;
 
+	ioc_clear_queue(q);
+	blk_mq_sched_free_rqs(q);
+
 	mutex_lock(&e->sysfs_lock);
 	blk_mq_exit_sched(q, e);
 	mutex_unlock(&e->sysfs_lock);
@@ -596,8 +599,6 @@ int elevator_switch_mq(struct request_queue *q,
 
 	if (q->elevator) {
 		elv_unregister_queue(q);
-		ioc_clear_queue(q);
-		blk_mq_sched_free_rqs(q);
 		elevator_exit(q);
 	}
 
@@ -608,7 +609,6 @@ int elevator_switch_mq(struct request_queue *q,
 	if (new_e) {
 		ret = elv_register_queue(q, true);
 		if (ret) {
-			blk_mq_sched_free_rqs(q);
 			elevator_exit(q);
 			goto out;
 		}
diff --git a/block/genhd.c b/block/genhd.c
index 73705a749ea9..857e0a54da7d 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1115,10 +1115,7 @@ static void disk_release_mq(struct request_queue *q)
 	 * cgroup controller.
 	 */
 	if (q->elevator) {
-		ioc_clear_queue(q);
-
 		mutex_lock(&q->sysfs_lock);
-		blk_mq_sched_free_rqs(q);
 		elevator_exit(q);
 		mutex_unlock(&q->sysfs_lock);
 	}

From 5ca7546fe317bd181ad55f623f6e522069c2e426 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 8 Mar 2022 06:52:00 +0100
Subject: [PATCH 117/127] block: move rq_qos_exit() into disk_release()

Keep all teardown of file system I/O related functionality in one place.
There can't be file system I/O in disk_release(), so it is safe to move
rq_qos_exit() there.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220308055200.735835-15-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 857e0a54da7d..56f66c6fee94 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -627,7 +627,6 @@ void del_gendisk(struct gendisk *disk)
 
 	blk_mq_freeze_queue_wait(q);
 
-	rq_qos_exit(q);
 	blk_sync_queue(q);
 	blk_flush_integrity();
 	/*
@@ -1119,7 +1118,7 @@ static void disk_release_mq(struct request_queue *q)
 		elevator_exit(q);
 		mutex_unlock(&q->sysfs_lock);
 	}
-
+	rq_qos_exit(q);
 	__blk_mq_unfreeze_queue(q, true);
 }
 

From 5b2050718d095cd3242d1f42aaaea3a2fec8e6f0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 11 Mar 2022 10:21:43 -0700
Subject: [PATCH 118/127] block: ensure plug merging checks the correct queue
 at least once

Song reports that a RAID rebuild workload runs much slower recently,
and it is seeing a lot less merging than it did previously. The reason
is that a previous commit reduced the amount of work we do for plug
merging. RAID rebuild interleaves requests between disks, so a last-entry
check in plug merging always misses a merge opportunity since we always
find a different disk than what we are looking for.

Modify the logic such that it's still a one-hit cache, but ensure that
we check enough to find the right target before giving up.

Fixes: d38a9c04c0d5 ("block: only check previous entry for plug merge attempt")
Reported-and-tested-by: Song Liu <song@kernel.org>
Reviewed-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-merge.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index f5255991b773..8d8177f71ebd 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -1087,12 +1087,20 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
 	if (!plug || rq_list_empty(plug->mq_list))
 		return false;
 
-	/* check the previously added entry for a quick merge attempt */
-	rq = rq_list_peek(&plug->mq_list);
-	if (rq->q == q) {
-		if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
-				BIO_MERGE_OK)
-			return true;
+	rq_list_for_each(&plug->mq_list, rq) {
+		if (rq->q == q) {
+			if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
+			    BIO_MERGE_OK)
+				return true;
+			break;
+		}
+
+		/*
+		 * Only keep iterating plug list for merges if we have multiple
+		 * queues
+		 */
+		if (!plug->multiple_queues)
+			break;
 	}
 	return false;
 }

From 26fed4ac4eab09c27fbae1859696cc38f0536407 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 11 Mar 2022 10:24:17 -0700
Subject: [PATCH 119/127] block: flush plug based on hardware and software
 queue order

We used to sort the plug list if we had multiple queues before dispatching
requests to the IO scheduler. This usually isn't needed, but for certain
workloads that interleave requests to disks, it's a less efficient to
process the plug list one-by-one if everything is interleaved.

Don't sort the list, but skip through it and flush out entries that have
the same target at the same time.

Fixes: df87eb0fce8f ("block: get rid of plug list sorting")
Reported-and-tested-by: Song Liu <song@kernel.org>
Reviewed-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 59 ++++++++++++++++++++++++--------------------------
 1 file changed, 28 insertions(+), 31 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 862d91c6112e..213bb5979bed 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2573,13 +2573,36 @@ static void __blk_mq_flush_plug_list(struct request_queue *q,
 	q->mq_ops->queue_rqs(&plug->mq_list);
 }
 
+static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
+{
+	struct blk_mq_hw_ctx *this_hctx = NULL;
+	struct blk_mq_ctx *this_ctx = NULL;
+	struct request *requeue_list = NULL;
+	unsigned int depth = 0;
+	LIST_HEAD(list);
+
+	do {
+		struct request *rq = rq_list_pop(&plug->mq_list);
+
+		if (!this_hctx) {
+			this_hctx = rq->mq_hctx;
+			this_ctx = rq->mq_ctx;
+		} else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) {
+			rq_list_add(&requeue_list, rq);
+			continue;
+		}
+		list_add_tail(&rq->queuelist, &list);
+		depth++;
+	} while (!rq_list_empty(plug->mq_list));
+
+	plug->mq_list = requeue_list;
+	trace_block_unplug(this_hctx->queue, depth, !from_sched);
+	blk_mq_sched_insert_requests(this_hctx, this_ctx, &list, from_sched);
+}
+
 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 {
-	struct blk_mq_hw_ctx *this_hctx;
-	struct blk_mq_ctx *this_ctx;
 	struct request *rq;
-	unsigned int depth;
-	LIST_HEAD(list);
 
 	if (rq_list_empty(plug->mq_list))
 		return;
@@ -2615,35 +2638,9 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 			return;
 	}
 
-	this_hctx = NULL;
-	this_ctx = NULL;
-	depth = 0;
 	do {
-		rq = rq_list_pop(&plug->mq_list);
-
-		if (!this_hctx) {
-			this_hctx = rq->mq_hctx;
-			this_ctx = rq->mq_ctx;
-		} else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) {
-			trace_block_unplug(this_hctx->queue, depth,
-						!from_schedule);
-			blk_mq_sched_insert_requests(this_hctx, this_ctx,
-						&list, from_schedule);
-			depth = 0;
-			this_hctx = rq->mq_hctx;
-			this_ctx = rq->mq_ctx;
-
-		}
-
-		list_add(&rq->queuelist, &list);
-		depth++;
+		blk_mq_dispatch_plug_list(plug, from_schedule);
 	} while (!rq_list_empty(plug->mq_list));
-
-	if (!list_empty(&list)) {
-		trace_block_unplug(this_hctx->queue, depth, !from_schedule);
-		blk_mq_sched_insert_requests(this_hctx, this_ctx, &list,
-						from_schedule);
-	}
 }
 
 void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,

From aa1b46dcdc7baaf5fec0be25782ef24b26aa209e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 13 Mar 2022 21:15:02 -1000
Subject: [PATCH 120/127] block: fix rq-qos breakage from skipping
 rq_qos_done_bio()

a647a524a467 ("block: don't call rq_qos_ops->done_bio if the bio isn't
tracked") made bio_endio() skip rq_qos_done_bio() if BIO_TRACKED is not set.
While this fixed a potential oops, it also broke blk-iocost by skipping the
done_bio callback for merged bios.

Before, whether a bio goes through rq_qos_throttle() or rq_qos_merge(),
rq_qos_done_bio() would be called on the bio on completion with BIO_TRACKED
distinguishing the former from the latter. rq_qos_done_bio() is not called
for bios which wenth through rq_qos_merge(). This royally confuses
blk-iocost as the merged bios never finish and are considered perpetually
in-flight.

One reliably reproducible failure mode is an intermediate cgroup geting
stuck active preventing its children from being activated due to the
leaf-only rule, leading to loss of control. The following is from
resctl-bench protection scenario which emulates isolating a web server like
workload from a memory bomb run on an iocost configuration which should
yield a reasonable level of protection.

  # cat /sys/block/nvme2n1/device/model
  Samsung SSD 970 PRO 512GB
  # cat /sys/fs/cgroup/io.cost.model
  259:0 ctrl=user model=linear rbps=834913556 rseqiops=93622 rrandiops=102913 wbps=618985353 wseqiops=72325 wrandiops=71025
  # cat /sys/fs/cgroup/io.cost.qos
  259:0 enable=1 ctrl=user rpct=95.00 rlat=18776 wpct=95.00 wlat=8897 min=60.00 max=100.00
  # resctl-bench -m 29.6G -r out.json run protection::scenario=mem-hog,loops=1
  ...
  Memory Hog Summary
  ==================

  IO Latency: R p50=242u:336u/2.5m p90=794u:1.4m/7.5m p99=2.7m:8.0m/62.5m max=8.0m:36.4m/350m
              W p50=221u:323u/1.5m p90=709u:1.2m/5.5m p99=1.5m:2.5m/9.5m max=6.9m:35.9m/350m

  Isolation and Request Latency Impact Distributions:

                min   p01   p05   p10   p25   p50   p75   p90   p95   p99   max  mean stdev
  isol%       15.90 15.90 15.90 40.05 57.24 59.07 60.01 74.63 74.63 90.35 90.35 58.12 15.82
  lat-imp%        0     0     0     0     0  4.55 14.68 15.54 233.5 548.1 548.1 53.88 143.6

  Result: isol=58.12:15.82% lat_imp=53.88%:143.6 work_csv=100.0% missing=3.96%

The isolation result of 58.12% is close to what this device would show
without any IO control.

Fix it by introducing a new flag BIO_QOS_MERGED to mark merged bios and
calling rq_qos_done_bio() on them too. For consistency and clarity, rename
BIO_TRACKED to BIO_QOS_THROTTLED. The flag checks are moved into
rq_qos_done_bio() so that it's next to the code paths that set the flags.

With the patch applied, the above same benchmark shows:

  # resctl-bench -m 29.6G -r out.json run protection::scenario=mem-hog,loops=1
  ...
  Memory Hog Summary
  ==================

  IO Latency: R p50=123u:84.4u/985u p90=322u:256u/2.5m p99=1.6m:1.4m/9.5m max=11.1m:36.0m/350m
              W p50=429u:274u/995u p90=1.7m:1.3m/4.5m p99=3.4m:2.7m/11.5m max=7.9m:5.9m/26.5m

  Isolation and Request Latency Impact Distributions:

                min   p01   p05   p10   p25   p50   p75   p90   p95   p99   max  mean stdev
  isol%       84.91 84.91 89.51 90.73 92.31 94.49 96.36 98.04 98.71 100.0 100.0 94.42  2.81
  lat-imp%        0     0     0     0     0  2.81  5.73 11.11 13.92 17.53 22.61  4.10  4.68

  Result: isol=94.42:2.81% lat_imp=4.10%:4.68 work_csv=58.34% missing=0%

Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: a647a524a467 ("block: don't call rq_qos_ops->done_bio if the bio isn't tracked")
Cc: stable@vger.kernel.org # v5.15+
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/Yi7rdrzQEHjJLGKB@slm.duckdns.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c               |  3 +--
 block/blk-iolatency.c     |  2 +-
 block/blk-rq-qos.h        | 20 +++++++++++---------
 include/linux/blk_types.h |  3 ++-
 4 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 151cace2dbe1..33979f306e9e 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1516,8 +1516,7 @@ again:
 	if (!bio_integrity_endio(bio))
 		return;
 
-	if (bio->bi_bdev && bio_flagged(bio, BIO_TRACKED))
-		rq_qos_done_bio(bdev_get_queue(bio->bi_bdev), bio);
+	rq_qos_done_bio(bio);
 
 	if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
 		trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio);
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 010e658d44a8..2f33932e72e3 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -598,7 +598,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
 	int inflight = 0;
 
 	blkg = bio->bi_blkg;
-	if (!blkg || !bio_flagged(bio, BIO_TRACKED))
+	if (!blkg || !bio_flagged(bio, BIO_QOS_THROTTLED))
 		return;
 
 	iolat = blkg_to_lat(bio->bi_blkg);
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 3cfbc8668cba..68267007da1c 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -177,20 +177,20 @@ static inline void rq_qos_requeue(struct request_queue *q, struct request *rq)
 		__rq_qos_requeue(q->rq_qos, rq);
 }
 
-static inline void rq_qos_done_bio(struct request_queue *q, struct bio *bio)
+static inline void rq_qos_done_bio(struct bio *bio)
 {
-	if (q->rq_qos)
-		__rq_qos_done_bio(q->rq_qos, bio);
+	if (bio->bi_bdev && (bio_flagged(bio, BIO_QOS_THROTTLED) ||
+			     bio_flagged(bio, BIO_QOS_MERGED))) {
+		struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+		if (q->rq_qos)
+			__rq_qos_done_bio(q->rq_qos, bio);
+	}
 }
 
 static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
 {
-	/*
-	 * BIO_TRACKED lets controllers know that a bio went through the
-	 * normal rq_qos path.
-	 */
 	if (q->rq_qos) {
-		bio_set_flag(bio, BIO_TRACKED);
+		bio_set_flag(bio, BIO_QOS_THROTTLED);
 		__rq_qos_throttle(q->rq_qos, bio);
 	}
 }
@@ -205,8 +205,10 @@ static inline void rq_qos_track(struct request_queue *q, struct request *rq,
 static inline void rq_qos_merge(struct request_queue *q, struct request *rq,
 				struct bio *bio)
 {
-	if (q->rq_qos)
+	if (q->rq_qos) {
+		bio_set_flag(bio, BIO_QOS_MERGED);
 		__rq_qos_merge(q->rq_qos, rq, bio);
+	}
 }
 
 static inline void rq_qos_queue_depth_changed(struct request_queue *q)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 5561e58d158a..0c3563b45fe9 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -324,7 +324,8 @@ enum {
 	BIO_TRACE_COMPLETION,	/* bio_endio() should trace the final completion
 				 * of this bio. */
 	BIO_CGROUP_ACCT,	/* has been accounted to a cgroup */
-	BIO_TRACKED,		/* set if bio goes through the rq_qos path */
+	BIO_QOS_THROTTLED,	/* bio went through rq_qos throttle path */
+	BIO_QOS_MERGED,		/* but went through rq_qos merge path */
 	BIO_REMAPPED,
 	BIO_ZONE_WRITE_LOCKED,	/* Owns a zoned device zone write lock */
 	BIO_PERCPU_CACHE,	/* can participate in per-cpu alloc cache */

From 6b2b04590b51aa4cf395fcd185ce439cab5961dc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 14 Mar 2022 14:30:11 -1000
Subject: [PATCH 121/127] block: don't merge across cgroup boundaries if blkcg
 is enabled

blk-iocost and iolatency are cgroup aware rq-qos policies but they didn't
disable merges across different cgroups. This obviously can lead to
accounting and control errors but more importantly to priority inversions -
e.g. an IO which belongs to a higher priority cgroup or IO class may end up
getting throttled incorrectly because it gets merged to an IO issued from a
low priority cgroup.

Fix it by adding blk_cgroup_mergeable() which is called from merge paths and
rejects cross-cgroup and cross-issue_as_root merges.

Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: d70675121546 ("block: introduce blk-iolatency io controller")
Cc: stable@vger.kernel.org # v4.19+
Cc: Josef Bacik <jbacik@fb.com>
Link: https://lore.kernel.org/r/Yi/eE/6zFNyWJ+qd@slm.duckdns.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.h | 17 +++++++++++++++++
 block/blk-merge.c  | 11 +++++++++++
 2 files changed, 28 insertions(+)

diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 3e91803c4a55..47e1e38390c9 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -15,6 +15,7 @@
  */
 
 #include <linux/blk-cgroup.h>
+#include <linux/blk-mq.h>
 
 /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
 #define BLKG_STAT_CPU_BATCH	(INT_MAX / 2)
@@ -428,6 +429,21 @@ static inline void blkcg_clear_delay(struct blkcg_gq *blkg)
 		atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
 }
 
+/**
+ * blk_cgroup_mergeable - Determine whether to allow or disallow merges
+ * @rq: request to merge into
+ * @bio: bio to merge
+ *
+ * @bio and @rq should belong to the same cgroup and their issue_as_root should
+ * match. The latter is necessary as we don't want to throttle e.g. a metadata
+ * update because it happens to be next to a regular IO.
+ */
+static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio)
+{
+	return rq->bio->bi_blkg == bio->bi_blkg &&
+		bio_issue_as_root_blkg(rq->bio) == bio_issue_as_root_blkg(bio);
+}
+
 void blk_cgroup_bio_start(struct bio *bio);
 void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta);
 #else	/* CONFIG_BLK_CGROUP */
@@ -467,6 +483,7 @@ static inline void blkg_put(struct blkcg_gq *blkg) { }
 static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; }
 static inline void blkcg_bio_issue_init(struct bio *bio) { }
 static inline void blk_cgroup_bio_start(struct bio *bio) { }
+static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; }
 
 #define blk_queue_for_each_rl(rl, q)	\
 	for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 8d8177f71ebd..ea6968313b4a 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -9,6 +9,7 @@
 #include <linux/blk-integrity.h>
 #include <linux/scatterlist.h>
 #include <linux/part_stat.h>
+#include <linux/blk-cgroup.h>
 
 #include <trace/events/block.h>
 
@@ -598,6 +599,9 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
 static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
 		unsigned int nr_phys_segs)
 {
+	if (!blk_cgroup_mergeable(req, bio))
+		goto no_merge;
+
 	if (blk_integrity_merge_bio(req->q, req, bio) == false)
 		goto no_merge;
 
@@ -694,6 +698,9 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 	if (total_phys_segments > blk_rq_get_max_segments(req))
 		return 0;
 
+	if (!blk_cgroup_mergeable(req, next->bio))
+		return 0;
+
 	if (blk_integrity_merge_rq(q, req, next) == false)
 		return 0;
 
@@ -902,6 +909,10 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 	if (bio_data_dir(bio) != rq_data_dir(rq))
 		return false;
 
+	/* don't merge across cgroup boundaries */
+	if (!blk_cgroup_mergeable(rq, bio))
+		return false;
+
 	/* only merge integrity protected bio into ditto rq */
 	if (blk_integrity_merge_bio(rq->q, rq, bio) == false)
 		return false;

From 79d45f57a19537a1ec6ebf836944e968b154f86e Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Mon, 14 Mar 2022 16:03:21 +0100
Subject: [PATCH 122/127] sr: simplify the local variable initialization in
 sr_block_open()

Commit 01d0c698536f ("sr: implement ->free_disk to simplify refcounting")
refactored sr_block_open(), initialized one variable with a duplicate
assignment (probably an unintended copy & paste duplication) and turned one
error case into an early return, which makes the initialization of the
return variable needless.

So, simplify the local variable initialization in sr_block_open() to make
the code a bit more clear.

No functional change. No change in resulting object code.

Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220314150321.17720-1-lukas.bulwahn@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/sr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 11fbdc75bb71..641552d6330b 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -483,9 +483,9 @@ static void sr_revalidate_disk(struct scsi_cd *cd)
 
 static int sr_block_open(struct block_device *bdev, fmode_t mode)
 {
-	struct scsi_cd *cd = cd = scsi_cd(bdev->bd_disk);
+	struct scsi_cd *cd = scsi_cd(bdev->bd_disk);
 	struct scsi_device *sdev = cd->device;
-	int ret = -ENXIO;
+	int ret;
 
 	if (scsi_device_get(cd->device))
 		return -ENXIO;

From 8ef22dc4a7cbef5870e612edfbf9cd058ddaaa50 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Tue, 15 Mar 2022 22:15:39 +0000
Subject: [PATCH 123/127] block/bfq-iosched: Fix spelling mistake "tenative" ->
 "tentative"

There is a spelling mistake in a bfq_log_bfqq message. Fix it.

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Link: https://lore.kernel.org/r/20220315221539.2959167-1-colin.i.king@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 975fd30a07c9..3376ad03823e 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -2153,7 +2153,7 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 		bfqq->waker_detection_started = now_ns;
 		bfq_bfqq_name(bfqq->tentative_waker_bfqq, waker_name,
 			      MAX_BFQQ_NAME_LENGTH);
-		bfq_log_bfqq(bfqd, bfqq, "set tenative waker %s", waker_name);
+		bfq_log_bfqq(bfqd, bfqq, "set tentative waker %s", waker_name);
 	} else /* Same tentative waker queue detected again */
 		bfqq->num_waker_detections++;
 

From 572299f03afd676dd4e20669cdaf5ed0fe1379d4 Mon Sep 17 00:00:00 2001
From: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Date: Fri, 18 Mar 2022 11:26:41 +0900
Subject: [PATCH 124/127] block: limit request dispatch loop duration

When IO requests are made continuously and the target block device
handles requests faster than request arrival, the request dispatch loop
keeps on repeating to dispatch the arriving requests very long time,
more than a minute. Since the loop runs as a workqueue worker task, the
very long loop duration triggers workqueue watchdog timeout and BUG [1].

To avoid the very long loop duration, break the loop periodically. When
opportunity to dispatch requests still exists, check need_resched(). If
need_resched() returns true, the dispatch loop already consumed its time
slice, then reschedule the dispatch work and break the loop. With heavy
IO load, need_resched() does not return true for 20~30 seconds. To cover
such case, check time spent in the dispatch loop with jiffies. If more
than 1 second is spent, reschedule the dispatch work and break the loop.

[1]

[  609.691437] BUG: workqueue lockup - pool cpus=10 node=1 flags=0x0 nice=-20 stuck for 35s!
[  609.701820] Showing busy workqueues and worker pools:
[  609.707915] workqueue events: flags=0x0
[  609.712615]   pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/256 refcnt=2
[  609.712626]     pending: drm_fb_helper_damage_work [drm_kms_helper]
[  609.712687] workqueue events_freezable: flags=0x4
[  609.732943]   pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/256 refcnt=2
[  609.732952]     pending: pci_pme_list_scan
[  609.732968] workqueue events_power_efficient: flags=0x80
[  609.751947]   pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/256 refcnt=2
[  609.751955]     pending: neigh_managed_work
[  609.752018] workqueue kblockd: flags=0x18
[  609.769480]   pwq 21: cpus=10 node=1 flags=0x0 nice=-20 active=3/256 refcnt=4
[  609.769488]     in-flight: 1020:blk_mq_run_work_fn
[  609.769498]     pending: blk_mq_timeout_work, blk_mq_run_work_fn
[  609.769744] pool 21: cpus=10 node=1 flags=0x0 nice=-20 hung=35s workers=2 idle: 67
[  639.899730] BUG: workqueue lockup - pool cpus=10 node=1 flags=0x0 nice=-20 stuck for 66s!
[  639.909513] Showing busy workqueues and worker pools:
[  639.915404] workqueue events: flags=0x0
[  639.920197]   pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/256 refcnt=2
[  639.920215]     pending: drm_fb_helper_damage_work [drm_kms_helper]
[  639.920365] workqueue kblockd: flags=0x18
[  639.939932]   pwq 21: cpus=10 node=1 flags=0x0 nice=-20 active=3/256 refcnt=4
[  639.939942]     in-flight: 1020:blk_mq_run_work_fn
[  639.939955]     pending: blk_mq_timeout_work, blk_mq_run_work_fn
[  639.940212] pool 21: cpus=10 node=1 flags=0x0 nice=-20 hung=66s workers=2 idle: 67

Fixes: 6e6fcbc27e778 ("blk-mq: support batching dispatch in case of io")
Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Cc: stable@vger.kernel.org # v5.10+
Link: https://lore.kernel.org/linux-block/20220310091649.zypaem5lkyfadymg@shindev/
Link: https://lore.kernel.org/r/20220318022641.133484-1-shinichiro.kawasaki@wdc.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index e6ad8f761474..9e56a69422b6 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -180,11 +180,18 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 
 static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 {
+	unsigned long end = jiffies + HZ;
 	int ret;
 
 	do {
 		ret = __blk_mq_do_dispatch_sched(hctx);
-	} while (ret == 1);
+		if (ret != 1)
+			break;
+		if (need_resched() || time_is_before_jiffies(end)) {
+			blk_mq_delay_run_hw_queue(hctx, 0);
+			break;
+		}
+	} while (1);
 
 	return ret;
 }

From ee37eddbfa9e0401f13a01691cf4bbbacd2d16c9 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 18 Mar 2022 21:01:42 +0800
Subject: [PATCH 125/127] block: avoid use-after-free on throttle data

In throtl_pending_timer_fn(), request queue is retrieved from throttle
data. And tg's pending timer is deleted synchronously when releasing the
associated blkg, at that time, throttle data may have been freed since
commit 1059699f87eb ("block: move blkcg initialization/destroy into disk
allocation/release handler") moves freeing q->td to disk_release() from
blk_release_queue(). So use-after-free on q->td in throtl_pending_timer_fn
can be triggered.

Fixes the issue by:

- do nothing in case that disk is released, when there isn't any bio to
  dispatch

- retrieve request queue from blkg instead of throttle data for
non top-level pending timer.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20220318130144.1066064-2-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-throttle.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a3b3ebc72dd4..880701d0106f 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1137,12 +1137,22 @@ static void throtl_pending_timer_fn(struct timer_list *t)
 	struct throtl_service_queue *sq = from_timer(sq, t, pending_timer);
 	struct throtl_grp *tg = sq_to_tg(sq);
 	struct throtl_data *td = sq_to_td(sq);
-	struct request_queue *q = td->queue;
 	struct throtl_service_queue *parent_sq;
+	struct request_queue *q;
 	bool dispatched;
 	int ret;
 
+	/* throtl_data may be gone, so figure out request queue by blkg */
+	if (tg)
+		q = tg->pd.blkg->q;
+	else
+		q = td->queue;
+
 	spin_lock_irq(&q->queue_lock);
+
+	if (!q->root_blkg)
+		goto out_unlock;
+
 	if (throtl_can_upgrade(td, NULL))
 		throtl_upgrade_state(td);
 

From 0a9a25ca78437b39e691bcc3dc8240455b803d8d Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 18 Mar 2022 21:01:43 +0800
Subject: [PATCH 126/127] block: let blkcg_gq grab request queue's refcnt

In the whole lifetime of blkcg_gq instance, ->q will be referred, such
as, ->pd_free_fn() is called in blkg_free, and throtl_pd_free() still
may touch the request queue via &tg->service_queue.pending_timer which
is handled by throtl_pending_timer_fn(), so it is reasonable to grab
request queue's refcnt by blkcg_gq instance.

Previously blkcg_exit_queue() is called from blk_release_queue, and it
is hard to avoid the use-after-free. But recently commit 1059699f87eb ("block:
move blkcg initialization/destroy into disk allocation/release handler")
is merged to for-5.18/block, it becomes simple to fix the issue by simply
grabbing request queue's refcnt.

Reported-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20220318130144.1066064-3-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index fa063c6c0338..d53b0d69dd73 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -82,6 +82,8 @@ static void blkg_free(struct blkcg_gq *blkg)
 		if (blkg->pd[i])
 			blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
 
+	if (blkg->q)
+		blk_put_queue(blkg->q);
 	free_percpu(blkg->iostat_cpu);
 	percpu_ref_exit(&blkg->refcnt);
 	kfree(blkg);
@@ -167,6 +169,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
 	if (!blkg->iostat_cpu)
 		goto err_free;
 
+	if (!blk_get_queue(q))
+		goto err_free;
+
 	blkg->q = q;
 	INIT_LIST_HEAD(&blkg->q_node);
 	spin_lock_init(&blkg->async_bio_lock);

From 8f9e7b65f833cb9a4b2e2f54a049d74df394d906 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Fri, 18 Mar 2022 21:01:44 +0800
Subject: [PATCH 127/127] block: cancel all throttled bios in del_gendisk()

Throttled bios can't be issued after del_gendisk() is done, thus
it's better to cancel them immediately rather than waiting for
throttle is done.

For example, if user thread is throttled with low bps while it's
issuing large io, and the device is deleted. The user thread will
wait for a long time for io to return.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20220318130144.1066064-4-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-throttle.c | 36 +++++++++++++++++++++++++++++++++++-
 block/blk-throttle.h |  3 +++
 block/genhd.c        |  3 +++
 3 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 880701d0106f..469c483719be 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -874,7 +874,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
 	       bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
 
 	/* If tg->bps = -1, then BW is unlimited */
-	if (bps_limit == U64_MAX && iops_limit == UINT_MAX) {
+	if ((bps_limit == U64_MAX && iops_limit == UINT_MAX) ||
+	    tg->flags & THROTL_TG_CANCELING) {
 		if (wait)
 			*wait = 0;
 		return true;
@@ -1776,6 +1777,39 @@ static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg)
 	return false;
 }
 
+void blk_throtl_cancel_bios(struct request_queue *q)
+{
+	struct cgroup_subsys_state *pos_css;
+	struct blkcg_gq *blkg;
+
+	spin_lock_irq(&q->queue_lock);
+	/*
+	 * queue_lock is held, rcu lock is not needed here technically.
+	 * However, rcu lock is still held to emphasize that following
+	 * path need RCU protection and to prevent warning from lockdep.
+	 */
+	rcu_read_lock();
+	blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) {
+		struct throtl_grp *tg = blkg_to_tg(blkg);
+		struct throtl_service_queue *sq = &tg->service_queue;
+
+		/*
+		 * Set the flag to make sure throtl_pending_timer_fn() won't
+		 * stop until all throttled bios are dispatched.
+		 */
+		blkg_to_tg(blkg)->flags |= THROTL_TG_CANCELING;
+		/*
+		 * Update disptime after setting the above flag to make sure
+		 * throtl_select_dispatch() won't exit without dispatching.
+		 */
+		tg_update_disptime(tg);
+
+		throtl_schedule_pending_timer(sq, jiffies + 1);
+	}
+	rcu_read_unlock();
+	spin_unlock_irq(&q->queue_lock);
+}
+
 static bool throtl_can_upgrade(struct throtl_data *td,
 	struct throtl_grp *this_tg)
 {
diff --git a/block/blk-throttle.h b/block/blk-throttle.h
index b23a9f3abb82..c1b602996127 100644
--- a/block/blk-throttle.h
+++ b/block/blk-throttle.h
@@ -56,6 +56,7 @@ enum tg_state_flags {
 	THROTL_TG_PENDING	= 1 << 0,	/* on parent's pending tree */
 	THROTL_TG_WAS_EMPTY	= 1 << 1,	/* bio_lists[] became non-empty */
 	THROTL_TG_HAS_IOPS_LIMIT = 1 << 2,	/* tg has iops limit */
+	THROTL_TG_CANCELING	= 1 << 3,	/* starts to cancel bio */
 };
 
 enum {
@@ -162,11 +163,13 @@ static inline int blk_throtl_init(struct request_queue *q) { return 0; }
 static inline void blk_throtl_exit(struct request_queue *q) { }
 static inline void blk_throtl_register_queue(struct request_queue *q) { }
 static inline bool blk_throtl_bio(struct bio *bio) { return false; }
+static inline void blk_throtl_cancel_bios(struct request_queue *q) { }
 #else /* CONFIG_BLK_DEV_THROTTLING */
 int blk_throtl_init(struct request_queue *q);
 void blk_throtl_exit(struct request_queue *q);
 void blk_throtl_register_queue(struct request_queue *q);
 bool __blk_throtl_bio(struct bio *bio);
+void blk_throtl_cancel_bios(struct request_queue *q);
 static inline bool blk_throtl_bio(struct bio *bio)
 {
 	struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg);
diff --git a/block/genhd.c b/block/genhd.c
index 56f66c6fee94..8244f09e058d 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -25,6 +25,7 @@
 #include <linux/pm_runtime.h>
 #include <linux/badblocks.h>
 #include <linux/part_stat.h>
+#include "blk-throttle.h"
 
 #include "blk.h"
 #include "blk-mq-sched.h"
@@ -627,6 +628,8 @@ void del_gendisk(struct gendisk *disk)
 
 	blk_mq_freeze_queue_wait(q);
 
+	blk_throtl_cancel_bios(disk->queue);
+
 	blk_sync_queue(q);
 	blk_flush_integrity();
 	/*