From aaa53168cbcc486ca1927faac00bd99e81d4ff04 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 28 May 2024 13:32:34 +0200 Subject: [PATCH 01/33] dm: optimize flushes Device mapper sends flush bios to all the targets and the targets send it to the underlying device. That may be inefficient, for example if a table contains 10 linear targets pointing to the same physical device, then device mapper would send 10 flush bios to that device - despite the fact that only one bio would be sufficient. This commit optimizes the flush behavior. It introduces a per-target variable flush_bypasses_map - it is set when the target supports flush optimization - currently, the dm-linear and dm-stripe targets support it. When all the targets in a table have flush_bypasses_map, flush_bypasses_map on the table is set. __send_empty_flush tests if the table has flush_bypasses_map - and if it has, no flush bios are sent to the targets via the "map" method and the list dm_table->devices is iterated and the flush bios are sent to each member of the list. Signed-off-by: Mikulas Patocka Reviewed-by: Mike Snitzer Suggested-by: Yang Yang --- drivers/md/dm-core.h | 2 ++ drivers/md/dm-linear.c | 1 + drivers/md/dm-stripe.c | 1 + drivers/md/dm-table.c | 4 +++ drivers/md/dm.c | 50 ++++++++++++++++++++++++++--------- include/linux/device-mapper.h | 15 +++++++++++ 6 files changed, 61 insertions(+), 12 deletions(-) diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 14a44c0f8286..3637761f3585 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -206,6 +206,8 @@ struct dm_table { bool integrity_supported:1; bool singleton:1; + /* set if all the targets in the table have "flush_bypasses_map" set */ + bool flush_bypasses_map:1; /* * Indicates the rw permissions for the new logical device. This diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 2d3e186ca87e..49fb0f684193 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -62,6 +62,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->num_discard_bios = 1; ti->num_secure_erase_bios = 1; ti->num_write_zeroes_bios = 1; + ti->flush_bypasses_map = true; ti->private = lc; return 0; diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 16b93ae51d96..ec908cf8cfdc 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -157,6 +157,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->num_discard_bios = stripes; ti->num_secure_erase_bios = stripes; ti->num_write_zeroes_bios = stripes; + ti->flush_bypasses_map = true; sc->chunk_size = chunk_size; if (chunk_size & (chunk_size - 1)) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 92d18dcdb3e9..33b7a1844ed4 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -160,6 +160,7 @@ int dm_table_create(struct dm_table **result, blk_mode_t mode, t->type = DM_TYPE_NONE; t->mode = mode; t->md = md; + t->flush_bypasses_map = true; *result = t; return 0; } @@ -748,6 +749,9 @@ int dm_table_add_target(struct dm_table *t, const char *type, if (ti->limit_swap_bios && !static_key_enabled(&swap_bios_enabled.key)) static_branch_enable(&swap_bios_enabled); + if (!ti->flush_bypasses_map) + t->flush_bypasses_map = false; + return 0; bad: diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 7d107ae06e1a..3763b2ce557b 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -645,7 +645,7 @@ static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti, /* Set default bdev, but target must bio_set_dev() before issuing IO */ clone->bi_bdev = md->disk->part0; - if (unlikely(ti->needs_bio_set_dev)) + if (likely(ti != NULL) && unlikely(ti->needs_bio_set_dev)) bio_set_dev(clone, md->disk->part0); if (len) { @@ -1107,7 +1107,7 @@ static void clone_endio(struct bio *bio) blk_status_t error = bio->bi_status; struct dm_target_io *tio = clone_to_tio(bio); struct dm_target *ti = tio->ti; - dm_endio_fn endio = ti->type->end_io; + dm_endio_fn endio = likely(ti != NULL) ? ti->type->end_io : NULL; struct dm_io *io = tio->io; struct mapped_device *md = io->md; @@ -1154,7 +1154,7 @@ static void clone_endio(struct bio *bio) } if (static_branch_unlikely(&swap_bios_enabled) && - unlikely(swap_bios_limit(ti, bio))) + likely(ti != NULL) && unlikely(swap_bios_limit(ti, bio))) up(&md->swap_bios_semaphore); free_tio(bio); @@ -1553,17 +1553,43 @@ static void __send_empty_flush(struct clone_info *ci) ci->sector_count = 0; ci->io->tio.clone.bi_iter.bi_size = 0; - for (unsigned int i = 0; i < t->num_targets; i++) { - unsigned int bios; - struct dm_target *ti = dm_table_get_target(t, i); + if (!t->flush_bypasses_map) { + for (unsigned int i = 0; i < t->num_targets; i++) { + unsigned int bios; + struct dm_target *ti = dm_table_get_target(t, i); - if (unlikely(ti->num_flush_bios == 0)) - continue; + if (unlikely(ti->num_flush_bios == 0)) + continue; - atomic_add(ti->num_flush_bios, &ci->io->io_count); - bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios, - NULL, GFP_NOWAIT); - atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count); + atomic_add(ti->num_flush_bios, &ci->io->io_count); + bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios, + NULL, GFP_NOWAIT); + atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count); + } + } else { + /* + * Note that there's no need to grab t->devices_lock here + * because the targets that support flush optimization don't + * modify the list of devices. + */ + struct list_head *devices = dm_table_get_devices(t); + unsigned int len = 0; + struct dm_dev_internal *dd; + list_for_each_entry(dd, devices, list) { + struct bio *clone; + /* + * Note that the structure dm_target_io is not + * associated with any target (because the device may be + * used by multiple targets), so we set tio->ti = NULL. + * We must check for NULL in the I/O processing path, to + * avoid NULL pointer dereference. + */ + clone = alloc_tio(ci, NULL, 0, &len, GFP_NOIO); + atomic_add(1, &ci->io->io_count); + bio_set_dev(clone, dd->dm_dev->bdev); + clone->bi_end_io = clone_endio; + dm_submit_bio_remap(clone, NULL); + } } /* diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 82b2195efaca..3611b230d0aa 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -397,6 +397,21 @@ struct dm_target { * bio_set_dev(). NOTE: ideally a target should _not_ need this. */ bool needs_bio_set_dev:1; + + /* + * Set if the target supports flush optimization. If all the targets in + * a table have flush_bypasses_map set, the dm core will not send + * flushes to the targets via a ->map method. It will iterate over + * dm_table->devices and send flushes to the devices directly. This + * optimization reduces the number of flushes being sent when multiple + * targets in a table use the same underlying device. + * + * This optimization may be enabled on targets that just pass the + * flushes to the underlying devices without performing any other + * actions on the flush request. Currently, dm-linear and dm-stripe + * support it. + */ + bool flush_bypasses_map:1; }; void *dm_per_bio_data(struct bio *bio, size_t data_size); From 06a0b333e58407970e9b109d054610d2f107ca87 Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Tue, 2 Jul 2024 11:56:45 +0200 Subject: [PATCH 02/33] dm io: bump num_bvecs to handle offset memory If dp->get_page() returns a non-zero offset, the bio might need an additional bvec to deal with the offset. For example, if remaining is exactly one page size, but there is an offset, the memory will span two pages. Signed-off-by: Benjamin Marzinski Signed-off-by: Mikulas Patocka --- drivers/md/dm-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 7409490259d1..3333943fe288 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -347,7 +347,7 @@ static void do_region(const blk_opf_t opf, unsigned int region, break; default: num_bvecs = bio_max_segs(dm_sector_div_up(remaining, - (PAGE_SIZE >> SECTOR_SHIFT))); + (PAGE_SIZE >> SECTOR_SHIFT)) + 1); } bio = bio_alloc_bioset(where->bdev, num_bvecs, opf, GFP_NOIO, From b0042ba7684c90d9c8e7deb500c73d6ae3872cfe Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Tue, 2 Jul 2024 11:58:56 +0200 Subject: [PATCH 03/33] dm io: don't call the async_io notify.fn on invalid num_regions If dm_io() returned an error, callers that set a notify.fn and wanted it called on an error need to check the return value and call notify.fn themselves if it was -EINVAL but not if it was -EIO. None of them do this (granted, all the existing async_io users of dm_io call it in a way that is guaranteed to not return an error). Simplify the interface by never calling the notify.fn if dm_io returns an error. This works with the existing dm_io callers which check for an error and handle it using the same methods as the notify.fn. This also allows us to move the now equivalent num_regions checks out of sync_io() and async_io() and into dm_io() itself. Additionally, change async_io() into a void function, since it can no longer fail. Signed-off-by: Benjamin Marzinski Signed-off-by: Mikulas Patocka --- drivers/md/dm-io.c | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 3333943fe288..329a85a12061 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -431,11 +431,6 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, struct io *io; struct sync_io sio; - if (num_regions > 1 && !op_is_write(opf)) { - WARN_ON(1); - return -EIO; - } - init_completion(&sio.wait); io = mempool_alloc(&client->pool, GFP_NOIO); @@ -458,19 +453,13 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, return sio.error_bits ? -EIO : 0; } -static int async_io(struct dm_io_client *client, unsigned int num_regions, - struct dm_io_region *where, blk_opf_t opf, - struct dpages *dp, io_notify_fn fn, void *context, - unsigned short ioprio) +static void async_io(struct dm_io_client *client, unsigned int num_regions, + struct dm_io_region *where, blk_opf_t opf, + struct dpages *dp, io_notify_fn fn, void *context, + unsigned short ioprio) { struct io *io; - if (num_regions > 1 && !op_is_write(opf)) { - WARN_ON(1); - fn(1, context); - return -EIO; - } - io = mempool_alloc(&client->pool, GFP_NOIO); io->error_bits = 0; atomic_set(&io->count, 1); /* see dispatch_io() */ @@ -482,7 +471,6 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions, io->vma_invalidate_size = dp->vma_invalidate_size; dispatch_io(opf, num_regions, where, dp, io, 0, ioprio); - return 0; } static int dp_init(struct dm_io_request *io_req, struct dpages *dp, @@ -529,6 +517,11 @@ int dm_io(struct dm_io_request *io_req, unsigned int num_regions, int r; struct dpages dp; + if (num_regions > 1 && !op_is_write(io_req->bi_opf)) { + WARN_ON(1); + return -EIO; + } + r = dp_init(io_req, &dp, (unsigned long)where->count << SECTOR_SHIFT); if (r) return r; @@ -537,9 +530,9 @@ int dm_io(struct dm_io_request *io_req, unsigned int num_regions, return sync_io(io_req->client, num_regions, where, io_req->bi_opf, &dp, sync_error_bits, ioprio); - return async_io(io_req->client, num_regions, where, - io_req->bi_opf, &dp, io_req->notify.fn, - io_req->notify.context, ioprio); + async_io(io_req->client, num_regions, where, io_req->bi_opf, &dp, + io_req->notify.fn, io_req->notify.context, ioprio); + return 0; } EXPORT_SYMBOL(dm_io); From babe69e86d0fcb11a4a8f629edf2951d94ef67ae Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Tue, 2 Jul 2024 12:00:43 +0200 Subject: [PATCH 04/33] dm io: remove code duplication between sync_io and aysnc_io The only difference between the code to setup and dispatch the io in sync_io() and async_io() is the sync argument to dispatch_io(), which is used to update the opf argument. Update the opf argument direcly in sync_io(), and remove the sync argument from dispatch_io(). Then, make sync_io() call async_io() instead of duplicting all of its code. Signed-off-by: Benjamin Marzinski Signed-off-by: Mikulas Patocka --- drivers/md/dm-io.c | 81 +++++++++++++++++++--------------------------- 1 file changed, 34 insertions(+), 47 deletions(-) diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 329a85a12061..d7a8e2f40db3 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -384,16 +384,13 @@ static void do_region(const blk_opf_t opf, unsigned int region, static void dispatch_io(blk_opf_t opf, unsigned int num_regions, struct dm_io_region *where, struct dpages *dp, - struct io *io, int sync, unsigned short ioprio) + struct io *io, unsigned short ioprio) { int i; struct dpages old_pages = *dp; BUG_ON(num_regions > DM_IO_MAX_REGIONS); - if (sync) - opf |= REQ_SYNC; - /* * For multiple regions we need to be careful to rewind * the dp object for each call to do_region. @@ -411,48 +408,6 @@ static void dispatch_io(blk_opf_t opf, unsigned int num_regions, dec_count(io, 0, 0); } -struct sync_io { - unsigned long error_bits; - struct completion wait; -}; - -static void sync_io_complete(unsigned long error, void *context) -{ - struct sync_io *sio = context; - - sio->error_bits = error; - complete(&sio->wait); -} - -static int sync_io(struct dm_io_client *client, unsigned int num_regions, - struct dm_io_region *where, blk_opf_t opf, struct dpages *dp, - unsigned long *error_bits, unsigned short ioprio) -{ - struct io *io; - struct sync_io sio; - - init_completion(&sio.wait); - - io = mempool_alloc(&client->pool, GFP_NOIO); - io->error_bits = 0; - atomic_set(&io->count, 1); /* see dispatch_io() */ - io->client = client; - io->callback = sync_io_complete; - io->context = &sio; - - io->vma_invalidate_address = dp->vma_invalidate_address; - io->vma_invalidate_size = dp->vma_invalidate_size; - - dispatch_io(opf, num_regions, where, dp, io, 1, ioprio); - - wait_for_completion_io(&sio.wait); - - if (error_bits) - *error_bits = sio.error_bits; - - return sio.error_bits ? -EIO : 0; -} - static void async_io(struct dm_io_client *client, unsigned int num_regions, struct dm_io_region *where, blk_opf_t opf, struct dpages *dp, io_notify_fn fn, void *context, @@ -470,7 +425,39 @@ static void async_io(struct dm_io_client *client, unsigned int num_regions, io->vma_invalidate_address = dp->vma_invalidate_address; io->vma_invalidate_size = dp->vma_invalidate_size; - dispatch_io(opf, num_regions, where, dp, io, 0, ioprio); + dispatch_io(opf, num_regions, where, dp, io, ioprio); +} + +struct sync_io { + unsigned long error_bits; + struct completion wait; +}; + +static void sync_io_complete(unsigned long error, void *context) +{ + struct sync_io *sio = context; + + sio->error_bits = error; + complete(&sio->wait); +} + +static int sync_io(struct dm_io_client *client, unsigned int num_regions, + struct dm_io_region *where, blk_opf_t opf, struct dpages *dp, + unsigned long *error_bits, unsigned short ioprio) +{ + struct sync_io sio; + + init_completion(&sio.wait); + + async_io(client, num_regions, where, opf | REQ_SYNC, dp, + sync_io_complete, &sio, ioprio); + + wait_for_completion_io(&sio.wait); + + if (error_bits) + *error_bits = sio.error_bits; + + return sio.error_bits ? -EIO : 0; } static int dp_init(struct dm_io_request *io_req, struct dpages *dp, From c1a66a37d606d27bad037b34083464a7f515d619 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Tue, 2 Jul 2024 12:10:32 +0200 Subject: [PATCH 05/33] dm cache metadata: remove unused struct 'thunk' 'thunk' has been unused since commit f177940a8091 ("dm cache metadata: switch to using the new cursor api for loading metadata"). Remove it. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Matthew Sakai Signed-off-by: Mikulas Patocka --- drivers/md/dm-cache-metadata.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 96751cd3d181..0ad9dc1824fa 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -1282,15 +1282,6 @@ int dm_cache_insert_mapping(struct dm_cache_metadata *cmd, return r; } -struct thunk { - load_mapping_fn fn; - void *context; - - struct dm_cache_metadata *cmd; - bool respect_dirty_flags; - bool hints_valid; -}; - static bool policy_unchanged(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy) { From 140ce37fd78a629105377e17842465258a5459ef Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Tue, 2 Jul 2024 12:13:24 +0200 Subject: [PATCH 06/33] dm init: Handle minors larger than 255 dm_parse_device_entry() simply copies the minor number into dmi.dev, but the dev_t format splits the minor number between the lowest 8 bytes and highest 12 bytes. If the minor number is larger than 255, part of it will end up getting treated as the major number Fix this by checking that the minor number is valid and then encoding it as a dev_t. Signed-off-by: Benjamin Marzinski Signed-off-by: Mikulas Patocka --- drivers/md/dm-init.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-init.c b/drivers/md/dm-init.c index 2a71bcdba92d..b37bbe762500 100644 --- a/drivers/md/dm-init.c +++ b/drivers/md/dm-init.c @@ -212,8 +212,10 @@ static char __init *dm_parse_device_entry(struct dm_device *dev, char *str) strscpy(dev->dmi.uuid, field[1], sizeof(dev->dmi.uuid)); /* minor */ if (strlen(field[2])) { - if (kstrtoull(field[2], 0, &dev->dmi.dev)) + if (kstrtoull(field[2], 0, &dev->dmi.dev) || + dev->dmi.dev >= (1 << MINORBITS)) return ERR_PTR(-EINVAL); + dev->dmi.dev = huge_encode_dev((dev_t)dev->dmi.dev); dev->dmi.flags |= DM_PERSISTENT_DEV_FLAG; } /* flags */ From 44d36a2ea426be14d2e5db72e2e7676346c8b802 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 2 Jul 2024 16:37:45 +0200 Subject: [PATCH 07/33] dm-verity: move hash algorithm setup into its own function Move the code that sets up the hash transformation into its own function. No change in behavior. Reviewed-by: Sami Tolvanen Acked-by: Ard Biesheuvel Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-target.c | 70 +++++++++++++++++++---------------- 1 file changed, 39 insertions(+), 31 deletions(-) diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index bb5da66da4c1..88d2a49dca43 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -1226,6 +1226,43 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, return r; } +static int verity_setup_hash_alg(struct dm_verity *v, const char *alg_name) +{ + struct dm_target *ti = v->ti; + struct crypto_ahash *ahash; + + v->alg_name = kstrdup(alg_name, GFP_KERNEL); + if (!v->alg_name) { + ti->error = "Cannot allocate algorithm name"; + return -ENOMEM; + } + + ahash = crypto_alloc_ahash(alg_name, 0, + v->use_bh_wq ? CRYPTO_ALG_ASYNC : 0); + if (IS_ERR(ahash)) { + ti->error = "Cannot initialize hash function"; + return PTR_ERR(ahash); + } + v->tfm = ahash; + + /* + * dm-verity performance can vary greatly depending on which hash + * algorithm implementation is used. Help people debug performance + * problems by logging the ->cra_driver_name. + */ + DMINFO("%s using implementation \"%s\"", alg_name, + crypto_hash_alg_common(ahash)->base.cra_driver_name); + + v->digest_size = crypto_ahash_digestsize(ahash); + if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) { + ti->error = "Digest size too big"; + return -EINVAL; + } + v->ahash_reqsize = sizeof(struct ahash_request) + + crypto_ahash_reqsize(ahash); + return 0; +} + /* * Target parameters: * The current format is version 1. @@ -1350,38 +1387,9 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv) } v->hash_start = num_ll; - v->alg_name = kstrdup(argv[7], GFP_KERNEL); - if (!v->alg_name) { - ti->error = "Cannot allocate algorithm name"; - r = -ENOMEM; + r = verity_setup_hash_alg(v, argv[7]); + if (r) goto bad; - } - - v->tfm = crypto_alloc_ahash(v->alg_name, 0, - v->use_bh_wq ? CRYPTO_ALG_ASYNC : 0); - if (IS_ERR(v->tfm)) { - ti->error = "Cannot initialize hash function"; - r = PTR_ERR(v->tfm); - v->tfm = NULL; - goto bad; - } - - /* - * dm-verity performance can vary greatly depending on which hash - * algorithm implementation is used. Help people debug performance - * problems by logging the ->cra_driver_name. - */ - DMINFO("%s using implementation \"%s\"", v->alg_name, - crypto_hash_alg_common(v->tfm)->base.cra_driver_name); - - v->digest_size = crypto_ahash_digestsize(v->tfm); - if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) { - ti->error = "Digest size too big"; - r = -EINVAL; - goto bad; - } - v->ahash_reqsize = sizeof(struct ahash_request) + - crypto_ahash_reqsize(v->tfm); v->root_digest = kmalloc(v->digest_size, GFP_KERNEL); if (!v->root_digest) { From e41e52e59e51992368092a0555fd7b889662653f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 2 Jul 2024 16:38:21 +0200 Subject: [PATCH 08/33] dm-verity: move data hash mismatch handling into its own function Move the code that handles mismatches of data block hashes into its own function so that it doesn't clutter up verity_verify_io(). Reviewed-by: Sami Tolvanen Acked-by: Ard Biesheuvel Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-target.c | 64 ++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 28 deletions(-) diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 88d2a49dca43..796d85526696 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -542,6 +542,38 @@ free_ret: return r; } +static int verity_handle_data_hash_mismatch(struct dm_verity *v, + struct dm_verity_io *io, + struct bio *bio, sector_t blkno, + struct bvec_iter *start) +{ + if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) { + /* + * Error handling code (FEC included) cannot be run in the + * BH workqueue, so fallback to a standard workqueue. + */ + return -EAGAIN; + } + if (verity_recheck(v, io, *start, blkno) == 0) { + if (v->validated_blocks) + set_bit(blkno, v->validated_blocks); + return 0; + } +#if defined(CONFIG_DM_VERITY_FEC) + if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA, blkno, + NULL, start) == 0) + return 0; +#endif + if (bio->bi_status) + return -EIO; /* Error correction failed; Just return error */ + + if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA, blkno)) { + dm_audit_log_bio(DM_MSG_PREFIX, "verify-data", bio, blkno, 0); + return -EIO; + } + return 0; +} + static int verity_bv_zero(struct dm_verity *v, struct dm_verity_io *io, u8 *data, size_t len) { @@ -634,35 +666,11 @@ static int verity_verify_io(struct dm_verity_io *io) if (v->validated_blocks) set_bit(cur_block, v->validated_blocks); continue; - } else if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) { - /* - * Error handling code (FEC included) cannot be run in a - * tasklet since it may sleep, so fallback to work-queue. - */ - return -EAGAIN; - } else if (verity_recheck(v, io, start, cur_block) == 0) { - if (v->validated_blocks) - set_bit(cur_block, v->validated_blocks); - continue; -#if defined(CONFIG_DM_VERITY_FEC) - } else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA, - cur_block, NULL, &start) == 0) { - continue; -#endif - } else { - if (bio->bi_status) { - /* - * Error correction failed; Just return error - */ - return -EIO; - } - if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA, - cur_block)) { - dm_audit_log_bio(DM_MSG_PREFIX, "verify-data", - bio, cur_block, 0); - return -EIO; - } } + r = verity_handle_data_hash_mismatch(v, io, bio, cur_block, + &start); + if (unlikely(r)) + return r; } return 0; From a7ddb3d49d16e1774b4d5b814d3469ee9c3830c0 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 2 Jul 2024 16:39:13 +0200 Subject: [PATCH 09/33] dm-verity: make real_digest and want_digest fixed-length Change the digest fields in struct dm_verity_io from variable-length to fixed-length, since their maximum length is fixed at HASH_MAX_DIGESTSIZE, i.e. 64 bytes, which is not too big. This is simpler and makes the fields a bit faster to access. (HASH_MAX_DIGESTSIZE did not exist when this code was written, which may explain why it wasn't used.) This makes the verity_io_real_digest() and verity_io_want_digest() functions trivial, but this patch leaves them in place temporarily since most of their callers will go away in a later patch anyway. Reviewed-by: Sami Tolvanen Acked-by: Ard Biesheuvel Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-target.c | 3 +-- drivers/md/dm-verity.h | 17 +++++++---------- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 796d85526696..4ef814a7faf4 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -1529,8 +1529,7 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - ti->per_io_data_size = sizeof(struct dm_verity_io) + - v->ahash_reqsize + v->digest_size * 2; + ti->per_io_data_size = sizeof(struct dm_verity_io) + v->ahash_reqsize; r = verity_fec_ctr(v); if (r) diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index 20b1bcf03474..5d3da9f5fc95 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -91,15 +91,12 @@ struct dm_verity_io { char *recheck_buffer; + u8 real_digest[HASH_MAX_DIGESTSIZE]; + u8 want_digest[HASH_MAX_DIGESTSIZE]; + /* - * Three variably-size fields follow this struct: - * - * u8 hash_req[v->ahash_reqsize]; - * u8 real_digest[v->digest_size]; - * u8 want_digest[v->digest_size]; - * - * To access them use: verity_io_hash_req(), verity_io_real_digest() - * and verity_io_want_digest(). + * This struct is followed by a variable-sized struct ahash_request of + * size v->ahash_reqsize. To access it, use verity_io_hash_req(). */ }; @@ -112,13 +109,13 @@ static inline struct ahash_request *verity_io_hash_req(struct dm_verity *v, static inline u8 *verity_io_real_digest(struct dm_verity *v, struct dm_verity_io *io) { - return (u8 *)(io + 1) + v->ahash_reqsize; + return io->real_digest; } static inline u8 *verity_io_want_digest(struct dm_verity *v, struct dm_verity_io *io) { - return (u8 *)(io + 1) + v->ahash_reqsize + v->digest_size; + return io->want_digest; } extern int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io, From 09d1430896e38933470045408d7a350dac7d841b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 2 Jul 2024 16:39:51 +0200 Subject: [PATCH 10/33] dm-verity: provide dma_alignment limit in io_hints Since Linux v6.1, some filesystems support submitting direct I/O that is aligned to only dma_alignment instead of the logical_block_size alignment that was required before. I/O that is not aligned to the logical_block_size is difficult to handle in device-mapper targets that do cryptographic processing of data, as it makes the units of data that are hashed or encrypted possibly be split across pages, creating rarely used and rarely tested edge cases. As such, dm-crypt and dm-integrity have already opted out of this by setting dma_alignment to 'logical_block_size - 1'. Although dm-verity does have code that handles these cases (or at least is intended to do so), supporting direct I/O with such a low amount of alignment is not really useful on dm-verity devices. So, opt dm-verity out of it too so that it's not necessary to handle these edge cases. Reviewed-by: Sami Tolvanen Acked-by: Ard Biesheuvel Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-target.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 4ef814a7faf4..182ce59763da 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -1023,6 +1023,14 @@ static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits) limits->physical_block_size = 1 << v->data_dev_block_bits; blk_limits_io_min(limits, limits->logical_block_size); + + /* + * Similar to what dm-crypt does, opt dm-verity out of support for + * direct I/O that is aligned to less than the traditional direct I/O + * alignment requirement of logical_block_size. This prevents dm-verity + * data blocks from crossing pages, eliminating various edge cases. + */ + limits->dma_alignment = limits->logical_block_size - 1; } static void verity_dtr(struct dm_target *ti) From cf715f4b7eb521a5bf67d391387b754c2fcde8d2 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 2 Jul 2024 16:40:20 +0200 Subject: [PATCH 11/33] dm-verity: always "map" the data blocks dm-verity needs to access data blocks by virtual address in three different cases (zeroization, recheck, and forward error correction), and one more case (shash support) is coming. Since it's guaranteed that dm-verity data blocks never cross pages, and kmap_local_page and kunmap_local are no-ops on modern platforms anyway, just unconditionally "map" every data block's page and work with the virtual buffer directly. This simplifies the code and eliminates unnecessary overhead. Reviewed-by: Sami Tolvanen Acked-by: Ard Biesheuvel Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-fec.c | 26 +---- drivers/md/dm-verity-fec.h | 6 +- drivers/md/dm-verity-target.c | 182 +++++++--------------------------- drivers/md/dm-verity.h | 8 -- 4 files changed, 42 insertions(+), 180 deletions(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index e46aee6f932e..b838d21183b5 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -404,24 +404,9 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io, return 0; } -static int fec_bv_copy(struct dm_verity *v, struct dm_verity_io *io, u8 *data, - size_t len) -{ - struct dm_verity_fec_io *fio = fec_io(io); - - memcpy(data, &fio->output[fio->output_pos], len); - fio->output_pos += len; - - return 0; -} - -/* - * Correct errors in a block. Copies corrected block to dest if non-NULL, - * otherwise to a bio_vec starting from iter. - */ +/* Correct errors in a block. Copies corrected block to dest. */ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, - enum verity_block_type type, sector_t block, u8 *dest, - struct bvec_iter *iter) + enum verity_block_type type, sector_t block, u8 *dest) { int r; struct dm_verity_fec_io *fio = fec_io(io); @@ -471,12 +456,7 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, goto done; } - if (dest) - memcpy(dest, fio->output, 1 << v->data_dev_block_bits); - else if (iter) { - fio->output_pos = 0; - r = verity_for_bv_block(v, io, iter, fec_bv_copy); - } + memcpy(dest, fio->output, 1 << v->data_dev_block_bits); done: fio->level--; diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h index 8454070d2824..09123a612953 100644 --- a/drivers/md/dm-verity-fec.h +++ b/drivers/md/dm-verity-fec.h @@ -57,7 +57,6 @@ struct dm_verity_fec_io { u8 *bufs[DM_VERITY_FEC_BUF_MAX]; /* bufs for deinterleaving */ unsigned int nbufs; /* number of buffers allocated */ u8 *output; /* buffer for corrected output */ - size_t output_pos; unsigned int level; /* recursion level */ }; @@ -70,7 +69,7 @@ extern bool verity_fec_is_enabled(struct dm_verity *v); extern int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, enum verity_block_type type, sector_t block, - u8 *dest, struct bvec_iter *iter); + u8 *dest); extern unsigned int verity_fec_status_table(struct dm_verity *v, unsigned int sz, char *result, unsigned int maxlen); @@ -100,8 +99,7 @@ static inline bool verity_fec_is_enabled(struct dm_verity *v) static inline int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, enum verity_block_type type, - sector_t block, u8 *dest, - struct bvec_iter *iter) + sector_t block, u8 *dest) { return -EOPNOTSUPP; } diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 182ce59763da..858140a83ae5 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -342,7 +342,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, r = -EAGAIN; goto release_ret_r; } else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_METADATA, - hash_block, data, NULL) == 0) + hash_block, data) == 0) aux->hash_verified = 1; else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_METADATA, @@ -404,98 +404,8 @@ out: return r; } -/* - * Calculates the digest for the given bio - */ -static int verity_for_io_block(struct dm_verity *v, struct dm_verity_io *io, - struct bvec_iter *iter, struct crypto_wait *wait) -{ - unsigned int todo = 1 << v->data_dev_block_bits; - struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); - struct scatterlist sg; - struct ahash_request *req = verity_io_hash_req(v, io); - - do { - int r; - unsigned int len; - struct bio_vec bv = bio_iter_iovec(bio, *iter); - - sg_init_table(&sg, 1); - - len = bv.bv_len; - - if (likely(len >= todo)) - len = todo; - /* - * Operating on a single page at a time looks suboptimal - * until you consider the typical block size is 4,096B. - * Going through this loops twice should be very rare. - */ - sg_set_page(&sg, bv.bv_page, len, bv.bv_offset); - ahash_request_set_crypt(req, &sg, NULL, len); - r = crypto_wait_req(crypto_ahash_update(req), wait); - - if (unlikely(r < 0)) { - DMERR("%s crypto op failed: %d", __func__, r); - return r; - } - - bio_advance_iter(bio, iter, len); - todo -= len; - } while (todo); - - return 0; -} - -/* - * Calls function process for 1 << v->data_dev_block_bits bytes in the bio_vec - * starting from iter. - */ -int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io, - struct bvec_iter *iter, - int (*process)(struct dm_verity *v, - struct dm_verity_io *io, u8 *data, - size_t len)) -{ - unsigned int todo = 1 << v->data_dev_block_bits; - struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); - - do { - int r; - u8 *page; - unsigned int len; - struct bio_vec bv = bio_iter_iovec(bio, *iter); - - page = bvec_kmap_local(&bv); - len = bv.bv_len; - - if (likely(len >= todo)) - len = todo; - - r = process(v, io, page, len); - kunmap_local(page); - - if (r < 0) - return r; - - bio_advance_iter(bio, iter, len); - todo -= len; - } while (todo); - - return 0; -} - -static int verity_recheck_copy(struct dm_verity *v, struct dm_verity_io *io, - u8 *data, size_t len) -{ - memcpy(data, io->recheck_buffer, len); - io->recheck_buffer += len; - - return 0; -} - static noinline int verity_recheck(struct dm_verity *v, struct dm_verity_io *io, - struct bvec_iter start, sector_t cur_block) + sector_t cur_block, u8 *dest) { struct page *page; void *buffer; @@ -530,11 +440,7 @@ static noinline int verity_recheck(struct dm_verity *v, struct dm_verity_io *io, goto free_ret; } - io->recheck_buffer = buffer; - r = verity_for_bv_block(v, io, &start, verity_recheck_copy); - if (unlikely(r)) - goto free_ret; - + memcpy(dest, buffer, 1 << v->data_dev_block_bits); r = 0; free_ret: mempool_free(page, &v->recheck_pool); @@ -545,7 +451,7 @@ free_ret: static int verity_handle_data_hash_mismatch(struct dm_verity *v, struct dm_verity_io *io, struct bio *bio, sector_t blkno, - struct bvec_iter *start) + u8 *data) { if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) { /* @@ -554,14 +460,14 @@ static int verity_handle_data_hash_mismatch(struct dm_verity *v, */ return -EAGAIN; } - if (verity_recheck(v, io, *start, blkno) == 0) { + if (verity_recheck(v, io, blkno, data) == 0) { if (v->validated_blocks) set_bit(blkno, v->validated_blocks); return 0; } #if defined(CONFIG_DM_VERITY_FEC) if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA, blkno, - NULL, start) == 0) + data) == 0) return 0; #endif if (bio->bi_status) @@ -574,36 +480,15 @@ static int verity_handle_data_hash_mismatch(struct dm_verity *v, return 0; } -static int verity_bv_zero(struct dm_verity *v, struct dm_verity_io *io, - u8 *data, size_t len) -{ - memset(data, 0, len); - return 0; -} - -/* - * Moves the bio iter one data block forward. - */ -static inline void verity_bv_skip_block(struct dm_verity *v, - struct dm_verity_io *io, - struct bvec_iter *iter) -{ - struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); - - bio_advance_iter(bio, iter, 1 << v->data_dev_block_bits); -} - /* * Verify one "dm_verity_io" structure. */ static int verity_verify_io(struct dm_verity_io *io) { - bool is_zero; struct dm_verity *v = io->v; - struct bvec_iter start; + const unsigned int block_size = 1 << v->data_dev_block_bits; struct bvec_iter iter_copy; struct bvec_iter *iter; - struct crypto_wait wait; struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); unsigned int b; @@ -617,16 +502,17 @@ static int verity_verify_io(struct dm_verity_io *io) } else iter = &io->iter; - for (b = 0; b < io->n_blocks; b++) { + for (b = 0; b < io->n_blocks; + b++, bio_advance_iter(bio, iter, block_size)) { int r; sector_t cur_block = io->block + b; - struct ahash_request *req = verity_io_hash_req(v, io); + bool is_zero; + struct bio_vec bv; + void *data; if (v->validated_blocks && bio->bi_status == BLK_STS_OK && - likely(test_bit(cur_block, v->validated_blocks))) { - verity_bv_skip_block(v, io, iter); + likely(test_bit(cur_block, v->validated_blocks))) continue; - } r = verity_hash_for_block(v, io, cur_block, verity_io_want_digest(v, io), @@ -634,41 +520,47 @@ static int verity_verify_io(struct dm_verity_io *io) if (unlikely(r < 0)) return r; + bv = bio_iter_iovec(bio, *iter); + if (unlikely(bv.bv_len < block_size)) { + /* + * Data block spans pages. This should not happen, + * since dm-verity sets dma_alignment to the data block + * size minus 1, and dm-verity also doesn't allow the + * data block size to be greater than PAGE_SIZE. + */ + DMERR_LIMIT("unaligned io (data block spans pages)"); + return -EIO; + } + + data = bvec_kmap_local(&bv); + if (is_zero) { /* * If we expect a zero block, don't validate, just * return zeros. */ - r = verity_for_bv_block(v, io, iter, - verity_bv_zero); - if (unlikely(r < 0)) - return r; - + memset(data, 0, block_size); + kunmap_local(data); continue; } - r = verity_hash_init(v, req, &wait, !io->in_bh); - if (unlikely(r < 0)) - return r; - - start = *iter; - r = verity_for_io_block(v, io, iter, &wait); - if (unlikely(r < 0)) - return r; - - r = verity_hash_final(v, req, verity_io_real_digest(v, io), - &wait); - if (unlikely(r < 0)) + r = verity_hash(v, verity_io_hash_req(v, io), data, block_size, + verity_io_real_digest(v, io), !io->in_bh); + if (unlikely(r < 0)) { + kunmap_local(data); return r; + } if (likely(memcmp(verity_io_real_digest(v, io), verity_io_want_digest(v, io), v->digest_size) == 0)) { if (v->validated_blocks) set_bit(cur_block, v->validated_blocks); + kunmap_local(data); continue; } r = verity_handle_data_hash_mismatch(v, io, bio, cur_block, - &start); + data); + kunmap_local(data); if (unlikely(r)) return r; } diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index 5d3da9f5fc95..bd461c28b710 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -89,8 +89,6 @@ struct dm_verity_io { struct work_struct work; struct work_struct bh_work; - char *recheck_buffer; - u8 real_digest[HASH_MAX_DIGESTSIZE]; u8 want_digest[HASH_MAX_DIGESTSIZE]; @@ -118,12 +116,6 @@ static inline u8 *verity_io_want_digest(struct dm_verity *v, return io->want_digest; } -extern int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io, - struct bvec_iter *iter, - int (*process)(struct dm_verity *v, - struct dm_verity_io *io, - u8 *data, size_t len)); - extern int verity_hash(struct dm_verity *v, struct ahash_request *req, const u8 *data, size_t len, u8 *digest, bool may_sleep); From e8f5e933013af36f4042fea955a00df40105643a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 2 Jul 2024 16:40:41 +0200 Subject: [PATCH 12/33] dm-verity: make verity_hash() take dm_verity_io instead of ahash_request In preparation for adding shash support to dm-verity, change verity_hash() to take a pointer to a struct dm_verity_io instead of a pointer to the ahash_request embedded inside it. Reviewed-by: Sami Tolvanen Acked-by: Ard Biesheuvel Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-fec.c | 6 ++---- drivers/md/dm-verity-target.c | 21 ++++++++++----------- drivers/md/dm-verity.h | 2 +- 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index b838d21183b5..62b1a44b8dd2 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -186,8 +186,7 @@ error: static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io, u8 *want_digest, u8 *data) { - if (unlikely(verity_hash(v, verity_io_hash_req(v, io), - data, 1 << v->data_dev_block_bits, + if (unlikely(verity_hash(v, io, data, 1 << v->data_dev_block_bits, verity_io_real_digest(v, io), true))) return 0; @@ -388,8 +387,7 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io, } /* Always re-validate the corrected block against the expected hash */ - r = verity_hash(v, verity_io_hash_req(v, io), fio->output, - 1 << v->data_dev_block_bits, + r = verity_hash(v, io, fio->output, 1 << v->data_dev_block_bits, verity_io_real_digest(v, io), true); if (unlikely(r < 0)) return r; diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 858140a83ae5..1720c8e8d750 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -180,9 +180,10 @@ out: return r; } -int verity_hash(struct dm_verity *v, struct ahash_request *req, +int verity_hash(struct dm_verity *v, struct dm_verity_io *io, const u8 *data, size_t len, u8 *digest, bool may_sleep) { + struct ahash_request *req = verity_io_hash_req(v, io); int r; struct crypto_wait wait; @@ -325,8 +326,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, goto release_ret_r; } - r = verity_hash(v, verity_io_hash_req(v, io), - data, 1 << v->hash_dev_block_bits, + r = verity_hash(v, io, data, 1 << v->hash_dev_block_bits, verity_io_real_digest(v, io), !io->in_bh); if (unlikely(r < 0)) goto release_ret_r; @@ -428,8 +428,7 @@ static noinline int verity_recheck(struct dm_verity *v, struct dm_verity_io *io, if (unlikely(r)) goto free_ret; - r = verity_hash(v, verity_io_hash_req(v, io), buffer, - 1 << v->data_dev_block_bits, + r = verity_hash(v, io, buffer, 1 << v->data_dev_block_bits, verity_io_real_digest(v, io), true); if (unlikely(r)) goto free_ret; @@ -544,7 +543,7 @@ static int verity_verify_io(struct dm_verity_io *io) continue; } - r = verity_hash(v, verity_io_hash_req(v, io), data, block_size, + r = verity_hash(v, io, data, block_size, verity_io_real_digest(v, io), !io->in_bh); if (unlikely(r < 0)) { kunmap_local(data); @@ -991,7 +990,7 @@ static int verity_alloc_most_once(struct dm_verity *v) static int verity_alloc_zero_digest(struct dm_verity *v) { int r = -ENOMEM; - struct ahash_request *req; + struct dm_verity_io *io; u8 *zero_data; v->zero_digest = kmalloc(v->digest_size, GFP_KERNEL); @@ -999,9 +998,9 @@ static int verity_alloc_zero_digest(struct dm_verity *v) if (!v->zero_digest) return r; - req = kmalloc(v->ahash_reqsize, GFP_KERNEL); + io = kmalloc(sizeof(*io) + v->ahash_reqsize, GFP_KERNEL); - if (!req) + if (!io) return r; /* verity_dtr will free zero_digest */ zero_data = kzalloc(1 << v->data_dev_block_bits, GFP_KERNEL); @@ -1009,11 +1008,11 @@ static int verity_alloc_zero_digest(struct dm_verity *v) if (!zero_data) goto out; - r = verity_hash(v, req, zero_data, 1 << v->data_dev_block_bits, + r = verity_hash(v, io, zero_data, 1 << v->data_dev_block_bits, v->zero_digest, true); out: - kfree(req); + kfree(io); kfree(zero_data); return r; diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index bd461c28b710..0e1dd02a916f 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -116,7 +116,7 @@ static inline u8 *verity_io_want_digest(struct dm_verity *v, return io->want_digest; } -extern int verity_hash(struct dm_verity *v, struct ahash_request *req, +extern int verity_hash(struct dm_verity *v, struct dm_verity_io *io, const u8 *data, size_t len, u8 *digest, bool may_sleep); extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, From b76ad8844234bd0d394105d7d784cd05f1bf269a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 2 Jul 2024 16:41:08 +0200 Subject: [PATCH 13/33] dm-verity: hash blocks with shash import+finup when possible Currently dm-verity computes the hash of each block by using multiple calls to the "ahash" crypto API. While the exact sequence depends on the chosen dm-verity settings, in the vast majority of cases it is: 1. crypto_ahash_init() 2. crypto_ahash_update() [salt] 3. crypto_ahash_update() [data] 4. crypto_ahash_final() This is inefficient for two main reasons: - It makes multiple indirect calls, which is expensive on modern CPUs especially when mitigations for CPU vulnerabilities are enabled. Since the salt is the same across all blocks on a given dm-verity device, a much more efficient sequence would be to do an import of the pre-salted state, then a finup. - It uses the ahash (asynchronous hash) API, despite the fact that CPU-based hashing is almost always used in practice, and therefore it experiences the overhead of the ahash-based wrapper for shash. Because dm-verity was intentionally converted to ahash to support off-CPU crypto accelerators, a full reversion to shash might not be acceptable. Yet, we should still provide a fast path for shash with the most common dm-verity settings. Another reason for shash over ahash is that the upcoming multibuffer hashing support, which is specific to CPU-based hashing, is much better suited for shash than for ahash. Supporting it via ahash would add significant complexity and overhead. And it's not possible for the "same" code to properly support both multibuffer hashing and HW accelerators at the same time anyway, given the different computation models. Unfortunately there will always be code specific to each model needed (for users who want to support both). Therefore, this patch adds a new shash import+finup based fast path to dm-verity. It is used automatically when appropriate. This makes dm-verity optimized for what the vast majority of users want: CPU-based hashing with the most common settings, while still retaining support for rarer settings and off-CPU crypto accelerators. In benchmarks with veritysetup's default parameters (SHA-256, 4K data and hash block sizes, 32-byte salt), which also match the parameters that Android currently uses, this patch improves block hashing performance by about 15% on x86_64 using the SHA-NI instructions, or by about 5% on arm64 using the ARMv8 SHA2 instructions. On x86_64 roughly two-thirds of the improvement comes from the use of import and finup, while the remaining third comes from the switch from ahash to shash. Note that another benefit of using "import" to handle the salt is that if the salt size is equal to the input size of the hash algorithm's compression function, e.g. 64 bytes for SHA-256, then the performance is exactly the same as no salt. This doesn't seem to be much better than veritysetup's current default of 32-byte salts, due to the way SHA-256's finalization padding works, but it should be marginally better. Reviewed-by: Sami Tolvanen Acked-by: Ard Biesheuvel Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-target.c | 169 ++++++++++++++++++++++++---------- drivers/md/dm-verity.h | 18 ++-- 2 files changed, 130 insertions(+), 57 deletions(-) diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 1720c8e8d750..adf21b095dc3 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -48,6 +48,9 @@ module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, 0644); static DEFINE_STATIC_KEY_FALSE(use_bh_wq_enabled); +/* Is at least one dm-verity instance using ahash_tfm instead of shash_tfm? */ +static DEFINE_STATIC_KEY_FALSE(ahash_enabled); + struct dm_verity_prefetch_work { struct work_struct work; struct dm_verity *v; @@ -102,7 +105,7 @@ static sector_t verity_position_at_level(struct dm_verity *v, sector_t block, return block >> (level * v->hash_per_block_bits); } -static int verity_hash_update(struct dm_verity *v, struct ahash_request *req, +static int verity_ahash_update(struct dm_verity *v, struct ahash_request *req, const u8 *data, size_t len, struct crypto_wait *wait) { @@ -135,12 +138,12 @@ static int verity_hash_update(struct dm_verity *v, struct ahash_request *req, /* * Wrapper for crypto_ahash_init, which handles verity salting. */ -static int verity_hash_init(struct dm_verity *v, struct ahash_request *req, +static int verity_ahash_init(struct dm_verity *v, struct ahash_request *req, struct crypto_wait *wait, bool may_sleep) { int r; - ahash_request_set_tfm(req, v->tfm); + ahash_request_set_tfm(req, v->ahash_tfm); ahash_request_set_callback(req, may_sleep ? CRYPTO_TFM_REQ_MAY_SLEEP | CRYPTO_TFM_REQ_MAY_BACKLOG : 0, crypto_req_done, (void *)wait); @@ -155,18 +158,18 @@ static int verity_hash_init(struct dm_verity *v, struct ahash_request *req, } if (likely(v->salt_size && (v->version >= 1))) - r = verity_hash_update(v, req, v->salt, v->salt_size, wait); + r = verity_ahash_update(v, req, v->salt, v->salt_size, wait); return r; } -static int verity_hash_final(struct dm_verity *v, struct ahash_request *req, - u8 *digest, struct crypto_wait *wait) +static int verity_ahash_final(struct dm_verity *v, struct ahash_request *req, + u8 *digest, struct crypto_wait *wait) { int r; if (unlikely(v->salt_size && (!v->version))) { - r = verity_hash_update(v, req, v->salt, v->salt_size, wait); + r = verity_ahash_update(v, req, v->salt, v->salt_size, wait); if (r < 0) { DMERR("%s failed updating salt: %d", __func__, r); @@ -183,21 +186,24 @@ out: int verity_hash(struct dm_verity *v, struct dm_verity_io *io, const u8 *data, size_t len, u8 *digest, bool may_sleep) { - struct ahash_request *req = verity_io_hash_req(v, io); int r; - struct crypto_wait wait; - r = verity_hash_init(v, req, &wait, may_sleep); - if (unlikely(r < 0)) - goto out; + if (static_branch_unlikely(&ahash_enabled) && !v->shash_tfm) { + struct ahash_request *req = verity_io_hash_req(v, io); + struct crypto_wait wait; - r = verity_hash_update(v, req, data, len, &wait); - if (unlikely(r < 0)) - goto out; + r = verity_ahash_init(v, req, &wait, may_sleep) ?: + verity_ahash_update(v, req, data, len, &wait) ?: + verity_ahash_final(v, req, digest, &wait); + } else { + struct shash_desc *desc = verity_io_hash_req(v, io); - r = verity_hash_final(v, req, digest, &wait); - -out: + desc->tfm = v->shash_tfm; + r = crypto_shash_import(desc, v->initial_hashstate) ?: + crypto_shash_finup(desc, data, len, digest); + } + if (unlikely(r)) + DMERR("Error hashing block: %d", r); return r; } @@ -940,11 +946,16 @@ static void verity_dtr(struct dm_target *ti) kvfree(v->validated_blocks); kfree(v->salt); + kfree(v->initial_hashstate); kfree(v->root_digest); kfree(v->zero_digest); - if (v->tfm) - crypto_free_ahash(v->tfm); + if (v->ahash_tfm) { + static_branch_dec(&ahash_enabled); + crypto_free_ahash(v->ahash_tfm); + } else { + crypto_free_shash(v->shash_tfm); + } kfree(v->alg_name); @@ -998,7 +1009,7 @@ static int verity_alloc_zero_digest(struct dm_verity *v) if (!v->zero_digest) return r; - io = kmalloc(sizeof(*io) + v->ahash_reqsize, GFP_KERNEL); + io = kmalloc(sizeof(*io) + v->hash_reqsize, GFP_KERNEL); if (!io) return r; /* verity_dtr will free zero_digest */ @@ -1137,6 +1148,8 @@ static int verity_setup_hash_alg(struct dm_verity *v, const char *alg_name) { struct dm_target *ti = v->ti; struct crypto_ahash *ahash; + struct crypto_shash *shash = NULL; + const char *driver_name; v->alg_name = kstrdup(alg_name, GFP_KERNEL); if (!v->alg_name) { @@ -1144,29 +1157,97 @@ static int verity_setup_hash_alg(struct dm_verity *v, const char *alg_name) return -ENOMEM; } + /* + * Allocate the hash transformation object that this dm-verity instance + * will use. The vast majority of dm-verity users use CPU-based + * hashing, so when possible use the shash API to minimize the crypto + * API overhead. If the ahash API resolves to a different driver + * (likely an off-CPU hardware offload), use ahash instead. Also use + * ahash if the obsolete dm-verity format with the appended salt is + * being used, so that quirk only needs to be handled in one place. + */ ahash = crypto_alloc_ahash(alg_name, 0, v->use_bh_wq ? CRYPTO_ALG_ASYNC : 0); if (IS_ERR(ahash)) { ti->error = "Cannot initialize hash function"; return PTR_ERR(ahash); } - v->tfm = ahash; - - /* - * dm-verity performance can vary greatly depending on which hash - * algorithm implementation is used. Help people debug performance - * problems by logging the ->cra_driver_name. - */ - DMINFO("%s using implementation \"%s\"", alg_name, - crypto_hash_alg_common(ahash)->base.cra_driver_name); - - v->digest_size = crypto_ahash_digestsize(ahash); + driver_name = crypto_ahash_driver_name(ahash); + if (v->version >= 1 /* salt prepended, not appended? */) { + shash = crypto_alloc_shash(alg_name, 0, 0); + if (!IS_ERR(shash) && + strcmp(crypto_shash_driver_name(shash), driver_name) != 0) { + /* + * ahash gave a different driver than shash, so probably + * this is a case of real hardware offload. Use ahash. + */ + crypto_free_shash(shash); + shash = NULL; + } + } + if (!IS_ERR_OR_NULL(shash)) { + crypto_free_ahash(ahash); + ahash = NULL; + v->shash_tfm = shash; + v->digest_size = crypto_shash_digestsize(shash); + v->hash_reqsize = sizeof(struct shash_desc) + + crypto_shash_descsize(shash); + DMINFO("%s using shash \"%s\"", alg_name, driver_name); + } else { + v->ahash_tfm = ahash; + static_branch_inc(&ahash_enabled); + v->digest_size = crypto_ahash_digestsize(ahash); + v->hash_reqsize = sizeof(struct ahash_request) + + crypto_ahash_reqsize(ahash); + DMINFO("%s using ahash \"%s\"", alg_name, driver_name); + } if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) { ti->error = "Digest size too big"; return -EINVAL; } - v->ahash_reqsize = sizeof(struct ahash_request) + - crypto_ahash_reqsize(ahash); + return 0; +} + +static int verity_setup_salt_and_hashstate(struct dm_verity *v, const char *arg) +{ + struct dm_target *ti = v->ti; + + if (strcmp(arg, "-") != 0) { + v->salt_size = strlen(arg) / 2; + v->salt = kmalloc(v->salt_size, GFP_KERNEL); + if (!v->salt) { + ti->error = "Cannot allocate salt"; + return -ENOMEM; + } + if (strlen(arg) != v->salt_size * 2 || + hex2bin(v->salt, arg, v->salt_size)) { + ti->error = "Invalid salt"; + return -EINVAL; + } + } + if (v->shash_tfm) { + SHASH_DESC_ON_STACK(desc, v->shash_tfm); + int r; + + /* + * Compute the pre-salted hash state that can be passed to + * crypto_shash_import() for each block later. + */ + v->initial_hashstate = kmalloc( + crypto_shash_statesize(v->shash_tfm), GFP_KERNEL); + if (!v->initial_hashstate) { + ti->error = "Cannot allocate initial hash state"; + return -ENOMEM; + } + desc->tfm = v->shash_tfm; + r = crypto_shash_init(desc) ?: + crypto_shash_update(desc, v->salt, v->salt_size) ?: + crypto_shash_export(desc, v->initial_hashstate); + if (r) { + ti->error = "Cannot set up initial hash state"; + return r; + } + } return 0; } @@ -1312,21 +1393,9 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv) } root_hash_digest_to_validate = argv[8]; - if (strcmp(argv[9], "-")) { - v->salt_size = strlen(argv[9]) / 2; - v->salt = kmalloc(v->salt_size, GFP_KERNEL); - if (!v->salt) { - ti->error = "Cannot allocate salt"; - r = -ENOMEM; - goto bad; - } - if (strlen(argv[9]) != v->salt_size * 2 || - hex2bin(v->salt, argv[9], v->salt_size)) { - ti->error = "Invalid salt"; - r = -EINVAL; - goto bad; - } - } + r = verity_setup_salt_and_hashstate(v, argv[9]); + if (r) + goto bad; argv += 10; argc -= 10; @@ -1428,7 +1497,7 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - ti->per_io_data_size = sizeof(struct dm_verity_io) + v->ahash_reqsize; + ti->per_io_data_size = sizeof(struct dm_verity_io) + v->hash_reqsize; r = verity_fec_ctr(v); if (r) diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index 0e1dd02a916f..aac3a1b1d94a 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -39,9 +39,11 @@ struct dm_verity { struct dm_target *ti; struct dm_bufio_client *bufio; char *alg_name; - struct crypto_ahash *tfm; + struct crypto_ahash *ahash_tfm; /* either this or shash_tfm is set */ + struct crypto_shash *shash_tfm; /* either this or ahash_tfm is set */ u8 *root_digest; /* digest of the root block */ u8 *salt; /* salt: its size is salt_size */ + u8 *initial_hashstate; /* salted initial state, if shash_tfm is set */ u8 *zero_digest; /* digest for a zero block */ unsigned int salt_size; sector_t data_start; /* data offset in 512-byte sectors */ @@ -56,7 +58,7 @@ struct dm_verity { bool hash_failed:1; /* set if hash of any block failed */ bool use_bh_wq:1; /* try to verify in BH wq before normal work-queue */ unsigned int digest_size; /* digest size for the current hash algorithm */ - unsigned int ahash_reqsize;/* the size of temporary space for crypto */ + unsigned int hash_reqsize; /* the size of temporary space for crypto */ enum verity_mode mode; /* mode for handling verification errors */ unsigned int corrupted_errs;/* Number of errors for corrupted blocks */ @@ -93,15 +95,17 @@ struct dm_verity_io { u8 want_digest[HASH_MAX_DIGESTSIZE]; /* - * This struct is followed by a variable-sized struct ahash_request of - * size v->ahash_reqsize. To access it, use verity_io_hash_req(). + * This struct is followed by a variable-sized hash request of size + * v->hash_reqsize, either a struct ahash_request or a struct shash_desc + * (depending on whether ahash_tfm or shash_tfm is being used). To + * access it, use verity_io_hash_req(). */ }; -static inline struct ahash_request *verity_io_hash_req(struct dm_verity *v, - struct dm_verity_io *io) +static inline void *verity_io_hash_req(struct dm_verity *v, + struct dm_verity_io *io) { - return (struct ahash_request *)(io + 1); + return io + 1; } static inline u8 *verity_io_real_digest(struct dm_verity *v, From 3199a34bfaf7561410e0be1e33a61eba870768fc Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Tue, 2 Jul 2024 17:02:48 +0200 Subject: [PATCH 14/33] dm-raid: Fix WARN_ON_ONCE check for sync_thread in raid_resume rm-raid devices will occasionally trigger the following warning when being resumed after a table load because DM_RECOVERY_RUNNING is set: WARNING: CPU: 7 PID: 5660 at drivers/md/dm-raid.c:4105 raid_resume+0xee/0x100 [dm_raid] The failing check is: WARN_ON_ONCE(test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); This check is designed to make sure that the sync thread isn't registered, but md_check_recovery can set MD_RECOVERY_RUNNING without the sync_thread ever getting registered. Instead of checking if MD_RECOVERY_RUNNING is set, check if sync_thread is non-NULL. Fixes: 16c4770c75b1 ("dm-raid: really frozen sync_thread during suspend") Suggested-by: Yu Kuai Signed-off-by: Benjamin Marzinski Reviewed-by: Yu Kuai Signed-off-by: Mikulas Patocka --- drivers/md/dm-raid.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 052c00c1eb15..a790d6a82e14 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -4101,10 +4101,11 @@ static void raid_resume(struct dm_target *ti) if (mddev->delta_disks < 0) rs_set_capacity(rs); - WARN_ON_ONCE(!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)); - WARN_ON_ONCE(test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); - clear_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags); mddev_lock_nointr(mddev); + WARN_ON_ONCE(!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)); + WARN_ON_ONCE(rcu_dereference_protected(mddev->sync_thread, + lockdep_is_held(&mddev->reconfig_mutex))); + clear_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags); mddev->ro = 0; mddev->in_sync = 0; md_unfrozen_sync_thread(mddev); From 6fce1f40e95182ebbfe1ee3096b8fc0b37903269 Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Tue, 2 Jul 2024 18:16:57 +0200 Subject: [PATCH 15/33] dm verity: add support for signature verification with platform keyring Add a new configuration CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG_PLATFORM_KEYRING that enables verifying dm-verity signatures using the platform keyring, which is populated using the UEFI DB certificates. This is useful for self-enrolled systems that do not use MOK, as the secondary keyring which is already used for verification, if the relevant kconfig is enabled, is linked to the machine keyring, which gets its certificates loaded from MOK. On datacenter/virtual/cloud deployments it is more common to deploy one's own certificate chain directly in DB on first boot in unattended mode, rather than relying on MOK, as the latter typically requires interactive authentication to enroll, and is more suited for personal machines. Default to the same value as DM_VERITY_VERIFY_ROOTHASH_SIG_SECONDARY_KEYRING if not otherwise specified, as it is likely that if one wants to use MOK certificates to verify dm-verity volumes, DB certificates are going to be used too. Keys in DB are allowed to load a full kernel already anyway, so they are already highly privileged. Signed-off-by: Luca Boccassi Signed-off-by: Mikulas Patocka --- drivers/md/Kconfig | 10 ++++++++++ drivers/md/dm-verity-verify-sig.c | 7 +++++++ 2 files changed, 17 insertions(+) diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 35b1080752cd..1e9db8e4acdf 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -540,6 +540,16 @@ config DM_VERITY_VERIFY_ROOTHASH_SIG_SECONDARY_KEYRING If unsure, say N. +config DM_VERITY_VERIFY_ROOTHASH_SIG_PLATFORM_KEYRING + bool "Verity data device root hash signature verification with platform keyring" + default DM_VERITY_VERIFY_ROOTHASH_SIG_SECONDARY_KEYRING + depends on DM_VERITY_VERIFY_ROOTHASH_SIG + depends on INTEGRITY_PLATFORM_KEYRING + help + Rely also on the platform keyring to verify dm-verity signatures. + + If unsure, say N. + config DM_VERITY_FEC bool "Verity forward error correction support" depends on DM_VERITY diff --git a/drivers/md/dm-verity-verify-sig.c b/drivers/md/dm-verity-verify-sig.c index 4836508ea50c..d351d7d39c60 100644 --- a/drivers/md/dm-verity-verify-sig.c +++ b/drivers/md/dm-verity-verify-sig.c @@ -126,6 +126,13 @@ int verity_verify_root_hash(const void *root_hash, size_t root_hash_len, NULL, #endif VERIFYING_UNSPECIFIED_SIGNATURE, NULL, NULL); +#ifdef CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG_PLATFORM_KEYRING + if (ret == -ENOKEY) + ret = verify_pkcs7_signature(root_hash, root_hash_len, sig_data, + sig_len, + VERIFY_USE_PLATFORM_KEYRING, + VERIFYING_UNSPECIFIED_SIGNATURE, NULL, NULL); +#endif return ret; } From 0d815e3400e631d227a3a95968b8c8e7e0c0ef9e Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 3 Jul 2024 15:00:29 +0200 Subject: [PATCH 16/33] dm-crypt: limit the size of encryption requests There was a performance regression reported where dm-crypt would perform worse on new kernels than on old kernels. The reason is that the old kernels split the bios to NVMe request size (that is usually 65536 or 131072 bytes) and the new kernels pass the big bios through dm-crypt and split them underneath. If a big 1MiB bio is passed to dm-crypt, dm-crypt processes it on a single core without parallelization and this is what causes the performance degradation. This commit introduces new tunable variables /sys/module/dm_crypt/parameters/max_read_size and /sys/module/dm_crypt/parameters/max_write_size that specify the maximum bio size for dm-crypt. Bios larger than this value are split, so that they can be encrypted in parallel by multiple cores. If these variables are '0', a default 131072 is used. Splitting bios may cause performance regressions in other workloads - if this happens, the user should increase the value in max_read_size and max_write_size variables. max_read_size: 128k 2399MiB/s 256k 2368MiB/s 512k 1986MiB/s 1024 1790MiB/s max_write_size: 128k 1712MiB/s 256k 1651MiB/s 512k 1537MiB/s 1024k 1332MiB/s Note that if you run dm-crypt inside a virtual machine, you may need to do "echo numa >/sys/module/workqueue/parameters/default_affinity_scope" to improve performance. Signed-off-by: Mikulas Patocka Tested-by: Laurence Oberman --- .../admin-guide/device-mapper/dm-crypt.rst | 11 +++++++ drivers/md/dm-crypt.c | 32 +++++++++++++++++-- 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/device-mapper/dm-crypt.rst b/Documentation/admin-guide/device-mapper/dm-crypt.rst index 41f5f57f00eb..e625830d335e 100644 --- a/Documentation/admin-guide/device-mapper/dm-crypt.rst +++ b/Documentation/admin-guide/device-mapper/dm-crypt.rst @@ -160,6 +160,17 @@ iv_large_sectors The must be multiple of (in 512 bytes units) if this flag is specified. + +Module parameters:: +max_read_size +max_write_size + Maximum size of read or write requests. When a request larger than this size + is received, dm-crypt will split the request. The splitting improves + concurrency (the split requests could be encrypted in parallel by multiple + cores), but it also causes overhead. The user should tune these parameters to + fit the actual workload. + + Example scripts =============== LUKS (Linux Unified Key Setup) is now the preferred way to set up disk diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 6c013ceb0e5f..f46ff843d4c3 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -241,6 +241,31 @@ static unsigned int dm_crypt_clients_n; static volatile unsigned long dm_crypt_pages_per_client; #define DM_CRYPT_MEMORY_PERCENT 2 #define DM_CRYPT_MIN_PAGES_PER_CLIENT (BIO_MAX_VECS * 16) +#define DM_CRYPT_DEFAULT_MAX_READ_SIZE 131072 +#define DM_CRYPT_DEFAULT_MAX_WRITE_SIZE 131072 + +static unsigned int max_read_size = 0; +module_param(max_read_size, uint, 0644); +MODULE_PARM_DESC(max_read_size, "Maximum size of a read request"); +static unsigned int max_write_size = 0; +module_param(max_write_size, uint, 0644); +MODULE_PARM_DESC(max_write_size, "Maximum size of a write request"); +static unsigned get_max_request_size(struct crypt_config *cc, bool wrt) +{ + unsigned val, sector_align; + val = !wrt ? READ_ONCE(max_read_size) : READ_ONCE(max_write_size); + if (likely(!val)) + val = !wrt ? DM_CRYPT_DEFAULT_MAX_READ_SIZE : DM_CRYPT_DEFAULT_MAX_WRITE_SIZE; + if (wrt || cc->on_disk_tag_size) { + if (unlikely(val > BIO_MAX_VECS << PAGE_SHIFT)) + val = BIO_MAX_VECS << PAGE_SHIFT; + } + sector_align = max(bdev_logical_block_size(cc->dev->bdev), (unsigned)cc->sector_size); + val = round_down(val, sector_align); + if (unlikely(!val)) + val = sector_align; + return val >> SECTOR_SHIFT; +} static void crypt_endio(struct bio *clone); static void kcryptd_queue_crypt(struct dm_crypt_io *io); @@ -3474,6 +3499,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) { struct dm_crypt_io *io; struct crypt_config *cc = ti->private; + unsigned max_sectors; /* * If bio is REQ_PREFLUSH or REQ_OP_DISCARD, just bypass crypt queues. @@ -3492,9 +3518,9 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) /* * Check if bio is too large, split as needed. */ - if (unlikely(bio->bi_iter.bi_size > (BIO_MAX_VECS << PAGE_SHIFT)) && - (bio_data_dir(bio) == WRITE || cc->on_disk_tag_size)) - dm_accept_partial_bio(bio, ((BIO_MAX_VECS << PAGE_SHIFT) >> SECTOR_SHIFT)); + max_sectors = get_max_request_size(cc, bio_data_dir(bio) == WRITE); + if (unlikely(bio_sectors(bio) > max_sectors)) + dm_accept_partial_bio(bio, max_sectors); /* * Ensure that bio is a multiple of internal sector encryption size From 0a94a469a4f02bdcc223517fd578810ffc21c548 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 Jul 2024 15:12:08 +0200 Subject: [PATCH 17/33] dm: stop using blk_limits_io_{min,opt} Remove use of the blk_limits_io_{min,opt} and assign the values directly to the queue_limits structure. For the io_opt this is a completely mechanical change, for io_min it removes flooring the limit to the physical and logical block size in the particular caller. But as blk_validate_limits will do the same later when actually applying the limits, there still is no change in overall behavior. Signed-off-by: Christoph Hellwig Signed-off-by: Mikulas Patocka --- drivers/md/dm-cache-target.c | 4 ++-- drivers/md/dm-clone-target.c | 4 ++-- drivers/md/dm-ebs-target.c | 2 +- drivers/md/dm-era-target.c | 4 ++-- drivers/md/dm-integrity.c | 2 +- drivers/md/dm-raid.c | 4 ++-- drivers/md/dm-stripe.c | 4 ++-- drivers/md/dm-thin.c | 6 +++--- drivers/md/dm-vdo/dm-vdo-target.c | 4 ++-- drivers/md/dm-verity-target.c | 2 +- drivers/md/dm-zoned-target.c | 4 ++-- 11 files changed, 20 insertions(+), 20 deletions(-) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 2d8dd9283ff4..17f0fab1e254 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -3416,8 +3416,8 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) */ if (io_opt_sectors < cache->sectors_per_block || do_div(io_opt_sectors, cache->sectors_per_block)) { - blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT); - blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT); + limits->io_min = cache->sectors_per_block << SECTOR_SHIFT; + limits->io_opt = cache->sectors_per_block << SECTOR_SHIFT; } disable_passdown_if_not_supported(cache); diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c index b4384a8b13e3..12bbe487a4c8 100644 --- a/drivers/md/dm-clone-target.c +++ b/drivers/md/dm-clone-target.c @@ -2073,8 +2073,8 @@ static void clone_io_hints(struct dm_target *ti, struct queue_limits *limits) */ if (io_opt_sectors < clone->region_size || do_div(io_opt_sectors, clone->region_size)) { - blk_limits_io_min(limits, clone->region_size << SECTOR_SHIFT); - blk_limits_io_opt(limits, clone->region_size << SECTOR_SHIFT); + limits->io_min = clone->region_size << SECTOR_SHIFT; + limits->io_opt = clone->region_size << SECTOR_SHIFT; } disable_passdown_if_not_supported(clone); diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c index b70d4016c2ac..ec5db1478b2f 100644 --- a/drivers/md/dm-ebs-target.c +++ b/drivers/md/dm-ebs-target.c @@ -428,7 +428,7 @@ static void ebs_io_hints(struct dm_target *ti, struct queue_limits *limits) limits->logical_block_size = to_bytes(ec->e_bs); limits->physical_block_size = to_bytes(ec->u_bs); limits->alignment_offset = limits->physical_block_size; - blk_limits_io_min(limits, limits->logical_block_size); + limits->io_min = limits->logical_block_size; } static int ebs_iterate_devices(struct dm_target *ti, diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index 8f81e597858d..e627781b1420 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -1733,8 +1733,8 @@ static void era_io_hints(struct dm_target *ti, struct queue_limits *limits) */ if (io_opt_sectors < era->sectors_per_block || do_div(io_opt_sectors, era->sectors_per_block)) { - blk_limits_io_min(limits, 0); - blk_limits_io_opt(limits, era->sectors_per_block << SECTOR_SHIFT); + limits->io_min = 0; + limits->io_opt = era->sectors_per_block << SECTOR_SHIFT; } } diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 2a89f8eb4713..3c18d58358f5 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -3471,7 +3471,7 @@ static void dm_integrity_io_hints(struct dm_target *ti, struct queue_limits *lim if (ic->sectors_per_block > 1) { limits->logical_block_size = ic->sectors_per_block << SECTOR_SHIFT; limits->physical_block_size = ic->sectors_per_block << SECTOR_SHIFT; - blk_limits_io_min(limits, ic->sectors_per_block << SECTOR_SHIFT); + limits->io_min = ic->sectors_per_block << SECTOR_SHIFT; limits->dma_alignment = limits->logical_block_size - 1; limits->discard_granularity = ic->sectors_per_block << SECTOR_SHIFT; } diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index a790d6a82e14..46559f5d21e8 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3802,8 +3802,8 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) struct raid_set *rs = ti->private; unsigned int chunk_size_bytes = to_bytes(rs->md.chunk_sectors); - blk_limits_io_min(limits, chunk_size_bytes); - blk_limits_io_opt(limits, chunk_size_bytes * mddev_data_stripes(rs)); + limits->io_min = chunk_size_bytes; + limits->io_opt = chunk_size_bytes * mddev_data_stripes(rs); } static void raid_presuspend(struct dm_target *ti) diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index ec908cf8cfdc..4112071de0be 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -459,8 +459,8 @@ static void stripe_io_hints(struct dm_target *ti, struct stripe_c *sc = ti->private; unsigned int chunk_size = sc->chunk_size << SECTOR_SHIFT; - blk_limits_io_min(limits, chunk_size); - blk_limits_io_opt(limits, chunk_size * sc->stripes); + limits->io_min = chunk_size; + limits->io_opt = chunk_size * sc->stripes; } static struct target_type stripe_target = { diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 991f77a5885b..a0c1620e90c8 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -4079,10 +4079,10 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) if (io_opt_sectors < pool->sectors_per_block || !is_factor(io_opt_sectors, pool->sectors_per_block)) { if (is_factor(pool->sectors_per_block, limits->max_sectors)) - blk_limits_io_min(limits, limits->max_sectors << SECTOR_SHIFT); + limits->io_min = limits->max_sectors << SECTOR_SHIFT; else - blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT); - blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); + limits->io_min = pool->sectors_per_block << SECTOR_SHIFT; + limits->io_opt = pool->sectors_per_block << SECTOR_SHIFT; } /* diff --git a/drivers/md/dm-vdo/dm-vdo-target.c b/drivers/md/dm-vdo/dm-vdo-target.c index b423bec6458b..b1a62de1562f 100644 --- a/drivers/md/dm-vdo/dm-vdo-target.c +++ b/drivers/md/dm-vdo/dm-vdo-target.c @@ -928,9 +928,9 @@ static void vdo_io_hints(struct dm_target *ti, struct queue_limits *limits) limits->physical_block_size = VDO_BLOCK_SIZE; /* The minimum io size for random io */ - blk_limits_io_min(limits, VDO_BLOCK_SIZE); + limits->io_min = VDO_BLOCK_SIZE; /* The optimal io size for streamed/sequential io */ - blk_limits_io_opt(limits, VDO_BLOCK_SIZE); + limits->io_opt = VDO_BLOCK_SIZE; /* * Sets the maximum discard size that will be passed into VDO. This value comes from a diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index adf21b095dc3..0a2399d958b7 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -919,7 +919,7 @@ static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits) if (limits->physical_block_size < 1 << v->data_dev_block_bits) limits->physical_block_size = 1 << v->data_dev_block_bits; - blk_limits_io_min(limits, limits->logical_block_size); + limits->io_min = limits->logical_block_size; /* * Similar to what dm-crypt does, opt dm-verity out of support for diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index cd0ee144973f..6141fc25d842 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -996,8 +996,8 @@ static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits) limits->logical_block_size = DMZ_BLOCK_SIZE; limits->physical_block_size = DMZ_BLOCK_SIZE; - blk_limits_io_min(limits, DMZ_BLOCK_SIZE); - blk_limits_io_opt(limits, DMZ_BLOCK_SIZE); + limits->io_min = DMZ_BLOCK_SIZE; + limits->io_opt = DMZ_BLOCK_SIZE; limits->discard_alignment = 0; limits->discard_granularity = DMZ_BLOCK_SIZE; From b956d1a30f77bdeb7b7931bad0cee5fa8db25cbc Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 23 May 2024 22:07:16 +0100 Subject: [PATCH 18/33] dm vdo: remove unused struct 'uds_attribute' 'uds_attribute' is unused since commit a9da0fb6d8c6 ("dm vdo: remove all sysfs interfaces"). Remove it. Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Matthew Sakai Signed-off-by: Mikulas Patocka --- drivers/md/dm-vdo/dedupe.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/md/dm-vdo/dedupe.c b/drivers/md/dm-vdo/dedupe.c index 117266e1b3ae..39ac68614419 100644 --- a/drivers/md/dm-vdo/dedupe.c +++ b/drivers/md/dm-vdo/dedupe.c @@ -148,11 +148,6 @@ #include "vdo.h" #include "wait-queue.h" -struct uds_attribute { - struct attribute attr; - const char *(*show_string)(struct hash_zones *hash_zones); -}; - #define DEDUPE_QUERY_TIMER_IDLE 0 #define DEDUPE_QUERY_TIMER_RUNNING 1 #define DEDUPE_QUERY_TIMER_FIRED 2 From 7017ded001076593e03540840a5f5b80209bab67 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Fri, 24 May 2024 15:41:09 +0800 Subject: [PATCH 19/33] dm vdo indexer: use swap() instead of open coding it Use existing swap() macro rather than duplicating its implementation. Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=9173 Signed-off-by: Jiapeng Chong Signed-off-by: Matthew Sakai Signed-off-by: Mikulas Patocka --- drivers/md/dm-vdo/indexer/index.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/md/dm-vdo/indexer/index.c b/drivers/md/dm-vdo/indexer/index.c index 1ba767144426..df4934846244 100644 --- a/drivers/md/dm-vdo/indexer/index.c +++ b/drivers/md/dm-vdo/indexer/index.c @@ -197,15 +197,12 @@ static int finish_previous_chapter(struct uds_index *index, u64 current_chapter_ static int swap_open_chapter(struct index_zone *zone) { int result; - struct open_chapter_zone *temporary_chapter; result = finish_previous_chapter(zone->index, zone->newest_virtual_chapter); if (result != UDS_SUCCESS) return result; - temporary_chapter = zone->open_chapter; - zone->open_chapter = zone->writing_chapter; - zone->writing_chapter = temporary_chapter; + swap(zone->open_chapter, zone->writing_chapter); return UDS_SUCCESS; } From 396a27e91265a6632be17bebacb6743f0b9447be Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 4 Jul 2024 15:45:00 +0200 Subject: [PATCH 20/33] dm: Remove max_write_zeroes_granularity The max_write_zeroes_granularity boolean of struct dm_target is used in __process_abnormal_io() but never set by any target. Remove this field and the dead code using it. Signed-off-by: Damien Le Moal Signed-off-by: Mikulas Patocka --- drivers/md/dm.c | 2 -- include/linux/device-mapper.h | 6 ------ 2 files changed, 8 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 3763b2ce557b..a63efa2a46ae 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1664,8 +1664,6 @@ static blk_status_t __process_abnormal_io(struct clone_info *ci, case REQ_OP_WRITE_ZEROES: num_bios = ti->num_write_zeroes_bios; max_sectors = limits->max_write_zeroes_sectors; - if (ti->max_write_zeroes_granularity) - max_granularity = max_sectors; break; default: break; diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 3611b230d0aa..5b7e96653ec6 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -369,12 +369,6 @@ struct dm_target { */ bool max_secure_erase_granularity:1; - /* - * Set if this target requires that write_zeroes be split on - * 'max_write_zeroes_sectors' boundaries. - */ - bool max_write_zeroes_granularity:1; - /* * Set if we need to limit the number of in-flight bios when swapping. */ From 9d45db03acf9cee4f83148c403d105b1a38a0f23 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 4 Jul 2024 15:45:25 +0200 Subject: [PATCH 21/33] dm: Remove max_secure_erase_granularity The max_secure_erase_granularity boolean of struct dm_target is used in __process_abnormal_io() but never set by any target. Remove this field and the dead code using it. Signed-off-by: Damien Le Moal Signed-off-by: Mikulas Patocka --- drivers/md/dm.c | 2 -- include/linux/device-mapper.h | 6 ------ 2 files changed, 8 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index a63efa2a46ae..2abf2b6865ea 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1658,8 +1658,6 @@ static blk_status_t __process_abnormal_io(struct clone_info *ci, case REQ_OP_SECURE_ERASE: num_bios = ti->num_secure_erase_bios; max_sectors = limits->max_secure_erase_sectors; - if (ti->max_secure_erase_granularity) - max_granularity = max_sectors; break; case REQ_OP_WRITE_ZEROES: num_bios = ti->num_write_zeroes_bios; diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 5b7e96653ec6..4ba2e73993bd 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -363,12 +363,6 @@ struct dm_target { */ bool max_discard_granularity:1; - /* - * Set if this target requires that secure_erases be split on - * 'max_secure_erase_sectors' boundaries. - */ - bool max_secure_erase_granularity:1; - /* * Set if we need to limit the number of in-flight bios when swapping. */ From 3708c7269593b836b1d684214cd9f5d83e4ed3fd Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 4 Jul 2024 16:09:57 +0200 Subject: [PATCH 22/33] dm-verity: fix dm_is_verity_target() when dm-verity is builtin When CONFIG_DM_VERITY=y, dm_is_verity_target() returned true for any builtin dm target, not just dm-verity. Fix this by checking for verity_target instead of THIS_MODULE (which is NULL for builtin code). Fixes: b6c1c5745ccc ("dm: Add verity helpers for LoadPin") Cc: stable@vger.kernel.org Cc: Matthias Kaehlcke Cc: Kees Cook Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-target.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 0a2399d958b7..cf659c8feb29 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -1521,14 +1521,6 @@ bad: return r; } -/* - * Check whether a DM target is a verity target. - */ -bool dm_is_verity_target(struct dm_target *ti) -{ - return ti->type->module == THIS_MODULE; -} - /* * Get the verity mode (error behavior) of a verity target. * @@ -1582,6 +1574,14 @@ static struct target_type verity_target = { }; module_dm(verity); +/* + * Check whether a DM target is a verity target. + */ +bool dm_is_verity_target(struct dm_target *ti) +{ + return ti->type == &verity_target; +} + MODULE_AUTHOR("Mikulas Patocka "); MODULE_AUTHOR("Mandeep Baines "); MODULE_AUTHOR("Will Drewry "); From a21f9edb13b0d8066775cbd5efa7261e41871182 Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Thu, 4 Jul 2024 16:17:15 +0200 Subject: [PATCH 23/33] dm: factor out helper function from dm_get_device Factor out a helper function, dm_devt_from_path(), from dm_get_device() for use in dm targets. Signed-off-by: Benjamin Marzinski Signed-off-by: Mikulas Patocka --- drivers/md/dm-table.c | 33 ++++++++++++++++++++++++--------- include/linux/device-mapper.h | 5 +++++ 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 33b7a1844ed4..eea41e38d87e 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -331,23 +331,15 @@ static int upgrade_mode(struct dm_dev_internal *dd, blk_mode_t new_mode, } /* - * Add a device to the list, or just increment the usage count if - * it's already present. - * * Note: the __ref annotation is because this function can call the __init * marked early_lookup_bdev when called during early boot code from dm-init.c. */ -int __ref dm_get_device(struct dm_target *ti, const char *path, blk_mode_t mode, - struct dm_dev **result) +int __ref dm_devt_from_path(const char *path, dev_t *dev_p) { int r; dev_t dev; unsigned int major, minor; char dummy; - struct dm_dev_internal *dd; - struct dm_table *t = ti->table; - - BUG_ON(!t); if (sscanf(path, "%u:%u%c", &major, &minor, &dummy) == 2) { /* Extract the major/minor numbers */ @@ -363,6 +355,29 @@ int __ref dm_get_device(struct dm_target *ti, const char *path, blk_mode_t mode, if (r) return r; } + *dev_p = dev; + return 0; +} +EXPORT_SYMBOL(dm_devt_from_path); + +/* + * Add a device to the list, or just increment the usage count if + * it's already present. + */ +int dm_get_device(struct dm_target *ti, const char *path, blk_mode_t mode, + struct dm_dev **result) +{ + int r; + dev_t dev; + struct dm_dev_internal *dd; + struct dm_table *t = ti->table; + + BUG_ON(!t); + + r = dm_devt_from_path(path, &dev); + if (r) + return r; + if (dev == disk_devt(t->md->disk)) return -EINVAL; diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 4ba2e73993bd..384649a61bfa 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -179,6 +179,11 @@ int dm_get_device(struct dm_target *ti, const char *path, blk_mode_t mode, struct dm_dev **result); void dm_put_device(struct dm_target *ti, struct dm_dev *d); +/* + * Helper function for getting devices + */ +int dm_devt_from_path(const char *path, dev_t *dev_p); + /* * Information about a target type */ From a48f6b82c5c444b6c4e2f3394c7e5719031c6b00 Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Thu, 4 Jul 2024 16:18:44 +0200 Subject: [PATCH 24/33] dm mpath: don't call dm_get_device in multipath_message When mutipath_message is called with an action and a device, it needs to find the pgpath that matches that device. dm_get_device() is not the right function for this. dm_get_device() will look for a table_device matching the requested path in use by either the live or inactive table. If it doesn't find the device, dm_get_device() will open it and add it to the table. Means that multipath_message will accept any block device, add it to the table if not present, and then look through the pgpaths to see if it finds a match. Afterwards it will remove the device if it was not previously in the table devices list. This is the only function that can modify the device list of a table besides the constructors and destructors, and it can only do this when it was passed an invalid message. Instead, multipath_message() should call dm_devt_from_path() to get the device dev_t, and match that against its pgpaths. Signed-off-by: Benjamin Marzinski Signed-off-by: Mikulas Patocka --- drivers/md/dm-mpath.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 15b681b90153..637977acc3dc 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1419,8 +1419,7 @@ out: /* * Fail or reinstate all paths that match the provided struct dm_dev. */ -static int action_dev(struct multipath *m, struct dm_dev *dev, - action_fn action) +static int action_dev(struct multipath *m, dev_t dev, action_fn action) { int r = -EINVAL; struct pgpath *pgpath; @@ -1428,7 +1427,7 @@ static int action_dev(struct multipath *m, struct dm_dev *dev, list_for_each_entry(pg, &m->priority_groups, list) { list_for_each_entry(pgpath, &pg->pgpaths, list) { - if (pgpath->path.dev == dev) + if (pgpath->path.dev->bdev->bd_dev == dev) r = action(pgpath); } } @@ -1959,7 +1958,7 @@ static int multipath_message(struct dm_target *ti, unsigned int argc, char **arg char *result, unsigned int maxlen) { int r = -EINVAL; - struct dm_dev *dev; + dev_t dev; struct multipath *m = ti->private; action_fn action; unsigned long flags; @@ -2008,7 +2007,7 @@ static int multipath_message(struct dm_target *ti, unsigned int argc, char **arg goto out; } - r = dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev); + r = dm_devt_from_path(argv[1], &dev); if (r) { DMWARN("message: error getting device %s", argv[1]); @@ -2017,8 +2016,6 @@ static int multipath_message(struct dm_target *ti, unsigned int argc, char **arg r = action_dev(m, dev, action); - dm_put_device(ti, dev); - out: mutex_unlock(&m->work_mutex); return r; From 6a6c56130aaaeb893a237b2db058251d0f2800de Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 8 Jul 2024 22:06:58 +0200 Subject: [PATCH 25/33] dm-crypt: support for per-sector NVMe metadata Support per-sector NVMe metadata in dm-crypt. This commit changes dm-crypt, so that it can use NVMe metadata to store authentication information. We can put dm-crypt directly on the top of NVMe device, without using dm-integrity. This commit improves write throughput twice, becase the will be no writes to the dm-integrity journal. Signed-off-by: Mikulas Patocka --- drivers/md/dm-crypt.c | 47 ++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index f46ff843d4c3..348b4b26c272 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -214,7 +214,8 @@ struct crypt_config { unsigned int integrity_tag_size; unsigned int integrity_iv_size; - unsigned int on_disk_tag_size; + unsigned int used_tag_size; + unsigned int tuple_size; /* * pool for per bio private data, crypto requests, @@ -256,7 +257,7 @@ static unsigned get_max_request_size(struct crypt_config *cc, bool wrt) val = !wrt ? READ_ONCE(max_read_size) : READ_ONCE(max_write_size); if (likely(!val)) val = !wrt ? DM_CRYPT_DEFAULT_MAX_READ_SIZE : DM_CRYPT_DEFAULT_MAX_WRITE_SIZE; - if (wrt || cc->on_disk_tag_size) { + if (wrt || cc->used_tag_size) { if (unlikely(val > BIO_MAX_VECS << PAGE_SHIFT)) val = BIO_MAX_VECS << PAGE_SHIFT; } @@ -1176,14 +1177,14 @@ static int dm_crypt_integrity_io_alloc(struct dm_crypt_io *io, struct bio *bio) unsigned int tag_len; int ret; - if (!bio_sectors(bio) || !io->cc->on_disk_tag_size) + if (!bio_sectors(bio) || !io->cc->tuple_size) return 0; bip = bio_integrity_alloc(bio, GFP_NOIO, 1); if (IS_ERR(bip)) return PTR_ERR(bip); - tag_len = io->cc->on_disk_tag_size * (bio_sectors(bio) >> io->cc->sector_shift); + tag_len = io->cc->tuple_size * (bio_sectors(bio) >> io->cc->sector_shift); bip->bip_iter.bi_sector = io->cc->start + io->sector; @@ -1207,18 +1208,18 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti) return -EINVAL; } - if (bi->tag_size != cc->on_disk_tag_size || - bi->tuple_size != cc->on_disk_tag_size) { + if (bi->tuple_size < cc->used_tag_size) { ti->error = "Integrity profile tag size mismatch."; return -EINVAL; } + cc->tuple_size = bi->tuple_size; if (1 << bi->interval_exp != cc->sector_size) { ti->error = "Integrity profile sector size mismatch."; return -EINVAL; } if (crypt_integrity_aead(cc)) { - cc->integrity_tag_size = cc->on_disk_tag_size - cc->integrity_iv_size; + cc->integrity_tag_size = cc->used_tag_size - cc->integrity_iv_size; DMDEBUG("%s: Integrity AEAD, tag size %u, IV size %u.", dm_device_name(md), cc->integrity_tag_size, cc->integrity_iv_size); @@ -1230,7 +1231,7 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti) DMDEBUG("%s: Additional per-sector space %u bytes for IV.", dm_device_name(md), cc->integrity_iv_size); - if ((cc->integrity_tag_size + cc->integrity_iv_size) != bi->tag_size) { + if ((cc->integrity_tag_size + cc->integrity_iv_size) > cc->tuple_size) { ti->error = "Not enough space for integrity tag in the profile."; return -EINVAL; } @@ -1309,7 +1310,7 @@ static void *tag_from_dmreq(struct crypt_config *cc, struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx); return &io->integrity_metadata[*org_tag_of_dmreq(cc, dmreq) * - cc->on_disk_tag_size]; + cc->tuple_size]; } static void *iv_tag_from_dmreq(struct crypt_config *cc, @@ -1390,9 +1391,9 @@ static int crypt_convert_block_aead(struct crypt_config *cc, aead_request_set_crypt(req, dmreq->sg_in, dmreq->sg_out, cc->sector_size, iv); r = crypto_aead_encrypt(req); - if (cc->integrity_tag_size + cc->integrity_iv_size != cc->on_disk_tag_size) + if (cc->integrity_tag_size + cc->integrity_iv_size != cc->tuple_size) memset(tag + cc->integrity_tag_size + cc->integrity_iv_size, 0, - cc->on_disk_tag_size - (cc->integrity_tag_size + cc->integrity_iv_size)); + cc->tuple_size - (cc->integrity_tag_size + cc->integrity_iv_size)); } else { aead_request_set_crypt(req, dmreq->sg_in, dmreq->sg_out, cc->sector_size + cc->integrity_tag_size, iv); @@ -1822,7 +1823,7 @@ static void crypt_dec_pending(struct dm_crypt_io *io) return; if (likely(!io->ctx.aead_recheck) && unlikely(io->ctx.aead_failed) && - cc->on_disk_tag_size && bio_data_dir(base_bio) == READ) { + cc->used_tag_size && bio_data_dir(base_bio) == READ) { io->ctx.aead_recheck = true; io->ctx.aead_failed = false; io->error = 0; @@ -3206,7 +3207,7 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar ti->error = "Invalid integrity arguments"; return -EINVAL; } - cc->on_disk_tag_size = val; + cc->used_tag_size = val; sval = strchr(opt_string + strlen("integrity:"), ':') + 1; if (!strcasecmp(sval, "aead")) { set_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags); @@ -3418,12 +3419,12 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (ret) goto bad; - cc->tag_pool_max_sectors = POOL_ENTRY_SIZE / cc->on_disk_tag_size; + cc->tag_pool_max_sectors = POOL_ENTRY_SIZE / cc->tuple_size; if (!cc->tag_pool_max_sectors) cc->tag_pool_max_sectors = 1; ret = mempool_init_kmalloc_pool(&cc->tag_pool, MIN_IOS, - cc->tag_pool_max_sectors * cc->on_disk_tag_size); + cc->tag_pool_max_sectors * cc->tuple_size); if (ret) { ti->error = "Cannot allocate integrity tags mempool"; goto bad; @@ -3535,8 +3536,8 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) io = dm_per_bio_data(bio, cc->per_bio_data_size); crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector)); - if (cc->on_disk_tag_size) { - unsigned int tag_len = cc->on_disk_tag_size * (bio_sectors(bio) >> cc->sector_shift); + if (cc->tuple_size) { + unsigned int tag_len = cc->tuple_size * (bio_sectors(bio) >> cc->sector_shift); if (unlikely(tag_len > KMALLOC_MAX_SIZE)) io->integrity_metadata = NULL; @@ -3608,7 +3609,7 @@ static void crypt_status(struct dm_target *ti, status_type_t type, num_feature_args += test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags); num_feature_args += cc->sector_size != (1 << SECTOR_SHIFT); num_feature_args += test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags); - if (cc->on_disk_tag_size) + if (cc->used_tag_size) num_feature_args++; if (num_feature_args) { DMEMIT(" %d", num_feature_args); @@ -3624,8 +3625,8 @@ static void crypt_status(struct dm_target *ti, status_type_t type, DMEMIT(" no_read_workqueue"); if (test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags)) DMEMIT(" no_write_workqueue"); - if (cc->on_disk_tag_size) - DMEMIT(" integrity:%u:%s", cc->on_disk_tag_size, cc->cipher_auth); + if (cc->used_tag_size) + DMEMIT(" integrity:%u:%s", cc->used_tag_size, cc->cipher_auth); if (cc->sector_size != (1 << SECTOR_SHIFT)) DMEMIT(" sector_size:%d", cc->sector_size); if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags)) @@ -3647,9 +3648,9 @@ static void crypt_status(struct dm_target *ti, status_type_t type, DMEMIT(",iv_large_sectors=%c", test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags) ? 'y' : 'n'); - if (cc->on_disk_tag_size) + if (cc->used_tag_size) DMEMIT(",integrity_tag_size=%u,cipher_auth=%s", - cc->on_disk_tag_size, cc->cipher_auth); + cc->used_tag_size, cc->cipher_auth); if (cc->sector_size != (1 << SECTOR_SHIFT)) DMEMIT(",sector_size=%d", cc->sector_size); if (cc->cipher_string) @@ -3757,7 +3758,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type crypt_target = { .name = "crypt", - .version = {1, 26, 0}, + .version = {1, 27, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, From 453496b899b5f62ff193bca46097f0f7211cec46 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Tue, 9 Jul 2024 13:56:12 +0200 Subject: [PATCH 26/33] dm raid: move _get_reshape_sectors() as prerequisite to fixing reshape size issues rs_set_dev_and_array_sectors() needs this function to calculate device and array size properly in case leg data devices have out-of-place reshape space allocated. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mikulas Patocka --- drivers/md/dm-raid.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 46559f5d21e8..871e278de662 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -1626,6 +1626,23 @@ static int _check_data_dev_sectors(struct raid_set *rs) return 0; } +/* Get reshape sectors from data_offsets or raid set */ +static sector_t _get_reshape_sectors(struct raid_set *rs) +{ + struct md_rdev *rdev; + sector_t reshape_sectors = 0; + + rdev_for_each(rdev, &rs->md) + if (!test_bit(Journal, &rdev->flags)) { + reshape_sectors = (rdev->data_offset > rdev->new_data_offset) ? + rdev->data_offset - rdev->new_data_offset : + rdev->new_data_offset - rdev->data_offset; + break; + } + + return max(reshape_sectors, (sector_t) rs->data_offset); +} + /* Calculate the sectors per device and per array used for @rs */ static int rs_set_dev_and_array_sectors(struct raid_set *rs, sector_t sectors, bool use_mddev) { @@ -2811,23 +2828,6 @@ static int rs_prepare_reshape(struct raid_set *rs) return 0; } -/* Get reshape sectors from data_offsets or raid set */ -static sector_t _get_reshape_sectors(struct raid_set *rs) -{ - struct md_rdev *rdev; - sector_t reshape_sectors = 0; - - rdev_for_each(rdev, &rs->md) - if (!test_bit(Journal, &rdev->flags)) { - reshape_sectors = (rdev->data_offset > rdev->new_data_offset) ? - rdev->data_offset - rdev->new_data_offset : - rdev->new_data_offset - rdev->data_offset; - break; - } - - return max(reshape_sectors, (sector_t) rs->data_offset); -} - /* * Reshape: * - change raid layout From d176fadb9e783c152d0820a50f84882b6c5ae314 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Tue, 9 Jul 2024 13:56:38 +0200 Subject: [PATCH 27/33] dm raid: fix stripes adding reshape size issues Adding stripes to an existing raid4/5/6/10 mapped device grows its capacity though it'll be only made available _after_ the respective reshape finished as of MD kernel reshape semantics. Such reshaping involves moving a window forward starting at BOD reading content from previous lesser stripes and writing them back in the new layout with more stripes. Once that process finishes at end of previous data, the grown size may be announced and used. In order to avoid writing over any existing data in place, out-of-place space is added to the beginning of each data device by lvm2 before starting the reshape process. That reshape space wasn't taken into acount for data device size calculation. Fixes resulting from above: - correct event handling conditions in do_table_event() to set the device's capacity after the stripe adding reshape ended - subtract mentioned out-of-place space doing data device and array size calculations - conditionally set capacity as of superblock in preresume Testing: - passes all LVM2 RAID tests including new lvconvert-raid-reshape-size.sh one Tested-by: Heinz Mauelshagen Signed-off-by: Heinz Mauelshagen Signed-off-by: Mikulas Patocka --- drivers/md/dm-raid.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 871e278de662..0c3323e0adb2 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -1673,7 +1673,7 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, sector_t sectors, b if (sector_div(dev_sectors, data_stripes)) goto bad; - array_sectors = (data_stripes + delta_disks) * dev_sectors; + array_sectors = (data_stripes + delta_disks) * (dev_sectors - _get_reshape_sectors(rs)); if (sector_div(array_sectors, rs->raid10_copies)) goto bad; @@ -1682,7 +1682,7 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, sector_t sectors, b else /* Striped layouts */ - array_sectors = (data_stripes + delta_disks) * dev_sectors; + array_sectors = (data_stripes + delta_disks) * (dev_sectors - _get_reshape_sectors(rs)); mddev->array_sectors = array_sectors; mddev->dev_sectors = dev_sectors; @@ -1721,11 +1721,20 @@ static void do_table_event(struct work_struct *ws) struct raid_set *rs = container_of(ws, struct raid_set, md.event_work); smp_rmb(); /* Make sure we access most actual mddev properties */ - if (!rs_is_reshaping(rs)) { + + /* Only grow size resulting from added stripe(s) after reshape ended. */ + if (!rs_is_reshaping(rs) && + rs->array_sectors > rs->md.array_sectors && + !rs->md.delta_disks && + rs->md.raid_disks == rs->raid_disks) { + /* The raid10 personality doesn't provide proper device sizes -> correct. */ if (rs_is_raid10(rs)) rs_set_rdev_sectors(rs); + + rs->md.array_sectors = rs->array_sectors; rs_set_capacity(rs); } + dm_table_event(rs->ti->table); } @@ -4023,6 +4032,11 @@ static int raid_preresume(struct dm_target *ti) if (test_and_set_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags)) return 0; + /* If different and no explicit grow request, expose MD array size as of superblock. */ + if (!test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags) && + rs->array_sectors != mddev->array_sectors) + rs_set_capacity(rs); + /* * The superblocks need to be updated on disk if the * array is new or new devices got added (thus zeroed From 617069741dfbb141bc4574531a5dbbbad8ddf197 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 10 Jul 2024 20:53:12 +0200 Subject: [PATCH 28/33] dm: introduce the target flag mempool_needs_integrity This commit introduces the dm target flag mempool_needs_integrity. When the flag is set, device mapper will call bioset_integrity_create on it's bio sets. The target can then call bio_integrity_alloc on the bios allocated from the table's mempool. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-table.c | 7 +++++-- include/linux/device-mapper.h | 6 ++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index eea41e38d87e..dbd39b9722b9 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1050,6 +1050,7 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * unsigned int min_pool_size = 0, pool_size; struct dm_md_mempools *pools; unsigned int bioset_flags = 0; + bool mempool_needs_integrity = t->integrity_supported; if (unlikely(type == DM_TYPE_NONE)) { DMERR("no table type is set, can't allocate mempools"); @@ -1074,6 +1075,8 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * per_io_data_size = max(per_io_data_size, ti->per_io_data_size); min_pool_size = max(min_pool_size, ti->num_flush_bios); + + mempool_needs_integrity |= ti->mempool_needs_integrity; } pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size); front_pad = roundup(per_io_data_size, @@ -1083,13 +1086,13 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET; if (bioset_init(&pools->io_bs, pool_size, io_front_pad, bioset_flags)) goto out_free_pools; - if (t->integrity_supported && + if (mempool_needs_integrity && bioset_integrity_create(&pools->io_bs, pool_size)) goto out_free_pools; init_bs: if (bioset_init(&pools->bs, pool_size, front_pad, 0)) goto out_free_pools; - if (t->integrity_supported && + if (mempool_needs_integrity && bioset_integrity_create(&pools->bs, pool_size)) goto out_free_pools; diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 384649a61bfa..1363f87fbba6 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -405,6 +405,12 @@ struct dm_target { * support it. */ bool flush_bypasses_map:1; + + /* + * Set if the target calls bio_integrity_alloc on bios received + * in the map method. + */ + bool mempool_needs_integrity:1; }; void *dm_per_bio_data(struct bio *bio, size_t data_size); From fb0987682c629c1d2c476f35f6fde405a5e304a4 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 10 Jul 2024 21:00:18 +0200 Subject: [PATCH 29/33] dm-integrity: introduce the Inline mode This commit introduces a new 'I' mode for dm-integrity. The 'I' mode may be selected if the underlying device has non-power-of-2 sector size. In this mode, dm-integrity will store integrity data directly in device's sectors and it will not use journal. This mode improves performance and reduces flash wear because there would be no journal writes. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 406 +++++++++++++++++++++++++++++++++----- 1 file changed, 360 insertions(+), 46 deletions(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 3c18d58358f5..48ac628f471b 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -44,6 +44,7 @@ #define BITMAP_FLUSH_INTERVAL (10 * HZ) #define DISCARD_FILLER 0xf6 #define SALT_SIZE 16 +#define RECHECK_POOL_SIZE 256 /* * Warning - DEBUG_PRINT prints security-sensitive data to the log, @@ -62,6 +63,7 @@ #define SB_VERSION_3 3 #define SB_VERSION_4 4 #define SB_VERSION_5 5 +#define SB_VERSION_6 6 #define SB_SECTORS 8 #define MAX_SECTORS_PER_BLOCK 8 @@ -86,6 +88,7 @@ struct superblock { #define SB_FLAG_DIRTY_BITMAP 0x4 #define SB_FLAG_FIXED_PADDING 0x8 #define SB_FLAG_FIXED_HMAC 0x10 +#define SB_FLAG_INLINE 0x20 #define JOURNAL_ENTRY_ROUNDUP 8 @@ -166,6 +169,7 @@ struct dm_integrity_c { struct dm_dev *meta_dev; unsigned int tag_size; __s8 log2_tag_size; + unsigned int tuple_size; sector_t start; mempool_t journal_io_mempool; struct dm_io_client *io; @@ -279,6 +283,7 @@ struct dm_integrity_c { atomic64_t number_of_mismatches; mempool_t recheck_pool; + struct bio_set recheck_bios; struct notifier_block reboot_notifier; }; @@ -314,6 +319,9 @@ struct dm_integrity_io { struct completion *completion; struct dm_bio_details bio_details; + + char *integrity_payload; + bool integrity_payload_from_mempool; }; struct journal_completion { @@ -351,6 +359,7 @@ static struct kmem_cache *journal_io_cache; #endif static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map); +static int dm_integrity_map_inline(struct dm_integrity_io *dio); static void integrity_bio_wait(struct work_struct *w); static void dm_integrity_dtr(struct dm_target *ti); @@ -460,7 +469,9 @@ static void wraparound_section(struct dm_integrity_c *ic, unsigned int *sec_ptr) static void sb_set_version(struct dm_integrity_c *ic) { - if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) + if (ic->sb->flags & cpu_to_le32(SB_FLAG_INLINE)) + ic->sb->version = SB_VERSION_6; + else if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) ic->sb->version = SB_VERSION_5; else if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) ic->sb->version = SB_VERSION_4; @@ -1894,6 +1905,35 @@ error: dec_in_flight(dio); } +static inline bool dm_integrity_check_limits(struct dm_integrity_c *ic, sector_t logical_sector, struct bio *bio) +{ + if (unlikely(logical_sector + bio_sectors(bio) > ic->provided_data_sectors)) { + DMERR("Too big sector number: 0x%llx + 0x%x > 0x%llx", + logical_sector, bio_sectors(bio), + ic->provided_data_sectors); + return false; + } + if (unlikely((logical_sector | bio_sectors(bio)) & (unsigned int)(ic->sectors_per_block - 1))) { + DMERR("Bio not aligned on %u sectors: 0x%llx, 0x%x", + ic->sectors_per_block, + logical_sector, bio_sectors(bio)); + return false; + } + if (ic->sectors_per_block > 1 && likely(bio_op(bio) != REQ_OP_DISCARD)) { + struct bvec_iter iter; + struct bio_vec bv; + + bio_for_each_segment(bv, bio, iter) { + if (unlikely(bv.bv_len & ((ic->sectors_per_block << SECTOR_SHIFT) - 1))) { + DMERR("Bio vector (%u,%u) is not aligned on %u-sector boundary", + bv.bv_offset, bv.bv_len, ic->sectors_per_block); + return false; + } + } + } + return true; +} + static int dm_integrity_map(struct dm_target *ti, struct bio *bio) { struct dm_integrity_c *ic = ti->private; @@ -1906,6 +1946,9 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio) dio->bi_status = 0; dio->op = bio_op(bio); + if (ic->mode == 'I') + return dm_integrity_map_inline(dio); + if (unlikely(dio->op == REQ_OP_DISCARD)) { if (ti->max_io_len) { sector_t sec = dm_target_offset(ti, bio->bi_iter.bi_sector); @@ -1935,31 +1978,8 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio) */ bio->bi_opf &= ~REQ_FUA; } - if (unlikely(dio->range.logical_sector + bio_sectors(bio) > ic->provided_data_sectors)) { - DMERR("Too big sector number: 0x%llx + 0x%x > 0x%llx", - dio->range.logical_sector, bio_sectors(bio), - ic->provided_data_sectors); + if (unlikely(!dm_integrity_check_limits(ic, dio->range.logical_sector, bio))) return DM_MAPIO_KILL; - } - if (unlikely((dio->range.logical_sector | bio_sectors(bio)) & (unsigned int)(ic->sectors_per_block - 1))) { - DMERR("Bio not aligned on %u sectors: 0x%llx, 0x%x", - ic->sectors_per_block, - dio->range.logical_sector, bio_sectors(bio)); - return DM_MAPIO_KILL; - } - - if (ic->sectors_per_block > 1 && likely(dio->op != REQ_OP_DISCARD)) { - struct bvec_iter iter; - struct bio_vec bv; - - bio_for_each_segment(bv, bio, iter) { - if (unlikely(bv.bv_len & ((ic->sectors_per_block << SECTOR_SHIFT) - 1))) { - DMERR("Bio vector (%u,%u) is not aligned on %u-sector boundary", - bv.bv_offset, bv.bv_len, ic->sectors_per_block); - return DM_MAPIO_KILL; - } - } - } bip = bio_integrity(bio); if (!ic->internal_hash) { @@ -2375,6 +2395,213 @@ journal_read_write: do_endio_flush(ic, dio); } +static int dm_integrity_map_inline(struct dm_integrity_io *dio) +{ + struct dm_integrity_c *ic = dio->ic; + struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); + struct bio_integrity_payload *bip; + unsigned payload_len, digest_size, extra_size, ret; + + dio->integrity_payload = NULL; + dio->integrity_payload_from_mempool = false; + + if (unlikely(bio_integrity(bio))) { + bio->bi_status = BLK_STS_NOTSUPP; + bio_endio(bio); + return DM_MAPIO_SUBMITTED; + } + + bio_set_dev(bio, ic->dev->bdev); + if (unlikely((bio->bi_opf & REQ_PREFLUSH) != 0)) + return DM_MAPIO_REMAPPED; + +retry: + payload_len = ic->tuple_size * (bio_sectors(bio) >> ic->sb->log2_sectors_per_block); + digest_size = crypto_shash_digestsize(ic->internal_hash); + extra_size = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0; + payload_len += extra_size; + dio->integrity_payload = kmalloc(payload_len, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); + if (unlikely(!dio->integrity_payload)) { + const unsigned x_size = PAGE_SIZE << 1; + if (payload_len > x_size) { + unsigned sectors = ((x_size - extra_size) / ic->tuple_size) << ic->sb->log2_sectors_per_block; + if (WARN_ON(!sectors || sectors >= bio_sectors(bio))) { + bio->bi_status = BLK_STS_NOTSUPP; + bio_endio(bio); + return DM_MAPIO_SUBMITTED; + } + dm_accept_partial_bio(bio, sectors); + goto retry; + } + dio->integrity_payload = page_to_virt((struct page *)mempool_alloc(&ic->recheck_pool, GFP_NOIO)); + dio->integrity_payload_from_mempool = true; + } + + bio->bi_iter.bi_sector = dm_target_offset(ic->ti, bio->bi_iter.bi_sector); + dio->bio_details.bi_iter = bio->bi_iter; + + if (unlikely(!dm_integrity_check_limits(ic, bio->bi_iter.bi_sector, bio))) { + return DM_MAPIO_KILL; + } + + bio->bi_iter.bi_sector += ic->start + SB_SECTORS; + + bip = bio_integrity_alloc(bio, GFP_NOIO, 1); + if (unlikely(IS_ERR(bip))) { + bio->bi_status = errno_to_blk_status(PTR_ERR(bip)); + bio_endio(bio); + return DM_MAPIO_SUBMITTED; + } + + if (dio->op == REQ_OP_WRITE) { + unsigned pos = 0; + while (dio->bio_details.bi_iter.bi_size) { + struct bio_vec bv = bio_iter_iovec(bio, dio->bio_details.bi_iter); + const char *mem = bvec_kmap_local(&bv); + if (ic->tag_size < ic->tuple_size) + memset(dio->integrity_payload + pos + ic->tag_size, 0, ic->tuple_size - ic->tuple_size); + integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, mem, dio->integrity_payload + pos); + kunmap_local(mem); + pos += ic->tuple_size; + bio_advance_iter_single(bio, &dio->bio_details.bi_iter, ic->sectors_per_block << SECTOR_SHIFT); + } + } + + ret = bio_integrity_add_page(bio, virt_to_page(dio->integrity_payload), + payload_len, offset_in_page(dio->integrity_payload)); + if (unlikely(ret != payload_len)) { + bio->bi_status = BLK_STS_RESOURCE; + bio_endio(bio); + return DM_MAPIO_SUBMITTED; + } + + return DM_MAPIO_REMAPPED; +} + +static inline void dm_integrity_free_payload(struct dm_integrity_io *dio) +{ + struct dm_integrity_c *ic = dio->ic; + if (unlikely(dio->integrity_payload_from_mempool)) + mempool_free(virt_to_page(dio->integrity_payload), &ic->recheck_pool); + else + kfree(dio->integrity_payload); + dio->integrity_payload = NULL; + dio->integrity_payload_from_mempool = false; +} + +static void dm_integrity_inline_recheck(struct work_struct *w) +{ + struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work); + struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); + struct dm_integrity_c *ic = dio->ic; + struct bio *outgoing_bio; + void *outgoing_data; + + dio->integrity_payload = page_to_virt((struct page *)mempool_alloc(&ic->recheck_pool, GFP_NOIO)); + dio->integrity_payload_from_mempool = true; + + outgoing_data = dio->integrity_payload + PAGE_SIZE; + + while (dio->bio_details.bi_iter.bi_size) { + char digest[HASH_MAX_DIGESTSIZE]; + int r; + struct bio_integrity_payload *bip; + struct bio_vec bv; + char *mem; + + outgoing_bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_READ, GFP_NOIO, &ic->recheck_bios); + + r = bio_add_page(outgoing_bio, virt_to_page(outgoing_data), ic->sectors_per_block << SECTOR_SHIFT, 0); + if (unlikely(r != (ic->sectors_per_block << SECTOR_SHIFT))) { + bio_put(outgoing_bio); + bio->bi_status = BLK_STS_RESOURCE; + bio_endio(bio); + return; + } + + bip = bio_integrity_alloc(outgoing_bio, GFP_NOIO, 1); + if (unlikely(IS_ERR(bip))) { + bio_put(outgoing_bio); + bio->bi_status = errno_to_blk_status(PTR_ERR(bip)); + bio_endio(bio); + return; + } + + r = bio_integrity_add_page(outgoing_bio, virt_to_page(dio->integrity_payload), ic->tuple_size, 0); + if (unlikely(r != ic->tuple_size)) { + bio_put(outgoing_bio); + bio->bi_status = BLK_STS_RESOURCE; + bio_endio(bio); + return; + } + + outgoing_bio->bi_iter.bi_sector = dio->bio_details.bi_iter.bi_sector + ic->start + SB_SECTORS; + + r = submit_bio_wait(outgoing_bio); + if (unlikely(r != 0)) { + bio_put(outgoing_bio); + bio->bi_status = errno_to_blk_status(r); + bio_endio(bio); + return; + } + bio_put(outgoing_bio); + + integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, outgoing_data, digest); + if (unlikely(memcmp(digest, dio->integrity_payload, min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) { + DMERR_LIMIT("%pg: Checksum failed at sector 0x%llx", + ic->dev->bdev, dio->bio_details.bi_iter.bi_sector); + atomic64_inc(&ic->number_of_mismatches); + dm_audit_log_bio(DM_MSG_PREFIX, "integrity-checksum", + bio, dio->bio_details.bi_iter.bi_sector, 0); + + bio->bi_status = BLK_STS_PROTECTION; + bio_endio(bio); + return; + } + + bv = bio_iter_iovec(bio, dio->bio_details.bi_iter); + mem = bvec_kmap_local(&bv); + memcpy(mem, outgoing_data, ic->sectors_per_block << SECTOR_SHIFT); + kunmap_local(mem); + + bio_advance_iter_single(bio, &dio->bio_details.bi_iter, ic->sectors_per_block << SECTOR_SHIFT); + } + + bio_endio(bio); +} + +static int dm_integrity_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status) +{ + struct dm_integrity_c *ic = ti->private; + if (ic->mode == 'I') { + struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io)); + if (dio->op == REQ_OP_READ && likely(*status == BLK_STS_OK)) { + unsigned pos = 0; + while (dio->bio_details.bi_iter.bi_size) { + char digest[HASH_MAX_DIGESTSIZE]; + struct bio_vec bv = bio_iter_iovec(bio, dio->bio_details.bi_iter); + char *mem = bvec_kmap_local(&bv); + //memset(mem, 0xff, ic->sectors_per_block << SECTOR_SHIFT); + integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, mem, digest); + if (unlikely(memcmp(digest, dio->integrity_payload + pos, + min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) { + kunmap_local(mem); + dm_integrity_free_payload(dio); + INIT_WORK(&dio->work, dm_integrity_inline_recheck); + queue_work(ic->offload_wq, &dio->work); + return DM_ENDIO_INCOMPLETE; + } + kunmap_local(mem); + pos += ic->tuple_size; + bio_advance_iter_single(bio, &dio->bio_details.bi_iter, ic->sectors_per_block << SECTOR_SHIFT); + } + } + if (likely(dio->op == REQ_OP_READ) || likely(dio->op == REQ_OP_WRITE)) { + dm_integrity_free_payload(dio); + } + } + return DM_ENDIO_DONE; +} static void integrity_bio_wait(struct work_struct *w) { @@ -2413,6 +2640,9 @@ static void integrity_commit(struct work_struct *w) del_timer(&ic->autocommit_timer); + if (ic->mode == 'I') + return; + spin_lock_irq(&ic->endio_wait.lock); flushes = bio_list_get(&ic->flush_bio_list); if (unlikely(ic->mode != 'J')) { @@ -3515,7 +3745,10 @@ static int calculate_device_limits(struct dm_integrity_c *ic) return -EINVAL; ic->initial_sectors = initial_sectors; - if (!ic->meta_dev) { + if (ic->mode == 'I') { + if (ic->initial_sectors + ic->provided_data_sectors > ic->meta_device_sectors) + return -EINVAL; + } else if (!ic->meta_dev) { sector_t last_sector, last_area, last_offset; /* we have to maintain excessive padding for compatibility with existing volumes */ @@ -3578,6 +3811,8 @@ static int initialize_superblock(struct dm_integrity_c *ic, memset(ic->sb, 0, SB_SECTORS << SECTOR_SHIFT); memcpy(ic->sb->magic, SB_MAGIC, 8); + if (ic->mode == 'I') + ic->sb->flags |= cpu_to_le32(SB_FLAG_INLINE); ic->sb->integrity_tag_size = cpu_to_le16(ic->tag_size); ic->sb->log2_sectors_per_block = __ffs(ic->sectors_per_block); if (ic->journal_mac_alg.alg_string) @@ -3587,6 +3822,8 @@ static int initialize_superblock(struct dm_integrity_c *ic, journal_sections = journal_sectors / ic->journal_section_sectors; if (!journal_sections) journal_sections = 1; + if (ic->mode == 'I') + journal_sections = 0; if (ic->fix_hmac && (ic->internal_hash_alg.alg_string || ic->journal_mac_alg.alg_string)) { ic->sb->flags |= cpu_to_le32(SB_FLAG_FIXED_HMAC); @@ -4135,10 +4372,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv } if (!strcmp(argv[3], "J") || !strcmp(argv[3], "B") || - !strcmp(argv[3], "D") || !strcmp(argv[3], "R")) { + !strcmp(argv[3], "D") || !strcmp(argv[3], "R") || + !strcmp(argv[3], "I")) { ic->mode = argv[3][0]; } else { - ti->error = "Invalid mode (expecting J, B, D, R)"; + ti->error = "Invalid mode (expecting J, B, D, R, I)"; r = -EINVAL; goto bad; } @@ -4284,6 +4522,53 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv else ic->log2_tag_size = -1; + if (ic->mode == 'I') { + struct blk_integrity *bi; + if (ic->meta_dev) { + r = -EINVAL; + ti->error = "Metadata device not supported in inline mode"; + goto bad; + } + if (!ic->internal_hash_alg.alg_string) { + r = -EINVAL; + ti->error = "Internal hash not set in inline mode"; + goto bad; + } + if (ic->journal_crypt_alg.alg_string || ic->journal_mac_alg.alg_string) { + r = -EINVAL; + ti->error = "Journal crypt not supported in inline mode"; + goto bad; + } + if (ic->discard) { + r = -EINVAL; + ti->error = "Discards not supported in inline mode"; + goto bad; + } + bi = blk_get_integrity(ic->dev->bdev->bd_disk); + if (!bi || bi->csum_type != BLK_INTEGRITY_CSUM_NONE) { + r = -EINVAL; + ti->error = "Integrity profile not supported"; + goto bad; + } + /*printk("tag_size: %u, tuple_size: %u\n", bi->tag_size, bi->tuple_size);*/ + if (bi->tuple_size < ic->tag_size) { + r = -EINVAL; + ti->error = "The integrity profile is smaller than tag size"; + goto bad; + } + if ((unsigned long)bi->tuple_size > PAGE_SIZE / 2) { + r = -EINVAL; + ti->error = "Too big tuple size"; + goto bad; + } + ic->tuple_size = bi->tuple_size; + if (1 << bi->interval_exp != ic->sectors_per_block << SECTOR_SHIFT) { + r = -EINVAL; + ti->error = "Integrity profile sector size mismatch"; + goto bad; + } + } + if (ic->mode == 'B' && !ic->internal_hash) { r = -EINVAL; ti->error = "Bitmap mode can be only used with internal hash"; @@ -4314,12 +4599,26 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv goto bad; } - r = mempool_init_page_pool(&ic->recheck_pool, 1, 0); + r = mempool_init_page_pool(&ic->recheck_pool, 1, ic->mode == 'I' ? 1 : 0); if (r) { ti->error = "Cannot allocate mempool"; goto bad; } + if (ic->mode == 'I') { + r = bioset_init(&ic->recheck_bios, RECHECK_POOL_SIZE, 0, BIOSET_NEED_BVECS); + if (r) { + ti->error = "Cannot allocate bio set"; + goto bad; + } + r = bioset_integrity_create(&ic->recheck_bios, RECHECK_POOL_SIZE); + if (r) { + ti->error = "Cannot allocate bio integrity set"; + r = -ENOMEM; + goto bad; + } + } + ic->metadata_wq = alloc_workqueue("dm-integrity-metadata", WQ_MEM_RECLAIM, METADATA_WORKQUEUE_MAX_ACTIVE); if (!ic->metadata_wq) { @@ -4396,11 +4695,16 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv should_write_sb = true; } - if (!ic->sb->version || ic->sb->version > SB_VERSION_5) { + if (!ic->sb->version || ic->sb->version > SB_VERSION_6) { r = -EINVAL; ti->error = "Unknown version"; goto bad; } + if (!!(ic->sb->flags & cpu_to_le32(SB_FLAG_INLINE)) != (ic->mode == 'I')) { + r = -EINVAL; + ti->error = "Inline flag mismatch"; + goto bad; + } if (le16_to_cpu(ic->sb->integrity_tag_size) != ic->tag_size) { r = -EINVAL; ti->error = "Tag size doesn't match the information in superblock"; @@ -4411,9 +4715,12 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv ti->error = "Block size doesn't match the information in superblock"; goto bad; } - if (!le32_to_cpu(ic->sb->journal_sections)) { + if (!le32_to_cpu(ic->sb->journal_sections) != (ic->mode == 'I')) { r = -EINVAL; - ti->error = "Corrupted superblock, journal_sections is 0"; + if (ic->mode != 'I') + ti->error = "Corrupted superblock, journal_sections is 0"; + else + ti->error = "Corrupted superblock, journal_sections is not 0"; goto bad; } /* make sure that ti->max_io_len doesn't overflow */ @@ -4465,8 +4772,9 @@ try_smaller_buffer: bits_in_journal = ((__u64)ic->journal_section_sectors * ic->journal_sections) << (SECTOR_SHIFT + 3); if (bits_in_journal > UINT_MAX) bits_in_journal = UINT_MAX; - while (bits_in_journal < (ic->provided_data_sectors + ((sector_t)1 << log2_sectors_per_bitmap_bit) - 1) >> log2_sectors_per_bitmap_bit) - log2_sectors_per_bitmap_bit++; + if (bits_in_journal) + while (bits_in_journal < (ic->provided_data_sectors + ((sector_t)1 << log2_sectors_per_bitmap_bit) - 1) >> log2_sectors_per_bitmap_bit) + log2_sectors_per_bitmap_bit++; log2_blocks_per_bitmap_bit = log2_sectors_per_bitmap_bit - ic->sb->log2_sectors_per_block; ic->log2_blocks_per_bitmap_bit = log2_blocks_per_bitmap_bit; @@ -4486,7 +4794,6 @@ try_smaller_buffer: goto bad; } - threshold = (__u64)ic->journal_entries * (100 - journal_watermark); threshold += 50; do_div(threshold, 100); @@ -4538,17 +4845,19 @@ try_smaller_buffer: goto bad; } - ic->bufio = dm_bufio_client_create(ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev, - 1U << (SECTOR_SHIFT + ic->log2_buffer_sectors), 1, 0, NULL, NULL, 0); - if (IS_ERR(ic->bufio)) { - r = PTR_ERR(ic->bufio); - ti->error = "Cannot initialize dm-bufio"; - ic->bufio = NULL; - goto bad; + if (ic->mode != 'I') { + ic->bufio = dm_bufio_client_create(ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev, + 1U << (SECTOR_SHIFT + ic->log2_buffer_sectors), 1, 0, NULL, NULL, 0); + if (IS_ERR(ic->bufio)) { + r = PTR_ERR(ic->bufio); + ti->error = "Cannot initialize dm-bufio"; + ic->bufio = NULL; + goto bad; + } + dm_bufio_set_sector_offset(ic->bufio, ic->start + ic->initial_sectors); } - dm_bufio_set_sector_offset(ic->bufio, ic->start + ic->initial_sectors); - if (ic->mode != 'R') { + if (ic->mode != 'R' && ic->mode != 'I') { r = create_journal(ic, &ti->error); if (r) goto bad; @@ -4608,7 +4917,7 @@ try_smaller_buffer: ic->just_formatted = true; } - if (!ic->meta_dev) { + if (!ic->meta_dev && ic->mode != 'I') { r = dm_set_target_max_io_len(ti, 1U << ic->sb->log2_interleave_sectors); if (r) goto bad; @@ -4632,6 +4941,9 @@ try_smaller_buffer: if (ic->discard) ti->num_discard_bios = 1; + if (ic->mode == 'I') + ti->mempool_needs_integrity = true; + dm_audit_log_ctr(DM_MSG_PREFIX, ti, 1); return 0; @@ -4665,6 +4977,7 @@ static void dm_integrity_dtr(struct dm_target *ti) kvfree(ic->bbs); if (ic->bufio) dm_bufio_client_destroy(ic->bufio); + bioset_exit(&ic->recheck_bios); mempool_exit(&ic->recheck_pool); mempool_exit(&ic->journal_io_mempool); if (ic->io) @@ -4718,12 +5031,13 @@ static void dm_integrity_dtr(struct dm_target *ti) static struct target_type integrity_target = { .name = "integrity", - .version = {1, 11, 0}, + .version = {1, 12, 0}, .module = THIS_MODULE, .features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY, .ctr = dm_integrity_ctr, .dtr = dm_integrity_dtr, .map = dm_integrity_map, + .end_io = dm_integrity_end_io, .postsuspend = dm_integrity_postsuspend, .resume = dm_integrity_resume, .status = dm_integrity_status, From 0b60be1628e3448fc96e28bde3d0b27e6d8743fc Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 14 Jul 2024 09:13:56 +0200 Subject: [PATCH 30/33] dm: Constify struct dm_block_validator 'struct dm_block_validator' are not modified in these drivers. Constifying this structure moves some data to a read-only section, so increase overall security. On a x86_64, with allmodconfig, as an example: Before: ====== text data bss dec hex filename 32047 920 16 32983 80d7 drivers/md/dm-cache-metadata.o After: ===== text data bss dec hex filename 32075 896 16 32987 80db drivers/md/dm-cache-metadata.o Signed-off-by: Christophe JAILLET Signed-off-by: Mikulas Patocka --- drivers/md/dm-cache-metadata.c | 6 +++--- drivers/md/dm-clone-metadata.c | 6 +++--- drivers/md/dm-era-target.c | 6 +++--- drivers/md/dm-thin-metadata.c | 6 +++--- drivers/md/persistent-data/dm-array.c | 6 +++--- drivers/md/persistent-data/dm-block-manager.c | 12 ++++++------ drivers/md/persistent-data/dm-block-manager.h | 14 ++++++++------ drivers/md/persistent-data/dm-btree-internal.h | 2 +- drivers/md/persistent-data/dm-btree-spine.c | 6 +++--- drivers/md/persistent-data/dm-space-map-common.c | 12 ++++++------ .../md/persistent-data/dm-transaction-manager.c | 8 ++++---- .../md/persistent-data/dm-transaction-manager.h | 6 +++--- 12 files changed, 46 insertions(+), 44 deletions(-) diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 0ad9dc1824fa..24cd87fddf75 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -170,7 +170,7 @@ struct dm_cache_metadata { */ #define SUPERBLOCK_CSUM_XOR 9031977 -static void sb_prepare_for_write(struct dm_block_validator *v, +static void sb_prepare_for_write(const struct dm_block_validator *v, struct dm_block *b, size_t sb_block_size) { @@ -195,7 +195,7 @@ static int check_metadata_version(struct cache_disk_superblock *disk_super) return 0; } -static int sb_check(struct dm_block_validator *v, +static int sb_check(const struct dm_block_validator *v, struct dm_block *b, size_t sb_block_size) { @@ -228,7 +228,7 @@ static int sb_check(struct dm_block_validator *v, return check_metadata_version(disk_super); } -static struct dm_block_validator sb_validator = { +static const struct dm_block_validator sb_validator = { .name = "superblock", .prepare_for_write = sb_prepare_for_write, .check = sb_check diff --git a/drivers/md/dm-clone-metadata.c b/drivers/md/dm-clone-metadata.c index 47c1fa7aad8b..2db84cd2202b 100644 --- a/drivers/md/dm-clone-metadata.c +++ b/drivers/md/dm-clone-metadata.c @@ -163,7 +163,7 @@ struct dm_clone_metadata { /* * Superblock validation. */ -static void sb_prepare_for_write(struct dm_block_validator *v, +static void sb_prepare_for_write(const struct dm_block_validator *v, struct dm_block *b, size_t sb_block_size) { struct superblock_disk *sb; @@ -177,7 +177,7 @@ static void sb_prepare_for_write(struct dm_block_validator *v, sb->csum = cpu_to_le32(csum); } -static int sb_check(struct dm_block_validator *v, struct dm_block *b, +static int sb_check(const struct dm_block_validator *v, struct dm_block *b, size_t sb_block_size) { struct superblock_disk *sb; @@ -220,7 +220,7 @@ static int sb_check(struct dm_block_validator *v, struct dm_block *b, return 0; } -static struct dm_block_validator sb_validator = { +static const struct dm_block_validator sb_validator = { .name = "superblock", .prepare_for_write = sb_prepare_for_write, .check = sb_check diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index e627781b1420..9c84e9d13eca 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -196,7 +196,7 @@ struct superblock_disk { * Superblock validation *-------------------------------------------------------------- */ -static void sb_prepare_for_write(struct dm_block_validator *v, +static void sb_prepare_for_write(const struct dm_block_validator *v, struct dm_block *b, size_t sb_block_size) { @@ -221,7 +221,7 @@ static int check_metadata_version(struct superblock_disk *disk) return 0; } -static int sb_check(struct dm_block_validator *v, +static int sb_check(const struct dm_block_validator *v, struct dm_block *b, size_t sb_block_size) { @@ -254,7 +254,7 @@ static int sb_check(struct dm_block_validator *v, return check_metadata_version(disk); } -static struct dm_block_validator sb_validator = { +static const struct dm_block_validator sb_validator = { .name = "superblock", .prepare_for_write = sb_prepare_for_write, .check = sb_check diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 6022189c1388..f90679cfec5b 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -249,7 +249,7 @@ struct dm_thin_device { */ #define SUPERBLOCK_CSUM_XOR 160774 -static void sb_prepare_for_write(struct dm_block_validator *v, +static void sb_prepare_for_write(const struct dm_block_validator *v, struct dm_block *b, size_t block_size) { @@ -261,7 +261,7 @@ static void sb_prepare_for_write(struct dm_block_validator *v, SUPERBLOCK_CSUM_XOR)); } -static int sb_check(struct dm_block_validator *v, +static int sb_check(const struct dm_block_validator *v, struct dm_block *b, size_t block_size) { @@ -294,7 +294,7 @@ static int sb_check(struct dm_block_validator *v, return 0; } -static struct dm_block_validator sb_validator = { +static const struct dm_block_validator sb_validator = { .name = "superblock", .prepare_for_write = sb_prepare_for_write, .check = sb_check diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c index 798c9c53a343..157c9bd2fed7 100644 --- a/drivers/md/persistent-data/dm-array.c +++ b/drivers/md/persistent-data/dm-array.c @@ -38,7 +38,7 @@ struct array_block { */ #define CSUM_XOR 595846735 -static void array_block_prepare_for_write(struct dm_block_validator *v, +static void array_block_prepare_for_write(const struct dm_block_validator *v, struct dm_block *b, size_t size_of_block) { @@ -50,7 +50,7 @@ static void array_block_prepare_for_write(struct dm_block_validator *v, CSUM_XOR)); } -static int array_block_check(struct dm_block_validator *v, +static int array_block_check(const struct dm_block_validator *v, struct dm_block *b, size_t size_of_block) { @@ -77,7 +77,7 @@ static int array_block_check(struct dm_block_validator *v, return 0; } -static struct dm_block_validator array_validator = { +static const struct dm_block_validator array_validator = { .name = "array", .prepare_for_write = array_block_prepare_for_write, .check = array_block_check diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c index b17b54df673b..1ef71e5fcde7 100644 --- a/drivers/md/persistent-data/dm-block-manager.c +++ b/drivers/md/persistent-data/dm-block-manager.c @@ -345,7 +345,7 @@ void *dm_block_data(struct dm_block *b) EXPORT_SYMBOL_GPL(dm_block_data); struct buffer_aux { - struct dm_block_validator *validator; + const struct dm_block_validator *validator; int write_locked; #ifdef CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING @@ -441,7 +441,7 @@ dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm) static int dm_bm_validate_buffer(struct dm_block_manager *bm, struct dm_buffer *buf, struct buffer_aux *aux, - struct dm_block_validator *v) + const struct dm_block_validator *v) { if (unlikely(!aux->validator)) { int r; @@ -467,7 +467,7 @@ static int dm_bm_validate_buffer(struct dm_block_manager *bm, return 0; } int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b, - struct dm_block_validator *v, + const struct dm_block_validator *v, struct dm_block **result) { struct buffer_aux *aux; @@ -500,7 +500,7 @@ int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b, EXPORT_SYMBOL_GPL(dm_bm_read_lock); int dm_bm_write_lock(struct dm_block_manager *bm, - dm_block_t b, struct dm_block_validator *v, + dm_block_t b, const struct dm_block_validator *v, struct dm_block **result) { struct buffer_aux *aux; @@ -536,7 +536,7 @@ int dm_bm_write_lock(struct dm_block_manager *bm, EXPORT_SYMBOL_GPL(dm_bm_write_lock); int dm_bm_read_try_lock(struct dm_block_manager *bm, - dm_block_t b, struct dm_block_validator *v, + dm_block_t b, const struct dm_block_validator *v, struct dm_block **result) { struct buffer_aux *aux; @@ -569,7 +569,7 @@ int dm_bm_read_try_lock(struct dm_block_manager *bm, } int dm_bm_write_lock_zero(struct dm_block_manager *bm, - dm_block_t b, struct dm_block_validator *v, + dm_block_t b, const struct dm_block_validator *v, struct dm_block **result) { int r; diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h index f706d3de8d5a..b1998968594c 100644 --- a/drivers/md/persistent-data/dm-block-manager.h +++ b/drivers/md/persistent-data/dm-block-manager.h @@ -51,12 +51,14 @@ dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm); */ struct dm_block_validator { const char *name; - void (*prepare_for_write)(struct dm_block_validator *v, struct dm_block *b, size_t block_size); + void (*prepare_for_write)(const struct dm_block_validator *v, + struct dm_block *b, size_t block_size); /* * Return 0 if the checksum is valid or < 0 on error. */ - int (*check)(struct dm_block_validator *v, struct dm_block *b, size_t block_size); + int (*check)(const struct dm_block_validator *v, + struct dm_block *b, size_t block_size); }; /*----------------------------------------------------------------*/ @@ -73,11 +75,11 @@ struct dm_block_validator { * written back to the disk sometime after dm_bm_unlock is called. */ int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b, - struct dm_block_validator *v, + const struct dm_block_validator *v, struct dm_block **result); int dm_bm_write_lock(struct dm_block_manager *bm, dm_block_t b, - struct dm_block_validator *v, + const struct dm_block_validator *v, struct dm_block **result); /* @@ -85,7 +87,7 @@ int dm_bm_write_lock(struct dm_block_manager *bm, dm_block_t b, * available immediately. */ int dm_bm_read_try_lock(struct dm_block_manager *bm, dm_block_t b, - struct dm_block_validator *v, + const struct dm_block_validator *v, struct dm_block **result); /* @@ -93,7 +95,7 @@ int dm_bm_read_try_lock(struct dm_block_manager *bm, dm_block_t b, * overwrite the block completely. It saves a disk read. */ int dm_bm_write_lock_zero(struct dm_block_manager *bm, dm_block_t b, - struct dm_block_validator *v, + const struct dm_block_validator *v, struct dm_block **result); void dm_bm_unlock(struct dm_block *b); diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h index 7ed2ce656fcc..acebd32858a7 100644 --- a/drivers/md/persistent-data/dm-btree-internal.h +++ b/drivers/md/persistent-data/dm-btree-internal.h @@ -138,7 +138,7 @@ static inline uint64_t value64(struct btree_node *n, uint32_t index) */ int lower_bound(struct btree_node *n, uint64_t key); -extern struct dm_block_validator btree_node_validator; +extern const struct dm_block_validator btree_node_validator; /* * Value type for upper levels of multi-level btrees. diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c index 7540383b7cf3..c46fc50c274e 100644 --- a/drivers/md/persistent-data/dm-btree-spine.c +++ b/drivers/md/persistent-data/dm-btree-spine.c @@ -16,7 +16,7 @@ #define BTREE_CSUM_XOR 121107 -static void node_prepare_for_write(struct dm_block_validator *v, +static void node_prepare_for_write(const struct dm_block_validator *v, struct dm_block *b, size_t block_size) { @@ -29,7 +29,7 @@ static void node_prepare_for_write(struct dm_block_validator *v, BTREE_CSUM_XOR)); } -static int node_check(struct dm_block_validator *v, +static int node_check(const struct dm_block_validator *v, struct dm_block *b, size_t block_size) { @@ -81,7 +81,7 @@ static int node_check(struct dm_block_validator *v, return 0; } -struct dm_block_validator btree_node_validator = { +const struct dm_block_validator btree_node_validator = { .name = "btree_node", .prepare_for_write = node_prepare_for_write, .check = node_check diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index 591d1a43d035..3a19124ee279 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c @@ -22,7 +22,7 @@ */ #define INDEX_CSUM_XOR 160478 -static void index_prepare_for_write(struct dm_block_validator *v, +static void index_prepare_for_write(const struct dm_block_validator *v, struct dm_block *b, size_t block_size) { @@ -34,7 +34,7 @@ static void index_prepare_for_write(struct dm_block_validator *v, INDEX_CSUM_XOR)); } -static int index_check(struct dm_block_validator *v, +static int index_check(const struct dm_block_validator *v, struct dm_block *b, size_t block_size) { @@ -59,7 +59,7 @@ static int index_check(struct dm_block_validator *v, return 0; } -static struct dm_block_validator index_validator = { +static const struct dm_block_validator index_validator = { .name = "index", .prepare_for_write = index_prepare_for_write, .check = index_check @@ -72,7 +72,7 @@ static struct dm_block_validator index_validator = { */ #define BITMAP_CSUM_XOR 240779 -static void dm_bitmap_prepare_for_write(struct dm_block_validator *v, +static void dm_bitmap_prepare_for_write(const struct dm_block_validator *v, struct dm_block *b, size_t block_size) { @@ -84,7 +84,7 @@ static void dm_bitmap_prepare_for_write(struct dm_block_validator *v, BITMAP_CSUM_XOR)); } -static int dm_bitmap_check(struct dm_block_validator *v, +static int dm_bitmap_check(const struct dm_block_validator *v, struct dm_block *b, size_t block_size) { @@ -109,7 +109,7 @@ static int dm_bitmap_check(struct dm_block_validator *v, return 0; } -static struct dm_block_validator dm_sm_bitmap_validator = { +static const struct dm_block_validator dm_sm_bitmap_validator = { .name = "sm_bitmap", .prepare_for_write = dm_bitmap_prepare_for_write, .check = dm_bitmap_check, diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c index c88fa6266203..c7ba4e6cbbc7 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.c +++ b/drivers/md/persistent-data/dm-transaction-manager.c @@ -237,7 +237,7 @@ int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root) EXPORT_SYMBOL_GPL(dm_tm_commit); int dm_tm_new_block(struct dm_transaction_manager *tm, - struct dm_block_validator *v, + const struct dm_block_validator *v, struct dm_block **result) { int r; @@ -266,7 +266,7 @@ int dm_tm_new_block(struct dm_transaction_manager *tm, } static int __shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, - struct dm_block_validator *v, + const struct dm_block_validator *v, struct dm_block **result) { int r; @@ -306,7 +306,7 @@ static int __shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, } int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, - struct dm_block_validator *v, struct dm_block **result, + const struct dm_block_validator *v, struct dm_block **result, int *inc_children) { int r; @@ -331,7 +331,7 @@ int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, EXPORT_SYMBOL_GPL(dm_tm_shadow_block); int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b, - struct dm_block_validator *v, + const struct dm_block_validator *v, struct dm_block **blk) { if (tm->is_clone) { diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h index 01f7e650118d..61a8d10825ca 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.h +++ b/drivers/md/persistent-data/dm-transaction-manager.h @@ -64,7 +64,7 @@ int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *superblock) * Zeroes the new block and returns with write lock held. */ int dm_tm_new_block(struct dm_transaction_manager *tm, - struct dm_block_validator *v, + const struct dm_block_validator *v, struct dm_block **result); /* @@ -84,7 +84,7 @@ int dm_tm_new_block(struct dm_transaction_manager *tm, * it locked when you call this. */ int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, - struct dm_block_validator *v, + const struct dm_block_validator *v, struct dm_block **result, int *inc_children); /* @@ -92,7 +92,7 @@ int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, * on it outstanding then it'll block. */ int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b, - struct dm_block_validator *v, + const struct dm_block_validator *v, struct dm_block **result); void dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b); From fa398e603ff79523ac5f40632f061396baa58593 Mon Sep 17 00:00:00 2001 From: Matthew Sakai Date: Mon, 15 Jul 2024 12:45:15 -0400 Subject: [PATCH 31/33] dm vdo repair: add missing kerneldoc fields Also remove trivial comment for increment_recovery_point. Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=9518 Signed-off-by: Matthew Sakai Signed-off-by: Mikulas Patocka --- drivers/md/dm-vdo/repair.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-vdo/repair.c b/drivers/md/dm-vdo/repair.c index defc9359f10e..b6f3d0710a21 100644 --- a/drivers/md/dm-vdo/repair.c +++ b/drivers/md/dm-vdo/repair.c @@ -318,6 +318,7 @@ static bool __must_check abort_on_error(int result, struct repair_completion *re /** * drain_slab_depot() - Flush out all dirty refcounts blocks now that they have been rebuilt or * recovered. + * @completion: The repair completion. */ static void drain_slab_depot(struct vdo_completion *completion) { @@ -653,9 +654,6 @@ static void rebuild_reference_counts(struct vdo_completion *completion) vdo_traverse_forest(vdo->block_map, process_entry, completion); } -/** - * increment_recovery_point() - Move the given recovery point forward by one entry. - */ static void increment_recovery_point(struct recovery_point *point) { if (++point->entry_count < RECOVERY_JOURNAL_ENTRIES_PER_SECTOR) @@ -952,6 +950,7 @@ static void abort_block_map_recovery(struct repair_completion *repair, int resul /** * find_entry_starting_next_page() - Find the first journal entry after a given entry which is not * on the same block map page. + * @repair: The repair completion. * @current_entry: The entry to search from. * @needs_sort: Whether sorting is needed to proceed. * @@ -1218,6 +1217,7 @@ static bool __must_check is_exact_recovery_journal_block(const struct recovery_j /** * find_recovery_journal_head_and_tail() - Find the tail and head of the journal. + * @repair: The repair completion. * * Return: True if there were valid journal blocks. */ @@ -1446,6 +1446,7 @@ static int validate_heads(struct repair_completion *repair) /** * extract_new_mappings() - Find all valid new mappings to be applied to the block map. + * @repair: The repair completion. * * The mappings are extracted from the journal and stored in a sortable array so that all of the * mappings to be applied to a given block map page can be done in a single page fetch. @@ -1500,6 +1501,7 @@ static int extract_new_mappings(struct repair_completion *repair) /** * compute_usages() - Compute the lbns in use and block map data blocks counts from the tail of * the journal. + * @repair: The repair completion. */ static noinline int compute_usages(struct repair_completion *repair) { From 513789b7fb5366a7c26c9d347d83eaba6c33a537 Mon Sep 17 00:00:00 2001 From: Matthew Sakai Date: Mon, 15 Jul 2024 10:43:39 -0400 Subject: [PATCH 32/33] dm vdo int-map: fix kerneldoc formatting Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202407141607.M3E2XQ0Z-lkp@intel.com/ Signed-off-by: Matthew Sakai Signed-off-by: Mikulas Patocka --- drivers/md/dm-vdo/int-map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-vdo/int-map.c b/drivers/md/dm-vdo/int-map.c index 3aa438f84ea1..f6fe58e437b3 100644 --- a/drivers/md/dm-vdo/int-map.c +++ b/drivers/md/dm-vdo/int-map.c @@ -96,7 +96,7 @@ struct int_map { size_t size; /** @capacity: The number of neighborhoods in the map. */ size_t capacity; - /* @bucket_count: The number of buckets in the bucket array. */ + /** @bucket_count: The number of buckets in the bucket array. */ size_t bucket_count; /** @buckets: The array of hash buckets. */ struct bucket *buckets; From 7f1c4909a821dbfd258177db9ec96dda3ae91346 Mon Sep 17 00:00:00 2001 From: Masatake YAMATO Date: Wed, 17 Jul 2024 04:05:59 +0900 Subject: [PATCH 33/33] dm vdo: fix a minor formatting issue in vdo.rst Signed-off-by: Masatake YAMATO Signed-off-by: Matthew Sakai Signed-off-by: Mikulas Patocka --- Documentation/admin-guide/device-mapper/vdo.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/admin-guide/device-mapper/vdo.rst b/Documentation/admin-guide/device-mapper/vdo.rst index 7e1ecafdf91e..c69ac186863a 100644 --- a/Documentation/admin-guide/device-mapper/vdo.rst +++ b/Documentation/admin-guide/device-mapper/vdo.rst @@ -241,6 +241,7 @@ Messages All vdo devices accept messages in the form: :: + dmsetup message 0 The messages are: