md: fix deadlock between mddev_suspend() and md_write_start()
If mddev_suspend() races with md_write_start() we can deadlock with mddev_suspend() waiting for the request that is currently in md_write_start() to complete the ->make_request() call, and md_write_start() waiting for the metadata to be updated to mark the array as 'dirty'. As metadata updates done by md_check_recovery() only happen then the mddev_lock() can be claimed, and as mddev_suspend() is often called with the lock held, these threads wait indefinitely for each other. We fix this by having md_write_start() abort if mddev_suspend() is happening, and ->make_request() aborts if md_write_start() aborted. md_make_request() can detect this abort, decrease the ->active_io count, and wait for mddev_suspend(). Reported-by: Nix <nix@esperi.org.uk> Fix: 68866e425be2(MD: no sync IO while suspended) Cc: stable@vger.kernel.org Signed-off-by: NeilBrown <neilb@suse.com> Signed-off-by: Shaohua Li <shli@fb.com>
This commit is contained in:
parent
63f700aab4
commit
cc27b0c78c
@ -170,7 +170,7 @@ static void add_sector(struct faulty_conf *conf, sector_t start, int mode)
|
||||
conf->nfaults = n+1;
|
||||
}
|
||||
|
||||
static void faulty_make_request(struct mddev *mddev, struct bio *bio)
|
||||
static bool faulty_make_request(struct mddev *mddev, struct bio *bio)
|
||||
{
|
||||
struct faulty_conf *conf = mddev->private;
|
||||
int failit = 0;
|
||||
@ -182,7 +182,7 @@ static void faulty_make_request(struct mddev *mddev, struct bio *bio)
|
||||
* just fail immediately
|
||||
*/
|
||||
bio_io_error(bio);
|
||||
return;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (check_sector(conf, bio->bi_iter.bi_sector,
|
||||
@ -224,6 +224,7 @@ static void faulty_make_request(struct mddev *mddev, struct bio *bio)
|
||||
bio->bi_bdev = conf->rdev->bdev;
|
||||
|
||||
generic_make_request(bio);
|
||||
return true;
|
||||
}
|
||||
|
||||
static void faulty_status(struct seq_file *seq, struct mddev *mddev)
|
||||
|
@ -245,7 +245,7 @@ static void linear_free(struct mddev *mddev, void *priv)
|
||||
kfree(conf);
|
||||
}
|
||||
|
||||
static void linear_make_request(struct mddev *mddev, struct bio *bio)
|
||||
static bool linear_make_request(struct mddev *mddev, struct bio *bio)
|
||||
{
|
||||
char b[BDEVNAME_SIZE];
|
||||
struct dev_info *tmp_dev;
|
||||
@ -254,7 +254,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
|
||||
|
||||
if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
|
||||
md_flush_request(mddev, bio);
|
||||
return;
|
||||
return true;
|
||||
}
|
||||
|
||||
tmp_dev = which_dev(mddev, bio_sector);
|
||||
@ -292,7 +292,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
|
||||
mddev_check_write_zeroes(mddev, bio);
|
||||
generic_make_request(bio);
|
||||
}
|
||||
return;
|
||||
return true;
|
||||
|
||||
out_of_bounds:
|
||||
pr_err("md/linear:%s: make_request: Sector %llu out of bounds on dev %s: %llu sectors, offset %llu\n",
|
||||
@ -302,6 +302,7 @@ out_of_bounds:
|
||||
(unsigned long long)tmp_dev->rdev->sectors,
|
||||
(unsigned long long)start_sector);
|
||||
bio_io_error(bio);
|
||||
return true;
|
||||
}
|
||||
|
||||
static void linear_status (struct seq_file *seq, struct mddev *mddev)
|
||||
|
@ -277,7 +277,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
|
||||
bio_endio(bio);
|
||||
return BLK_QC_T_NONE;
|
||||
}
|
||||
smp_rmb(); /* Ensure implications of 'active' are visible */
|
||||
check_suspended:
|
||||
rcu_read_lock();
|
||||
if (mddev->suspended) {
|
||||
DEFINE_WAIT(__wait);
|
||||
@ -302,7 +302,11 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
|
||||
sectors = bio_sectors(bio);
|
||||
/* bio could be mergeable after passing to underlayer */
|
||||
bio->bi_opf &= ~REQ_NOMERGE;
|
||||
mddev->pers->make_request(mddev, bio);
|
||||
if (!mddev->pers->make_request(mddev, bio)) {
|
||||
atomic_dec(&mddev->active_io);
|
||||
wake_up(&mddev->sb_wait);
|
||||
goto check_suspended;
|
||||
}
|
||||
|
||||
cpu = part_stat_lock();
|
||||
part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
|
||||
@ -327,6 +331,7 @@ void mddev_suspend(struct mddev *mddev)
|
||||
if (mddev->suspended++)
|
||||
return;
|
||||
synchronize_rcu();
|
||||
wake_up(&mddev->sb_wait);
|
||||
wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
|
||||
mddev->pers->quiesce(mddev, 1);
|
||||
|
||||
@ -7950,12 +7955,14 @@ EXPORT_SYMBOL(md_done_sync);
|
||||
* If we need to update some array metadata (e.g. 'active' flag
|
||||
* in superblock) before writing, schedule a superblock update
|
||||
* and wait for it to complete.
|
||||
* A return value of 'false' means that the write wasn't recorded
|
||||
* and cannot proceed as the array is being suspend.
|
||||
*/
|
||||
void md_write_start(struct mddev *mddev, struct bio *bi)
|
||||
bool md_write_start(struct mddev *mddev, struct bio *bi)
|
||||
{
|
||||
int did_change = 0;
|
||||
if (bio_data_dir(bi) != WRITE)
|
||||
return;
|
||||
return true;
|
||||
|
||||
BUG_ON(mddev->ro == 1);
|
||||
if (mddev->ro == 2) {
|
||||
@ -7987,7 +7994,12 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
|
||||
if (did_change)
|
||||
sysfs_notify_dirent_safe(mddev->sysfs_state);
|
||||
wait_event(mddev->sb_wait,
|
||||
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
|
||||
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && !mddev->suspended);
|
||||
if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
|
||||
percpu_ref_put(&mddev->writes_pending);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
EXPORT_SYMBOL(md_write_start);
|
||||
|
||||
|
@ -510,7 +510,7 @@ struct md_personality
|
||||
int level;
|
||||
struct list_head list;
|
||||
struct module *owner;
|
||||
void (*make_request)(struct mddev *mddev, struct bio *bio);
|
||||
bool (*make_request)(struct mddev *mddev, struct bio *bio);
|
||||
int (*run)(struct mddev *mddev);
|
||||
void (*free)(struct mddev *mddev, void *priv);
|
||||
void (*status)(struct seq_file *seq, struct mddev *mddev);
|
||||
@ -649,7 +649,7 @@ extern void md_wakeup_thread(struct md_thread *thread);
|
||||
extern void md_check_recovery(struct mddev *mddev);
|
||||
extern void md_reap_sync_thread(struct mddev *mddev);
|
||||
extern int mddev_init_writes_pending(struct mddev *mddev);
|
||||
extern void md_write_start(struct mddev *mddev, struct bio *bi);
|
||||
extern bool md_write_start(struct mddev *mddev, struct bio *bi);
|
||||
extern void md_write_inc(struct mddev *mddev, struct bio *bi);
|
||||
extern void md_write_end(struct mddev *mddev);
|
||||
extern void md_done_sync(struct mddev *mddev, int blocks, int ok);
|
||||
|
@ -106,7 +106,7 @@ static void multipath_end_request(struct bio *bio)
|
||||
rdev_dec_pending(rdev, conf->mddev);
|
||||
}
|
||||
|
||||
static void multipath_make_request(struct mddev *mddev, struct bio * bio)
|
||||
static bool multipath_make_request(struct mddev *mddev, struct bio * bio)
|
||||
{
|
||||
struct mpconf *conf = mddev->private;
|
||||
struct multipath_bh * mp_bh;
|
||||
@ -114,7 +114,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
|
||||
|
||||
if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
|
||||
md_flush_request(mddev, bio);
|
||||
return;
|
||||
return true;
|
||||
}
|
||||
|
||||
mp_bh = mempool_alloc(conf->pool, GFP_NOIO);
|
||||
@ -126,7 +126,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
|
||||
if (mp_bh->path < 0) {
|
||||
bio_io_error(bio);
|
||||
mempool_free(mp_bh, conf->pool);
|
||||
return;
|
||||
return true;
|
||||
}
|
||||
multipath = conf->multipaths + mp_bh->path;
|
||||
|
||||
@ -141,7 +141,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
|
||||
mddev_check_writesame(mddev, &mp_bh->bio);
|
||||
mddev_check_write_zeroes(mddev, &mp_bh->bio);
|
||||
generic_make_request(&mp_bh->bio);
|
||||
return;
|
||||
return true;
|
||||
}
|
||||
|
||||
static void multipath_status(struct seq_file *seq, struct mddev *mddev)
|
||||
|
@ -548,7 +548,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
|
||||
bio_endio(bio);
|
||||
}
|
||||
|
||||
static void raid0_make_request(struct mddev *mddev, struct bio *bio)
|
||||
static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
|
||||
{
|
||||
struct strip_zone *zone;
|
||||
struct md_rdev *tmp_dev;
|
||||
@ -559,12 +559,12 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
|
||||
|
||||
if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
|
||||
md_flush_request(mddev, bio);
|
||||
return;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) {
|
||||
raid0_handle_discard(mddev, bio);
|
||||
return;
|
||||
return true;
|
||||
}
|
||||
|
||||
bio_sector = bio->bi_iter.bi_sector;
|
||||
@ -599,6 +599,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
|
||||
mddev_check_writesame(mddev, bio);
|
||||
mddev_check_write_zeroes(mddev, bio);
|
||||
generic_make_request(bio);
|
||||
return true;
|
||||
}
|
||||
|
||||
static void raid0_status(struct seq_file *seq, struct mddev *mddev)
|
||||
|
@ -1321,7 +1321,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
|
||||
* Continue immediately if no resync is active currently.
|
||||
*/
|
||||
|
||||
md_write_start(mddev, bio); /* wait on superblock update early */
|
||||
|
||||
if ((bio_end_sector(bio) > mddev->suspend_lo &&
|
||||
bio->bi_iter.bi_sector < mddev->suspend_hi) ||
|
||||
@ -1550,13 +1549,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
|
||||
wake_up(&conf->wait_barrier);
|
||||
}
|
||||
|
||||
static void raid1_make_request(struct mddev *mddev, struct bio *bio)
|
||||
static bool raid1_make_request(struct mddev *mddev, struct bio *bio)
|
||||
{
|
||||
sector_t sectors;
|
||||
|
||||
if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
|
||||
md_flush_request(mddev, bio);
|
||||
return;
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1571,8 +1570,12 @@ static void raid1_make_request(struct mddev *mddev, struct bio *bio)
|
||||
|
||||
if (bio_data_dir(bio) == READ)
|
||||
raid1_read_request(mddev, bio, sectors, NULL);
|
||||
else
|
||||
else {
|
||||
if (!md_write_start(mddev,bio))
|
||||
return false;
|
||||
raid1_write_request(mddev, bio, sectors);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static void raid1_status(struct seq_file *seq, struct mddev *mddev)
|
||||
|
@ -1303,8 +1303,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
|
||||
sector_t sectors;
|
||||
int max_sectors;
|
||||
|
||||
md_write_start(mddev, bio);
|
||||
|
||||
/*
|
||||
* Register the new request and wait if the reconstruction
|
||||
* thread has put up a bar for new requests.
|
||||
@ -1525,7 +1523,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
|
||||
raid10_write_request(mddev, bio, r10_bio);
|
||||
}
|
||||
|
||||
static void raid10_make_request(struct mddev *mddev, struct bio *bio)
|
||||
static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
|
||||
{
|
||||
struct r10conf *conf = mddev->private;
|
||||
sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
|
||||
@ -1534,9 +1532,12 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio)
|
||||
|
||||
if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
|
||||
md_flush_request(mddev, bio);
|
||||
return;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!md_write_start(mddev, bio))
|
||||
return false;
|
||||
|
||||
/*
|
||||
* If this request crosses a chunk boundary, we need to split
|
||||
* it.
|
||||
@ -1553,6 +1554,7 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio)
|
||||
|
||||
/* In case raid10d snuck in to freeze_array */
|
||||
wake_up(&conf->wait_barrier);
|
||||
return true;
|
||||
}
|
||||
|
||||
static void raid10_status(struct seq_file *seq, struct mddev *mddev)
|
||||
|
@ -5479,7 +5479,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
|
||||
last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
|
||||
|
||||
bi->bi_next = NULL;
|
||||
md_write_start(mddev, bi);
|
||||
|
||||
stripe_sectors = conf->chunk_sectors *
|
||||
(conf->raid_disks - conf->max_degraded);
|
||||
@ -5549,11 +5548,10 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
|
||||
release_stripe_plug(mddev, sh);
|
||||
}
|
||||
|
||||
md_write_end(mddev);
|
||||
bio_endio(bi);
|
||||
}
|
||||
|
||||
static void raid5_make_request(struct mddev *mddev, struct bio * bi)
|
||||
static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
|
||||
{
|
||||
struct r5conf *conf = mddev->private;
|
||||
int dd_idx;
|
||||
@ -5569,10 +5567,10 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
|
||||
int ret = r5l_handle_flush_request(conf->log, bi);
|
||||
|
||||
if (ret == 0)
|
||||
return;
|
||||
return true;
|
||||
if (ret == -ENODEV) {
|
||||
md_flush_request(mddev, bi);
|
||||
return;
|
||||
return true;
|
||||
}
|
||||
/* ret == -EAGAIN, fallback */
|
||||
/*
|
||||
@ -5582,6 +5580,8 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
|
||||
do_flush = bi->bi_opf & REQ_PREFLUSH;
|
||||
}
|
||||
|
||||
if (!md_write_start(mddev, bi))
|
||||
return false;
|
||||
/*
|
||||
* If array is degraded, better not do chunk aligned read because
|
||||
* later we might have to read it again in order to reconstruct
|
||||
@ -5591,18 +5591,18 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
|
||||
mddev->reshape_position == MaxSector) {
|
||||
bi = chunk_aligned_read(mddev, bi);
|
||||
if (!bi)
|
||||
return;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) {
|
||||
make_discard_request(mddev, bi);
|
||||
return;
|
||||
md_write_end(mddev);
|
||||
return true;
|
||||
}
|
||||
|
||||
logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
|
||||
last_sector = bio_end_sector(bi);
|
||||
bi->bi_next = NULL;
|
||||
md_write_start(mddev, bi);
|
||||
|
||||
prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
|
||||
for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
|
||||
@ -5740,6 +5740,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
|
||||
if (rw == WRITE)
|
||||
md_write_end(mddev);
|
||||
bio_endio(bi);
|
||||
return true;
|
||||
}
|
||||
|
||||
static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
|
||||
|
Loading…
Reference in New Issue
Block a user