md: resolve external metadata handling deadlock in md_allow_write
md_allow_write() marks the metadata dirty while holding mddev->lock and then waits for the write to complete. For externally managed metadata this causes a deadlock as userspace needs to take the lock to communicate that the metadata update has completed. Change md_allow_write() in the 'external' case to start the 'mark active' operation and then return -EAGAIN. The expected side effects while waiting for userspace to write 'active' to 'array_state' are holding off reshape (code currently handles -ENOMEM), cause some 'stripe_cache_size' change requests to fail, cause some GET_BITMAP_FILE ioctl requests to fall back to GFP_NOIO, and cause updates to 'raid_disks' to fail. Except for 'stripe_cache_size' changes these failures can be mitigated by coordinating with mdmon. md_write_start() still prevents writes from occurring until the metadata handler has had a chance to take action as it unconditionally waits for MD_CHANGE_CLEAN to be cleared. [neilb@suse.de: return -EAGAIN, try GFP_NOIO] Signed-off-by: Dan Williams <dan.j.williams@intel.com>
This commit is contained in:
parent
1fe797e67f
commit
b5470dc5fc
@ -4172,9 +4172,11 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg)
|
|||||||
char *ptr, *buf = NULL;
|
char *ptr, *buf = NULL;
|
||||||
int err = -ENOMEM;
|
int err = -ENOMEM;
|
||||||
|
|
||||||
md_allow_write(mddev);
|
if (md_allow_write(mddev))
|
||||||
|
file = kmalloc(sizeof(*file), GFP_NOIO);
|
||||||
|
else
|
||||||
file = kmalloc(sizeof(*file), GFP_KERNEL);
|
file = kmalloc(sizeof(*file), GFP_KERNEL);
|
||||||
|
|
||||||
if (!file)
|
if (!file)
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
@ -5667,15 +5669,18 @@ void md_write_end(mddev_t *mddev)
|
|||||||
* may proceed without blocking. It is important to call this before
|
* may proceed without blocking. It is important to call this before
|
||||||
* attempting a GFP_KERNEL allocation while holding the mddev lock.
|
* attempting a GFP_KERNEL allocation while holding the mddev lock.
|
||||||
* Must be called with mddev_lock held.
|
* Must be called with mddev_lock held.
|
||||||
|
*
|
||||||
|
* In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
|
||||||
|
* is dropped, so return -EAGAIN after notifying userspace.
|
||||||
*/
|
*/
|
||||||
void md_allow_write(mddev_t *mddev)
|
int md_allow_write(mddev_t *mddev)
|
||||||
{
|
{
|
||||||
if (!mddev->pers)
|
if (!mddev->pers)
|
||||||
return;
|
return 0;
|
||||||
if (mddev->ro)
|
if (mddev->ro)
|
||||||
return;
|
return 0;
|
||||||
if (!mddev->pers->sync_request)
|
if (!mddev->pers->sync_request)
|
||||||
return;
|
return 0;
|
||||||
|
|
||||||
spin_lock_irq(&mddev->write_lock);
|
spin_lock_irq(&mddev->write_lock);
|
||||||
if (mddev->in_sync) {
|
if (mddev->in_sync) {
|
||||||
@ -5686,14 +5691,14 @@ void md_allow_write(mddev_t *mddev)
|
|||||||
mddev->safemode = 1;
|
mddev->safemode = 1;
|
||||||
spin_unlock_irq(&mddev->write_lock);
|
spin_unlock_irq(&mddev->write_lock);
|
||||||
md_update_sb(mddev, 0);
|
md_update_sb(mddev, 0);
|
||||||
|
|
||||||
sysfs_notify(&mddev->kobj, NULL, "array_state");
|
sysfs_notify(&mddev->kobj, NULL, "array_state");
|
||||||
/* wait for the dirty state to be recorded in the metadata */
|
|
||||||
wait_event(mddev->sb_wait,
|
|
||||||
!test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
|
|
||||||
!test_bit(MD_CHANGE_PENDING, &mddev->flags));
|
|
||||||
} else
|
} else
|
||||||
spin_unlock_irq(&mddev->write_lock);
|
spin_unlock_irq(&mddev->write_lock);
|
||||||
|
|
||||||
|
if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
|
||||||
|
return -EAGAIN;
|
||||||
|
else
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(md_allow_write);
|
EXPORT_SYMBOL_GPL(md_allow_write);
|
||||||
|
|
||||||
|
@ -2136,7 +2136,7 @@ static int raid1_reshape(mddev_t *mddev)
|
|||||||
conf_t *conf = mddev_to_conf(mddev);
|
conf_t *conf = mddev_to_conf(mddev);
|
||||||
int cnt, raid_disks;
|
int cnt, raid_disks;
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
int d, d2;
|
int d, d2, err;
|
||||||
|
|
||||||
/* Cannot change chunk_size, layout, or level */
|
/* Cannot change chunk_size, layout, or level */
|
||||||
if (mddev->chunk_size != mddev->new_chunk ||
|
if (mddev->chunk_size != mddev->new_chunk ||
|
||||||
@ -2148,7 +2148,9 @@ static int raid1_reshape(mddev_t *mddev)
|
|||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
md_allow_write(mddev);
|
err = md_allow_write(mddev);
|
||||||
|
if (err)
|
||||||
|
return err;
|
||||||
|
|
||||||
raid_disks = mddev->raid_disks + mddev->delta_disks;
|
raid_disks = mddev->raid_disks + mddev->delta_disks;
|
||||||
|
|
||||||
|
@ -911,14 +911,16 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
|
|||||||
struct stripe_head *osh, *nsh;
|
struct stripe_head *osh, *nsh;
|
||||||
LIST_HEAD(newstripes);
|
LIST_HEAD(newstripes);
|
||||||
struct disk_info *ndisks;
|
struct disk_info *ndisks;
|
||||||
int err = 0;
|
int err;
|
||||||
struct kmem_cache *sc;
|
struct kmem_cache *sc;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
if (newsize <= conf->pool_size)
|
if (newsize <= conf->pool_size)
|
||||||
return 0; /* never bother to shrink */
|
return 0; /* never bother to shrink */
|
||||||
|
|
||||||
md_allow_write(conf->mddev);
|
err = md_allow_write(conf->mddev);
|
||||||
|
if (err)
|
||||||
|
return err;
|
||||||
|
|
||||||
/* Step 1 */
|
/* Step 1 */
|
||||||
sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
|
sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
|
||||||
@ -3843,6 +3845,8 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
|
|||||||
{
|
{
|
||||||
raid5_conf_t *conf = mddev_to_conf(mddev);
|
raid5_conf_t *conf = mddev_to_conf(mddev);
|
||||||
unsigned long new;
|
unsigned long new;
|
||||||
|
int err;
|
||||||
|
|
||||||
if (len >= PAGE_SIZE)
|
if (len >= PAGE_SIZE)
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
if (!conf)
|
if (!conf)
|
||||||
@ -3858,7 +3862,9 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
|
|||||||
else
|
else
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
md_allow_write(mddev);
|
err = md_allow_write(mddev);
|
||||||
|
if (err)
|
||||||
|
return err;
|
||||||
while (new > conf->max_nr_stripes) {
|
while (new > conf->max_nr_stripes) {
|
||||||
if (grow_one_stripe(conf))
|
if (grow_one_stripe(conf))
|
||||||
conf->max_nr_stripes++;
|
conf->max_nr_stripes++;
|
||||||
|
@ -95,7 +95,7 @@ extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
|
|||||||
struct page *page, int rw);
|
struct page *page, int rw);
|
||||||
extern void md_do_sync(mddev_t *mddev);
|
extern void md_do_sync(mddev_t *mddev);
|
||||||
extern void md_new_event(mddev_t *mddev);
|
extern void md_new_event(mddev_t *mddev);
|
||||||
extern void md_allow_write(mddev_t *mddev);
|
extern int md_allow_write(mddev_t *mddev);
|
||||||
extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
|
extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
|
||||||
|
|
||||||
#endif /* CONFIG_MD */
|
#endif /* CONFIG_MD */
|
||||||
|
Loading…
Reference in New Issue
Block a user