md update for 3.13.
Mostly optimisations and obscure bug fixes. - raid5 gets less lock contention - raid1 gets less contention between normal-io and resync-io during resync. -----BEGIN PGP SIGNATURE----- Version: GnuPG v2.0.19 (GNU/Linux) iQIVAwUAUovzDznsnt1WYoG5AQJ1pQ//bDuXadoJ5dwjWjVxFOKoQ9j/9joEI0yH XTApD3ADKckdBc4TSLOIbCNLW1Pbe23HlOI/GjCiJ/7mePL3OwHd7Fx8Rfq3BubV f7NgjVwu8nwYD0OXEZsshImptEtrbYwQdy+qlKcHXcZz1MUfR+Egih3r/ouTEfEt FNq/6MpyN0IKSY82xP/jFZgesBucgKz/YOUIbwClxm7UiyISKvWQLBIAfLB3dyI3 HoEdEzQX6I56Rw0mkSUG4Mk+8xx/8twxL+yqEUqfdJREWuB56Km8kl8y/e465Nk0 ZZg6j/TrslVEwbEeVMx0syvYcaAWFZ4X2jdKfo1lI0g9beZp7H1GRF8yR1s2t/h4 g/vb55MEN++4LPaE9ut4z7SG2yLyGkZgFTzTjyq5of+DFL0cayO7wXxbgpcD7JYf Doef/OSa6csKiGiJI48iQa08Bolmz9ZWzZQXhAthKfFQ9Rv+GEtIAi4kLR8EZPbu 0/FL1ylYNUY9O7p0g+iy9Kcoc+xW36I95pPZf8pO8GFcXTjyuCCBVh/SNvFZZHPl 3xk3aZJknAEID8VrVG2IJPkeDI8WK8YxmpU/nARCoytn07Df6Ye8jGvLdR8pL3lB TIZV6eRY4yciB8LtoK9Kg4XTmOMhBtjt4c3znkljp98vhOQQb/oHN+BXMGcwqvr9 fk0KGrg31VA= =8RCg -----END PGP SIGNATURE----- Merge tag 'md/3.13' of git://neil.brown.name/md Pull md update from Neil Brown: "Mostly optimisations and obscure bug fixes. - raid5 gets less lock contention - raid1 gets less contention between normal-io and resync-io during resync" * tag 'md/3.13' of git://neil.brown.name/md: md/raid5: Use conf->device_lock protect changing of multi-thread resources. md/raid5: Before freeing old multi-thread worker, it should flush them. md/raid5: For stripe with R5_ReadNoMerge, we replace REQ_FLUSH with REQ_NOMERGE. UAPI: include <asm/byteorder.h> in linux/raid/md_p.h raid1: Rewrite the implementation of iobarrier. raid1: Add some macros to make code clearly. raid1: Replace raise_barrier/lower_barrier with freeze_array/unfreeze_array when reconfiguring the array. raid1: Add a field array_frozen to indicate whether raid in freeze state. md: Convert use of typedef ctl_table to struct ctl_table md/raid5: avoid deadlock when raid5 array has unack badblocks during md_stop_writes. md: use MD_RECOVERY_INTR instead of kthread_should_stop in resync thread. md: fix some places where mddev_lock return value is not checked. raid5: Retry R5_ReadNoMerge flag when hit a read error. raid5: relieve lock contention in get_active_stripe() raid5: relieve lock contention in get_active_stripe() wait: add wait_event_cmd() md/raid5.c: add proper locking to error path of raid5_start_reshape. md: fix calculation of stacking limits on level change. raid5: Use slow_path to release stripe when mddev->thread is null
This commit is contained in:
commit
6d6e352c80
133
drivers/md/md.c
133
drivers/md/md.c
@ -112,7 +112,7 @@ static inline int speed_max(struct mddev *mddev)
|
||||
|
||||
static struct ctl_table_header *raid_table_header;
|
||||
|
||||
static ctl_table raid_table[] = {
|
||||
static struct ctl_table raid_table[] = {
|
||||
{
|
||||
.procname = "speed_limit_min",
|
||||
.data = &sysctl_speed_limit_min,
|
||||
@ -130,7 +130,7 @@ static ctl_table raid_table[] = {
|
||||
{ }
|
||||
};
|
||||
|
||||
static ctl_table raid_dir_table[] = {
|
||||
static struct ctl_table raid_dir_table[] = {
|
||||
{
|
||||
.procname = "raid",
|
||||
.maxlen = 0,
|
||||
@ -140,7 +140,7 @@ static ctl_table raid_dir_table[] = {
|
||||
{ }
|
||||
};
|
||||
|
||||
static ctl_table raid_root_table[] = {
|
||||
static struct ctl_table raid_root_table[] = {
|
||||
{
|
||||
.procname = "dev",
|
||||
.maxlen = 0,
|
||||
@ -562,11 +562,19 @@ static struct mddev * mddev_find(dev_t unit)
|
||||
goto retry;
|
||||
}
|
||||
|
||||
static inline int mddev_lock(struct mddev * mddev)
|
||||
static inline int __must_check mddev_lock(struct mddev * mddev)
|
||||
{
|
||||
return mutex_lock_interruptible(&mddev->reconfig_mutex);
|
||||
}
|
||||
|
||||
/* Sometimes we need to take the lock in a situation where
|
||||
* failure due to interrupts is not acceptable.
|
||||
*/
|
||||
static inline void mddev_lock_nointr(struct mddev * mddev)
|
||||
{
|
||||
mutex_lock(&mddev->reconfig_mutex);
|
||||
}
|
||||
|
||||
static inline int mddev_is_locked(struct mddev *mddev)
|
||||
{
|
||||
return mutex_is_locked(&mddev->reconfig_mutex);
|
||||
@ -2978,7 +2986,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
|
||||
for_each_mddev(mddev, tmp) {
|
||||
struct md_rdev *rdev2;
|
||||
|
||||
mddev_lock(mddev);
|
||||
mddev_lock_nointr(mddev);
|
||||
rdev_for_each(rdev2, mddev)
|
||||
if (rdev->bdev == rdev2->bdev &&
|
||||
rdev != rdev2 &&
|
||||
@ -2994,7 +3002,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
|
||||
break;
|
||||
}
|
||||
}
|
||||
mddev_lock(my_mddev);
|
||||
mddev_lock_nointr(my_mddev);
|
||||
if (overlap) {
|
||||
/* Someone else could have slipped in a size
|
||||
* change here, but doing so is just silly.
|
||||
@ -3580,6 +3588,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
|
||||
mddev->in_sync = 1;
|
||||
del_timer_sync(&mddev->safemode_timer);
|
||||
}
|
||||
blk_set_stacking_limits(&mddev->queue->limits);
|
||||
pers->run(mddev);
|
||||
set_bit(MD_CHANGE_DEVS, &mddev->flags);
|
||||
mddev_resume(mddev);
|
||||
@ -5258,7 +5267,7 @@ static void __md_stop_writes(struct mddev *mddev)
|
||||
|
||||
void md_stop_writes(struct mddev *mddev)
|
||||
{
|
||||
mddev_lock(mddev);
|
||||
mddev_lock_nointr(mddev);
|
||||
__md_stop_writes(mddev);
|
||||
mddev_unlock(mddev);
|
||||
}
|
||||
@ -5291,20 +5300,35 @@ EXPORT_SYMBOL_GPL(md_stop);
|
||||
static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
|
||||
{
|
||||
int err = 0;
|
||||
int did_freeze = 0;
|
||||
|
||||
if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
|
||||
did_freeze = 1;
|
||||
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
|
||||
md_wakeup_thread(mddev->thread);
|
||||
}
|
||||
if (mddev->sync_thread) {
|
||||
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
|
||||
/* Thread might be blocked waiting for metadata update
|
||||
* which will now never happen */
|
||||
wake_up_process(mddev->sync_thread->tsk);
|
||||
}
|
||||
mddev_unlock(mddev);
|
||||
wait_event(resync_wait, mddev->sync_thread == NULL);
|
||||
mddev_lock_nointr(mddev);
|
||||
|
||||
mutex_lock(&mddev->open_mutex);
|
||||
if (atomic_read(&mddev->openers) > !!bdev) {
|
||||
if (atomic_read(&mddev->openers) > !!bdev ||
|
||||
mddev->sync_thread ||
|
||||
(bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
|
||||
printk("md: %s still in use.\n",mdname(mddev));
|
||||
if (did_freeze) {
|
||||
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
|
||||
md_wakeup_thread(mddev->thread);
|
||||
}
|
||||
err = -EBUSY;
|
||||
goto out;
|
||||
}
|
||||
if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) {
|
||||
/* Someone opened the device since we flushed it
|
||||
* so page cache could be dirty and it is too late
|
||||
* to flush. So abort
|
||||
*/
|
||||
mutex_unlock(&mddev->open_mutex);
|
||||
return -EBUSY;
|
||||
}
|
||||
if (mddev->pers) {
|
||||
__md_stop_writes(mddev);
|
||||
|
||||
@ -5315,7 +5339,7 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
|
||||
set_disk_ro(mddev->gendisk, 1);
|
||||
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
|
||||
sysfs_notify_dirent_safe(mddev->sysfs_state);
|
||||
err = 0;
|
||||
err = 0;
|
||||
}
|
||||
out:
|
||||
mutex_unlock(&mddev->open_mutex);
|
||||
@ -5331,20 +5355,34 @@ static int do_md_stop(struct mddev * mddev, int mode,
|
||||
{
|
||||
struct gendisk *disk = mddev->gendisk;
|
||||
struct md_rdev *rdev;
|
||||
int did_freeze = 0;
|
||||
|
||||
if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
|
||||
did_freeze = 1;
|
||||
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
|
||||
md_wakeup_thread(mddev->thread);
|
||||
}
|
||||
if (mddev->sync_thread) {
|
||||
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
|
||||
/* Thread might be blocked waiting for metadata update
|
||||
* which will now never happen */
|
||||
wake_up_process(mddev->sync_thread->tsk);
|
||||
}
|
||||
mddev_unlock(mddev);
|
||||
wait_event(resync_wait, mddev->sync_thread == NULL);
|
||||
mddev_lock_nointr(mddev);
|
||||
|
||||
mutex_lock(&mddev->open_mutex);
|
||||
if (atomic_read(&mddev->openers) > !!bdev ||
|
||||
mddev->sysfs_active) {
|
||||
mddev->sysfs_active ||
|
||||
mddev->sync_thread ||
|
||||
(bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
|
||||
printk("md: %s still in use.\n",mdname(mddev));
|
||||
mutex_unlock(&mddev->open_mutex);
|
||||
return -EBUSY;
|
||||
}
|
||||
if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) {
|
||||
/* Someone opened the device since we flushed it
|
||||
* so page cache could be dirty and it is too late
|
||||
* to flush. So abort
|
||||
*/
|
||||
mutex_unlock(&mddev->open_mutex);
|
||||
if (did_freeze) {
|
||||
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
|
||||
md_wakeup_thread(mddev->thread);
|
||||
}
|
||||
return -EBUSY;
|
||||
}
|
||||
if (mddev->pers) {
|
||||
@ -6551,7 +6589,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
|
||||
wait_event(mddev->sb_wait,
|
||||
!test_bit(MD_CHANGE_DEVS, &mddev->flags) &&
|
||||
!test_bit(MD_CHANGE_PENDING, &mddev->flags));
|
||||
mddev_lock(mddev);
|
||||
mddev_lock_nointr(mddev);
|
||||
}
|
||||
} else {
|
||||
err = -EROFS;
|
||||
@ -7361,9 +7399,6 @@ void md_do_sync(struct md_thread *thread)
|
||||
mddev->curr_resync = 2;
|
||||
|
||||
try_again:
|
||||
if (kthread_should_stop())
|
||||
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
|
||||
|
||||
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
|
||||
goto skip;
|
||||
for_each_mddev(mddev2, tmp) {
|
||||
@ -7388,7 +7423,7 @@ void md_do_sync(struct md_thread *thread)
|
||||
* be caught by 'softlockup'
|
||||
*/
|
||||
prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
|
||||
if (!kthread_should_stop() &&
|
||||
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
|
||||
mddev2->curr_resync >= mddev->curr_resync) {
|
||||
printk(KERN_INFO "md: delaying %s of %s"
|
||||
" until %s has finished (they"
|
||||
@ -7464,7 +7499,7 @@ void md_do_sync(struct md_thread *thread)
|
||||
last_check = 0;
|
||||
|
||||
if (j>2) {
|
||||
printk(KERN_INFO
|
||||
printk(KERN_INFO
|
||||
"md: resuming %s of %s from checkpoint.\n",
|
||||
desc, mdname(mddev));
|
||||
mddev->curr_resync = j;
|
||||
@ -7501,7 +7536,8 @@ void md_do_sync(struct md_thread *thread)
|
||||
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
|
||||
}
|
||||
|
||||
while (j >= mddev->resync_max && !kthread_should_stop()) {
|
||||
while (j >= mddev->resync_max &&
|
||||
!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
|
||||
/* As this condition is controlled by user-space,
|
||||
* we can block indefinitely, so use '_interruptible'
|
||||
* to avoid triggering warnings.
|
||||
@ -7509,17 +7545,18 @@ void md_do_sync(struct md_thread *thread)
|
||||
flush_signals(current); /* just in case */
|
||||
wait_event_interruptible(mddev->recovery_wait,
|
||||
mddev->resync_max > j
|
||||
|| kthread_should_stop());
|
||||
|| test_bit(MD_RECOVERY_INTR,
|
||||
&mddev->recovery));
|
||||
}
|
||||
|
||||
if (kthread_should_stop())
|
||||
goto interrupted;
|
||||
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
|
||||
break;
|
||||
|
||||
sectors = mddev->pers->sync_request(mddev, j, &skipped,
|
||||
currspeed < speed_min(mddev));
|
||||
if (sectors == 0) {
|
||||
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
|
||||
goto out;
|
||||
break;
|
||||
}
|
||||
|
||||
if (!skipped) { /* actual IO requested */
|
||||
@ -7556,10 +7593,8 @@ void md_do_sync(struct md_thread *thread)
|
||||
last_mark = next;
|
||||
}
|
||||
|
||||
|
||||
if (kthread_should_stop())
|
||||
goto interrupted;
|
||||
|
||||
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
|
||||
break;
|
||||
|
||||
/*
|
||||
* this loop exits only if either when we are slower than
|
||||
@ -7582,11 +7617,12 @@ void md_do_sync(struct md_thread *thread)
|
||||
}
|
||||
}
|
||||
}
|
||||
printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc);
|
||||
printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc,
|
||||
test_bit(MD_RECOVERY_INTR, &mddev->recovery)
|
||||
? "interrupted" : "done");
|
||||
/*
|
||||
* this also signals 'finished resyncing' to md_stop
|
||||
*/
|
||||
out:
|
||||
blk_finish_plug(&plug);
|
||||
wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
|
||||
|
||||
@ -7640,16 +7676,6 @@ void md_do_sync(struct md_thread *thread)
|
||||
set_bit(MD_RECOVERY_DONE, &mddev->recovery);
|
||||
md_wakeup_thread(mddev->thread);
|
||||
return;
|
||||
|
||||
interrupted:
|
||||
/*
|
||||
* got a signal, exit.
|
||||
*/
|
||||
printk(KERN_INFO
|
||||
"md: md_do_sync() got signal ... exiting\n");
|
||||
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
|
||||
goto out;
|
||||
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(md_do_sync);
|
||||
|
||||
@ -7894,6 +7920,7 @@ void md_reap_sync_thread(struct mddev *mddev)
|
||||
|
||||
/* resync has finished, collect result */
|
||||
md_unregister_thread(&mddev->sync_thread);
|
||||
wake_up(&resync_wait);
|
||||
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
|
||||
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
|
||||
/* success...*/
|
||||
|
@ -66,7 +66,8 @@
|
||||
*/
|
||||
static int max_queued_requests = 1024;
|
||||
|
||||
static void allow_barrier(struct r1conf *conf);
|
||||
static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
|
||||
sector_t bi_sector);
|
||||
static void lower_barrier(struct r1conf *conf);
|
||||
|
||||
static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
|
||||
@ -84,10 +85,12 @@ static void r1bio_pool_free(void *r1_bio, void *data)
|
||||
}
|
||||
|
||||
#define RESYNC_BLOCK_SIZE (64*1024)
|
||||
//#define RESYNC_BLOCK_SIZE PAGE_SIZE
|
||||
#define RESYNC_DEPTH 32
|
||||
#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
|
||||
#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
|
||||
#define RESYNC_WINDOW (2048*1024)
|
||||
#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
|
||||
#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
|
||||
#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS)
|
||||
|
||||
static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
|
||||
{
|
||||
@ -225,6 +228,8 @@ static void call_bio_endio(struct r1bio *r1_bio)
|
||||
struct bio *bio = r1_bio->master_bio;
|
||||
int done;
|
||||
struct r1conf *conf = r1_bio->mddev->private;
|
||||
sector_t start_next_window = r1_bio->start_next_window;
|
||||
sector_t bi_sector = bio->bi_sector;
|
||||
|
||||
if (bio->bi_phys_segments) {
|
||||
unsigned long flags;
|
||||
@ -232,6 +237,11 @@ static void call_bio_endio(struct r1bio *r1_bio)
|
||||
bio->bi_phys_segments--;
|
||||
done = (bio->bi_phys_segments == 0);
|
||||
spin_unlock_irqrestore(&conf->device_lock, flags);
|
||||
/*
|
||||
* make_request() might be waiting for
|
||||
* bi_phys_segments to decrease
|
||||
*/
|
||||
wake_up(&conf->wait_barrier);
|
||||
} else
|
||||
done = 1;
|
||||
|
||||
@ -243,7 +253,7 @@ static void call_bio_endio(struct r1bio *r1_bio)
|
||||
* Wake up any possible resync thread that waits for the device
|
||||
* to go idle.
|
||||
*/
|
||||
allow_barrier(conf);
|
||||
allow_barrier(conf, start_next_window, bi_sector);
|
||||
}
|
||||
}
|
||||
|
||||
@ -814,8 +824,6 @@ static void flush_pending_writes(struct r1conf *conf)
|
||||
* there is no normal IO happeing. It must arrange to call
|
||||
* lower_barrier when the particular background IO completes.
|
||||
*/
|
||||
#define RESYNC_DEPTH 32
|
||||
|
||||
static void raise_barrier(struct r1conf *conf)
|
||||
{
|
||||
spin_lock_irq(&conf->resync_lock);
|
||||
@ -827,9 +835,19 @@ static void raise_barrier(struct r1conf *conf)
|
||||
/* block any new IO from starting */
|
||||
conf->barrier++;
|
||||
|
||||
/* Now wait for all pending IO to complete */
|
||||
/* For these conditions we must wait:
|
||||
* A: while the array is in frozen state
|
||||
* B: while barrier >= RESYNC_DEPTH, meaning resync reach
|
||||
* the max count which allowed.
|
||||
* C: next_resync + RESYNC_SECTORS > start_next_window, meaning
|
||||
* next resync will reach to the window which normal bios are
|
||||
* handling.
|
||||
*/
|
||||
wait_event_lock_irq(conf->wait_barrier,
|
||||
!conf->nr_pending && conf->barrier < RESYNC_DEPTH,
|
||||
!conf->array_frozen &&
|
||||
conf->barrier < RESYNC_DEPTH &&
|
||||
(conf->start_next_window >=
|
||||
conf->next_resync + RESYNC_SECTORS),
|
||||
conf->resync_lock);
|
||||
|
||||
spin_unlock_irq(&conf->resync_lock);
|
||||
@ -845,10 +863,33 @@ static void lower_barrier(struct r1conf *conf)
|
||||
wake_up(&conf->wait_barrier);
|
||||
}
|
||||
|
||||
static void wait_barrier(struct r1conf *conf)
|
||||
static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio)
|
||||
{
|
||||
bool wait = false;
|
||||
|
||||
if (conf->array_frozen || !bio)
|
||||
wait = true;
|
||||
else if (conf->barrier && bio_data_dir(bio) == WRITE) {
|
||||
if (conf->next_resync < RESYNC_WINDOW_SECTORS)
|
||||
wait = true;
|
||||
else if ((conf->next_resync - RESYNC_WINDOW_SECTORS
|
||||
>= bio_end_sector(bio)) ||
|
||||
(conf->next_resync + NEXT_NORMALIO_DISTANCE
|
||||
<= bio->bi_sector))
|
||||
wait = false;
|
||||
else
|
||||
wait = true;
|
||||
}
|
||||
|
||||
return wait;
|
||||
}
|
||||
|
||||
static sector_t wait_barrier(struct r1conf *conf, struct bio *bio)
|
||||
{
|
||||
sector_t sector = 0;
|
||||
|
||||
spin_lock_irq(&conf->resync_lock);
|
||||
if (conf->barrier) {
|
||||
if (need_to_wait_for_sync(conf, bio)) {
|
||||
conf->nr_waiting++;
|
||||
/* Wait for the barrier to drop.
|
||||
* However if there are already pending
|
||||
@ -860,22 +901,67 @@ static void wait_barrier(struct r1conf *conf)
|
||||
* count down.
|
||||
*/
|
||||
wait_event_lock_irq(conf->wait_barrier,
|
||||
!conf->barrier ||
|
||||
(conf->nr_pending &&
|
||||
!conf->array_frozen &&
|
||||
(!conf->barrier ||
|
||||
((conf->start_next_window <
|
||||
conf->next_resync + RESYNC_SECTORS) &&
|
||||
current->bio_list &&
|
||||
!bio_list_empty(current->bio_list)),
|
||||
!bio_list_empty(current->bio_list))),
|
||||
conf->resync_lock);
|
||||
conf->nr_waiting--;
|
||||
}
|
||||
|
||||
if (bio && bio_data_dir(bio) == WRITE) {
|
||||
if (conf->next_resync + NEXT_NORMALIO_DISTANCE
|
||||
<= bio->bi_sector) {
|
||||
if (conf->start_next_window == MaxSector)
|
||||
conf->start_next_window =
|
||||
conf->next_resync +
|
||||
NEXT_NORMALIO_DISTANCE;
|
||||
|
||||
if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE)
|
||||
<= bio->bi_sector)
|
||||
conf->next_window_requests++;
|
||||
else
|
||||
conf->current_window_requests++;
|
||||
}
|
||||
if (bio->bi_sector >= conf->start_next_window)
|
||||
sector = conf->start_next_window;
|
||||
}
|
||||
|
||||
conf->nr_pending++;
|
||||
spin_unlock_irq(&conf->resync_lock);
|
||||
return sector;
|
||||
}
|
||||
|
||||
static void allow_barrier(struct r1conf *conf)
|
||||
static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
|
||||
sector_t bi_sector)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&conf->resync_lock, flags);
|
||||
conf->nr_pending--;
|
||||
if (start_next_window) {
|
||||
if (start_next_window == conf->start_next_window) {
|
||||
if (conf->start_next_window + NEXT_NORMALIO_DISTANCE
|
||||
<= bi_sector)
|
||||
conf->next_window_requests--;
|
||||
else
|
||||
conf->current_window_requests--;
|
||||
} else
|
||||
conf->current_window_requests--;
|
||||
|
||||
if (!conf->current_window_requests) {
|
||||
if (conf->next_window_requests) {
|
||||
conf->current_window_requests =
|
||||
conf->next_window_requests;
|
||||
conf->next_window_requests = 0;
|
||||
conf->start_next_window +=
|
||||
NEXT_NORMALIO_DISTANCE;
|
||||
} else
|
||||
conf->start_next_window = MaxSector;
|
||||
}
|
||||
}
|
||||
spin_unlock_irqrestore(&conf->resync_lock, flags);
|
||||
wake_up(&conf->wait_barrier);
|
||||
}
|
||||
@ -884,8 +970,7 @@ static void freeze_array(struct r1conf *conf, int extra)
|
||||
{
|
||||
/* stop syncio and normal IO and wait for everything to
|
||||
* go quite.
|
||||
* We increment barrier and nr_waiting, and then
|
||||
* wait until nr_pending match nr_queued+extra
|
||||
* We wait until nr_pending match nr_queued+extra
|
||||
* This is called in the context of one normal IO request
|
||||
* that has failed. Thus any sync request that might be pending
|
||||
* will be blocked by nr_pending, and we need to wait for
|
||||
@ -895,8 +980,7 @@ static void freeze_array(struct r1conf *conf, int extra)
|
||||
* we continue.
|
||||
*/
|
||||
spin_lock_irq(&conf->resync_lock);
|
||||
conf->barrier++;
|
||||
conf->nr_waiting++;
|
||||
conf->array_frozen = 1;
|
||||
wait_event_lock_irq_cmd(conf->wait_barrier,
|
||||
conf->nr_pending == conf->nr_queued+extra,
|
||||
conf->resync_lock,
|
||||
@ -907,8 +991,7 @@ static void unfreeze_array(struct r1conf *conf)
|
||||
{
|
||||
/* reverse the effect of the freeze */
|
||||
spin_lock_irq(&conf->resync_lock);
|
||||
conf->barrier--;
|
||||
conf->nr_waiting--;
|
||||
conf->array_frozen = 0;
|
||||
wake_up(&conf->wait_barrier);
|
||||
spin_unlock_irq(&conf->resync_lock);
|
||||
}
|
||||
@ -1013,6 +1096,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
|
||||
int first_clone;
|
||||
int sectors_handled;
|
||||
int max_sectors;
|
||||
sector_t start_next_window;
|
||||
|
||||
/*
|
||||
* Register the new request and wait if the reconstruction
|
||||
@ -1042,7 +1126,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
|
||||
finish_wait(&conf->wait_barrier, &w);
|
||||
}
|
||||
|
||||
wait_barrier(conf);
|
||||
start_next_window = wait_barrier(conf, bio);
|
||||
|
||||
bitmap = mddev->bitmap;
|
||||
|
||||
@ -1163,6 +1247,7 @@ read_again:
|
||||
|
||||
disks = conf->raid_disks * 2;
|
||||
retry_write:
|
||||
r1_bio->start_next_window = start_next_window;
|
||||
blocked_rdev = NULL;
|
||||
rcu_read_lock();
|
||||
max_sectors = r1_bio->sectors;
|
||||
@ -1231,14 +1316,24 @@ read_again:
|
||||
if (unlikely(blocked_rdev)) {
|
||||
/* Wait for this device to become unblocked */
|
||||
int j;
|
||||
sector_t old = start_next_window;
|
||||
|
||||
for (j = 0; j < i; j++)
|
||||
if (r1_bio->bios[j])
|
||||
rdev_dec_pending(conf->mirrors[j].rdev, mddev);
|
||||
r1_bio->state = 0;
|
||||
allow_barrier(conf);
|
||||
allow_barrier(conf, start_next_window, bio->bi_sector);
|
||||
md_wait_for_blocked_rdev(blocked_rdev, mddev);
|
||||
wait_barrier(conf);
|
||||
start_next_window = wait_barrier(conf, bio);
|
||||
/*
|
||||
* We must make sure the multi r1bios of bio have
|
||||
* the same value of bi_phys_segments
|
||||
*/
|
||||
if (bio->bi_phys_segments && old &&
|
||||
old != start_next_window)
|
||||
/* Wait for the former r1bio(s) to complete */
|
||||
wait_event(conf->wait_barrier,
|
||||
bio->bi_phys_segments == 1);
|
||||
goto retry_write;
|
||||
}
|
||||
|
||||
@ -1438,11 +1533,14 @@ static void print_conf(struct r1conf *conf)
|
||||
|
||||
static void close_sync(struct r1conf *conf)
|
||||
{
|
||||
wait_barrier(conf);
|
||||
allow_barrier(conf);
|
||||
wait_barrier(conf, NULL);
|
||||
allow_barrier(conf, 0, 0);
|
||||
|
||||
mempool_destroy(conf->r1buf_pool);
|
||||
conf->r1buf_pool = NULL;
|
||||
|
||||
conf->next_resync = 0;
|
||||
conf->start_next_window = MaxSector;
|
||||
}
|
||||
|
||||
static int raid1_spare_active(struct mddev *mddev)
|
||||
@ -2714,6 +2812,9 @@ static struct r1conf *setup_conf(struct mddev *mddev)
|
||||
conf->pending_count = 0;
|
||||
conf->recovery_disabled = mddev->recovery_disabled - 1;
|
||||
|
||||
conf->start_next_window = MaxSector;
|
||||
conf->current_window_requests = conf->next_window_requests = 0;
|
||||
|
||||
err = -EIO;
|
||||
for (i = 0; i < conf->raid_disks * 2; i++) {
|
||||
|
||||
@ -2871,8 +2972,8 @@ static int stop(struct mddev *mddev)
|
||||
atomic_read(&bitmap->behind_writes) == 0);
|
||||
}
|
||||
|
||||
raise_barrier(conf);
|
||||
lower_barrier(conf);
|
||||
freeze_array(conf, 0);
|
||||
unfreeze_array(conf);
|
||||
|
||||
md_unregister_thread(&mddev->thread);
|
||||
if (conf->r1bio_pool)
|
||||
@ -3031,10 +3132,10 @@ static void raid1_quiesce(struct mddev *mddev, int state)
|
||||
wake_up(&conf->wait_barrier);
|
||||
break;
|
||||
case 1:
|
||||
raise_barrier(conf);
|
||||
freeze_array(conf, 0);
|
||||
break;
|
||||
case 0:
|
||||
lower_barrier(conf);
|
||||
unfreeze_array(conf);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -3051,7 +3152,8 @@ static void *raid1_takeover(struct mddev *mddev)
|
||||
mddev->new_chunk_sectors = 0;
|
||||
conf = setup_conf(mddev);
|
||||
if (!IS_ERR(conf))
|
||||
conf->barrier = 1;
|
||||
/* Array must appear to be quiesced */
|
||||
conf->array_frozen = 1;
|
||||
return conf;
|
||||
}
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
@ -41,6 +41,19 @@ struct r1conf {
|
||||
*/
|
||||
sector_t next_resync;
|
||||
|
||||
/* When raid1 starts resync, we divide array into four partitions
|
||||
* |---------|--------------|---------------------|-------------|
|
||||
* next_resync start_next_window end_window
|
||||
* start_next_window = next_resync + NEXT_NORMALIO_DISTANCE
|
||||
* end_window = start_next_window + NEXT_NORMALIO_DISTANCE
|
||||
* current_window_requests means the count of normalIO between
|
||||
* start_next_window and end_window.
|
||||
* next_window_requests means the count of normalIO after end_window.
|
||||
* */
|
||||
sector_t start_next_window;
|
||||
int current_window_requests;
|
||||
int next_window_requests;
|
||||
|
||||
spinlock_t device_lock;
|
||||
|
||||
/* list of 'struct r1bio' that need to be processed by raid1d,
|
||||
@ -65,6 +78,7 @@ struct r1conf {
|
||||
int nr_waiting;
|
||||
int nr_queued;
|
||||
int barrier;
|
||||
int array_frozen;
|
||||
|
||||
/* Set to 1 if a full sync is needed, (fresh device added).
|
||||
* Cleared when a sync completes.
|
||||
@ -111,6 +125,7 @@ struct r1bio {
|
||||
* in this BehindIO request
|
||||
*/
|
||||
sector_t sector;
|
||||
sector_t start_next_window;
|
||||
int sectors;
|
||||
unsigned long state;
|
||||
struct mddev *mddev;
|
||||
|
@ -4384,7 +4384,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
|
||||
set_bit(MD_CHANGE_DEVS, &mddev->flags);
|
||||
md_wakeup_thread(mddev->thread);
|
||||
wait_event(mddev->sb_wait, mddev->flags == 0 ||
|
||||
kthread_should_stop());
|
||||
test_bit(MD_RECOVERY_INTR, &mddev->recovery));
|
||||
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
|
||||
allow_barrier(conf);
|
||||
return sectors_done;
|
||||
}
|
||||
conf->reshape_safe = mddev->reshape_position;
|
||||
allow_barrier(conf);
|
||||
}
|
||||
|
@ -85,6 +85,42 @@ static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
|
||||
return &conf->stripe_hashtbl[hash];
|
||||
}
|
||||
|
||||
static inline int stripe_hash_locks_hash(sector_t sect)
|
||||
{
|
||||
return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
|
||||
}
|
||||
|
||||
static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
|
||||
{
|
||||
spin_lock_irq(conf->hash_locks + hash);
|
||||
spin_lock(&conf->device_lock);
|
||||
}
|
||||
|
||||
static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
|
||||
{
|
||||
spin_unlock(&conf->device_lock);
|
||||
spin_unlock_irq(conf->hash_locks + hash);
|
||||
}
|
||||
|
||||
static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
|
||||
{
|
||||
int i;
|
||||
local_irq_disable();
|
||||
spin_lock(conf->hash_locks);
|
||||
for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
|
||||
spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
|
||||
spin_lock(&conf->device_lock);
|
||||
}
|
||||
|
||||
static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
|
||||
{
|
||||
int i;
|
||||
spin_unlock(&conf->device_lock);
|
||||
for (i = NR_STRIPE_HASH_LOCKS; i; i--)
|
||||
spin_unlock(conf->hash_locks + i - 1);
|
||||
local_irq_enable();
|
||||
}
|
||||
|
||||
/* bio's attached to a stripe+device for I/O are linked together in bi_sector
|
||||
* order without overlap. There may be several bio's per stripe+device, and
|
||||
* a bio could span several devices.
|
||||
@ -249,7 +285,8 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
|
||||
}
|
||||
}
|
||||
|
||||
static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
|
||||
static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
|
||||
struct list_head *temp_inactive_list)
|
||||
{
|
||||
BUG_ON(!list_empty(&sh->lru));
|
||||
BUG_ON(atomic_read(&conf->active_stripes)==0);
|
||||
@ -278,23 +315,68 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
|
||||
< IO_THRESHOLD)
|
||||
md_wakeup_thread(conf->mddev->thread);
|
||||
atomic_dec(&conf->active_stripes);
|
||||
if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
|
||||
list_add_tail(&sh->lru, &conf->inactive_list);
|
||||
wake_up(&conf->wait_for_stripe);
|
||||
if (conf->retry_read_aligned)
|
||||
md_wakeup_thread(conf->mddev->thread);
|
||||
}
|
||||
if (!test_bit(STRIPE_EXPANDING, &sh->state))
|
||||
list_add_tail(&sh->lru, temp_inactive_list);
|
||||
}
|
||||
}
|
||||
|
||||
static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
|
||||
static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
|
||||
struct list_head *temp_inactive_list)
|
||||
{
|
||||
if (atomic_dec_and_test(&sh->count))
|
||||
do_release_stripe(conf, sh);
|
||||
do_release_stripe(conf, sh, temp_inactive_list);
|
||||
}
|
||||
|
||||
/*
|
||||
* @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list
|
||||
*
|
||||
* Be careful: Only one task can add/delete stripes from temp_inactive_list at
|
||||
* given time. Adding stripes only takes device lock, while deleting stripes
|
||||
* only takes hash lock.
|
||||
*/
|
||||
static void release_inactive_stripe_list(struct r5conf *conf,
|
||||
struct list_head *temp_inactive_list,
|
||||
int hash)
|
||||
{
|
||||
int size;
|
||||
bool do_wakeup = false;
|
||||
unsigned long flags;
|
||||
|
||||
if (hash == NR_STRIPE_HASH_LOCKS) {
|
||||
size = NR_STRIPE_HASH_LOCKS;
|
||||
hash = NR_STRIPE_HASH_LOCKS - 1;
|
||||
} else
|
||||
size = 1;
|
||||
while (size) {
|
||||
struct list_head *list = &temp_inactive_list[size - 1];
|
||||
|
||||
/*
|
||||
* We don't hold any lock here yet, get_active_stripe() might
|
||||
* remove stripes from the list
|
||||
*/
|
||||
if (!list_empty_careful(list)) {
|
||||
spin_lock_irqsave(conf->hash_locks + hash, flags);
|
||||
if (list_empty(conf->inactive_list + hash) &&
|
||||
!list_empty(list))
|
||||
atomic_dec(&conf->empty_inactive_list_nr);
|
||||
list_splice_tail_init(list, conf->inactive_list + hash);
|
||||
do_wakeup = true;
|
||||
spin_unlock_irqrestore(conf->hash_locks + hash, flags);
|
||||
}
|
||||
size--;
|
||||
hash--;
|
||||
}
|
||||
|
||||
if (do_wakeup) {
|
||||
wake_up(&conf->wait_for_stripe);
|
||||
if (conf->retry_read_aligned)
|
||||
md_wakeup_thread(conf->mddev->thread);
|
||||
}
|
||||
}
|
||||
|
||||
/* should hold conf->device_lock already */
|
||||
static int release_stripe_list(struct r5conf *conf)
|
||||
static int release_stripe_list(struct r5conf *conf,
|
||||
struct list_head *temp_inactive_list)
|
||||
{
|
||||
struct stripe_head *sh;
|
||||
int count = 0;
|
||||
@ -303,6 +385,8 @@ static int release_stripe_list(struct r5conf *conf)
|
||||
head = llist_del_all(&conf->released_stripes);
|
||||
head = llist_reverse_order(head);
|
||||
while (head) {
|
||||
int hash;
|
||||
|
||||
sh = llist_entry(head, struct stripe_head, release_list);
|
||||
head = llist_next(head);
|
||||
/* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
|
||||
@ -313,7 +397,8 @@ static int release_stripe_list(struct r5conf *conf)
|
||||
* again, the count is always > 1. This is true for
|
||||
* STRIPE_ON_UNPLUG_LIST bit too.
|
||||
*/
|
||||
__release_stripe(conf, sh);
|
||||
hash = sh->hash_lock_index;
|
||||
__release_stripe(conf, sh, &temp_inactive_list[hash]);
|
||||
count++;
|
||||
}
|
||||
|
||||
@ -324,9 +409,12 @@ static void release_stripe(struct stripe_head *sh)
|
||||
{
|
||||
struct r5conf *conf = sh->raid_conf;
|
||||
unsigned long flags;
|
||||
struct list_head list;
|
||||
int hash;
|
||||
bool wakeup;
|
||||
|
||||
if (test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
|
||||
if (unlikely(!conf->mddev->thread) ||
|
||||
test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
|
||||
goto slow_path;
|
||||
wakeup = llist_add(&sh->release_list, &conf->released_stripes);
|
||||
if (wakeup)
|
||||
@ -336,8 +424,11 @@ slow_path:
|
||||
local_irq_save(flags);
|
||||
/* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
|
||||
if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
|
||||
do_release_stripe(conf, sh);
|
||||
INIT_LIST_HEAD(&list);
|
||||
hash = sh->hash_lock_index;
|
||||
do_release_stripe(conf, sh, &list);
|
||||
spin_unlock(&conf->device_lock);
|
||||
release_inactive_stripe_list(conf, &list, hash);
|
||||
}
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
@ -362,18 +453,21 @@ static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
|
||||
|
||||
|
||||
/* find an idle stripe, make sure it is unhashed, and return it. */
|
||||
static struct stripe_head *get_free_stripe(struct r5conf *conf)
|
||||
static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
|
||||
{
|
||||
struct stripe_head *sh = NULL;
|
||||
struct list_head *first;
|
||||
|
||||
if (list_empty(&conf->inactive_list))
|
||||
if (list_empty(conf->inactive_list + hash))
|
||||
goto out;
|
||||
first = conf->inactive_list.next;
|
||||
first = (conf->inactive_list + hash)->next;
|
||||
sh = list_entry(first, struct stripe_head, lru);
|
||||
list_del_init(first);
|
||||
remove_hash(sh);
|
||||
atomic_inc(&conf->active_stripes);
|
||||
BUG_ON(hash != sh->hash_lock_index);
|
||||
if (list_empty(conf->inactive_list + hash))
|
||||
atomic_inc(&conf->empty_inactive_list_nr);
|
||||
out:
|
||||
return sh;
|
||||
}
|
||||
@ -416,7 +510,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
|
||||
static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
|
||||
{
|
||||
struct r5conf *conf = sh->raid_conf;
|
||||
int i;
|
||||
int i, seq;
|
||||
|
||||
BUG_ON(atomic_read(&sh->count) != 0);
|
||||
BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
|
||||
@ -426,7 +520,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
|
||||
(unsigned long long)sh->sector);
|
||||
|
||||
remove_hash(sh);
|
||||
|
||||
retry:
|
||||
seq = read_seqcount_begin(&conf->gen_lock);
|
||||
sh->generation = conf->generation - previous;
|
||||
sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
|
||||
sh->sector = sector;
|
||||
@ -448,6 +543,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
|
||||
dev->flags = 0;
|
||||
raid5_build_block(sh, i, previous);
|
||||
}
|
||||
if (read_seqcount_retry(&conf->gen_lock, seq))
|
||||
goto retry;
|
||||
insert_hash(conf, sh);
|
||||
sh->cpu = smp_processor_id();
|
||||
}
|
||||
@ -552,29 +649,31 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
|
||||
int previous, int noblock, int noquiesce)
|
||||
{
|
||||
struct stripe_head *sh;
|
||||
int hash = stripe_hash_locks_hash(sector);
|
||||
|
||||
pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
|
||||
|
||||
spin_lock_irq(&conf->device_lock);
|
||||
spin_lock_irq(conf->hash_locks + hash);
|
||||
|
||||
do {
|
||||
wait_event_lock_irq(conf->wait_for_stripe,
|
||||
conf->quiesce == 0 || noquiesce,
|
||||
conf->device_lock);
|
||||
*(conf->hash_locks + hash));
|
||||
sh = __find_stripe(conf, sector, conf->generation - previous);
|
||||
if (!sh) {
|
||||
if (!conf->inactive_blocked)
|
||||
sh = get_free_stripe(conf);
|
||||
sh = get_free_stripe(conf, hash);
|
||||
if (noblock && sh == NULL)
|
||||
break;
|
||||
if (!sh) {
|
||||
conf->inactive_blocked = 1;
|
||||
wait_event_lock_irq(conf->wait_for_stripe,
|
||||
!list_empty(&conf->inactive_list) &&
|
||||
(atomic_read(&conf->active_stripes)
|
||||
< (conf->max_nr_stripes *3/4)
|
||||
|| !conf->inactive_blocked),
|
||||
conf->device_lock);
|
||||
wait_event_lock_irq(
|
||||
conf->wait_for_stripe,
|
||||
!list_empty(conf->inactive_list + hash) &&
|
||||
(atomic_read(&conf->active_stripes)
|
||||
< (conf->max_nr_stripes * 3 / 4)
|
||||
|| !conf->inactive_blocked),
|
||||
*(conf->hash_locks + hash));
|
||||
conf->inactive_blocked = 0;
|
||||
} else
|
||||
init_stripe(sh, sector, previous);
|
||||
@ -585,9 +684,11 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
|
||||
&& !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)
|
||||
&& !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
|
||||
} else {
|
||||
spin_lock(&conf->device_lock);
|
||||
if (!test_bit(STRIPE_HANDLE, &sh->state))
|
||||
atomic_inc(&conf->active_stripes);
|
||||
if (list_empty(&sh->lru) &&
|
||||
!test_bit(STRIPE_ON_RELEASE_LIST, &sh->state) &&
|
||||
!test_bit(STRIPE_EXPANDING, &sh->state))
|
||||
BUG();
|
||||
list_del_init(&sh->lru);
|
||||
@ -595,6 +696,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
|
||||
sh->group->stripes_cnt--;
|
||||
sh->group = NULL;
|
||||
}
|
||||
spin_unlock(&conf->device_lock);
|
||||
}
|
||||
}
|
||||
} while (sh == NULL);
|
||||
@ -602,7 +704,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
|
||||
if (sh)
|
||||
atomic_inc(&sh->count);
|
||||
|
||||
spin_unlock_irq(&conf->device_lock);
|
||||
spin_unlock_irq(conf->hash_locks + hash);
|
||||
return sh;
|
||||
}
|
||||
|
||||
@ -758,7 +860,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
|
||||
bi->bi_sector = (sh->sector
|
||||
+ rdev->data_offset);
|
||||
if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
|
||||
bi->bi_rw |= REQ_FLUSH;
|
||||
bi->bi_rw |= REQ_NOMERGE;
|
||||
|
||||
bi->bi_vcnt = 1;
|
||||
bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
|
||||
@ -1582,7 +1684,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
|
||||
put_cpu();
|
||||
}
|
||||
|
||||
static int grow_one_stripe(struct r5conf *conf)
|
||||
static int grow_one_stripe(struct r5conf *conf, int hash)
|
||||
{
|
||||
struct stripe_head *sh;
|
||||
sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
|
||||
@ -1598,6 +1700,7 @@ static int grow_one_stripe(struct r5conf *conf)
|
||||
kmem_cache_free(conf->slab_cache, sh);
|
||||
return 0;
|
||||
}
|
||||
sh->hash_lock_index = hash;
|
||||
/* we just created an active stripe so... */
|
||||
atomic_set(&sh->count, 1);
|
||||
atomic_inc(&conf->active_stripes);
|
||||
@ -1610,6 +1713,7 @@ static int grow_stripes(struct r5conf *conf, int num)
|
||||
{
|
||||
struct kmem_cache *sc;
|
||||
int devs = max(conf->raid_disks, conf->previous_raid_disks);
|
||||
int hash;
|
||||
|
||||
if (conf->mddev->gendisk)
|
||||
sprintf(conf->cache_name[0],
|
||||
@ -1627,9 +1731,13 @@ static int grow_stripes(struct r5conf *conf, int num)
|
||||
return 1;
|
||||
conf->slab_cache = sc;
|
||||
conf->pool_size = devs;
|
||||
while (num--)
|
||||
if (!grow_one_stripe(conf))
|
||||
hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
|
||||
while (num--) {
|
||||
if (!grow_one_stripe(conf, hash))
|
||||
return 1;
|
||||
conf->max_nr_stripes++;
|
||||
hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1687,6 +1795,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
|
||||
int err;
|
||||
struct kmem_cache *sc;
|
||||
int i;
|
||||
int hash, cnt;
|
||||
|
||||
if (newsize <= conf->pool_size)
|
||||
return 0; /* never bother to shrink */
|
||||
@ -1726,19 +1835,29 @@ static int resize_stripes(struct r5conf *conf, int newsize)
|
||||
* OK, we have enough stripes, start collecting inactive
|
||||
* stripes and copying them over
|
||||
*/
|
||||
hash = 0;
|
||||
cnt = 0;
|
||||
list_for_each_entry(nsh, &newstripes, lru) {
|
||||
spin_lock_irq(&conf->device_lock);
|
||||
wait_event_lock_irq(conf->wait_for_stripe,
|
||||
!list_empty(&conf->inactive_list),
|
||||
conf->device_lock);
|
||||
osh = get_free_stripe(conf);
|
||||
spin_unlock_irq(&conf->device_lock);
|
||||
lock_device_hash_lock(conf, hash);
|
||||
wait_event_cmd(conf->wait_for_stripe,
|
||||
!list_empty(conf->inactive_list + hash),
|
||||
unlock_device_hash_lock(conf, hash),
|
||||
lock_device_hash_lock(conf, hash));
|
||||
osh = get_free_stripe(conf, hash);
|
||||
unlock_device_hash_lock(conf, hash);
|
||||
atomic_set(&nsh->count, 1);
|
||||
for(i=0; i<conf->pool_size; i++)
|
||||
nsh->dev[i].page = osh->dev[i].page;
|
||||
for( ; i<newsize; i++)
|
||||
nsh->dev[i].page = NULL;
|
||||
nsh->hash_lock_index = hash;
|
||||
kmem_cache_free(conf->slab_cache, osh);
|
||||
cnt++;
|
||||
if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
|
||||
!!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
|
||||
hash++;
|
||||
cnt = 0;
|
||||
}
|
||||
}
|
||||
kmem_cache_destroy(conf->slab_cache);
|
||||
|
||||
@ -1797,13 +1916,13 @@ static int resize_stripes(struct r5conf *conf, int newsize)
|
||||
return err;
|
||||
}
|
||||
|
||||
static int drop_one_stripe(struct r5conf *conf)
|
||||
static int drop_one_stripe(struct r5conf *conf, int hash)
|
||||
{
|
||||
struct stripe_head *sh;
|
||||
|
||||
spin_lock_irq(&conf->device_lock);
|
||||
sh = get_free_stripe(conf);
|
||||
spin_unlock_irq(&conf->device_lock);
|
||||
spin_lock_irq(conf->hash_locks + hash);
|
||||
sh = get_free_stripe(conf, hash);
|
||||
spin_unlock_irq(conf->hash_locks + hash);
|
||||
if (!sh)
|
||||
return 0;
|
||||
BUG_ON(atomic_read(&sh->count));
|
||||
@ -1815,8 +1934,10 @@ static int drop_one_stripe(struct r5conf *conf)
|
||||
|
||||
static void shrink_stripes(struct r5conf *conf)
|
||||
{
|
||||
while (drop_one_stripe(conf))
|
||||
;
|
||||
int hash;
|
||||
for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++)
|
||||
while (drop_one_stripe(conf, hash))
|
||||
;
|
||||
|
||||
if (conf->slab_cache)
|
||||
kmem_cache_destroy(conf->slab_cache);
|
||||
@ -1921,6 +2042,9 @@ static void raid5_end_read_request(struct bio * bi, int error)
|
||||
mdname(conf->mddev), bdn);
|
||||
else
|
||||
retry = 1;
|
||||
if (set_bad && test_bit(In_sync, &rdev->flags)
|
||||
&& !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
|
||||
retry = 1;
|
||||
if (retry)
|
||||
if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
|
||||
set_bit(R5_ReadError, &sh->dev[i].flags);
|
||||
@ -3900,7 +4024,8 @@ static void raid5_activate_delayed(struct r5conf *conf)
|
||||
}
|
||||
}
|
||||
|
||||
static void activate_bit_delay(struct r5conf *conf)
|
||||
static void activate_bit_delay(struct r5conf *conf,
|
||||
struct list_head *temp_inactive_list)
|
||||
{
|
||||
/* device_lock is held */
|
||||
struct list_head head;
|
||||
@ -3908,9 +4033,11 @@ static void activate_bit_delay(struct r5conf *conf)
|
||||
list_del_init(&conf->bitmap_list);
|
||||
while (!list_empty(&head)) {
|
||||
struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
|
||||
int hash;
|
||||
list_del_init(&sh->lru);
|
||||
atomic_inc(&sh->count);
|
||||
__release_stripe(conf, sh);
|
||||
hash = sh->hash_lock_index;
|
||||
__release_stripe(conf, sh, &temp_inactive_list[hash]);
|
||||
}
|
||||
}
|
||||
|
||||
@ -3926,7 +4053,7 @@ int md_raid5_congested(struct mddev *mddev, int bits)
|
||||
return 1;
|
||||
if (conf->quiesce)
|
||||
return 1;
|
||||
if (list_empty_careful(&conf->inactive_list))
|
||||
if (atomic_read(&conf->empty_inactive_list_nr))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
@ -4256,6 +4383,7 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
|
||||
struct raid5_plug_cb {
|
||||
struct blk_plug_cb cb;
|
||||
struct list_head list;
|
||||
struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
|
||||
};
|
||||
|
||||
static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
|
||||
@ -4266,6 +4394,7 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
|
||||
struct mddev *mddev = cb->cb.data;
|
||||
struct r5conf *conf = mddev->private;
|
||||
int cnt = 0;
|
||||
int hash;
|
||||
|
||||
if (cb->list.next && !list_empty(&cb->list)) {
|
||||
spin_lock_irq(&conf->device_lock);
|
||||
@ -4283,11 +4412,14 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
|
||||
* STRIPE_ON_RELEASE_LIST could be set here. In that
|
||||
* case, the count is always > 1 here
|
||||
*/
|
||||
__release_stripe(conf, sh);
|
||||
hash = sh->hash_lock_index;
|
||||
__release_stripe(conf, sh, &cb->temp_inactive_list[hash]);
|
||||
cnt++;
|
||||
}
|
||||
spin_unlock_irq(&conf->device_lock);
|
||||
}
|
||||
release_inactive_stripe_list(conf, cb->temp_inactive_list,
|
||||
NR_STRIPE_HASH_LOCKS);
|
||||
if (mddev->queue)
|
||||
trace_block_unplug(mddev->queue, cnt, !from_schedule);
|
||||
kfree(cb);
|
||||
@ -4308,8 +4440,12 @@ static void release_stripe_plug(struct mddev *mddev,
|
||||
|
||||
cb = container_of(blk_cb, struct raid5_plug_cb, cb);
|
||||
|
||||
if (cb->list.next == NULL)
|
||||
if (cb->list.next == NULL) {
|
||||
int i;
|
||||
INIT_LIST_HEAD(&cb->list);
|
||||
for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
|
||||
INIT_LIST_HEAD(cb->temp_inactive_list + i);
|
||||
}
|
||||
|
||||
if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
|
||||
list_add_tail(&sh->lru, &cb->list);
|
||||
@ -4692,14 +4828,19 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
|
||||
time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
|
||||
/* Cannot proceed until we've updated the superblock... */
|
||||
wait_event(conf->wait_for_overlap,
|
||||
atomic_read(&conf->reshape_stripes)==0);
|
||||
atomic_read(&conf->reshape_stripes)==0
|
||||
|| test_bit(MD_RECOVERY_INTR, &mddev->recovery));
|
||||
if (atomic_read(&conf->reshape_stripes) != 0)
|
||||
return 0;
|
||||
mddev->reshape_position = conf->reshape_progress;
|
||||
mddev->curr_resync_completed = sector_nr;
|
||||
conf->reshape_checkpoint = jiffies;
|
||||
set_bit(MD_CHANGE_DEVS, &mddev->flags);
|
||||
md_wakeup_thread(mddev->thread);
|
||||
wait_event(mddev->sb_wait, mddev->flags == 0 ||
|
||||
kthread_should_stop());
|
||||
test_bit(MD_RECOVERY_INTR, &mddev->recovery));
|
||||
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
|
||||
return 0;
|
||||
spin_lock_irq(&conf->device_lock);
|
||||
conf->reshape_safe = mddev->reshape_position;
|
||||
spin_unlock_irq(&conf->device_lock);
|
||||
@ -4782,7 +4923,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
|
||||
>= mddev->resync_max - mddev->curr_resync_completed) {
|
||||
/* Cannot proceed until we've updated the superblock... */
|
||||
wait_event(conf->wait_for_overlap,
|
||||
atomic_read(&conf->reshape_stripes) == 0);
|
||||
atomic_read(&conf->reshape_stripes) == 0
|
||||
|| test_bit(MD_RECOVERY_INTR, &mddev->recovery));
|
||||
if (atomic_read(&conf->reshape_stripes) != 0)
|
||||
goto ret;
|
||||
mddev->reshape_position = conf->reshape_progress;
|
||||
mddev->curr_resync_completed = sector_nr;
|
||||
conf->reshape_checkpoint = jiffies;
|
||||
@ -4790,13 +4934,16 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
|
||||
md_wakeup_thread(mddev->thread);
|
||||
wait_event(mddev->sb_wait,
|
||||
!test_bit(MD_CHANGE_DEVS, &mddev->flags)
|
||||
|| kthread_should_stop());
|
||||
|| test_bit(MD_RECOVERY_INTR, &mddev->recovery));
|
||||
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
|
||||
goto ret;
|
||||
spin_lock_irq(&conf->device_lock);
|
||||
conf->reshape_safe = mddev->reshape_position;
|
||||
spin_unlock_irq(&conf->device_lock);
|
||||
wake_up(&conf->wait_for_overlap);
|
||||
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
|
||||
}
|
||||
ret:
|
||||
return reshape_sectors;
|
||||
}
|
||||
|
||||
@ -4954,27 +5101,45 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
|
||||
}
|
||||
|
||||
static int handle_active_stripes(struct r5conf *conf, int group,
|
||||
struct r5worker *worker)
|
||||
struct r5worker *worker,
|
||||
struct list_head *temp_inactive_list)
|
||||
{
|
||||
struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
|
||||
int i, batch_size = 0;
|
||||
int i, batch_size = 0, hash;
|
||||
bool release_inactive = false;
|
||||
|
||||
while (batch_size < MAX_STRIPE_BATCH &&
|
||||
(sh = __get_priority_stripe(conf, group)) != NULL)
|
||||
batch[batch_size++] = sh;
|
||||
|
||||
if (batch_size == 0)
|
||||
return batch_size;
|
||||
if (batch_size == 0) {
|
||||
for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
|
||||
if (!list_empty(temp_inactive_list + i))
|
||||
break;
|
||||
if (i == NR_STRIPE_HASH_LOCKS)
|
||||
return batch_size;
|
||||
release_inactive = true;
|
||||
}
|
||||
spin_unlock_irq(&conf->device_lock);
|
||||
|
||||
release_inactive_stripe_list(conf, temp_inactive_list,
|
||||
NR_STRIPE_HASH_LOCKS);
|
||||
|
||||
if (release_inactive) {
|
||||
spin_lock_irq(&conf->device_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
for (i = 0; i < batch_size; i++)
|
||||
handle_stripe(batch[i]);
|
||||
|
||||
cond_resched();
|
||||
|
||||
spin_lock_irq(&conf->device_lock);
|
||||
for (i = 0; i < batch_size; i++)
|
||||
__release_stripe(conf, batch[i]);
|
||||
for (i = 0; i < batch_size; i++) {
|
||||
hash = batch[i]->hash_lock_index;
|
||||
__release_stripe(conf, batch[i], &temp_inactive_list[hash]);
|
||||
}
|
||||
return batch_size;
|
||||
}
|
||||
|
||||
@ -4995,9 +5160,10 @@ static void raid5_do_work(struct work_struct *work)
|
||||
while (1) {
|
||||
int batch_size, released;
|
||||
|
||||
released = release_stripe_list(conf);
|
||||
released = release_stripe_list(conf, worker->temp_inactive_list);
|
||||
|
||||
batch_size = handle_active_stripes(conf, group_id, worker);
|
||||
batch_size = handle_active_stripes(conf, group_id, worker,
|
||||
worker->temp_inactive_list);
|
||||
worker->working = false;
|
||||
if (!batch_size && !released)
|
||||
break;
|
||||
@ -5036,7 +5202,7 @@ static void raid5d(struct md_thread *thread)
|
||||
struct bio *bio;
|
||||
int batch_size, released;
|
||||
|
||||
released = release_stripe_list(conf);
|
||||
released = release_stripe_list(conf, conf->temp_inactive_list);
|
||||
|
||||
if (
|
||||
!list_empty(&conf->bitmap_list)) {
|
||||
@ -5046,7 +5212,7 @@ static void raid5d(struct md_thread *thread)
|
||||
bitmap_unplug(mddev->bitmap);
|
||||
spin_lock_irq(&conf->device_lock);
|
||||
conf->seq_write = conf->seq_flush;
|
||||
activate_bit_delay(conf);
|
||||
activate_bit_delay(conf, conf->temp_inactive_list);
|
||||
}
|
||||
raid5_activate_delayed(conf);
|
||||
|
||||
@ -5060,7 +5226,8 @@ static void raid5d(struct md_thread *thread)
|
||||
handled++;
|
||||
}
|
||||
|
||||
batch_size = handle_active_stripes(conf, ANY_GROUP, NULL);
|
||||
batch_size = handle_active_stripes(conf, ANY_GROUP, NULL,
|
||||
conf->temp_inactive_list);
|
||||
if (!batch_size && !released)
|
||||
break;
|
||||
handled += batch_size;
|
||||
@ -5096,22 +5263,29 @@ raid5_set_cache_size(struct mddev *mddev, int size)
|
||||
{
|
||||
struct r5conf *conf = mddev->private;
|
||||
int err;
|
||||
int hash;
|
||||
|
||||
if (size <= 16 || size > 32768)
|
||||
return -EINVAL;
|
||||
hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS;
|
||||
while (size < conf->max_nr_stripes) {
|
||||
if (drop_one_stripe(conf))
|
||||
if (drop_one_stripe(conf, hash))
|
||||
conf->max_nr_stripes--;
|
||||
else
|
||||
break;
|
||||
hash--;
|
||||
if (hash < 0)
|
||||
hash = NR_STRIPE_HASH_LOCKS - 1;
|
||||
}
|
||||
err = md_allow_write(mddev);
|
||||
if (err)
|
||||
return err;
|
||||
hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
|
||||
while (size > conf->max_nr_stripes) {
|
||||
if (grow_one_stripe(conf))
|
||||
if (grow_one_stripe(conf, hash))
|
||||
conf->max_nr_stripes++;
|
||||
else break;
|
||||
hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
@ -5199,15 +5373,18 @@ raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int alloc_thread_groups(struct r5conf *conf, int cnt);
|
||||
static int alloc_thread_groups(struct r5conf *conf, int cnt,
|
||||
int *group_cnt,
|
||||
int *worker_cnt_per_group,
|
||||
struct r5worker_group **worker_groups);
|
||||
static ssize_t
|
||||
raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
|
||||
{
|
||||
struct r5conf *conf = mddev->private;
|
||||
unsigned long new;
|
||||
int err;
|
||||
struct r5worker_group *old_groups;
|
||||
int old_group_cnt;
|
||||
struct r5worker_group *new_groups, *old_groups;
|
||||
int group_cnt, worker_cnt_per_group;
|
||||
|
||||
if (len >= PAGE_SIZE)
|
||||
return -EINVAL;
|
||||
@ -5223,14 +5400,19 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
|
||||
mddev_suspend(mddev);
|
||||
|
||||
old_groups = conf->worker_groups;
|
||||
old_group_cnt = conf->worker_cnt_per_group;
|
||||
if (old_groups)
|
||||
flush_workqueue(raid5_wq);
|
||||
|
||||
err = alloc_thread_groups(conf, new,
|
||||
&group_cnt, &worker_cnt_per_group,
|
||||
&new_groups);
|
||||
if (!err) {
|
||||
spin_lock_irq(&conf->device_lock);
|
||||
conf->group_cnt = group_cnt;
|
||||
conf->worker_cnt_per_group = worker_cnt_per_group;
|
||||
conf->worker_groups = new_groups;
|
||||
spin_unlock_irq(&conf->device_lock);
|
||||
|
||||
conf->worker_groups = NULL;
|
||||
err = alloc_thread_groups(conf, new);
|
||||
if (err) {
|
||||
conf->worker_groups = old_groups;
|
||||
conf->worker_cnt_per_group = old_group_cnt;
|
||||
} else {
|
||||
if (old_groups)
|
||||
kfree(old_groups[0].workers);
|
||||
kfree(old_groups);
|
||||
@ -5260,40 +5442,47 @@ static struct attribute_group raid5_attrs_group = {
|
||||
.attrs = raid5_attrs,
|
||||
};
|
||||
|
||||
static int alloc_thread_groups(struct r5conf *conf, int cnt)
|
||||
static int alloc_thread_groups(struct r5conf *conf, int cnt,
|
||||
int *group_cnt,
|
||||
int *worker_cnt_per_group,
|
||||
struct r5worker_group **worker_groups)
|
||||
{
|
||||
int i, j;
|
||||
int i, j, k;
|
||||
ssize_t size;
|
||||
struct r5worker *workers;
|
||||
|
||||
conf->worker_cnt_per_group = cnt;
|
||||
*worker_cnt_per_group = cnt;
|
||||
if (cnt == 0) {
|
||||
conf->worker_groups = NULL;
|
||||
*group_cnt = 0;
|
||||
*worker_groups = NULL;
|
||||
return 0;
|
||||
}
|
||||
conf->group_cnt = num_possible_nodes();
|
||||
*group_cnt = num_possible_nodes();
|
||||
size = sizeof(struct r5worker) * cnt;
|
||||
workers = kzalloc(size * conf->group_cnt, GFP_NOIO);
|
||||
conf->worker_groups = kzalloc(sizeof(struct r5worker_group) *
|
||||
conf->group_cnt, GFP_NOIO);
|
||||
if (!conf->worker_groups || !workers) {
|
||||
workers = kzalloc(size * *group_cnt, GFP_NOIO);
|
||||
*worker_groups = kzalloc(sizeof(struct r5worker_group) *
|
||||
*group_cnt, GFP_NOIO);
|
||||
if (!*worker_groups || !workers) {
|
||||
kfree(workers);
|
||||
kfree(conf->worker_groups);
|
||||
conf->worker_groups = NULL;
|
||||
kfree(*worker_groups);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
for (i = 0; i < conf->group_cnt; i++) {
|
||||
for (i = 0; i < *group_cnt; i++) {
|
||||
struct r5worker_group *group;
|
||||
|
||||
group = &conf->worker_groups[i];
|
||||
group = worker_groups[i];
|
||||
INIT_LIST_HEAD(&group->handle_list);
|
||||
group->conf = conf;
|
||||
group->workers = workers + i * cnt;
|
||||
|
||||
for (j = 0; j < cnt; j++) {
|
||||
group->workers[j].group = group;
|
||||
INIT_WORK(&group->workers[j].work, raid5_do_work);
|
||||
struct r5worker *worker = group->workers + j;
|
||||
worker->group = group;
|
||||
INIT_WORK(&worker->work, raid5_do_work);
|
||||
|
||||
for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++)
|
||||
INIT_LIST_HEAD(worker->temp_inactive_list + k);
|
||||
}
|
||||
}
|
||||
|
||||
@ -5444,6 +5633,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
|
||||
struct md_rdev *rdev;
|
||||
struct disk_info *disk;
|
||||
char pers_name[6];
|
||||
int i;
|
||||
int group_cnt, worker_cnt_per_group;
|
||||
struct r5worker_group *new_group;
|
||||
|
||||
if (mddev->new_level != 5
|
||||
&& mddev->new_level != 4
|
||||
@ -5478,7 +5670,12 @@ static struct r5conf *setup_conf(struct mddev *mddev)
|
||||
if (conf == NULL)
|
||||
goto abort;
|
||||
/* Don't enable multi-threading by default*/
|
||||
if (alloc_thread_groups(conf, 0))
|
||||
if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
|
||||
&new_group)) {
|
||||
conf->group_cnt = group_cnt;
|
||||
conf->worker_cnt_per_group = worker_cnt_per_group;
|
||||
conf->worker_groups = new_group;
|
||||
} else
|
||||
goto abort;
|
||||
spin_lock_init(&conf->device_lock);
|
||||
seqcount_init(&conf->gen_lock);
|
||||
@ -5488,7 +5685,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
|
||||
INIT_LIST_HEAD(&conf->hold_list);
|
||||
INIT_LIST_HEAD(&conf->delayed_list);
|
||||
INIT_LIST_HEAD(&conf->bitmap_list);
|
||||
INIT_LIST_HEAD(&conf->inactive_list);
|
||||
init_llist_head(&conf->released_stripes);
|
||||
atomic_set(&conf->active_stripes, 0);
|
||||
atomic_set(&conf->preread_active_stripes, 0);
|
||||
@ -5514,6 +5710,21 @@ static struct r5conf *setup_conf(struct mddev *mddev)
|
||||
if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
|
||||
goto abort;
|
||||
|
||||
/* We init hash_locks[0] separately to that it can be used
|
||||
* as the reference lock in the spin_lock_nest_lock() call
|
||||
* in lock_all_device_hash_locks_irq in order to convince
|
||||
* lockdep that we know what we are doing.
|
||||
*/
|
||||
spin_lock_init(conf->hash_locks);
|
||||
for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
|
||||
spin_lock_init(conf->hash_locks + i);
|
||||
|
||||
for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
|
||||
INIT_LIST_HEAD(conf->inactive_list + i);
|
||||
|
||||
for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
|
||||
INIT_LIST_HEAD(conf->temp_inactive_list + i);
|
||||
|
||||
conf->level = mddev->new_level;
|
||||
if (raid5_alloc_percpu(conf) != 0)
|
||||
goto abort;
|
||||
@ -5554,7 +5765,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
|
||||
else
|
||||
conf->max_degraded = 1;
|
||||
conf->algorithm = mddev->new_layout;
|
||||
conf->max_nr_stripes = NR_STRIPES;
|
||||
conf->reshape_progress = mddev->reshape_position;
|
||||
if (conf->reshape_progress != MaxSector) {
|
||||
conf->prev_chunk_sectors = mddev->chunk_sectors;
|
||||
@ -5563,7 +5773,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
|
||||
|
||||
memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
|
||||
max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
|
||||
if (grow_stripes(conf, conf->max_nr_stripes)) {
|
||||
atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
|
||||
if (grow_stripes(conf, NR_STRIPES)) {
|
||||
printk(KERN_ERR
|
||||
"md/raid:%s: couldn't allocate %dkB for buffers\n",
|
||||
mdname(mddev), memory);
|
||||
@ -6369,12 +6580,18 @@ static int raid5_start_reshape(struct mddev *mddev)
|
||||
if (!mddev->sync_thread) {
|
||||
mddev->recovery = 0;
|
||||
spin_lock_irq(&conf->device_lock);
|
||||
write_seqcount_begin(&conf->gen_lock);
|
||||
mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
|
||||
mddev->new_chunk_sectors =
|
||||
conf->chunk_sectors = conf->prev_chunk_sectors;
|
||||
mddev->new_layout = conf->algorithm = conf->prev_algo;
|
||||
rdev_for_each(rdev, mddev)
|
||||
rdev->new_data_offset = rdev->data_offset;
|
||||
smp_wmb();
|
||||
conf->generation --;
|
||||
conf->reshape_progress = MaxSector;
|
||||
mddev->reshape_position = MaxSector;
|
||||
write_seqcount_end(&conf->gen_lock);
|
||||
spin_unlock_irq(&conf->device_lock);
|
||||
return -EAGAIN;
|
||||
}
|
||||
@ -6462,27 +6679,28 @@ static void raid5_quiesce(struct mddev *mddev, int state)
|
||||
break;
|
||||
|
||||
case 1: /* stop all writes */
|
||||
spin_lock_irq(&conf->device_lock);
|
||||
lock_all_device_hash_locks_irq(conf);
|
||||
/* '2' tells resync/reshape to pause so that all
|
||||
* active stripes can drain
|
||||
*/
|
||||
conf->quiesce = 2;
|
||||
wait_event_lock_irq(conf->wait_for_stripe,
|
||||
wait_event_cmd(conf->wait_for_stripe,
|
||||
atomic_read(&conf->active_stripes) == 0 &&
|
||||
atomic_read(&conf->active_aligned_reads) == 0,
|
||||
conf->device_lock);
|
||||
unlock_all_device_hash_locks_irq(conf),
|
||||
lock_all_device_hash_locks_irq(conf));
|
||||
conf->quiesce = 1;
|
||||
spin_unlock_irq(&conf->device_lock);
|
||||
unlock_all_device_hash_locks_irq(conf);
|
||||
/* allow reshape to continue */
|
||||
wake_up(&conf->wait_for_overlap);
|
||||
break;
|
||||
|
||||
case 0: /* re-enable writes */
|
||||
spin_lock_irq(&conf->device_lock);
|
||||
lock_all_device_hash_locks_irq(conf);
|
||||
conf->quiesce = 0;
|
||||
wake_up(&conf->wait_for_stripe);
|
||||
wake_up(&conf->wait_for_overlap);
|
||||
spin_unlock_irq(&conf->device_lock);
|
||||
unlock_all_device_hash_locks_irq(conf);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -205,6 +205,7 @@ struct stripe_head {
|
||||
short pd_idx; /* parity disk index */
|
||||
short qd_idx; /* 'Q' disk index for raid6 */
|
||||
short ddf_layout;/* use DDF ordering to calculate Q */
|
||||
short hash_lock_index;
|
||||
unsigned long state; /* state flags */
|
||||
atomic_t count; /* nr of active thread/requests */
|
||||
int bm_seq; /* sequence number for bitmap flushes */
|
||||
@ -367,9 +368,18 @@ struct disk_info {
|
||||
struct md_rdev *rdev, *replacement;
|
||||
};
|
||||
|
||||
/* NOTE NR_STRIPE_HASH_LOCKS must remain below 64.
|
||||
* This is because we sometimes take all the spinlocks
|
||||
* and creating that much locking depth can cause
|
||||
* problems.
|
||||
*/
|
||||
#define NR_STRIPE_HASH_LOCKS 8
|
||||
#define STRIPE_HASH_LOCKS_MASK (NR_STRIPE_HASH_LOCKS - 1)
|
||||
|
||||
struct r5worker {
|
||||
struct work_struct work;
|
||||
struct r5worker_group *group;
|
||||
struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
|
||||
bool working;
|
||||
};
|
||||
|
||||
@ -382,6 +392,8 @@ struct r5worker_group {
|
||||
|
||||
struct r5conf {
|
||||
struct hlist_head *stripe_hashtbl;
|
||||
/* only protect corresponding hash list and inactive_list */
|
||||
spinlock_t hash_locks[NR_STRIPE_HASH_LOCKS];
|
||||
struct mddev *mddev;
|
||||
int chunk_sectors;
|
||||
int level, algorithm;
|
||||
@ -462,7 +474,8 @@ struct r5conf {
|
||||
* Free stripes pool
|
||||
*/
|
||||
atomic_t active_stripes;
|
||||
struct list_head inactive_list;
|
||||
struct list_head inactive_list[NR_STRIPE_HASH_LOCKS];
|
||||
atomic_t empty_inactive_list_nr;
|
||||
struct llist_head released_stripes;
|
||||
wait_queue_head_t wait_for_stripe;
|
||||
wait_queue_head_t wait_for_overlap;
|
||||
@ -477,6 +490,7 @@ struct r5conf {
|
||||
* the new thread here until we fully activate the array.
|
||||
*/
|
||||
struct md_thread *thread;
|
||||
struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
|
||||
struct r5worker_group *worker_groups;
|
||||
int group_cnt;
|
||||
int worker_cnt_per_group;
|
||||
|
@ -278,6 +278,31 @@ do { \
|
||||
__ret; \
|
||||
})
|
||||
|
||||
#define __wait_event_cmd(wq, condition, cmd1, cmd2) \
|
||||
(void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \
|
||||
cmd1; schedule(); cmd2)
|
||||
|
||||
/**
|
||||
* wait_event_cmd - sleep until a condition gets true
|
||||
* @wq: the waitqueue to wait on
|
||||
* @condition: a C expression for the event to wait for
|
||||
* cmd1: the command will be executed before sleep
|
||||
* cmd2: the command will be executed after sleep
|
||||
*
|
||||
* The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
|
||||
* @condition evaluates to true. The @condition is checked each time
|
||||
* the waitqueue @wq is woken up.
|
||||
*
|
||||
* wake_up() has to be called after changing any variable that could
|
||||
* change the result of the wait condition.
|
||||
*/
|
||||
#define wait_event_cmd(wq, condition, cmd1, cmd2) \
|
||||
do { \
|
||||
if (condition) \
|
||||
break; \
|
||||
__wait_event_cmd(wq, condition, cmd1, cmd2); \
|
||||
} while (0)
|
||||
|
||||
#define __wait_event_interruptible(wq, condition) \
|
||||
___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \
|
||||
schedule())
|
||||
|
@ -16,6 +16,7 @@
|
||||
#define _MD_P_H
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <asm/byteorder.h>
|
||||
|
||||
/*
|
||||
* RAID superblock.
|
||||
|
Loading…
Reference in New Issue
Block a user