md: introduce a new struct for IO serialization

Obviously, IO serialization could cause the degradation of
performance a lot. In order to reduce the degradation, so a
rb interval tree is added in raid1 to speed up the check of
collision.

So, a rb root is needed in md_rdev, then abstract all the
serialize related members to a new struct (serial_in_rdev),
embed it into md_rdev.

Of course, we need to free the struct if it is not needed
anymore, so rdev/rdevs_uninit_serial are added accordingly.
And they should be called when destroty memory pool or can't
alloc memory.

And we need to consider to call mddev_destroy_serial_pool
in case serialize_policy/write-behind is disabled, bitmap
is destroyed or in __md_stop_writes.

Signed-off-by: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
This commit is contained in:
Guoqing Jiang 2019-12-23 10:49:00 +01:00 committed by Song Liu
parent 4d26d32fe4
commit 69b00b5bb2
4 changed files with 117 additions and 64 deletions

View File

@ -1789,10 +1789,8 @@ void md_bitmap_destroy(struct mddev *mddev)
return; return;
md_bitmap_wait_behind_writes(mddev); md_bitmap_wait_behind_writes(mddev);
if (!mddev->serialize_policy) { if (!mddev->serialize_policy)
mempool_destroy(mddev->serial_info_pool); mddev_destroy_serial_pool(mddev, NULL, true);
mddev->serial_info_pool = NULL;
}
mutex_lock(&mddev->bitmap_info.mutex); mutex_lock(&mddev->bitmap_info.mutex);
spin_lock(&mddev->lock); spin_lock(&mddev->lock);
@ -2478,10 +2476,8 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
mddev->bitmap_info.max_write_behind = backlog; mddev->bitmap_info.max_write_behind = backlog;
if (!backlog && mddev->serial_info_pool) { if (!backlog && mddev->serial_info_pool) {
/* serial_info_pool is not needed if backlog is zero */ /* serial_info_pool is not needed if backlog is zero */
if (!mddev->serialize_policy) { if (!mddev->serialize_policy)
mempool_destroy(mddev->serial_info_pool); mddev_destroy_serial_pool(mddev, NULL, false);
mddev->serial_info_pool = NULL;
}
} else if (backlog && !mddev->serial_info_pool) { } else if (backlog && !mddev->serial_info_pool) {
/* serial_info_pool is needed since backlog is not zero */ /* serial_info_pool is needed since backlog is not zero */
struct md_rdev *rdev; struct md_rdev *rdev;

View File

@ -125,25 +125,59 @@ static inline int speed_max(struct mddev *mddev)
mddev->sync_speed_max : sysctl_speed_limit_max; mddev->sync_speed_max : sysctl_speed_limit_max;
} }
static int rdev_init_serial(struct md_rdev *rdev) static void rdev_uninit_serial(struct md_rdev *rdev)
{ {
spin_lock_init(&rdev->serial_list_lock); if (!test_and_clear_bit(CollisionCheck, &rdev->flags))
INIT_LIST_HEAD(&rdev->serial_list); return;
init_waitqueue_head(&rdev->serial_io_wait);
set_bit(CollisionCheck, &rdev->flags);
return 1; kfree(rdev->serial);
rdev->serial = NULL;
} }
static void rdevs_init_serial(struct mddev *mddev) static void rdevs_uninit_serial(struct mddev *mddev)
{ {
struct md_rdev *rdev; struct md_rdev *rdev;
rdev_for_each(rdev, mddev)
rdev_uninit_serial(rdev);
}
static int rdev_init_serial(struct md_rdev *rdev)
{
struct serial_in_rdev *serial = NULL;
if (test_bit(CollisionCheck, &rdev->flags))
return 0;
serial = kmalloc(sizeof(struct serial_in_rdev), GFP_KERNEL);
if (!serial)
return -ENOMEM;
spin_lock_init(&serial->serial_lock);
serial->serial_rb = RB_ROOT_CACHED;
init_waitqueue_head(&serial->serial_io_wait);
rdev->serial = serial;
set_bit(CollisionCheck, &rdev->flags);
return 0;
}
static int rdevs_init_serial(struct mddev *mddev)
{
struct md_rdev *rdev;
int ret = 0;
rdev_for_each(rdev, mddev) { rdev_for_each(rdev, mddev) {
if (test_bit(CollisionCheck, &rdev->flags)) ret = rdev_init_serial(rdev);
continue; if (ret)
rdev_init_serial(rdev); break;
} }
/* Free all resources if pool is not existed */
if (ret && !mddev->serial_info_pool)
rdevs_uninit_serial(mddev);
return ret;
} }
/* /*
@ -166,6 +200,8 @@ static int rdev_need_serial(struct md_rdev *rdev)
void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
bool is_suspend) bool is_suspend)
{ {
int ret = 0;
if (rdev && !rdev_need_serial(rdev) && if (rdev && !rdev_need_serial(rdev) &&
!test_bit(CollisionCheck, &rdev->flags)) !test_bit(CollisionCheck, &rdev->flags))
return; return;
@ -174,9 +210,11 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
mddev_suspend(mddev); mddev_suspend(mddev);
if (!rdev) if (!rdev)
rdevs_init_serial(mddev); ret = rdevs_init_serial(mddev);
else else
rdev_init_serial(rdev); ret = rdev_init_serial(rdev);
if (ret)
goto abort;
if (mddev->serial_info_pool == NULL) { if (mddev->serial_info_pool == NULL) {
unsigned int noio_flag; unsigned int noio_flag;
@ -186,9 +224,13 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
mempool_create_kmalloc_pool(NR_SERIAL_INFOS, mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
sizeof(struct serial_info)); sizeof(struct serial_info));
memalloc_noio_restore(noio_flag); memalloc_noio_restore(noio_flag);
if (!mddev->serial_info_pool) if (!mddev->serial_info_pool) {
rdevs_uninit_serial(mddev);
pr_err("can't alloc memory pool for serialization\n"); pr_err("can't alloc memory pool for serialization\n");
}
} }
abort:
if (!is_suspend) if (!is_suspend)
mddev_resume(mddev); mddev_resume(mddev);
} }
@ -199,8 +241,8 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
* 2. when bitmap is destroyed while policy is not enabled. * 2. when bitmap is destroyed while policy is not enabled.
* 3. for disable policy, the pool is destroyed only when no rdev needs it. * 3. for disable policy, the pool is destroyed only when no rdev needs it.
*/ */
static void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
bool is_suspend) bool is_suspend)
{ {
if (rdev && !test_bit(CollisionCheck, &rdev->flags)) if (rdev && !test_bit(CollisionCheck, &rdev->flags))
return; return;
@ -213,8 +255,9 @@ static void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
mddev_suspend(mddev); mddev_suspend(mddev);
rdev_for_each(temp, mddev) { rdev_for_each(temp, mddev) {
if (!rdev) { if (!rdev) {
if (!rdev_need_serial(temp)) if (!mddev->serialize_policy ||
clear_bit(CollisionCheck, &temp->flags); !rdev_need_serial(temp))
rdev_uninit_serial(temp);
else else
num++; num++;
} else if (temp != rdev && } else if (temp != rdev &&
@ -223,7 +266,7 @@ static void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
} }
if (rdev) if (rdev)
clear_bit(CollisionCheck, &rdev->flags); rdev_uninit_serial(rdev);
if (num) if (num)
pr_info("The mempool could be used by other devices\n"); pr_info("The mempool could be used by other devices\n");
@ -6117,8 +6160,9 @@ static void __md_stop_writes(struct mddev *mddev)
mddev->in_sync = 1; mddev->in_sync = 1;
md_update_sb(mddev, 1); md_update_sb(mddev, 1);
} }
mempool_destroy(mddev->serial_info_pool); /* disable policy to guarantee rdevs free resources for serialization */
mddev->serial_info_pool = NULL; mddev->serialize_policy = 0;
mddev_destroy_serial_pool(mddev, NULL, true);
} }
void md_stop_writes(struct mddev *mddev) void md_stop_writes(struct mddev *mddev)

View File

@ -32,6 +32,16 @@
* be retried. * be retried.
*/ */
#define MD_FAILFAST (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT) #define MD_FAILFAST (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT)
/*
* The struct embedded in rdev is used to serialize IO.
*/
struct serial_in_rdev {
struct rb_root_cached serial_rb;
spinlock_t serial_lock;
wait_queue_head_t serial_io_wait;
};
/* /*
* MD's 'extended' device * MD's 'extended' device
*/ */
@ -110,12 +120,7 @@ struct md_rdev {
* in superblock. * in superblock.
*/ */
/* struct serial_in_rdev *serial; /* used for raid1 io serialization */
* The members for check collision of write IOs.
*/
struct list_head serial_list;
spinlock_t serial_list_lock;
wait_queue_head_t serial_io_wait;
struct work_struct del_work; /* used for delayed sysfs removal */ struct work_struct del_work; /* used for delayed sysfs removal */
@ -266,9 +271,10 @@ enum mddev_sb_flags {
#define NR_SERIAL_INFOS 8 #define NR_SERIAL_INFOS 8
/* record current range of serialize IOs */ /* record current range of serialize IOs */
struct serial_info { struct serial_info {
sector_t lo; struct rb_node node;
sector_t hi; sector_t start; /* start sector of rb node */
struct list_head list; sector_t last; /* end sector of rb node */
sector_t _subtree_last; /* highest sector in subtree of rb node */
}; };
struct mddev { struct mddev {
@ -740,6 +746,8 @@ extern void md_update_sb(struct mddev *mddev, int force);
extern void md_kick_rdev_from_array(struct md_rdev * rdev); extern void md_kick_rdev_from_array(struct md_rdev * rdev);
extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
bool is_suspend); bool is_suspend);
extern void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
bool is_suspend);
struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev); struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev);

View File

@ -29,6 +29,7 @@
#include <linux/module.h> #include <linux/module.h>
#include <linux/seq_file.h> #include <linux/seq_file.h>
#include <linux/ratelimit.h> #include <linux/ratelimit.h>
#include <linux/interval_tree_generic.h>
#include <trace/events/block.h> #include <trace/events/block.h>
@ -50,55 +51,58 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
#include "raid1-10.c" #include "raid1-10.c"
#define START(node) ((node)->start)
#define LAST(node) ((node)->last)
INTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last,
START, LAST, static inline, raid1_rb);
static int check_and_add_serial(struct md_rdev *rdev, sector_t lo, sector_t hi) static int check_and_add_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
{ {
struct serial_info *wi, *temp_wi; struct serial_info *si;
unsigned long flags; unsigned long flags;
int ret = 0; int ret = 0;
struct mddev *mddev = rdev->mddev; struct mddev *mddev = rdev->mddev;
struct serial_in_rdev *serial = rdev->serial;
wi = mempool_alloc(mddev->serial_info_pool, GFP_NOIO); si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO);
spin_lock_irqsave(&rdev->serial_list_lock, flags);
list_for_each_entry(temp_wi, &rdev->serial_list, list) {
/* collision happened */
if (hi > temp_wi->lo && lo < temp_wi->hi) {
ret = -EBUSY;
break;
}
}
spin_lock_irqsave(&serial->serial_lock, flags);
/* collision happened */
if (raid1_rb_iter_first(&serial->serial_rb, lo, hi))
ret = -EBUSY;
if (!ret) { if (!ret) {
wi->lo = lo; si->start = lo;
wi->hi = hi; si->last = hi;
list_add(&wi->list, &rdev->serial_list); raid1_rb_insert(si, &serial->serial_rb);
} else } else
mempool_free(wi, mddev->serial_info_pool); mempool_free(si, mddev->serial_info_pool);
spin_unlock_irqrestore(&rdev->serial_list_lock, flags); spin_unlock_irqrestore(&serial->serial_lock, flags);
return ret; return ret;
} }
static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi) static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
{ {
struct serial_info *wi; struct serial_info *si;
unsigned long flags; unsigned long flags;
int found = 0; int found = 0;
struct mddev *mddev = rdev->mddev; struct mddev *mddev = rdev->mddev;
struct serial_in_rdev *serial = rdev->serial;
spin_lock_irqsave(&rdev->serial_list_lock, flags); spin_lock_irqsave(&serial->serial_lock, flags);
list_for_each_entry(wi, &rdev->serial_list, list) for (si = raid1_rb_iter_first(&serial->serial_rb, lo, hi);
if (hi == wi->hi && lo == wi->lo) { si; si = raid1_rb_iter_next(si, lo, hi)) {
list_del(&wi->list); if (si->start == lo && si->last == hi) {
mempool_free(wi, mddev->serial_info_pool); raid1_rb_remove(si, &serial->serial_rb);
mempool_free(si, mddev->serial_info_pool);
found = 1; found = 1;
break; break;
} }
}
if (!found) if (!found)
WARN(1, "The write IO is not recorded for serialization\n"); WARN(1, "The write IO is not recorded for serialization\n");
spin_unlock_irqrestore(&rdev->serial_list_lock, flags); spin_unlock_irqrestore(&serial->serial_lock, flags);
wake_up(&rdev->serial_io_wait); wake_up(&serial->serial_io_wait);
} }
/* /*
@ -1482,6 +1486,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
for (i = 0; i < disks; i++) { for (i = 0; i < disks; i++) {
struct bio *mbio = NULL; struct bio *mbio = NULL;
struct md_rdev *rdev = conf->mirrors[i].rdev; struct md_rdev *rdev = conf->mirrors[i].rdev;
struct serial_in_rdev *serial = rdev->serial;
if (!r1_bio->bios[i]) if (!r1_bio->bios[i])
continue; continue;
@ -1510,13 +1515,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
if (r1_bio->behind_master_bio) { if (r1_bio->behind_master_bio) {
if (test_bit(CollisionCheck, &rdev->flags)) if (test_bit(CollisionCheck, &rdev->flags))
wait_event(rdev->serial_io_wait, wait_event(serial->serial_io_wait,
check_and_add_serial(rdev, lo, hi) check_and_add_serial(rdev, lo, hi)
== 0); == 0);
if (test_bit(WriteMostly, &rdev->flags)) if (test_bit(WriteMostly, &rdev->flags))
atomic_inc(&r1_bio->behind_remaining); atomic_inc(&r1_bio->behind_remaining);
} else if (mddev->serialize_policy) } else if (mddev->serialize_policy)
wait_event(rdev->serial_io_wait, wait_event(serial->serial_io_wait,
check_and_add_serial(rdev, lo, hi) == 0); check_and_add_serial(rdev, lo, hi) == 0);
r1_bio->bios[i] = mbio; r1_bio->bios[i] = mbio;