mirror of
https://github.com/torvalds/linux.git
synced 2024-11-11 06:31:49 +00:00
md updates for 4.4.
Two major components to this update. 1/ the clustered-raid1 support from SUSE is nearly complete. There are a few outstanding issues being worked on. Maybe half a dozen patches will bring this to a usable state. 2/ The first stage of journalled-raid5 support from Facebook makes an appearance. With a journal device configured (typically NVRAM or SSD), the "RAID5 write hole" should be closed - a crash during degraded operations cannot result in data corruption. The next stage will be to use the journal as a write-behind cache so that latency can be reduced and in some cases throughput increased by performing more full-stripe writes. -----BEGIN PGP SIGNATURE----- Version: GnuPG v2 iQIcBAABCAAGBQJWNX9RAAoJEDnsnt1WYoG5bYMP/jI0pV3wcbs7mZQAa8S/V0lU 2l25x4MdwDvqVKMfjIc/C5J08QNgcrgSvhiVPCEOK0w18q395vep9f6gFKbMHhu/ lWU3PLHGw8XBHp5yEnxrpQkN0pRrNjh5NqIdlVMBNyL6u+RZPS2ZuzxJ8wiNAFg1 MypNkgoUu6s+nBp4DWWnMGYhBc+szBR+gTYAzGiZ8vqOH9uiSJ2SsGG5aRVUN/af oMYvJAf9aA6uj+xSzNlXIaLfWJIrshQYS1jU/W4gTm0DwK9yqbTxvubJaE0SGu/o 73FGU8tmQ6ELYfsp3D/jmfUkE7weiNEQhdVb/4wy1A/SGc+W7Ju9pxfhm8ra57s0 /BCkfwWZXEvx1flegXfK1mC6EMpMIcGAD2FQEhmQbW6wTdDwtNyEhIePDVGJwD/F rhEThFa+Dg9+xnBGnS6OUK3EpXgml2hAeAC7uA3TVSAnWd/9/Mpim6fZhqrB/v9L Ik0tZt+H4nxYaheZjKlKhuXUQYcUWGiMb67bGMem/YAlMa4y9C9qF+9mPXxyjVlI hBsd5SfZNz99DyB/bO8BumQeIWlTfzLeFzWW67eQ864LRKO6k0/VIbPZHCfn2oVG XvyC2fUhNOIURP3IMxcyHYxOA7Mu6EDsVVDTpuqLVbZQ5IPjDEfQ54yB/BLUvbX/ Gh2/tKn7Xc25HuLAFEbs =TD5o -----END PGP SIGNATURE----- Merge tag 'md/4.4' of git://neil.brown.name/md Pull md updates from Neil Brown: "Two major components to this update. 1) The clustered-raid1 support from SUSE is nearly complete. There are a few outstanding issues being worked on. Maybe half a dozen patches will bring this to a usable state. 2) The first stage of journalled-raid5 support from Facebook makes an appearance. With a journal device configured (typically NVRAM or SSD), the "RAID5 write hole" should be closed - a crash during degraded operations cannot result in data corruption. The next stage will be to use the journal as a write-behind cache so that latency can be reduced and in some cases throughput increased by performing more full-stripe writes. * tag 'md/4.4' of git://neil.brown.name/md: (66 commits) MD: when RAID journal is missing/faulty, block RESTART_ARRAY_RW MD: set journal disk ->raid_disk MD: kick out journal disk if it's not fresh raid5-cache: start raid5 readonly if journal is missing MD: add new bit to indicate raid array with journal raid5-cache: IO error handling raid5: journal disk can't be removed raid5-cache: add trim support for log MD: fix info output for journal disk raid5-cache: use bio chaining raid5-cache: small log->seq cleanup raid5-cache: new helper: r5_reserve_log_entry raid5-cache: inline r5l_alloc_io_unit into r5l_new_meta raid5-cache: take rdev->data_offset into account early on raid5-cache: refactor bio allocation raid5-cache: clean up r5l_get_meta raid5-cache: simplify state machine when caches flushes are not needed raid5-cache: factor out a helper to run all stripes for an I/O unit raid5-cache: rename flushed_ios to finished_ios raid5-cache: free I/O units earlier ...
This commit is contained in:
commit
ac322de6bf
@ -17,7 +17,7 @@ dm-cache-smq-y += dm-cache-policy-smq.o
|
||||
dm-cache-cleaner-y += dm-cache-policy-cleaner.o
|
||||
dm-era-y += dm-era-target.o
|
||||
md-mod-y += md.o bitmap.o
|
||||
raid456-y += raid5.o
|
||||
raid456-y += raid5.o raid5-cache.o
|
||||
|
||||
# Note: link order is important. All raid personalities
|
||||
# and must come before md.o, as they each initialise
|
||||
|
@ -613,12 +613,10 @@ re_read:
|
||||
daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
|
||||
write_behind = le32_to_cpu(sb->write_behind);
|
||||
sectors_reserved = le32_to_cpu(sb->sectors_reserved);
|
||||
/* XXX: This is a hack to ensure that we don't use clustering
|
||||
* in case:
|
||||
* - dm-raid is in use and
|
||||
* - the nodes written in bitmap_sb is erroneous.
|
||||
/* Setup nodes/clustername only if bitmap version is
|
||||
* cluster-compatible
|
||||
*/
|
||||
if (!bitmap->mddev->sync_super) {
|
||||
if (sb->version == cpu_to_le32(BITMAP_MAJOR_CLUSTERED)) {
|
||||
nodes = le32_to_cpu(sb->nodes);
|
||||
strlcpy(bitmap->mddev->bitmap_info.cluster_name,
|
||||
sb->cluster_name, 64);
|
||||
@ -628,7 +626,7 @@ re_read:
|
||||
if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
|
||||
reason = "bad magic";
|
||||
else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO ||
|
||||
le32_to_cpu(sb->version) > BITMAP_MAJOR_HI)
|
||||
le32_to_cpu(sb->version) > BITMAP_MAJOR_CLUSTERED)
|
||||
reason = "unrecognized superblock version";
|
||||
else if (chunksize < 512)
|
||||
reason = "bitmap chunksize too small";
|
||||
@ -1572,7 +1570,7 @@ void bitmap_close_sync(struct bitmap *bitmap)
|
||||
}
|
||||
EXPORT_SYMBOL(bitmap_close_sync);
|
||||
|
||||
void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
|
||||
void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force)
|
||||
{
|
||||
sector_t s = 0;
|
||||
sector_t blocks;
|
||||
@ -1583,7 +1581,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
|
||||
bitmap->last_end_sync = jiffies;
|
||||
return;
|
||||
}
|
||||
if (time_before(jiffies, (bitmap->last_end_sync
|
||||
if (!force && time_before(jiffies, (bitmap->last_end_sync
|
||||
+ bitmap->mddev->bitmap_info.daemon_sleep)))
|
||||
return;
|
||||
wait_event(bitmap->mddev->recovery_wait,
|
||||
|
@ -9,8 +9,10 @@
|
||||
#define BITMAP_MAJOR_LO 3
|
||||
/* version 4 insists the bitmap is in little-endian order
|
||||
* with version 3, it is host-endian which is non-portable
|
||||
* Version 5 is currently set only for clustered devices
|
||||
*/
|
||||
#define BITMAP_MAJOR_HI 4
|
||||
#define BITMAP_MAJOR_CLUSTERED 5
|
||||
#define BITMAP_MAJOR_HOSTENDIAN 3
|
||||
|
||||
/*
|
||||
@ -255,7 +257,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
|
||||
int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded);
|
||||
void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted);
|
||||
void bitmap_close_sync(struct bitmap *bitmap);
|
||||
void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector);
|
||||
void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force);
|
||||
|
||||
void bitmap_unplug(struct bitmap *bitmap);
|
||||
void bitmap_daemon_work(struct mddev *mddev);
|
||||
|
@ -28,6 +28,7 @@ struct dlm_lock_resource {
|
||||
struct completion completion; /* completion for synchronized locking */
|
||||
void (*bast)(void *arg, int mode); /* blocking AST function pointer*/
|
||||
struct mddev *mddev; /* pointing back to mddev. */
|
||||
int mode;
|
||||
};
|
||||
|
||||
struct suspend_info {
|
||||
@ -53,8 +54,8 @@ struct md_cluster_info {
|
||||
dlm_lockspace_t *lockspace;
|
||||
int slot_number;
|
||||
struct completion completion;
|
||||
struct mutex sb_mutex;
|
||||
struct dlm_lock_resource *bitmap_lockres;
|
||||
struct dlm_lock_resource *resync_lockres;
|
||||
struct list_head suspend_list;
|
||||
spinlock_t suspend_lock;
|
||||
struct md_thread *recovery_thread;
|
||||
@ -79,20 +80,20 @@ enum msg_type {
|
||||
};
|
||||
|
||||
struct cluster_msg {
|
||||
int type;
|
||||
int slot;
|
||||
__le32 type;
|
||||
__le32 slot;
|
||||
/* TODO: Unionize this for smaller footprint */
|
||||
sector_t low;
|
||||
sector_t high;
|
||||
__le64 low;
|
||||
__le64 high;
|
||||
char uuid[16];
|
||||
int raid_slot;
|
||||
__le32 raid_slot;
|
||||
};
|
||||
|
||||
static void sync_ast(void *arg)
|
||||
{
|
||||
struct dlm_lock_resource *res;
|
||||
|
||||
res = (struct dlm_lock_resource *) arg;
|
||||
res = arg;
|
||||
complete(&res->completion);
|
||||
}
|
||||
|
||||
@ -106,6 +107,8 @@ static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
|
||||
if (ret)
|
||||
return ret;
|
||||
wait_for_completion(&res->completion);
|
||||
if (res->lksb.sb_status == 0)
|
||||
res->mode = mode;
|
||||
return res->lksb.sb_status;
|
||||
}
|
||||
|
||||
@ -127,6 +130,7 @@ static struct dlm_lock_resource *lockres_init(struct mddev *mddev,
|
||||
init_completion(&res->completion);
|
||||
res->ls = cinfo->lockspace;
|
||||
res->mddev = mddev;
|
||||
res->mode = DLM_LOCK_IV;
|
||||
namelen = strlen(name);
|
||||
res->name = kzalloc(namelen + 1, GFP_KERNEL);
|
||||
if (!res->name) {
|
||||
@ -191,8 +195,8 @@ retry:
|
||||
kfree(res);
|
||||
}
|
||||
|
||||
static void add_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres,
|
||||
sector_t lo, sector_t hi)
|
||||
static void add_resync_info(struct dlm_lock_resource *lockres,
|
||||
sector_t lo, sector_t hi)
|
||||
{
|
||||
struct resync_info *ri;
|
||||
|
||||
@ -210,7 +214,7 @@ static struct suspend_info *read_resync_info(struct mddev *mddev, struct dlm_loc
|
||||
dlm_lock_sync(lockres, DLM_LOCK_CR);
|
||||
memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info));
|
||||
hi = le64_to_cpu(ri.hi);
|
||||
if (ri.hi > 0) {
|
||||
if (hi > 0) {
|
||||
s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
|
||||
if (!s)
|
||||
goto out;
|
||||
@ -345,7 +349,7 @@ static const struct dlm_lockspace_ops md_ls_ops = {
|
||||
*/
|
||||
static void ack_bast(void *arg, int mode)
|
||||
{
|
||||
struct dlm_lock_resource *res = (struct dlm_lock_resource *)arg;
|
||||
struct dlm_lock_resource *res = arg;
|
||||
struct md_cluster_info *cinfo = res->mddev->cluster_info;
|
||||
|
||||
if (mode == DLM_LOCK_EX)
|
||||
@ -358,29 +362,32 @@ static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot)
|
||||
|
||||
list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
|
||||
if (slot == s->slot) {
|
||||
pr_info("%s:%d Deleting suspend_info: %d\n",
|
||||
__func__, __LINE__, slot);
|
||||
list_del(&s->list);
|
||||
kfree(s);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static void remove_suspend_info(struct md_cluster_info *cinfo, int slot)
|
||||
static void remove_suspend_info(struct mddev *mddev, int slot)
|
||||
{
|
||||
struct md_cluster_info *cinfo = mddev->cluster_info;
|
||||
spin_lock_irq(&cinfo->suspend_lock);
|
||||
__remove_suspend_info(cinfo, slot);
|
||||
spin_unlock_irq(&cinfo->suspend_lock);
|
||||
mddev->pers->quiesce(mddev, 2);
|
||||
}
|
||||
|
||||
|
||||
static void process_suspend_info(struct md_cluster_info *cinfo,
|
||||
static void process_suspend_info(struct mddev *mddev,
|
||||
int slot, sector_t lo, sector_t hi)
|
||||
{
|
||||
struct md_cluster_info *cinfo = mddev->cluster_info;
|
||||
struct suspend_info *s;
|
||||
|
||||
if (!hi) {
|
||||
remove_suspend_info(cinfo, slot);
|
||||
remove_suspend_info(mddev, slot);
|
||||
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
|
||||
md_wakeup_thread(mddev->thread);
|
||||
return;
|
||||
}
|
||||
s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
|
||||
@ -389,11 +396,14 @@ static void process_suspend_info(struct md_cluster_info *cinfo,
|
||||
s->slot = slot;
|
||||
s->lo = lo;
|
||||
s->hi = hi;
|
||||
mddev->pers->quiesce(mddev, 1);
|
||||
mddev->pers->quiesce(mddev, 0);
|
||||
spin_lock_irq(&cinfo->suspend_lock);
|
||||
/* Remove existing entry (if exists) before adding */
|
||||
__remove_suspend_info(cinfo, slot);
|
||||
list_add(&s->list, &cinfo->suspend_list);
|
||||
spin_unlock_irq(&cinfo->suspend_lock);
|
||||
mddev->pers->quiesce(mddev, 2);
|
||||
}
|
||||
|
||||
static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
|
||||
@ -407,7 +417,7 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
|
||||
|
||||
len = snprintf(disk_uuid, 64, "DEVICE_UUID=");
|
||||
sprintf(disk_uuid + len, "%pU", cmsg->uuid);
|
||||
snprintf(raid_slot, 16, "RAID_DISK=%d", cmsg->raid_slot);
|
||||
snprintf(raid_slot, 16, "RAID_DISK=%d", le32_to_cpu(cmsg->raid_slot));
|
||||
pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot);
|
||||
init_completion(&cinfo->newdisk_completion);
|
||||
set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
|
||||
@ -421,64 +431,59 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
|
||||
static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
|
||||
{
|
||||
struct md_cluster_info *cinfo = mddev->cluster_info;
|
||||
|
||||
md_reload_sb(mddev);
|
||||
md_reload_sb(mddev, le32_to_cpu(msg->raid_slot));
|
||||
dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
|
||||
}
|
||||
|
||||
static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
|
||||
{
|
||||
struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot);
|
||||
struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev,
|
||||
le32_to_cpu(msg->raid_slot));
|
||||
|
||||
if (rdev)
|
||||
md_kick_rdev_from_array(rdev);
|
||||
else
|
||||
pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", __func__, __LINE__, msg->raid_slot);
|
||||
pr_warn("%s: %d Could not find disk(%d) to REMOVE\n",
|
||||
__func__, __LINE__, le32_to_cpu(msg->raid_slot));
|
||||
}
|
||||
|
||||
static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg)
|
||||
{
|
||||
struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot);
|
||||
struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev,
|
||||
le32_to_cpu(msg->raid_slot));
|
||||
|
||||
if (rdev && test_bit(Faulty, &rdev->flags))
|
||||
clear_bit(Faulty, &rdev->flags);
|
||||
else
|
||||
pr_warn("%s: %d Could not find disk(%d) which is faulty", __func__, __LINE__, msg->raid_slot);
|
||||
pr_warn("%s: %d Could not find disk(%d) which is faulty",
|
||||
__func__, __LINE__, le32_to_cpu(msg->raid_slot));
|
||||
}
|
||||
|
||||
static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
|
||||
{
|
||||
switch (msg->type) {
|
||||
if (WARN(mddev->cluster_info->slot_number - 1 == le32_to_cpu(msg->slot),
|
||||
"node %d received it's own msg\n", le32_to_cpu(msg->slot)))
|
||||
return;
|
||||
switch (le32_to_cpu(msg->type)) {
|
||||
case METADATA_UPDATED:
|
||||
pr_info("%s: %d Received message: METADATA_UPDATE from %d\n",
|
||||
__func__, __LINE__, msg->slot);
|
||||
process_metadata_update(mddev, msg);
|
||||
break;
|
||||
case RESYNCING:
|
||||
pr_info("%s: %d Received message: RESYNCING from %d\n",
|
||||
__func__, __LINE__, msg->slot);
|
||||
process_suspend_info(mddev->cluster_info, msg->slot,
|
||||
msg->low, msg->high);
|
||||
process_suspend_info(mddev, le32_to_cpu(msg->slot),
|
||||
le64_to_cpu(msg->low),
|
||||
le64_to_cpu(msg->high));
|
||||
break;
|
||||
case NEWDISK:
|
||||
pr_info("%s: %d Received message: NEWDISK from %d\n",
|
||||
__func__, __LINE__, msg->slot);
|
||||
process_add_new_disk(mddev, msg);
|
||||
break;
|
||||
case REMOVE:
|
||||
pr_info("%s: %d Received REMOVE from %d\n",
|
||||
__func__, __LINE__, msg->slot);
|
||||
process_remove_disk(mddev, msg);
|
||||
break;
|
||||
case RE_ADD:
|
||||
pr_info("%s: %d Received RE_ADD from %d\n",
|
||||
__func__, __LINE__, msg->slot);
|
||||
process_readd_disk(mddev, msg);
|
||||
break;
|
||||
case BITMAP_NEEDS_SYNC:
|
||||
pr_info("%s: %d Received BITMAP_NEEDS_SYNC from %d\n",
|
||||
__func__, __LINE__, msg->slot);
|
||||
__recover_slot(mddev, msg->slot);
|
||||
__recover_slot(mddev, le32_to_cpu(msg->slot));
|
||||
break;
|
||||
default:
|
||||
pr_warn("%s:%d Received unknown message from %d\n",
|
||||
@ -528,11 +533,17 @@ static void recv_daemon(struct md_thread *thread)
|
||||
/* lock_comm()
|
||||
* Takes the lock on the TOKEN lock resource so no other
|
||||
* node can communicate while the operation is underway.
|
||||
* If called again, and the TOKEN lock is alread in EX mode
|
||||
* return success. However, care must be taken that unlock_comm()
|
||||
* is called only once.
|
||||
*/
|
||||
static int lock_comm(struct md_cluster_info *cinfo)
|
||||
{
|
||||
int error;
|
||||
|
||||
if (cinfo->token_lockres->mode == DLM_LOCK_EX)
|
||||
return 0;
|
||||
|
||||
error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
|
||||
if (error)
|
||||
pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
|
||||
@ -542,6 +553,7 @@ static int lock_comm(struct md_cluster_info *cinfo)
|
||||
|
||||
static void unlock_comm(struct md_cluster_info *cinfo)
|
||||
{
|
||||
WARN_ON(cinfo->token_lockres->mode != DLM_LOCK_EX);
|
||||
dlm_unlock_sync(cinfo->token_lockres);
|
||||
}
|
||||
|
||||
@ -696,7 +708,6 @@ static int join(struct mddev *mddev, int nodes)
|
||||
init_completion(&cinfo->completion);
|
||||
set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
|
||||
|
||||
mutex_init(&cinfo->sb_mutex);
|
||||
mddev->cluster_info = cinfo;
|
||||
|
||||
memset(str, 0, 64);
|
||||
@ -753,6 +764,10 @@ static int join(struct mddev *mddev, int nodes)
|
||||
goto err;
|
||||
}
|
||||
|
||||
cinfo->resync_lockres = lockres_init(mddev, "resync", NULL, 0);
|
||||
if (!cinfo->resync_lockres)
|
||||
goto err;
|
||||
|
||||
ret = gather_all_resync_info(mddev, nodes);
|
||||
if (ret)
|
||||
goto err;
|
||||
@ -763,6 +778,7 @@ err:
|
||||
lockres_free(cinfo->token_lockres);
|
||||
lockres_free(cinfo->ack_lockres);
|
||||
lockres_free(cinfo->no_new_dev_lockres);
|
||||
lockres_free(cinfo->resync_lockres);
|
||||
lockres_free(cinfo->bitmap_lockres);
|
||||
if (cinfo->lockspace)
|
||||
dlm_release_lockspace(cinfo->lockspace, 2);
|
||||
@ -771,12 +787,32 @@ err:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void resync_bitmap(struct mddev *mddev)
|
||||
{
|
||||
struct md_cluster_info *cinfo = mddev->cluster_info;
|
||||
struct cluster_msg cmsg = {0};
|
||||
int err;
|
||||
|
||||
cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
|
||||
err = sendmsg(cinfo, &cmsg);
|
||||
if (err)
|
||||
pr_err("%s:%d: failed to send BITMAP_NEEDS_SYNC message (%d)\n",
|
||||
__func__, __LINE__, err);
|
||||
}
|
||||
|
||||
static int leave(struct mddev *mddev)
|
||||
{
|
||||
struct md_cluster_info *cinfo = mddev->cluster_info;
|
||||
|
||||
if (!cinfo)
|
||||
return 0;
|
||||
|
||||
/* BITMAP_NEEDS_SYNC message should be sent when node
|
||||
* is leaving the cluster with dirty bitmap, also we
|
||||
* can only deliver it when dlm connection is available */
|
||||
if (cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector)
|
||||
resync_bitmap(mddev);
|
||||
|
||||
md_unregister_thread(&cinfo->recovery_thread);
|
||||
md_unregister_thread(&cinfo->recv_thread);
|
||||
lockres_free(cinfo->message_lockres);
|
||||
@ -799,15 +835,6 @@ static int slot_number(struct mddev *mddev)
|
||||
return cinfo->slot_number - 1;
|
||||
}
|
||||
|
||||
static void resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
|
||||
{
|
||||
struct md_cluster_info *cinfo = mddev->cluster_info;
|
||||
|
||||
add_resync_info(mddev, cinfo->bitmap_lockres, lo, hi);
|
||||
/* Re-acquire the lock to refresh LVB */
|
||||
dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
|
||||
}
|
||||
|
||||
static int metadata_update_start(struct mddev *mddev)
|
||||
{
|
||||
return lock_comm(mddev->cluster_info);
|
||||
@ -817,59 +844,62 @@ static int metadata_update_finish(struct mddev *mddev)
|
||||
{
|
||||
struct md_cluster_info *cinfo = mddev->cluster_info;
|
||||
struct cluster_msg cmsg;
|
||||
int ret;
|
||||
struct md_rdev *rdev;
|
||||
int ret = 0;
|
||||
int raid_slot = -1;
|
||||
|
||||
memset(&cmsg, 0, sizeof(cmsg));
|
||||
cmsg.type = cpu_to_le32(METADATA_UPDATED);
|
||||
ret = __sendmsg(cinfo, &cmsg);
|
||||
/* Pick up a good active device number to send.
|
||||
*/
|
||||
rdev_for_each(rdev, mddev)
|
||||
if (rdev->raid_disk > -1 && !test_bit(Faulty, &rdev->flags)) {
|
||||
raid_slot = rdev->desc_nr;
|
||||
break;
|
||||
}
|
||||
if (raid_slot >= 0) {
|
||||
cmsg.raid_slot = cpu_to_le32(raid_slot);
|
||||
ret = __sendmsg(cinfo, &cmsg);
|
||||
} else
|
||||
pr_warn("md-cluster: No good device id found to send\n");
|
||||
unlock_comm(cinfo);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int metadata_update_cancel(struct mddev *mddev)
|
||||
static void metadata_update_cancel(struct mddev *mddev)
|
||||
{
|
||||
struct md_cluster_info *cinfo = mddev->cluster_info;
|
||||
|
||||
return dlm_unlock_sync(cinfo->token_lockres);
|
||||
unlock_comm(cinfo);
|
||||
}
|
||||
|
||||
static int resync_send(struct mddev *mddev, enum msg_type type,
|
||||
sector_t lo, sector_t hi)
|
||||
static int resync_start(struct mddev *mddev)
|
||||
{
|
||||
struct md_cluster_info *cinfo = mddev->cluster_info;
|
||||
struct cluster_msg cmsg;
|
||||
int slot = cinfo->slot_number - 1;
|
||||
cinfo->resync_lockres->flags |= DLM_LKF_NOQUEUE;
|
||||
return dlm_lock_sync(cinfo->resync_lockres, DLM_LOCK_EX);
|
||||
}
|
||||
|
||||
pr_info("%s:%d lo: %llu hi: %llu\n", __func__, __LINE__,
|
||||
(unsigned long long)lo,
|
||||
(unsigned long long)hi);
|
||||
resync_info_update(mddev, lo, hi);
|
||||
cmsg.type = cpu_to_le32(type);
|
||||
cmsg.slot = cpu_to_le32(slot);
|
||||
static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
|
||||
{
|
||||
struct md_cluster_info *cinfo = mddev->cluster_info;
|
||||
struct cluster_msg cmsg = {0};
|
||||
|
||||
add_resync_info(cinfo->bitmap_lockres, lo, hi);
|
||||
/* Re-acquire the lock to refresh LVB */
|
||||
dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
|
||||
cmsg.type = cpu_to_le32(RESYNCING);
|
||||
cmsg.low = cpu_to_le64(lo);
|
||||
cmsg.high = cpu_to_le64(hi);
|
||||
|
||||
return sendmsg(cinfo, &cmsg);
|
||||
}
|
||||
|
||||
static int resync_start(struct mddev *mddev, sector_t lo, sector_t hi)
|
||||
{
|
||||
pr_info("%s:%d\n", __func__, __LINE__);
|
||||
return resync_send(mddev, RESYNCING, lo, hi);
|
||||
}
|
||||
|
||||
static void resync_finish(struct mddev *mddev)
|
||||
static int resync_finish(struct mddev *mddev)
|
||||
{
|
||||
struct md_cluster_info *cinfo = mddev->cluster_info;
|
||||
struct cluster_msg cmsg;
|
||||
int slot = cinfo->slot_number - 1;
|
||||
|
||||
pr_info("%s:%d\n", __func__, __LINE__);
|
||||
resync_send(mddev, RESYNCING, 0, 0);
|
||||
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
|
||||
cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
|
||||
cmsg.slot = cpu_to_le32(slot);
|
||||
sendmsg(cinfo, &cmsg);
|
||||
}
|
||||
cinfo->resync_lockres->flags &= ~DLM_LKF_NOQUEUE;
|
||||
dlm_unlock_sync(cinfo->resync_lockres);
|
||||
return resync_info_update(mddev, 0, 0);
|
||||
}
|
||||
|
||||
static int area_resyncing(struct mddev *mddev, int direction,
|
||||
@ -896,7 +926,11 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev)
|
||||
/* add_new_disk() - initiates a disk add
|
||||
* However, if this fails before writing md_update_sb(),
|
||||
* add_new_disk_cancel() must be called to release token lock
|
||||
*/
|
||||
static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
|
||||
{
|
||||
struct md_cluster_info *cinfo = mddev->cluster_info;
|
||||
struct cluster_msg cmsg;
|
||||
@ -907,7 +941,7 @@ static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev)
|
||||
memset(&cmsg, 0, sizeof(cmsg));
|
||||
cmsg.type = cpu_to_le32(NEWDISK);
|
||||
memcpy(cmsg.uuid, uuid, 16);
|
||||
cmsg.raid_slot = rdev->desc_nr;
|
||||
cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
|
||||
lock_comm(cinfo);
|
||||
ret = __sendmsg(cinfo, &cmsg);
|
||||
if (ret)
|
||||
@ -918,22 +952,17 @@ static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev)
|
||||
/* Some node does not "see" the device */
|
||||
if (ret == -EAGAIN)
|
||||
ret = -ENOENT;
|
||||
if (ret)
|
||||
unlock_comm(cinfo);
|
||||
else
|
||||
dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int add_new_disk_finish(struct mddev *mddev)
|
||||
static void add_new_disk_cancel(struct mddev *mddev)
|
||||
{
|
||||
struct cluster_msg cmsg;
|
||||
struct md_cluster_info *cinfo = mddev->cluster_info;
|
||||
int ret;
|
||||
/* Write sb and inform others */
|
||||
md_update_sb(mddev, 1);
|
||||
cmsg.type = METADATA_UPDATED;
|
||||
ret = __sendmsg(cinfo, &cmsg);
|
||||
unlock_comm(cinfo);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int new_disk_ack(struct mddev *mddev, bool ack)
|
||||
@ -953,10 +982,10 @@ static int new_disk_ack(struct mddev *mddev, bool ack)
|
||||
|
||||
static int remove_disk(struct mddev *mddev, struct md_rdev *rdev)
|
||||
{
|
||||
struct cluster_msg cmsg;
|
||||
struct cluster_msg cmsg = {0};
|
||||
struct md_cluster_info *cinfo = mddev->cluster_info;
|
||||
cmsg.type = REMOVE;
|
||||
cmsg.raid_slot = rdev->desc_nr;
|
||||
cmsg.type = cpu_to_le32(REMOVE);
|
||||
cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
|
||||
return __sendmsg(cinfo, &cmsg);
|
||||
}
|
||||
|
||||
@ -964,12 +993,12 @@ static int gather_bitmaps(struct md_rdev *rdev)
|
||||
{
|
||||
int sn, err;
|
||||
sector_t lo, hi;
|
||||
struct cluster_msg cmsg;
|
||||
struct cluster_msg cmsg = {0};
|
||||
struct mddev *mddev = rdev->mddev;
|
||||
struct md_cluster_info *cinfo = mddev->cluster_info;
|
||||
|
||||
cmsg.type = RE_ADD;
|
||||
cmsg.raid_slot = rdev->desc_nr;
|
||||
cmsg.type = cpu_to_le32(RE_ADD);
|
||||
cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
|
||||
err = sendmsg(cinfo, &cmsg);
|
||||
if (err)
|
||||
goto out;
|
||||
@ -993,15 +1022,15 @@ static struct md_cluster_operations cluster_ops = {
|
||||
.join = join,
|
||||
.leave = leave,
|
||||
.slot_number = slot_number,
|
||||
.resync_info_update = resync_info_update,
|
||||
.resync_start = resync_start,
|
||||
.resync_finish = resync_finish,
|
||||
.resync_info_update = resync_info_update,
|
||||
.metadata_update_start = metadata_update_start,
|
||||
.metadata_update_finish = metadata_update_finish,
|
||||
.metadata_update_cancel = metadata_update_cancel,
|
||||
.area_resyncing = area_resyncing,
|
||||
.add_new_disk_start = add_new_disk_start,
|
||||
.add_new_disk_finish = add_new_disk_finish,
|
||||
.add_new_disk = add_new_disk,
|
||||
.add_new_disk_cancel = add_new_disk_cancel,
|
||||
.new_disk_ack = new_disk_ack,
|
||||
.remove_disk = remove_disk,
|
||||
.gather_bitmaps = gather_bitmaps,
|
||||
@ -1022,5 +1051,6 @@ static void cluster_exit(void)
|
||||
|
||||
module_init(cluster_init);
|
||||
module_exit(cluster_exit);
|
||||
MODULE_AUTHOR("SUSE");
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_DESCRIPTION("Clustering support for MD");
|
||||
|
@ -12,15 +12,15 @@ struct md_cluster_operations {
|
||||
int (*join)(struct mddev *mddev, int nodes);
|
||||
int (*leave)(struct mddev *mddev);
|
||||
int (*slot_number)(struct mddev *mddev);
|
||||
void (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi);
|
||||
int (*resync_start)(struct mddev *mddev, sector_t lo, sector_t hi);
|
||||
void (*resync_finish)(struct mddev *mddev);
|
||||
int (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi);
|
||||
int (*metadata_update_start)(struct mddev *mddev);
|
||||
int (*metadata_update_finish)(struct mddev *mddev);
|
||||
int (*metadata_update_cancel)(struct mddev *mddev);
|
||||
void (*metadata_update_cancel)(struct mddev *mddev);
|
||||
int (*resync_start)(struct mddev *mddev);
|
||||
int (*resync_finish)(struct mddev *mddev);
|
||||
int (*area_resyncing)(struct mddev *mddev, int direction, sector_t lo, sector_t hi);
|
||||
int (*add_new_disk_start)(struct mddev *mddev, struct md_rdev *rdev);
|
||||
int (*add_new_disk_finish)(struct mddev *mddev);
|
||||
int (*add_new_disk)(struct mddev *mddev, struct md_rdev *rdev);
|
||||
void (*add_new_disk_cancel)(struct mddev *mddev);
|
||||
int (*new_disk_ack)(struct mddev *mddev, bool ack);
|
||||
int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev);
|
||||
int (*gather_bitmaps)(struct md_rdev *rdev);
|
||||
|
497
drivers/md/md.c
497
drivers/md/md.c
@ -1608,7 +1608,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
|
||||
++ev1;
|
||||
if (rdev->desc_nr >= 0 &&
|
||||
rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
|
||||
le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe)
|
||||
(le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
|
||||
le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
|
||||
if (ev1 < mddev->events)
|
||||
return -EINVAL;
|
||||
} else if (mddev->bitmap) {
|
||||
@ -1628,16 +1629,29 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
|
||||
int role;
|
||||
if (rdev->desc_nr < 0 ||
|
||||
rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
|
||||
role = 0xffff;
|
||||
role = MD_DISK_ROLE_SPARE;
|
||||
rdev->desc_nr = -1;
|
||||
} else
|
||||
role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
|
||||
switch(role) {
|
||||
case 0xffff: /* spare */
|
||||
case MD_DISK_ROLE_SPARE: /* spare */
|
||||
break;
|
||||
case 0xfffe: /* faulty */
|
||||
case MD_DISK_ROLE_FAULTY: /* faulty */
|
||||
set_bit(Faulty, &rdev->flags);
|
||||
break;
|
||||
case MD_DISK_ROLE_JOURNAL: /* journal device */
|
||||
if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
|
||||
/* journal device without journal feature */
|
||||
printk(KERN_WARNING
|
||||
"md: journal device provided without journal feature, ignoring the device\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
set_bit(Journal, &rdev->flags);
|
||||
rdev->journal_tail = le64_to_cpu(sb->journal_tail);
|
||||
if (mddev->recovery_cp == MaxSector)
|
||||
set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
|
||||
rdev->raid_disk = mddev->raid_disks;
|
||||
break;
|
||||
default:
|
||||
rdev->saved_raid_disk = role;
|
||||
if ((le32_to_cpu(sb->feature_map) &
|
||||
@ -1655,6 +1669,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
|
||||
set_bit(WriteMostly, &rdev->flags);
|
||||
if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
|
||||
set_bit(Replacement, &rdev->flags);
|
||||
if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
|
||||
set_bit(MD_HAS_JOURNAL, &mddev->flags);
|
||||
} else /* MULTIPATH are always insync */
|
||||
set_bit(In_sync, &rdev->flags);
|
||||
|
||||
@ -1679,6 +1695,8 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
|
||||
sb->events = cpu_to_le64(mddev->events);
|
||||
if (mddev->in_sync)
|
||||
sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
|
||||
else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
|
||||
sb->resync_offset = cpu_to_le64(MaxSector);
|
||||
else
|
||||
sb->resync_offset = cpu_to_le64(0);
|
||||
|
||||
@ -1702,7 +1720,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
|
||||
sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
|
||||
}
|
||||
|
||||
if (rdev->raid_disk >= 0 &&
|
||||
if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) &&
|
||||
!test_bit(In_sync, &rdev->flags)) {
|
||||
sb->feature_map |=
|
||||
cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
|
||||
@ -1712,6 +1730,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
|
||||
sb->feature_map |=
|
||||
cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
|
||||
}
|
||||
/* Note: recovery_offset and journal_tail share space */
|
||||
if (test_bit(Journal, &rdev->flags))
|
||||
sb->journal_tail = cpu_to_le64(rdev->journal_tail);
|
||||
if (test_bit(Replacement, &rdev->flags))
|
||||
sb->feature_map |=
|
||||
cpu_to_le32(MD_FEATURE_REPLACEMENT);
|
||||
@ -1735,6 +1756,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
|
||||
}
|
||||
}
|
||||
|
||||
if (mddev_is_clustered(mddev))
|
||||
sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED);
|
||||
|
||||
if (rdev->badblocks.count == 0)
|
||||
/* Nothing to do for bad blocks*/ ;
|
||||
else if (sb->bblog_offset == 0)
|
||||
@ -1785,18 +1809,23 @@ retry:
|
||||
max_dev = le32_to_cpu(sb->max_dev);
|
||||
|
||||
for (i=0; i<max_dev;i++)
|
||||
sb->dev_roles[i] = cpu_to_le16(0xfffe);
|
||||
sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
|
||||
|
||||
if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
|
||||
sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
|
||||
|
||||
rdev_for_each(rdev2, mddev) {
|
||||
i = rdev2->desc_nr;
|
||||
if (test_bit(Faulty, &rdev2->flags))
|
||||
sb->dev_roles[i] = cpu_to_le16(0xfffe);
|
||||
sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
|
||||
else if (test_bit(In_sync, &rdev2->flags))
|
||||
sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
|
||||
else if (test_bit(Journal, &rdev2->flags))
|
||||
sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL);
|
||||
else if (rdev2->raid_disk >= 0)
|
||||
sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
|
||||
else
|
||||
sb->dev_roles[i] = cpu_to_le16(0xffff);
|
||||
sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
|
||||
}
|
||||
|
||||
sb->sb_csum = calc_sb_1_csum(sb);
|
||||
@ -1912,13 +1941,23 @@ static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
|
||||
struct md_rdev *rdev, *rdev2;
|
||||
|
||||
rcu_read_lock();
|
||||
rdev_for_each_rcu(rdev, mddev1)
|
||||
rdev_for_each_rcu(rdev2, mddev2)
|
||||
rdev_for_each_rcu(rdev, mddev1) {
|
||||
if (test_bit(Faulty, &rdev->flags) ||
|
||||
test_bit(Journal, &rdev->flags) ||
|
||||
rdev->raid_disk == -1)
|
||||
continue;
|
||||
rdev_for_each_rcu(rdev2, mddev2) {
|
||||
if (test_bit(Faulty, &rdev2->flags) ||
|
||||
test_bit(Journal, &rdev2->flags) ||
|
||||
rdev2->raid_disk == -1)
|
||||
continue;
|
||||
if (rdev->bdev->bd_contains ==
|
||||
rdev2->bdev->bd_contains) {
|
||||
rcu_read_unlock();
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
return 0;
|
||||
}
|
||||
@ -2194,23 +2233,77 @@ static void sync_sbs(struct mddev *mddev, int nospares)
|
||||
}
|
||||
}
|
||||
|
||||
static bool does_sb_need_changing(struct mddev *mddev)
|
||||
{
|
||||
struct md_rdev *rdev;
|
||||
struct mdp_superblock_1 *sb;
|
||||
int role;
|
||||
|
||||
/* Find a good rdev */
|
||||
rdev_for_each(rdev, mddev)
|
||||
if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags))
|
||||
break;
|
||||
|
||||
/* No good device found. */
|
||||
if (!rdev)
|
||||
return false;
|
||||
|
||||
sb = page_address(rdev->sb_page);
|
||||
/* Check if a device has become faulty or a spare become active */
|
||||
rdev_for_each(rdev, mddev) {
|
||||
role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
|
||||
/* Device activated? */
|
||||
if (role == 0xffff && rdev->raid_disk >=0 &&
|
||||
!test_bit(Faulty, &rdev->flags))
|
||||
return true;
|
||||
/* Device turned faulty? */
|
||||
if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd))
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Check if any mddev parameters have changed */
|
||||
if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
|
||||
(mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
|
||||
(mddev->layout != le64_to_cpu(sb->layout)) ||
|
||||
(mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
|
||||
(mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void md_update_sb(struct mddev *mddev, int force_change)
|
||||
{
|
||||
struct md_rdev *rdev;
|
||||
int sync_req;
|
||||
int nospares = 0;
|
||||
int any_badblocks_changed = 0;
|
||||
int ret = -1;
|
||||
|
||||
if (mddev->ro) {
|
||||
if (force_change)
|
||||
set_bit(MD_CHANGE_DEVS, &mddev->flags);
|
||||
return;
|
||||
}
|
||||
|
||||
if (mddev_is_clustered(mddev)) {
|
||||
if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
|
||||
force_change = 1;
|
||||
ret = md_cluster_ops->metadata_update_start(mddev);
|
||||
/* Has someone else has updated the sb */
|
||||
if (!does_sb_need_changing(mddev)) {
|
||||
if (ret == 0)
|
||||
md_cluster_ops->metadata_update_cancel(mddev);
|
||||
clear_bit(MD_CHANGE_PENDING, &mddev->flags);
|
||||
return;
|
||||
}
|
||||
}
|
||||
repeat:
|
||||
/* First make sure individual recovery_offsets are correct */
|
||||
rdev_for_each(rdev, mddev) {
|
||||
if (rdev->raid_disk >= 0 &&
|
||||
mddev->delta_disks >= 0 &&
|
||||
!test_bit(Journal, &rdev->flags) &&
|
||||
!test_bit(In_sync, &rdev->flags) &&
|
||||
mddev->curr_resync_completed > rdev->recovery_offset)
|
||||
rdev->recovery_offset = mddev->curr_resync_completed;
|
||||
@ -2354,6 +2447,9 @@ repeat:
|
||||
clear_bit(BlockedBadBlocks, &rdev->flags);
|
||||
wake_up(&rdev->blocked_wait);
|
||||
}
|
||||
|
||||
if (mddev_is_clustered(mddev) && ret == 0)
|
||||
md_cluster_ops->metadata_update_finish(mddev);
|
||||
}
|
||||
EXPORT_SYMBOL(md_update_sb);
|
||||
|
||||
@ -2429,6 +2525,10 @@ state_show(struct md_rdev *rdev, char *page)
|
||||
len += sprintf(page+len, "%sin_sync",sep);
|
||||
sep = ",";
|
||||
}
|
||||
if (test_bit(Journal, &flags)) {
|
||||
len += sprintf(page+len, "%sjournal",sep);
|
||||
sep = ",";
|
||||
}
|
||||
if (test_bit(WriteMostly, &flags)) {
|
||||
len += sprintf(page+len, "%swrite_mostly",sep);
|
||||
sep = ",";
|
||||
@ -2440,6 +2540,7 @@ state_show(struct md_rdev *rdev, char *page)
|
||||
sep = ",";
|
||||
}
|
||||
if (!test_bit(Faulty, &flags) &&
|
||||
!test_bit(Journal, &flags) &&
|
||||
!test_bit(In_sync, &flags)) {
|
||||
len += sprintf(page+len, "%sspare", sep);
|
||||
sep = ",";
|
||||
@ -2488,17 +2589,16 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
|
||||
err = -EBUSY;
|
||||
else {
|
||||
struct mddev *mddev = rdev->mddev;
|
||||
if (mddev_is_clustered(mddev))
|
||||
md_cluster_ops->remove_disk(mddev, rdev);
|
||||
md_kick_rdev_from_array(rdev);
|
||||
if (mddev_is_clustered(mddev))
|
||||
md_cluster_ops->metadata_update_start(mddev);
|
||||
if (mddev->pers)
|
||||
md_update_sb(mddev, 1);
|
||||
md_new_event(mddev);
|
||||
if (mddev_is_clustered(mddev))
|
||||
md_cluster_ops->metadata_update_finish(mddev);
|
||||
err = 0;
|
||||
if (mddev_is_clustered(mddev))
|
||||
err = md_cluster_ops->remove_disk(mddev, rdev);
|
||||
|
||||
if (err == 0) {
|
||||
md_kick_rdev_from_array(rdev);
|
||||
if (mddev->pers)
|
||||
md_update_sb(mddev, 1);
|
||||
md_new_event(mddev);
|
||||
}
|
||||
}
|
||||
} else if (cmd_match(buf, "writemostly")) {
|
||||
set_bit(WriteMostly, &rdev->flags);
|
||||
@ -2527,7 +2627,8 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
|
||||
} else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
|
||||
set_bit(In_sync, &rdev->flags);
|
||||
err = 0;
|
||||
} else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0) {
|
||||
} else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
|
||||
!test_bit(Journal, &rdev->flags)) {
|
||||
if (rdev->mddev->pers == NULL) {
|
||||
clear_bit(In_sync, &rdev->flags);
|
||||
rdev->saved_raid_disk = rdev->raid_disk;
|
||||
@ -2546,6 +2647,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
|
||||
* check if recovery is needed.
|
||||
*/
|
||||
if (rdev->raid_disk >= 0 &&
|
||||
!test_bit(Journal, &rdev->flags) &&
|
||||
!test_bit(Replacement, &rdev->flags))
|
||||
set_bit(WantReplacement, &rdev->flags);
|
||||
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
|
||||
@ -2623,7 +2725,9 @@ __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
|
||||
static ssize_t
|
||||
slot_show(struct md_rdev *rdev, char *page)
|
||||
{
|
||||
if (rdev->raid_disk < 0)
|
||||
if (test_bit(Journal, &rdev->flags))
|
||||
return sprintf(page, "journal\n");
|
||||
else if (rdev->raid_disk < 0)
|
||||
return sprintf(page, "none\n");
|
||||
else
|
||||
return sprintf(page, "%d\n", rdev->raid_disk);
|
||||
@ -2635,6 +2739,8 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
|
||||
int slot;
|
||||
int err;
|
||||
|
||||
if (test_bit(Journal, &rdev->flags))
|
||||
return -EBUSY;
|
||||
if (strncmp(buf, "none", 4)==0)
|
||||
slot = -1;
|
||||
else {
|
||||
@ -2686,15 +2792,9 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
|
||||
rdev->saved_raid_disk = -1;
|
||||
clear_bit(In_sync, &rdev->flags);
|
||||
clear_bit(Bitmap_sync, &rdev->flags);
|
||||
err = rdev->mddev->pers->
|
||||
hot_add_disk(rdev->mddev, rdev);
|
||||
if (err) {
|
||||
rdev->raid_disk = -1;
|
||||
return err;
|
||||
} else
|
||||
sysfs_notify_dirent_safe(rdev->sysfs_state);
|
||||
if (sysfs_link_rdev(rdev->mddev, rdev))
|
||||
/* failure here is OK */;
|
||||
remove_and_add_spares(rdev->mddev, rdev);
|
||||
if (rdev->raid_disk == -1)
|
||||
return -EBUSY;
|
||||
/* don't wakeup anyone, leave that to userspace. */
|
||||
} else {
|
||||
if (slot >= rdev->mddev->raid_disks &&
|
||||
@ -2839,6 +2939,8 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
|
||||
sector_t oldsectors = rdev->sectors;
|
||||
sector_t sectors;
|
||||
|
||||
if (test_bit(Journal, &rdev->flags))
|
||||
return -EBUSY;
|
||||
if (strict_blocks_to_sectors(buf, §ors) < 0)
|
||||
return -EINVAL;
|
||||
if (rdev->data_offset != rdev->new_data_offset)
|
||||
@ -3196,20 +3298,14 @@ static void analyze_sbs(struct mddev *mddev)
|
||||
md_kick_rdev_from_array(rdev);
|
||||
continue;
|
||||
}
|
||||
/* No device should have a Candidate flag
|
||||
* when reading devices
|
||||
*/
|
||||
if (test_bit(Candidate, &rdev->flags)) {
|
||||
pr_info("md: kicking Cluster Candidate %s from array!\n",
|
||||
bdevname(rdev->bdev, b));
|
||||
md_kick_rdev_from_array(rdev);
|
||||
}
|
||||
}
|
||||
if (mddev->level == LEVEL_MULTIPATH) {
|
||||
rdev->desc_nr = i++;
|
||||
rdev->raid_disk = rdev->desc_nr;
|
||||
set_bit(In_sync, &rdev->flags);
|
||||
} else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) {
|
||||
} else if (rdev->raid_disk >=
|
||||
(mddev->raid_disks - min(0, mddev->delta_disks)) &&
|
||||
!test_bit(Journal, &rdev->flags)) {
|
||||
rdev->raid_disk = -1;
|
||||
clear_bit(In_sync, &rdev->flags);
|
||||
}
|
||||
@ -3267,6 +3363,11 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
|
||||
{
|
||||
unsigned long msec;
|
||||
|
||||
if (mddev_is_clustered(mddev)) {
|
||||
pr_info("md: Safemode is disabled for clustered mode\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
|
||||
return -EINVAL;
|
||||
if (msec == 0)
|
||||
@ -3867,7 +3968,9 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
|
||||
break;
|
||||
case clean:
|
||||
if (mddev->pers) {
|
||||
restart_array(mddev);
|
||||
err = restart_array(mddev);
|
||||
if (err)
|
||||
break;
|
||||
spin_lock(&mddev->lock);
|
||||
if (atomic_read(&mddev->writes_pending) == 0) {
|
||||
if (mddev->in_sync == 0) {
|
||||
@ -3885,7 +3988,9 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
|
||||
break;
|
||||
case active:
|
||||
if (mddev->pers) {
|
||||
restart_array(mddev);
|
||||
err = restart_array(mddev);
|
||||
if (err)
|
||||
break;
|
||||
clear_bit(MD_CHANGE_PENDING, &mddev->flags);
|
||||
wake_up(&mddev->sb_wait);
|
||||
err = 0;
|
||||
@ -4064,12 +4169,8 @@ size_store(struct mddev *mddev, const char *buf, size_t len)
|
||||
if (err)
|
||||
return err;
|
||||
if (mddev->pers) {
|
||||
if (mddev_is_clustered(mddev))
|
||||
md_cluster_ops->metadata_update_start(mddev);
|
||||
err = update_size(mddev, sectors);
|
||||
md_update_sb(mddev, 1);
|
||||
if (mddev_is_clustered(mddev))
|
||||
md_cluster_ops->metadata_update_finish(mddev);
|
||||
} else {
|
||||
if (mddev->dev_sectors == 0 ||
|
||||
mddev->dev_sectors > sectors)
|
||||
@ -5181,7 +5282,10 @@ int md_run(struct mddev *mddev)
|
||||
atomic_set(&mddev->max_corr_read_errors,
|
||||
MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
|
||||
mddev->safemode = 0;
|
||||
mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
|
||||
if (mddev_is_clustered(mddev))
|
||||
mddev->safemode_delay = 0;
|
||||
else
|
||||
mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
|
||||
mddev->in_sync = 1;
|
||||
smp_wmb();
|
||||
spin_lock(&mddev->lock);
|
||||
@ -5224,6 +5328,9 @@ static int do_md_run(struct mddev *mddev)
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (mddev_is_clustered(mddev))
|
||||
md_allow_write(mddev);
|
||||
|
||||
md_wakeup_thread(mddev->thread);
|
||||
md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
|
||||
|
||||
@ -5246,6 +5353,25 @@ static int restart_array(struct mddev *mddev)
|
||||
return -EINVAL;
|
||||
if (!mddev->ro)
|
||||
return -EBUSY;
|
||||
if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
|
||||
struct md_rdev *rdev;
|
||||
bool has_journal = false;
|
||||
|
||||
rcu_read_lock();
|
||||
rdev_for_each_rcu(rdev, mddev) {
|
||||
if (test_bit(Journal, &rdev->flags) &&
|
||||
!test_bit(Faulty, &rdev->flags)) {
|
||||
has_journal = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
/* Don't restart rw with journal missing/faulty */
|
||||
if (!has_journal)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
mddev->safemode = 0;
|
||||
mddev->ro = 0;
|
||||
set_disk_ro(disk, 0);
|
||||
@ -5307,8 +5433,6 @@ static void md_clean(struct mddev *mddev)
|
||||
|
||||
static void __md_stop_writes(struct mddev *mddev)
|
||||
{
|
||||
if (mddev_is_clustered(mddev))
|
||||
md_cluster_ops->metadata_update_start(mddev);
|
||||
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
|
||||
flush_workqueue(md_misc_wq);
|
||||
if (mddev->sync_thread) {
|
||||
@ -5322,13 +5446,13 @@ static void __md_stop_writes(struct mddev *mddev)
|
||||
md_super_wait(mddev);
|
||||
|
||||
if (mddev->ro == 0 &&
|
||||
(!mddev->in_sync || (mddev->flags & MD_UPDATE_SB_FLAGS))) {
|
||||
((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
|
||||
(mddev->flags & MD_UPDATE_SB_FLAGS))) {
|
||||
/* mark array as shutdown cleanly */
|
||||
mddev->in_sync = 1;
|
||||
if (!mddev_is_clustered(mddev))
|
||||
mddev->in_sync = 1;
|
||||
md_update_sb(mddev, 1);
|
||||
}
|
||||
if (mddev_is_clustered(mddev))
|
||||
md_cluster_ops->metadata_update_finish(mddev);
|
||||
}
|
||||
|
||||
void md_stop_writes(struct mddev *mddev)
|
||||
@ -5789,6 +5913,8 @@ static int get_disk_info(struct mddev *mddev, void __user * arg)
|
||||
info.state |= (1<<MD_DISK_ACTIVE);
|
||||
info.state |= (1<<MD_DISK_SYNC);
|
||||
}
|
||||
if (test_bit(Journal, &rdev->flags))
|
||||
info.state |= (1<<MD_DISK_JOURNAL);
|
||||
if (test_bit(WriteMostly, &rdev->flags))
|
||||
info.state |= (1<<MD_DISK_WRITEMOSTLY);
|
||||
} else {
|
||||
@ -5903,23 +6029,18 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
|
||||
else
|
||||
clear_bit(WriteMostly, &rdev->flags);
|
||||
|
||||
if (info->state & (1<<MD_DISK_JOURNAL))
|
||||
set_bit(Journal, &rdev->flags);
|
||||
/*
|
||||
* check whether the device shows up in other nodes
|
||||
*/
|
||||
if (mddev_is_clustered(mddev)) {
|
||||
if (info->state & (1 << MD_DISK_CANDIDATE)) {
|
||||
/* Through --cluster-confirm */
|
||||
if (info->state & (1 << MD_DISK_CANDIDATE))
|
||||
set_bit(Candidate, &rdev->flags);
|
||||
err = md_cluster_ops->new_disk_ack(mddev, true);
|
||||
if (err) {
|
||||
export_rdev(rdev);
|
||||
return err;
|
||||
}
|
||||
} else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
|
||||
else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
|
||||
/* --add initiated by this node */
|
||||
err = md_cluster_ops->add_new_disk_start(mddev, rdev);
|
||||
err = md_cluster_ops->add_new_disk(mddev, rdev);
|
||||
if (err) {
|
||||
md_cluster_ops->add_new_disk_finish(mddev);
|
||||
export_rdev(rdev);
|
||||
return err;
|
||||
}
|
||||
@ -5928,13 +6049,23 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
|
||||
|
||||
rdev->raid_disk = -1;
|
||||
err = bind_rdev_to_array(rdev, mddev);
|
||||
|
||||
if (err)
|
||||
export_rdev(rdev);
|
||||
else
|
||||
|
||||
if (mddev_is_clustered(mddev)) {
|
||||
if (info->state & (1 << MD_DISK_CANDIDATE))
|
||||
md_cluster_ops->new_disk_ack(mddev, (err == 0));
|
||||
else {
|
||||
if (err)
|
||||
md_cluster_ops->add_new_disk_cancel(mddev);
|
||||
else
|
||||
err = add_bound_rdev(rdev);
|
||||
}
|
||||
|
||||
} else if (!err)
|
||||
err = add_bound_rdev(rdev);
|
||||
if (mddev_is_clustered(mddev) &&
|
||||
(info->state & (1 << MD_DISK_CLUSTER_ADD)))
|
||||
md_cluster_ops->add_new_disk_finish(mddev);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
@ -5990,13 +6121,17 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
|
||||
{
|
||||
char b[BDEVNAME_SIZE];
|
||||
struct md_rdev *rdev;
|
||||
int ret = -1;
|
||||
|
||||
rdev = find_rdev(mddev, dev);
|
||||
if (!rdev)
|
||||
return -ENXIO;
|
||||
|
||||
if (mddev_is_clustered(mddev))
|
||||
md_cluster_ops->metadata_update_start(mddev);
|
||||
ret = md_cluster_ops->metadata_update_start(mddev);
|
||||
|
||||
if (rdev->raid_disk < 0)
|
||||
goto kick_rdev;
|
||||
|
||||
clear_bit(Blocked, &rdev->flags);
|
||||
remove_and_add_spares(mddev, rdev);
|
||||
@ -6004,20 +6139,19 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
|
||||
if (rdev->raid_disk >= 0)
|
||||
goto busy;
|
||||
|
||||
if (mddev_is_clustered(mddev))
|
||||
kick_rdev:
|
||||
if (mddev_is_clustered(mddev) && ret == 0)
|
||||
md_cluster_ops->remove_disk(mddev, rdev);
|
||||
|
||||
md_kick_rdev_from_array(rdev);
|
||||
md_update_sb(mddev, 1);
|
||||
md_new_event(mddev);
|
||||
|
||||
if (mddev_is_clustered(mddev))
|
||||
md_cluster_ops->metadata_update_finish(mddev);
|
||||
|
||||
return 0;
|
||||
busy:
|
||||
if (mddev_is_clustered(mddev))
|
||||
if (mddev_is_clustered(mddev) && ret == 0)
|
||||
md_cluster_ops->metadata_update_cancel(mddev);
|
||||
|
||||
printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
|
||||
bdevname(rdev->bdev,b), mdname(mddev));
|
||||
return -EBUSY;
|
||||
@ -6068,14 +6202,12 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
|
||||
goto abort_export;
|
||||
}
|
||||
|
||||
if (mddev_is_clustered(mddev))
|
||||
md_cluster_ops->metadata_update_start(mddev);
|
||||
clear_bit(In_sync, &rdev->flags);
|
||||
rdev->desc_nr = -1;
|
||||
rdev->saved_raid_disk = -1;
|
||||
err = bind_rdev_to_array(rdev, mddev);
|
||||
if (err)
|
||||
goto abort_clustered;
|
||||
goto abort_export;
|
||||
|
||||
/*
|
||||
* The rest should better be atomic, we can have disk failures
|
||||
@ -6085,9 +6217,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
|
||||
rdev->raid_disk = -1;
|
||||
|
||||
md_update_sb(mddev, 1);
|
||||
|
||||
if (mddev_is_clustered(mddev))
|
||||
md_cluster_ops->metadata_update_finish(mddev);
|
||||
/*
|
||||
* Kick recovery, maybe this spare has to be added to the
|
||||
* array immediately.
|
||||
@ -6097,9 +6226,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
|
||||
md_new_event(mddev);
|
||||
return 0;
|
||||
|
||||
abort_clustered:
|
||||
if (mddev_is_clustered(mddev))
|
||||
md_cluster_ops->metadata_update_cancel(mddev);
|
||||
abort_export:
|
||||
export_rdev(rdev);
|
||||
return err;
|
||||
@ -6417,8 +6543,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
|
||||
return rv;
|
||||
}
|
||||
}
|
||||
if (mddev_is_clustered(mddev))
|
||||
md_cluster_ops->metadata_update_start(mddev);
|
||||
if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
|
||||
rv = update_size(mddev, (sector_t)info->size * 2);
|
||||
|
||||
@ -6476,12 +6600,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
|
||||
}
|
||||
}
|
||||
md_update_sb(mddev, 1);
|
||||
if (mddev_is_clustered(mddev))
|
||||
md_cluster_ops->metadata_update_finish(mddev);
|
||||
return rv;
|
||||
err:
|
||||
if (mddev_is_clustered(mddev))
|
||||
md_cluster_ops->metadata_update_cancel(mddev);
|
||||
return rv;
|
||||
}
|
||||
|
||||
@ -7282,6 +7402,8 @@ static int md_seq_show(struct seq_file *seq, void *v)
|
||||
bdevname(rdev->bdev,b), rdev->desc_nr);
|
||||
if (test_bit(WriteMostly, &rdev->flags))
|
||||
seq_printf(seq, "(W)");
|
||||
if (test_bit(Journal, &rdev->flags))
|
||||
seq_printf(seq, "(J)");
|
||||
if (test_bit(Faulty, &rdev->flags)) {
|
||||
seq_printf(seq, "(F)");
|
||||
continue;
|
||||
@ -7594,11 +7716,7 @@ int md_allow_write(struct mddev *mddev)
|
||||
mddev->safemode == 0)
|
||||
mddev->safemode = 1;
|
||||
spin_unlock(&mddev->lock);
|
||||
if (mddev_is_clustered(mddev))
|
||||
md_cluster_ops->metadata_update_start(mddev);
|
||||
md_update_sb(mddev, 0);
|
||||
if (mddev_is_clustered(mddev))
|
||||
md_cluster_ops->metadata_update_finish(mddev);
|
||||
sysfs_notify_dirent_safe(mddev->sysfs_state);
|
||||
} else
|
||||
spin_unlock(&mddev->lock);
|
||||
@ -7630,6 +7748,7 @@ void md_do_sync(struct md_thread *thread)
|
||||
struct md_rdev *rdev;
|
||||
char *desc, *action = NULL;
|
||||
struct blk_plug plug;
|
||||
bool cluster_resync_finished = false;
|
||||
|
||||
/* just incase thread restarts... */
|
||||
if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
|
||||
@ -7739,6 +7858,7 @@ void md_do_sync(struct md_thread *thread)
|
||||
rcu_read_lock();
|
||||
rdev_for_each_rcu(rdev, mddev)
|
||||
if (rdev->raid_disk >= 0 &&
|
||||
!test_bit(Journal, &rdev->flags) &&
|
||||
!test_bit(Faulty, &rdev->flags) &&
|
||||
!test_bit(In_sync, &rdev->flags) &&
|
||||
rdev->recovery_offset < j)
|
||||
@ -7799,9 +7919,6 @@ void md_do_sync(struct md_thread *thread)
|
||||
md_new_event(mddev);
|
||||
update_time = jiffies;
|
||||
|
||||
if (mddev_is_clustered(mddev))
|
||||
md_cluster_ops->resync_start(mddev, j, max_sectors);
|
||||
|
||||
blk_start_plug(&plug);
|
||||
while (j < max_sectors) {
|
||||
sector_t sectors;
|
||||
@ -7865,8 +7982,6 @@ void md_do_sync(struct md_thread *thread)
|
||||
j = max_sectors;
|
||||
if (j > 2)
|
||||
mddev->curr_resync = j;
|
||||
if (mddev_is_clustered(mddev))
|
||||
md_cluster_ops->resync_info_update(mddev, j, max_sectors);
|
||||
mddev->curr_mark_cnt = io_sectors;
|
||||
if (last_check == 0)
|
||||
/* this is the earliest that rebuild will be
|
||||
@ -7937,7 +8052,11 @@ void md_do_sync(struct md_thread *thread)
|
||||
mddev->curr_resync_completed = mddev->curr_resync;
|
||||
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
|
||||
}
|
||||
/* tell personality that we are finished */
|
||||
/* tell personality and other nodes that we are finished */
|
||||
if (mddev_is_clustered(mddev)) {
|
||||
md_cluster_ops->resync_finish(mddev);
|
||||
cluster_resync_finished = true;
|
||||
}
|
||||
mddev->pers->sync_request(mddev, max_sectors, &skipped);
|
||||
|
||||
if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
|
||||
@ -7965,6 +8084,7 @@ void md_do_sync(struct md_thread *thread)
|
||||
rdev_for_each_rcu(rdev, mddev)
|
||||
if (rdev->raid_disk >= 0 &&
|
||||
mddev->delta_disks >= 0 &&
|
||||
!test_bit(Journal, &rdev->flags) &&
|
||||
!test_bit(Faulty, &rdev->flags) &&
|
||||
!test_bit(In_sync, &rdev->flags) &&
|
||||
rdev->recovery_offset < mddev->curr_resync)
|
||||
@ -7973,11 +8093,13 @@ void md_do_sync(struct md_thread *thread)
|
||||
}
|
||||
}
|
||||
skip:
|
||||
if (mddev_is_clustered(mddev))
|
||||
md_cluster_ops->resync_finish(mddev);
|
||||
|
||||
set_bit(MD_CHANGE_DEVS, &mddev->flags);
|
||||
|
||||
if (mddev_is_clustered(mddev) &&
|
||||
test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
|
||||
!cluster_resync_finished)
|
||||
md_cluster_ops->resync_finish(mddev);
|
||||
|
||||
spin_lock(&mddev->lock);
|
||||
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
|
||||
/* We completed so min/max setting can be forgotten if used. */
|
||||
@ -8008,7 +8130,8 @@ static int remove_and_add_spares(struct mddev *mddev,
|
||||
rdev->raid_disk >= 0 &&
|
||||
!test_bit(Blocked, &rdev->flags) &&
|
||||
(test_bit(Faulty, &rdev->flags) ||
|
||||
! test_bit(In_sync, &rdev->flags)) &&
|
||||
(!test_bit(In_sync, &rdev->flags) &&
|
||||
!test_bit(Journal, &rdev->flags))) &&
|
||||
atomic_read(&rdev->nr_pending)==0) {
|
||||
if (mddev->pers->hot_remove_disk(
|
||||
mddev, rdev) == 0) {
|
||||
@ -8020,18 +8143,25 @@ static int remove_and_add_spares(struct mddev *mddev,
|
||||
if (removed && mddev->kobj.sd)
|
||||
sysfs_notify(&mddev->kobj, NULL, "degraded");
|
||||
|
||||
if (this)
|
||||
if (this && removed)
|
||||
goto no_add;
|
||||
|
||||
rdev_for_each(rdev, mddev) {
|
||||
if (this && this != rdev)
|
||||
continue;
|
||||
if (test_bit(Candidate, &rdev->flags))
|
||||
continue;
|
||||
if (rdev->raid_disk >= 0 &&
|
||||
!test_bit(In_sync, &rdev->flags) &&
|
||||
!test_bit(Journal, &rdev->flags) &&
|
||||
!test_bit(Faulty, &rdev->flags))
|
||||
spares++;
|
||||
if (rdev->raid_disk >= 0)
|
||||
continue;
|
||||
if (test_bit(Faulty, &rdev->flags))
|
||||
continue;
|
||||
if (test_bit(Journal, &rdev->flags))
|
||||
continue;
|
||||
if (mddev->ro &&
|
||||
! (rdev->saved_raid_disk >= 0 &&
|
||||
!test_bit(Bitmap_sync, &rdev->flags)))
|
||||
@ -8056,14 +8186,25 @@ no_add:
|
||||
static void md_start_sync(struct work_struct *ws)
|
||||
{
|
||||
struct mddev *mddev = container_of(ws, struct mddev, del_work);
|
||||
int ret = 0;
|
||||
|
||||
if (mddev_is_clustered(mddev)) {
|
||||
ret = md_cluster_ops->resync_start(mddev);
|
||||
if (ret) {
|
||||
mddev->sync_thread = NULL;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
mddev->sync_thread = md_register_thread(md_do_sync,
|
||||
mddev,
|
||||
"resync");
|
||||
out:
|
||||
if (!mddev->sync_thread) {
|
||||
printk(KERN_ERR "%s: could not start resync"
|
||||
" thread...\n",
|
||||
mdname(mddev));
|
||||
if (!(mddev_is_clustered(mddev) && ret == -EAGAIN))
|
||||
printk(KERN_ERR "%s: could not start resync"
|
||||
" thread...\n",
|
||||
mdname(mddev));
|
||||
/* leave the spares where they are, it shouldn't hurt */
|
||||
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
|
||||
clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
|
||||
@ -8182,13 +8323,8 @@ void md_check_recovery(struct mddev *mddev)
|
||||
sysfs_notify_dirent_safe(mddev->sysfs_state);
|
||||
}
|
||||
|
||||
if (mddev->flags & MD_UPDATE_SB_FLAGS) {
|
||||
if (mddev_is_clustered(mddev))
|
||||
md_cluster_ops->metadata_update_start(mddev);
|
||||
if (mddev->flags & MD_UPDATE_SB_FLAGS)
|
||||
md_update_sb(mddev, 0);
|
||||
if (mddev_is_clustered(mddev))
|
||||
md_cluster_ops->metadata_update_finish(mddev);
|
||||
}
|
||||
|
||||
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
|
||||
!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
|
||||
@ -8286,8 +8422,6 @@ void md_reap_sync_thread(struct mddev *mddev)
|
||||
set_bit(MD_CHANGE_DEVS, &mddev->flags);
|
||||
}
|
||||
}
|
||||
if (mddev_is_clustered(mddev))
|
||||
md_cluster_ops->metadata_update_start(mddev);
|
||||
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
|
||||
mddev->pers->finish_reshape)
|
||||
mddev->pers->finish_reshape(mddev);
|
||||
@ -8300,8 +8434,6 @@ void md_reap_sync_thread(struct mddev *mddev)
|
||||
rdev->saved_raid_disk = -1;
|
||||
|
||||
md_update_sb(mddev, 1);
|
||||
if (mddev_is_clustered(mddev))
|
||||
md_cluster_ops->metadata_update_finish(mddev);
|
||||
clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
|
||||
clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
|
||||
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
|
||||
@ -8924,25 +9056,128 @@ err_wq:
|
||||
return ret;
|
||||
}
|
||||
|
||||
void md_reload_sb(struct mddev *mddev)
|
||||
static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
|
||||
{
|
||||
struct md_rdev *rdev, *tmp;
|
||||
struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
|
||||
struct md_rdev *rdev2;
|
||||
int role, ret;
|
||||
char b[BDEVNAME_SIZE];
|
||||
|
||||
rdev_for_each_safe(rdev, tmp, mddev) {
|
||||
rdev->sb_loaded = 0;
|
||||
ClearPageUptodate(rdev->sb_page);
|
||||
}
|
||||
mddev->raid_disks = 0;
|
||||
analyze_sbs(mddev);
|
||||
rdev_for_each_safe(rdev, tmp, mddev) {
|
||||
struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
|
||||
/* since we don't write to faulty devices, we figure out if the
|
||||
* disk is faulty by comparing events
|
||||
*/
|
||||
if (mddev->events > sb->events)
|
||||
set_bit(Faulty, &rdev->flags);
|
||||
/* Check for change of roles in the active devices */
|
||||
rdev_for_each(rdev2, mddev) {
|
||||
if (test_bit(Faulty, &rdev2->flags))
|
||||
continue;
|
||||
|
||||
/* Check if the roles changed */
|
||||
role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
|
||||
|
||||
if (test_bit(Candidate, &rdev2->flags)) {
|
||||
if (role == 0xfffe) {
|
||||
pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b));
|
||||
md_kick_rdev_from_array(rdev2);
|
||||
continue;
|
||||
}
|
||||
else
|
||||
clear_bit(Candidate, &rdev2->flags);
|
||||
}
|
||||
|
||||
if (role != rdev2->raid_disk) {
|
||||
/* got activated */
|
||||
if (rdev2->raid_disk == -1 && role != 0xffff) {
|
||||
rdev2->saved_raid_disk = role;
|
||||
ret = remove_and_add_spares(mddev, rdev2);
|
||||
pr_info("Activated spare: %s\n",
|
||||
bdevname(rdev2->bdev,b));
|
||||
continue;
|
||||
}
|
||||
/* device faulty
|
||||
* We just want to do the minimum to mark the disk
|
||||
* as faulty. The recovery is performed by the
|
||||
* one who initiated the error.
|
||||
*/
|
||||
if ((role == 0xfffe) || (role == 0xfffd)) {
|
||||
md_error(mddev, rdev2);
|
||||
clear_bit(Blocked, &rdev2->flags);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (mddev->raid_disks != le32_to_cpu(sb->raid_disks))
|
||||
update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
|
||||
|
||||
/* Finally set the event to be up to date */
|
||||
mddev->events = le64_to_cpu(sb->events);
|
||||
}
|
||||
|
||||
static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
|
||||
{
|
||||
int err;
|
||||
struct page *swapout = rdev->sb_page;
|
||||
struct mdp_superblock_1 *sb;
|
||||
|
||||
/* Store the sb page of the rdev in the swapout temporary
|
||||
* variable in case we err in the future
|
||||
*/
|
||||
rdev->sb_page = NULL;
|
||||
alloc_disk_sb(rdev);
|
||||
ClearPageUptodate(rdev->sb_page);
|
||||
rdev->sb_loaded = 0;
|
||||
err = super_types[mddev->major_version].load_super(rdev, NULL, mddev->minor_version);
|
||||
|
||||
if (err < 0) {
|
||||
pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n",
|
||||
__func__, __LINE__, rdev->desc_nr, err);
|
||||
put_page(rdev->sb_page);
|
||||
rdev->sb_page = swapout;
|
||||
rdev->sb_loaded = 1;
|
||||
return err;
|
||||
}
|
||||
|
||||
sb = page_address(rdev->sb_page);
|
||||
/* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET
|
||||
* is not set
|
||||
*/
|
||||
|
||||
if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET))
|
||||
rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
|
||||
|
||||
/* The other node finished recovery, call spare_active to set
|
||||
* device In_sync and mddev->degraded
|
||||
*/
|
||||
if (rdev->recovery_offset == MaxSector &&
|
||||
!test_bit(In_sync, &rdev->flags) &&
|
||||
mddev->pers->spare_active(mddev))
|
||||
sysfs_notify(&mddev->kobj, NULL, "degraded");
|
||||
|
||||
put_page(swapout);
|
||||
return 0;
|
||||
}
|
||||
|
||||
void md_reload_sb(struct mddev *mddev, int nr)
|
||||
{
|
||||
struct md_rdev *rdev;
|
||||
int err;
|
||||
|
||||
/* Find the rdev */
|
||||
rdev_for_each_rcu(rdev, mddev) {
|
||||
if (rdev->desc_nr == nr)
|
||||
break;
|
||||
}
|
||||
|
||||
if (!rdev || rdev->desc_nr != nr) {
|
||||
pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr);
|
||||
return;
|
||||
}
|
||||
|
||||
err = read_rdev(mddev, rdev);
|
||||
if (err < 0)
|
||||
return;
|
||||
|
||||
check_sb_changes(mddev, rdev);
|
||||
|
||||
/* Read all rdev's to update recovery_offset */
|
||||
rdev_for_each_rcu(rdev, mddev)
|
||||
read_rdev(mddev, rdev);
|
||||
}
|
||||
EXPORT_SYMBOL(md_reload_sb);
|
||||
|
||||
|
@ -87,10 +87,16 @@ struct md_rdev {
|
||||
* array and could again if we did a partial
|
||||
* resync from the bitmap
|
||||
*/
|
||||
sector_t recovery_offset;/* If this device has been partially
|
||||
union {
|
||||
sector_t recovery_offset;/* If this device has been partially
|
||||
* recovered, this is where we were
|
||||
* up to.
|
||||
*/
|
||||
sector_t journal_tail; /* If this device is a journal device,
|
||||
* this is the journal tail (journal
|
||||
* recovery start point)
|
||||
*/
|
||||
};
|
||||
|
||||
atomic_t nr_pending; /* number of pending requests.
|
||||
* only maintained for arrays that
|
||||
@ -172,6 +178,11 @@ enum flag_bits {
|
||||
* This device is seen locally but not
|
||||
* by the whole cluster
|
||||
*/
|
||||
Journal, /* This device is used as journal for
|
||||
* raid-5/6.
|
||||
* Usually, this device should be faster
|
||||
* than other devices in the array
|
||||
*/
|
||||
};
|
||||
|
||||
#define BB_LEN_MASK (0x00000000000001FFULL)
|
||||
@ -221,6 +232,8 @@ struct mddev {
|
||||
#define MD_STILL_CLOSED 4 /* If set, then array has not been opened since
|
||||
* md_ioctl checked on it.
|
||||
*/
|
||||
#define MD_JOURNAL_CLEAN 5 /* A raid with journal is already clean */
|
||||
#define MD_HAS_JOURNAL 6 /* The raid array has journal feature set */
|
||||
|
||||
int suspended;
|
||||
atomic_t active_io;
|
||||
@ -658,7 +671,7 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
|
||||
struct mddev *mddev);
|
||||
|
||||
extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule);
|
||||
extern void md_reload_sb(struct mddev *mddev);
|
||||
extern void md_reload_sb(struct mddev *mddev, int raid_disk);
|
||||
extern void md_update_sb(struct mddev *mddev, int force);
|
||||
extern void md_kick_rdev_from_array(struct md_rdev * rdev);
|
||||
struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
|
||||
|
@ -90,6 +90,8 @@ static void r1bio_pool_free(void *r1_bio, void *data)
|
||||
#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
|
||||
#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
|
||||
#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
|
||||
#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
|
||||
#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
|
||||
#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS)
|
||||
|
||||
static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
|
||||
@ -1590,6 +1592,15 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
|
||||
if (rdev->raid_disk >= 0)
|
||||
first = last = rdev->raid_disk;
|
||||
|
||||
/*
|
||||
* find the disk ... but prefer rdev->saved_raid_disk
|
||||
* if possible.
|
||||
*/
|
||||
if (rdev->saved_raid_disk >= 0 &&
|
||||
rdev->saved_raid_disk >= first &&
|
||||
conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
|
||||
first = last = rdev->saved_raid_disk;
|
||||
|
||||
for (mirror = first; mirror <= last; mirror++) {
|
||||
p = conf->mirrors+mirror;
|
||||
if (!p->rdev) {
|
||||
@ -2495,6 +2506,11 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
|
||||
|
||||
bitmap_close_sync(mddev->bitmap);
|
||||
close_sync(conf);
|
||||
|
||||
if (mddev_is_clustered(mddev)) {
|
||||
conf->cluster_sync_low = 0;
|
||||
conf->cluster_sync_high = 0;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -2515,7 +2531,12 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
|
||||
return sync_blocks;
|
||||
}
|
||||
|
||||
bitmap_cond_end_sync(mddev->bitmap, sector_nr);
|
||||
/* we are incrementing sector_nr below. To be safe, we check against
|
||||
* sector_nr + two times RESYNC_SECTORS
|
||||
*/
|
||||
|
||||
bitmap_cond_end_sync(mddev->bitmap, sector_nr,
|
||||
mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
|
||||
r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
|
||||
|
||||
raise_barrier(conf, sector_nr);
|
||||
@ -2706,6 +2727,16 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
|
||||
bio_full:
|
||||
r1_bio->sectors = nr_sectors;
|
||||
|
||||
if (mddev_is_clustered(mddev) &&
|
||||
conf->cluster_sync_high < sector_nr + nr_sectors) {
|
||||
conf->cluster_sync_low = mddev->curr_resync_completed;
|
||||
conf->cluster_sync_high = conf->cluster_sync_low + CLUSTER_RESYNC_WINDOW_SECTORS;
|
||||
/* Send resync message */
|
||||
md_cluster_ops->resync_info_update(mddev,
|
||||
conf->cluster_sync_low,
|
||||
conf->cluster_sync_high);
|
||||
}
|
||||
|
||||
/* For a user-requested sync, we read all readable devices and do a
|
||||
* compare
|
||||
*/
|
||||
@ -3020,9 +3051,11 @@ static int raid1_reshape(struct mddev *mddev)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
err = md_allow_write(mddev);
|
||||
if (err)
|
||||
return err;
|
||||
if (!mddev_is_clustered(mddev)) {
|
||||
err = md_allow_write(mddev);
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
|
||||
raid_disks = mddev->raid_disks + mddev->delta_disks;
|
||||
|
||||
|
@ -111,6 +111,13 @@ struct r1conf {
|
||||
* the new thread here until we fully activate the array.
|
||||
*/
|
||||
struct md_thread *thread;
|
||||
|
||||
/* Keep track of cluster resync window to send to other
|
||||
* nodes.
|
||||
*/
|
||||
sector_t cluster_sync_low;
|
||||
sector_t cluster_sync_high;
|
||||
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -3149,7 +3149,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
|
||||
/* resync. Schedule a read for every block at this virt offset */
|
||||
int count = 0;
|
||||
|
||||
bitmap_cond_end_sync(mddev->bitmap, sector_nr);
|
||||
bitmap_cond_end_sync(mddev->bitmap, sector_nr, 0);
|
||||
|
||||
if (!bitmap_start_sync(mddev->bitmap, sector_nr,
|
||||
&sync_blocks, mddev->degraded) &&
|
||||
|
1191
drivers/md/raid5-cache.c
Normal file
1191
drivers/md/raid5-cache.c
Normal file
File diff suppressed because it is too large
Load Diff
@ -353,7 +353,7 @@ static void release_inactive_stripe_list(struct r5conf *conf,
|
||||
struct list_head *list = &temp_inactive_list[size - 1];
|
||||
|
||||
/*
|
||||
* We don't hold any lock here yet, get_active_stripe() might
|
||||
* We don't hold any lock here yet, raid5_get_active_stripe() might
|
||||
* remove stripes from the list
|
||||
*/
|
||||
if (!list_empty_careful(list)) {
|
||||
@ -413,7 +413,7 @@ static int release_stripe_list(struct r5conf *conf,
|
||||
return count;
|
||||
}
|
||||
|
||||
static void release_stripe(struct stripe_head *sh)
|
||||
void raid5_release_stripe(struct stripe_head *sh)
|
||||
{
|
||||
struct r5conf *conf = sh->raid_conf;
|
||||
unsigned long flags;
|
||||
@ -658,9 +658,9 @@ static int has_failed(struct r5conf *conf)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct stripe_head *
|
||||
get_active_stripe(struct r5conf *conf, sector_t sector,
|
||||
int previous, int noblock, int noquiesce)
|
||||
struct stripe_head *
|
||||
raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
|
||||
int previous, int noblock, int noquiesce)
|
||||
{
|
||||
struct stripe_head *sh;
|
||||
int hash = stripe_hash_locks_hash(sector);
|
||||
@ -755,6 +755,10 @@ static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
|
||||
/* Only freshly new full stripe normal write stripe can be added to a batch list */
|
||||
static bool stripe_can_batch(struct stripe_head *sh)
|
||||
{
|
||||
struct r5conf *conf = sh->raid_conf;
|
||||
|
||||
if (conf->log)
|
||||
return false;
|
||||
return test_bit(STRIPE_BATCH_READY, &sh->state) &&
|
||||
!test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
|
||||
is_full_stripe_write(sh);
|
||||
@ -858,7 +862,7 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh
|
||||
unlock_out:
|
||||
unlock_two_stripes(head, sh);
|
||||
out:
|
||||
release_stripe(head);
|
||||
raid5_release_stripe(head);
|
||||
}
|
||||
|
||||
/* Determine if 'data_offset' or 'new_data_offset' should be used
|
||||
@ -895,6 +899,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
|
||||
|
||||
might_sleep();
|
||||
|
||||
if (r5l_write_stripe(conf->log, sh) == 0)
|
||||
return;
|
||||
for (i = disks; i--; ) {
|
||||
int rw;
|
||||
int replace_only = 0;
|
||||
@ -1208,7 +1214,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
|
||||
return_io(&return_bi);
|
||||
|
||||
set_bit(STRIPE_HANDLE, &sh->state);
|
||||
release_stripe(sh);
|
||||
raid5_release_stripe(sh);
|
||||
}
|
||||
|
||||
static void ops_run_biofill(struct stripe_head *sh)
|
||||
@ -1271,7 +1277,7 @@ static void ops_complete_compute(void *stripe_head_ref)
|
||||
if (sh->check_state == check_state_compute_run)
|
||||
sh->check_state = check_state_compute_result;
|
||||
set_bit(STRIPE_HANDLE, &sh->state);
|
||||
release_stripe(sh);
|
||||
raid5_release_stripe(sh);
|
||||
}
|
||||
|
||||
/* return a pointer to the address conversion region of the scribble buffer */
|
||||
@ -1697,7 +1703,7 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
|
||||
}
|
||||
|
||||
set_bit(STRIPE_HANDLE, &sh->state);
|
||||
release_stripe(sh);
|
||||
raid5_release_stripe(sh);
|
||||
}
|
||||
|
||||
static void
|
||||
@ -1855,7 +1861,7 @@ static void ops_complete_check(void *stripe_head_ref)
|
||||
|
||||
sh->check_state = check_state_check_result;
|
||||
set_bit(STRIPE_HANDLE, &sh->state);
|
||||
release_stripe(sh);
|
||||
raid5_release_stripe(sh);
|
||||
}
|
||||
|
||||
static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
|
||||
@ -2017,7 +2023,7 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
|
||||
/* we just created an active stripe so... */
|
||||
atomic_inc(&conf->active_stripes);
|
||||
|
||||
release_stripe(sh);
|
||||
raid5_release_stripe(sh);
|
||||
conf->max_nr_stripes++;
|
||||
return 1;
|
||||
}
|
||||
@ -2236,7 +2242,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
|
||||
if (!p)
|
||||
err = -ENOMEM;
|
||||
}
|
||||
release_stripe(nsh);
|
||||
raid5_release_stripe(nsh);
|
||||
}
|
||||
/* critical section pass, GFP_NOIO no longer needed */
|
||||
|
||||
@ -2394,7 +2400,7 @@ static void raid5_end_read_request(struct bio * bi)
|
||||
rdev_dec_pending(rdev, conf->mddev);
|
||||
clear_bit(R5_LOCKED, &sh->dev[i].flags);
|
||||
set_bit(STRIPE_HANDLE, &sh->state);
|
||||
release_stripe(sh);
|
||||
raid5_release_stripe(sh);
|
||||
}
|
||||
|
||||
static void raid5_end_write_request(struct bio *bi)
|
||||
@ -2468,14 +2474,12 @@ static void raid5_end_write_request(struct bio *bi)
|
||||
if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
|
||||
clear_bit(R5_LOCKED, &sh->dev[i].flags);
|
||||
set_bit(STRIPE_HANDLE, &sh->state);
|
||||
release_stripe(sh);
|
||||
raid5_release_stripe(sh);
|
||||
|
||||
if (sh->batch_head && sh != sh->batch_head)
|
||||
release_stripe(sh->batch_head);
|
||||
raid5_release_stripe(sh->batch_head);
|
||||
}
|
||||
|
||||
static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
|
||||
|
||||
static void raid5_build_block(struct stripe_head *sh, int i, int previous)
|
||||
{
|
||||
struct r5dev *dev = &sh->dev[i];
|
||||
@ -2491,7 +2495,7 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous)
|
||||
dev->rreq.bi_private = sh;
|
||||
|
||||
dev->flags = 0;
|
||||
dev->sector = compute_blocknr(sh, i, previous);
|
||||
dev->sector = raid5_compute_blocknr(sh, i, previous);
|
||||
}
|
||||
|
||||
static void error(struct mddev *mddev, struct md_rdev *rdev)
|
||||
@ -2524,9 +2528,9 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
|
||||
* Input: a 'big' sector number,
|
||||
* Output: index of the data and parity disk, and the sector # in them.
|
||||
*/
|
||||
static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
|
||||
int previous, int *dd_idx,
|
||||
struct stripe_head *sh)
|
||||
sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
|
||||
int previous, int *dd_idx,
|
||||
struct stripe_head *sh)
|
||||
{
|
||||
sector_t stripe, stripe2;
|
||||
sector_t chunk_number;
|
||||
@ -2726,7 +2730,7 @@ static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
|
||||
return new_sector;
|
||||
}
|
||||
|
||||
static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
|
||||
sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
|
||||
{
|
||||
struct r5conf *conf = sh->raid_conf;
|
||||
int raid_disks = sh->disks;
|
||||
@ -3098,6 +3102,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
|
||||
if (bi)
|
||||
bitmap_end = 1;
|
||||
|
||||
r5l_stripe_write_finished(sh);
|
||||
|
||||
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
|
||||
wake_up(&conf->wait_for_overlap);
|
||||
|
||||
@ -3141,6 +3147,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
|
||||
* the data has not reached the cache yet.
|
||||
*/
|
||||
if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
|
||||
s->failed > conf->max_degraded &&
|
||||
(!test_bit(R5_Insync, &sh->dev[i].flags) ||
|
||||
test_bit(R5_ReadError, &sh->dev[i].flags))) {
|
||||
spin_lock_irq(&sh->stripe_lock);
|
||||
@ -3497,6 +3504,9 @@ returnbi:
|
||||
WARN_ON(test_bit(R5_SkipCopy, &dev->flags));
|
||||
WARN_ON(dev->page != dev->orig_page);
|
||||
}
|
||||
|
||||
r5l_stripe_write_finished(sh);
|
||||
|
||||
if (!discard_pending &&
|
||||
test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
|
||||
int hash;
|
||||
@ -3939,10 +3949,10 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
|
||||
struct stripe_head *sh2;
|
||||
struct async_submit_ctl submit;
|
||||
|
||||
sector_t bn = compute_blocknr(sh, i, 1);
|
||||
sector_t bn = raid5_compute_blocknr(sh, i, 1);
|
||||
sector_t s = raid5_compute_sector(conf, bn, 0,
|
||||
&dd_idx, NULL);
|
||||
sh2 = get_active_stripe(conf, s, 0, 1, 1);
|
||||
sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1);
|
||||
if (sh2 == NULL)
|
||||
/* so far only the early blocks of this stripe
|
||||
* have been requested. When later blocks
|
||||
@ -3952,7 +3962,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
|
||||
if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
|
||||
test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
|
||||
/* must have already done this block */
|
||||
release_stripe(sh2);
|
||||
raid5_release_stripe(sh2);
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -3973,7 +3983,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
|
||||
set_bit(STRIPE_EXPAND_READY, &sh2->state);
|
||||
set_bit(STRIPE_HANDLE, &sh2->state);
|
||||
}
|
||||
release_stripe(sh2);
|
||||
raid5_release_stripe(sh2);
|
||||
|
||||
}
|
||||
/* done submitting copies, wait for them to complete */
|
||||
@ -4008,6 +4018,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
|
||||
s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head;
|
||||
s->failed_num[0] = -1;
|
||||
s->failed_num[1] = -1;
|
||||
s->log_failed = r5l_log_disk_error(conf);
|
||||
|
||||
/* Now to look around and see what can be done */
|
||||
rcu_read_lock();
|
||||
@ -4259,7 +4270,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
|
||||
if (handle_flags == 0 ||
|
||||
sh->state & handle_flags)
|
||||
set_bit(STRIPE_HANDLE, &sh->state);
|
||||
release_stripe(sh);
|
||||
raid5_release_stripe(sh);
|
||||
}
|
||||
spin_lock_irq(&head_sh->stripe_lock);
|
||||
head_sh->batch_head = NULL;
|
||||
@ -4320,6 +4331,9 @@ static void handle_stripe(struct stripe_head *sh)
|
||||
|
||||
analyse_stripe(sh, &s);
|
||||
|
||||
if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
|
||||
goto finish;
|
||||
|
||||
if (s.handle_bad_blocks) {
|
||||
set_bit(STRIPE_HANDLE, &sh->state);
|
||||
goto finish;
|
||||
@ -4348,7 +4362,7 @@ static void handle_stripe(struct stripe_head *sh)
|
||||
/* check if the array has lost more than max_degraded devices and,
|
||||
* if so, some requests might need to be failed.
|
||||
*/
|
||||
if (s.failed > conf->max_degraded) {
|
||||
if (s.failed > conf->max_degraded || s.log_failed) {
|
||||
sh->check_state = 0;
|
||||
sh->reconstruct_state = 0;
|
||||
break_stripe_batch_list(sh, 0);
|
||||
@ -4506,7 +4520,7 @@ static void handle_stripe(struct stripe_head *sh)
|
||||
/* Finish reconstruct operations initiated by the expansion process */
|
||||
if (sh->reconstruct_state == reconstruct_state_result) {
|
||||
struct stripe_head *sh_src
|
||||
= get_active_stripe(conf, sh->sector, 1, 1, 1);
|
||||
= raid5_get_active_stripe(conf, sh->sector, 1, 1, 1);
|
||||
if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
|
||||
/* sh cannot be written until sh_src has been read.
|
||||
* so arrange for sh to be delayed a little
|
||||
@ -4516,11 +4530,11 @@ static void handle_stripe(struct stripe_head *sh)
|
||||
if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
|
||||
&sh_src->state))
|
||||
atomic_inc(&conf->preread_active_stripes);
|
||||
release_stripe(sh_src);
|
||||
raid5_release_stripe(sh_src);
|
||||
goto finish;
|
||||
}
|
||||
if (sh_src)
|
||||
release_stripe(sh_src);
|
||||
raid5_release_stripe(sh_src);
|
||||
|
||||
sh->reconstruct_state = reconstruct_state_idle;
|
||||
clear_bit(STRIPE_EXPANDING, &sh->state);
|
||||
@ -5012,7 +5026,7 @@ static void release_stripe_plug(struct mddev *mddev,
|
||||
struct raid5_plug_cb *cb;
|
||||
|
||||
if (!blk_cb) {
|
||||
release_stripe(sh);
|
||||
raid5_release_stripe(sh);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -5028,7 +5042,7 @@ static void release_stripe_plug(struct mddev *mddev,
|
||||
if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
|
||||
list_add_tail(&sh->lru, &cb->list);
|
||||
else
|
||||
release_stripe(sh);
|
||||
raid5_release_stripe(sh);
|
||||
}
|
||||
|
||||
static void make_discard_request(struct mddev *mddev, struct bio *bi)
|
||||
@ -5063,12 +5077,12 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
|
||||
DEFINE_WAIT(w);
|
||||
int d;
|
||||
again:
|
||||
sh = get_active_stripe(conf, logical_sector, 0, 0, 0);
|
||||
sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0);
|
||||
prepare_to_wait(&conf->wait_for_overlap, &w,
|
||||
TASK_UNINTERRUPTIBLE);
|
||||
set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
|
||||
if (test_bit(STRIPE_SYNCING, &sh->state)) {
|
||||
release_stripe(sh);
|
||||
raid5_release_stripe(sh);
|
||||
schedule();
|
||||
goto again;
|
||||
}
|
||||
@ -5080,7 +5094,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
|
||||
if (sh->dev[d].towrite || sh->dev[d].toread) {
|
||||
set_bit(R5_Overlap, &sh->dev[d].flags);
|
||||
spin_unlock_irq(&sh->stripe_lock);
|
||||
release_stripe(sh);
|
||||
raid5_release_stripe(sh);
|
||||
schedule();
|
||||
goto again;
|
||||
}
|
||||
@ -5136,8 +5150,15 @@ static void make_request(struct mddev *mddev, struct bio * bi)
|
||||
bool do_prepare;
|
||||
|
||||
if (unlikely(bi->bi_rw & REQ_FLUSH)) {
|
||||
md_flush_request(mddev, bi);
|
||||
return;
|
||||
int ret = r5l_handle_flush_request(conf->log, bi);
|
||||
|
||||
if (ret == 0)
|
||||
return;
|
||||
if (ret == -ENODEV) {
|
||||
md_flush_request(mddev, bi);
|
||||
return;
|
||||
}
|
||||
/* ret == -EAGAIN, fallback */
|
||||
}
|
||||
|
||||
md_write_start(mddev, bi);
|
||||
@ -5210,7 +5231,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
|
||||
(unsigned long long)new_sector,
|
||||
(unsigned long long)logical_sector);
|
||||
|
||||
sh = get_active_stripe(conf, new_sector, previous,
|
||||
sh = raid5_get_active_stripe(conf, new_sector, previous,
|
||||
(bi->bi_rw&RWA_MASK), 0);
|
||||
if (sh) {
|
||||
if (unlikely(previous)) {
|
||||
@ -5231,7 +5252,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
|
||||
must_retry = 1;
|
||||
spin_unlock_irq(&conf->device_lock);
|
||||
if (must_retry) {
|
||||
release_stripe(sh);
|
||||
raid5_release_stripe(sh);
|
||||
schedule();
|
||||
do_prepare = true;
|
||||
goto retry;
|
||||
@ -5241,14 +5262,14 @@ static void make_request(struct mddev *mddev, struct bio * bi)
|
||||
/* Might have got the wrong stripe_head
|
||||
* by accident
|
||||
*/
|
||||
release_stripe(sh);
|
||||
raid5_release_stripe(sh);
|
||||
goto retry;
|
||||
}
|
||||
|
||||
if (rw == WRITE &&
|
||||
logical_sector >= mddev->suspend_lo &&
|
||||
logical_sector < mddev->suspend_hi) {
|
||||
release_stripe(sh);
|
||||
raid5_release_stripe(sh);
|
||||
/* As the suspend_* range is controlled by
|
||||
* userspace, we want an interruptible
|
||||
* wait.
|
||||
@ -5271,7 +5292,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
|
||||
* and wait a while
|
||||
*/
|
||||
md_wakeup_thread(mddev->thread);
|
||||
release_stripe(sh);
|
||||
raid5_release_stripe(sh);
|
||||
schedule();
|
||||
do_prepare = true;
|
||||
goto retry;
|
||||
@ -5458,7 +5479,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
|
||||
for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
|
||||
int j;
|
||||
int skipped_disk = 0;
|
||||
sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
|
||||
sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
|
||||
set_bit(STRIPE_EXPANDING, &sh->state);
|
||||
atomic_inc(&conf->reshape_stripes);
|
||||
/* If any of this stripe is beyond the end of the old
|
||||
@ -5471,7 +5492,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
|
||||
if (conf->level == 6 &&
|
||||
j == sh->qd_idx)
|
||||
continue;
|
||||
s = compute_blocknr(sh, j, 0);
|
||||
s = raid5_compute_blocknr(sh, j, 0);
|
||||
if (s < raid5_size(mddev, 0, 0)) {
|
||||
skipped_disk = 1;
|
||||
continue;
|
||||
@ -5507,10 +5528,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
|
||||
if (last_sector >= mddev->dev_sectors)
|
||||
last_sector = mddev->dev_sectors - 1;
|
||||
while (first_sector <= last_sector) {
|
||||
sh = get_active_stripe(conf, first_sector, 1, 0, 1);
|
||||
sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1);
|
||||
set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
|
||||
set_bit(STRIPE_HANDLE, &sh->state);
|
||||
release_stripe(sh);
|
||||
raid5_release_stripe(sh);
|
||||
first_sector += STRIPE_SECTORS;
|
||||
}
|
||||
/* Now that the sources are clearly marked, we can release
|
||||
@ -5519,7 +5540,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
|
||||
while (!list_empty(&stripes)) {
|
||||
sh = list_entry(stripes.next, struct stripe_head, lru);
|
||||
list_del_init(&sh->lru);
|
||||
release_stripe(sh);
|
||||
raid5_release_stripe(sh);
|
||||
}
|
||||
/* If this takes us to the resync_max point where we have to pause,
|
||||
* then we need to write out the superblock.
|
||||
@ -5615,11 +5636,11 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
|
||||
return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
|
||||
}
|
||||
|
||||
bitmap_cond_end_sync(mddev->bitmap, sector_nr);
|
||||
bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
|
||||
|
||||
sh = get_active_stripe(conf, sector_nr, 0, 1, 0);
|
||||
sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0);
|
||||
if (sh == NULL) {
|
||||
sh = get_active_stripe(conf, sector_nr, 0, 0, 0);
|
||||
sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0);
|
||||
/* make sure we don't swamp the stripe cache if someone else
|
||||
* is trying to get access
|
||||
*/
|
||||
@ -5643,7 +5664,7 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
|
||||
set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
|
||||
set_bit(STRIPE_HANDLE, &sh->state);
|
||||
|
||||
release_stripe(sh);
|
||||
raid5_release_stripe(sh);
|
||||
|
||||
return STRIPE_SECTORS;
|
||||
}
|
||||
@ -5682,7 +5703,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
|
||||
/* already done this stripe */
|
||||
continue;
|
||||
|
||||
sh = get_active_stripe(conf, sector, 0, 1, 1);
|
||||
sh = raid5_get_active_stripe(conf, sector, 0, 1, 1);
|
||||
|
||||
if (!sh) {
|
||||
/* failed to get a stripe - must wait */
|
||||
@ -5692,7 +5713,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
|
||||
}
|
||||
|
||||
if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
|
||||
release_stripe(sh);
|
||||
raid5_release_stripe(sh);
|
||||
raid5_set_bi_processed_stripes(raid_bio, scnt);
|
||||
conf->retry_read_aligned = raid_bio;
|
||||
return handled;
|
||||
@ -5700,7 +5721,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
|
||||
|
||||
set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags);
|
||||
handle_stripe(sh);
|
||||
release_stripe(sh);
|
||||
raid5_release_stripe(sh);
|
||||
handled++;
|
||||
}
|
||||
remaining = raid5_dec_bi_active_stripes(raid_bio);
|
||||
@ -5730,8 +5751,12 @@ static int handle_active_stripes(struct r5conf *conf, int group,
|
||||
for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
|
||||
if (!list_empty(temp_inactive_list + i))
|
||||
break;
|
||||
if (i == NR_STRIPE_HASH_LOCKS)
|
||||
if (i == NR_STRIPE_HASH_LOCKS) {
|
||||
spin_unlock_irq(&conf->device_lock);
|
||||
r5l_flush_stripe_to_raid(conf->log);
|
||||
spin_lock_irq(&conf->device_lock);
|
||||
return batch_size;
|
||||
}
|
||||
release_inactive = true;
|
||||
}
|
||||
spin_unlock_irq(&conf->device_lock);
|
||||
@ -5739,6 +5764,7 @@ static int handle_active_stripes(struct r5conf *conf, int group,
|
||||
release_inactive_stripe_list(conf, temp_inactive_list,
|
||||
NR_STRIPE_HASH_LOCKS);
|
||||
|
||||
r5l_flush_stripe_to_raid(conf->log);
|
||||
if (release_inactive) {
|
||||
spin_lock_irq(&conf->device_lock);
|
||||
return 0;
|
||||
@ -5746,6 +5772,7 @@ static int handle_active_stripes(struct r5conf *conf, int group,
|
||||
|
||||
for (i = 0; i < batch_size; i++)
|
||||
handle_stripe(batch[i]);
|
||||
r5l_write_stripe_run(conf->log);
|
||||
|
||||
cond_resched();
|
||||
|
||||
@ -5879,6 +5906,8 @@ static void raid5d(struct md_thread *thread)
|
||||
mutex_unlock(&conf->cache_size_mutex);
|
||||
}
|
||||
|
||||
r5l_flush_stripe_to_raid(conf->log);
|
||||
|
||||
async_tx_issue_pending_all();
|
||||
blk_finish_plug(&plug);
|
||||
|
||||
@ -6316,8 +6345,11 @@ static void raid5_free_percpu(struct r5conf *conf)
|
||||
|
||||
static void free_conf(struct r5conf *conf)
|
||||
{
|
||||
if (conf->log)
|
||||
r5l_exit_log(conf->log);
|
||||
if (conf->shrinker.seeks)
|
||||
unregister_shrinker(&conf->shrinker);
|
||||
|
||||
free_thread_groups(conf);
|
||||
shrink_stripes(conf);
|
||||
raid5_free_percpu(conf);
|
||||
@ -6530,7 +6562,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
|
||||
rdev_for_each(rdev, mddev) {
|
||||
raid_disk = rdev->raid_disk;
|
||||
if (raid_disk >= max_disks
|
||||
|| raid_disk < 0)
|
||||
|| raid_disk < 0 || test_bit(Journal, &rdev->flags))
|
||||
continue;
|
||||
disk = conf->disks + raid_disk;
|
||||
|
||||
@ -6650,6 +6682,7 @@ static int run(struct mddev *mddev)
|
||||
int working_disks = 0;
|
||||
int dirty_parity_disks = 0;
|
||||
struct md_rdev *rdev;
|
||||
struct md_rdev *journal_dev = NULL;
|
||||
sector_t reshape_offset = 0;
|
||||
int i;
|
||||
long long min_offset_diff = 0;
|
||||
@ -6662,6 +6695,11 @@ static int run(struct mddev *mddev)
|
||||
|
||||
rdev_for_each(rdev, mddev) {
|
||||
long long diff;
|
||||
|
||||
if (test_bit(Journal, &rdev->flags)) {
|
||||
journal_dev = rdev;
|
||||
continue;
|
||||
}
|
||||
if (rdev->raid_disk < 0)
|
||||
continue;
|
||||
diff = (rdev->new_data_offset - rdev->data_offset);
|
||||
@ -6695,6 +6733,12 @@ static int run(struct mddev *mddev)
|
||||
int chunk_sectors;
|
||||
int new_data_disks;
|
||||
|
||||
if (journal_dev) {
|
||||
printk(KERN_ERR "md/raid:%s: don't support reshape with journal - aborting.\n",
|
||||
mdname(mddev));
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (mddev->new_level != mddev->level) {
|
||||
printk(KERN_ERR "md/raid:%s: unsupported reshape "
|
||||
"required - aborting.\n",
|
||||
@ -6770,6 +6814,13 @@ static int run(struct mddev *mddev)
|
||||
if (IS_ERR(conf))
|
||||
return PTR_ERR(conf);
|
||||
|
||||
if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !journal_dev) {
|
||||
printk(KERN_ERR "md/raid:%s: journal disk is missing, force array readonly\n",
|
||||
mdname(mddev));
|
||||
mddev->ro = 1;
|
||||
set_disk_ro(mddev->gendisk, 1);
|
||||
}
|
||||
|
||||
conf->min_offset_diff = min_offset_diff;
|
||||
mddev->thread = conf->thread;
|
||||
conf->thread = NULL;
|
||||
@ -6973,6 +7024,14 @@ static int run(struct mddev *mddev)
|
||||
mddev->queue);
|
||||
}
|
||||
|
||||
if (journal_dev) {
|
||||
char b[BDEVNAME_SIZE];
|
||||
|
||||
printk(KERN_INFO"md/raid:%s: using device %s as journal\n",
|
||||
mdname(mddev), bdevname(journal_dev->bdev, b));
|
||||
r5l_init_log(conf, journal_dev);
|
||||
}
|
||||
|
||||
return 0;
|
||||
abort:
|
||||
md_unregister_thread(&mddev->thread);
|
||||
@ -7082,6 +7141,15 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
|
||||
struct disk_info *p = conf->disks + number;
|
||||
|
||||
print_raid5_conf(conf);
|
||||
if (test_bit(Journal, &rdev->flags)) {
|
||||
/*
|
||||
* journal disk is not removable, but we need give a chance to
|
||||
* update superblock of other disks. Otherwise journal disk
|
||||
* will be considered as 'fresh'
|
||||
*/
|
||||
set_bit(MD_CHANGE_DEVS, &mddev->flags);
|
||||
return -EINVAL;
|
||||
}
|
||||
if (rdev == p->rdev)
|
||||
rdevp = &p->rdev;
|
||||
else if (rdev == p->replacement)
|
||||
@ -7144,6 +7212,8 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
|
||||
int first = 0;
|
||||
int last = conf->raid_disks - 1;
|
||||
|
||||
if (test_bit(Journal, &rdev->flags))
|
||||
return -EINVAL;
|
||||
if (mddev->recovery_disabled == conf->recovery_disabled)
|
||||
return -EBUSY;
|
||||
|
||||
@ -7205,6 +7275,8 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
|
||||
sector_t newsize;
|
||||
struct r5conf *conf = mddev->private;
|
||||
|
||||
if (conf->log)
|
||||
return -EINVAL;
|
||||
sectors &= ~((sector_t)conf->chunk_sectors - 1);
|
||||
newsize = raid5_size(mddev, sectors, mddev->raid_disks);
|
||||
if (mddev->external_size &&
|
||||
@ -7256,6 +7328,8 @@ static int check_reshape(struct mddev *mddev)
|
||||
{
|
||||
struct r5conf *conf = mddev->private;
|
||||
|
||||
if (conf->log)
|
||||
return -EINVAL;
|
||||
if (mddev->delta_disks == 0 &&
|
||||
mddev->new_layout == mddev->layout &&
|
||||
mddev->new_chunk_sectors == mddev->chunk_sectors)
|
||||
@ -7532,6 +7606,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
|
||||
unlock_all_device_hash_locks_irq(conf);
|
||||
break;
|
||||
}
|
||||
r5l_quiesce(conf->log, state);
|
||||
}
|
||||
|
||||
static void *raid45_takeover_raid0(struct mddev *mddev, int level)
|
||||
|
@ -223,6 +223,9 @@ struct stripe_head {
|
||||
struct stripe_head *batch_head; /* protected by stripe lock */
|
||||
spinlock_t batch_lock; /* only header's lock is useful */
|
||||
struct list_head batch_list; /* protected by head's batch lock*/
|
||||
|
||||
struct r5l_io_unit *log_io;
|
||||
struct list_head log_list;
|
||||
/**
|
||||
* struct stripe_operations
|
||||
* @target - STRIPE_OP_COMPUTE_BLK target
|
||||
@ -244,6 +247,7 @@ struct stripe_head {
|
||||
struct bio *toread, *read, *towrite, *written;
|
||||
sector_t sector; /* sector of this page */
|
||||
unsigned long flags;
|
||||
u32 log_checksum;
|
||||
} dev[1]; /* allocated with extra space depending of RAID geometry */
|
||||
};
|
||||
|
||||
@ -268,6 +272,7 @@ struct stripe_head_state {
|
||||
struct bio_list return_bi;
|
||||
struct md_rdev *blocked_rdev;
|
||||
int handle_bad_blocks;
|
||||
int log_failed;
|
||||
};
|
||||
|
||||
/* Flags for struct r5dev.flags */
|
||||
@ -340,6 +345,7 @@ enum {
|
||||
STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add
|
||||
* to batch yet.
|
||||
*/
|
||||
STRIPE_LOG_TRAPPED, /* trapped into log */
|
||||
};
|
||||
|
||||
#define STRIPE_EXPAND_SYNC_FLAGS \
|
||||
@ -543,6 +549,7 @@ struct r5conf {
|
||||
struct r5worker_group *worker_groups;
|
||||
int group_cnt;
|
||||
int worker_cnt_per_group;
|
||||
struct r5l_log *log;
|
||||
};
|
||||
|
||||
|
||||
@ -609,4 +616,21 @@ static inline int algorithm_is_DDF(int layout)
|
||||
|
||||
extern void md_raid5_kick_device(struct r5conf *conf);
|
||||
extern int raid5_set_cache_size(struct mddev *mddev, int size);
|
||||
extern sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous);
|
||||
extern void raid5_release_stripe(struct stripe_head *sh);
|
||||
extern sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
|
||||
int previous, int *dd_idx,
|
||||
struct stripe_head *sh);
|
||||
extern struct stripe_head *
|
||||
raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
|
||||
int previous, int noblock, int noquiesce);
|
||||
extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
|
||||
extern void r5l_exit_log(struct r5l_log *log);
|
||||
extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
|
||||
extern void r5l_write_stripe_run(struct r5l_log *log);
|
||||
extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
|
||||
extern void r5l_stripe_write_finished(struct stripe_head *sh);
|
||||
extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
|
||||
extern void r5l_quiesce(struct r5l_log *log, int state);
|
||||
extern bool r5l_log_disk_error(struct r5conf *conf);
|
||||
#endif
|
||||
|
@ -89,6 +89,12 @@
|
||||
* read requests will only be sent here in
|
||||
* dire need
|
||||
*/
|
||||
#define MD_DISK_JOURNAL 18 /* disk is used as the write journal in RAID-5/6 */
|
||||
|
||||
#define MD_DISK_ROLE_SPARE 0xffff
|
||||
#define MD_DISK_ROLE_FAULTY 0xfffe
|
||||
#define MD_DISK_ROLE_JOURNAL 0xfffd
|
||||
#define MD_DISK_ROLE_MAX 0xff00 /* max value of regular disk role */
|
||||
|
||||
typedef struct mdp_device_descriptor_s {
|
||||
__u32 number; /* 0 Device number in the entire set */
|
||||
@ -252,7 +258,10 @@ struct mdp_superblock_1 {
|
||||
__le64 data_offset; /* sector start of data, often 0 */
|
||||
__le64 data_size; /* sectors in this device that can be used for data */
|
||||
__le64 super_offset; /* sector start of this superblock */
|
||||
__le64 recovery_offset;/* sectors before this offset (from data_offset) have been recovered */
|
||||
union {
|
||||
__le64 recovery_offset;/* sectors before this offset (from data_offset) have been recovered */
|
||||
__le64 journal_tail;/* journal tail of journal device (from data_offset) */
|
||||
};
|
||||
__le32 dev_number; /* permanent identifier of this device - not role in raid */
|
||||
__le32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */
|
||||
__u8 device_uuid[16]; /* user-space setable, ignored by kernel */
|
||||
@ -302,6 +311,8 @@ struct mdp_superblock_1 {
|
||||
#define MD_FEATURE_RECOVERY_BITMAP 128 /* recovery that is happening
|
||||
* is guided by bitmap.
|
||||
*/
|
||||
#define MD_FEATURE_CLUSTERED 256 /* clustered MD */
|
||||
#define MD_FEATURE_JOURNAL 512 /* support write cache */
|
||||
#define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \
|
||||
|MD_FEATURE_RECOVERY_OFFSET \
|
||||
|MD_FEATURE_RESHAPE_ACTIVE \
|
||||
@ -310,6 +321,66 @@ struct mdp_superblock_1 {
|
||||
|MD_FEATURE_RESHAPE_BACKWARDS \
|
||||
|MD_FEATURE_NEW_OFFSET \
|
||||
|MD_FEATURE_RECOVERY_BITMAP \
|
||||
|MD_FEATURE_CLUSTERED \
|
||||
|MD_FEATURE_JOURNAL \
|
||||
)
|
||||
|
||||
struct r5l_payload_header {
|
||||
__le16 type;
|
||||
__le16 flags;
|
||||
} __attribute__ ((__packed__));
|
||||
|
||||
enum r5l_payload_type {
|
||||
R5LOG_PAYLOAD_DATA = 0,
|
||||
R5LOG_PAYLOAD_PARITY = 1,
|
||||
R5LOG_PAYLOAD_FLUSH = 2,
|
||||
};
|
||||
|
||||
struct r5l_payload_data_parity {
|
||||
struct r5l_payload_header header;
|
||||
__le32 size; /* sector. data/parity size. each 4k
|
||||
* has a checksum */
|
||||
__le64 location; /* sector. For data, it's raid sector. For
|
||||
* parity, it's stripe sector */
|
||||
__le32 checksum[];
|
||||
} __attribute__ ((__packed__));
|
||||
|
||||
enum r5l_payload_data_parity_flag {
|
||||
R5LOG_PAYLOAD_FLAG_DISCARD = 1, /* payload is discard */
|
||||
/*
|
||||
* RESHAPED/RESHAPING is only set when there is reshape activity. Note,
|
||||
* both data/parity of a stripe should have the same flag set
|
||||
*
|
||||
* RESHAPED: reshape is running, and this stripe finished reshape
|
||||
* RESHAPING: reshape is running, and this stripe isn't reshaped
|
||||
*/
|
||||
R5LOG_PAYLOAD_FLAG_RESHAPED = 2,
|
||||
R5LOG_PAYLOAD_FLAG_RESHAPING = 3,
|
||||
};
|
||||
|
||||
struct r5l_payload_flush {
|
||||
struct r5l_payload_header header;
|
||||
__le32 size; /* flush_stripes size, bytes */
|
||||
__le64 flush_stripes[];
|
||||
} __attribute__ ((__packed__));
|
||||
|
||||
enum r5l_payload_flush_flag {
|
||||
R5LOG_PAYLOAD_FLAG_FLUSH_STRIPE = 1, /* data represents whole stripe */
|
||||
};
|
||||
|
||||
struct r5l_meta_block {
|
||||
__le32 magic;
|
||||
__le32 checksum;
|
||||
__u8 version;
|
||||
__u8 __zero_pading_1;
|
||||
__le16 __zero_pading_2;
|
||||
__le32 meta_size; /* whole size of the block */
|
||||
|
||||
__le64 seq;
|
||||
__le64 position; /* sector, start from rdev->data_offset, current position */
|
||||
struct r5l_payload_header payloads[];
|
||||
} __attribute__ ((__packed__));
|
||||
|
||||
#define R5LOG_VERSION 0x1
|
||||
#define R5LOG_MAGIC 0x6433c509
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user