md updates for 4.4.

Two major components to this update.
 
 1/ the clustered-raid1 support from SUSE is nearly
   complete.  There are a few outstanding issues being
   worked on.  Maybe half a dozen patches will bring
   this to a usable state.
 
 2/ The first stage of journalled-raid5 support from
    Facebook makes an appearance.  With a journal
    device configured (typically NVRAM or SSD), the
    "RAID5 write hole" should be closed - a crash
    during degraded operations cannot result in data
    corruption.
 
    The next stage will be to use the journal as a
    write-behind cache so that latency can be reduced
    and in some cases throughput increased by
    performing more full-stripe writes.
 -----BEGIN PGP SIGNATURE-----
 Version: GnuPG v2
 
 iQIcBAABCAAGBQJWNX9RAAoJEDnsnt1WYoG5bYMP/jI0pV3wcbs7mZQAa8S/V0lU
 2l25x4MdwDvqVKMfjIc/C5J08QNgcrgSvhiVPCEOK0w18q395vep9f6gFKbMHhu/
 lWU3PLHGw8XBHp5yEnxrpQkN0pRrNjh5NqIdlVMBNyL6u+RZPS2ZuzxJ8wiNAFg1
 MypNkgoUu6s+nBp4DWWnMGYhBc+szBR+gTYAzGiZ8vqOH9uiSJ2SsGG5aRVUN/af
 oMYvJAf9aA6uj+xSzNlXIaLfWJIrshQYS1jU/W4gTm0DwK9yqbTxvubJaE0SGu/o
 73FGU8tmQ6ELYfsp3D/jmfUkE7weiNEQhdVb/4wy1A/SGc+W7Ju9pxfhm8ra57s0
 /BCkfwWZXEvx1flegXfK1mC6EMpMIcGAD2FQEhmQbW6wTdDwtNyEhIePDVGJwD/F
 rhEThFa+Dg9+xnBGnS6OUK3EpXgml2hAeAC7uA3TVSAnWd/9/Mpim6fZhqrB/v9L
 Ik0tZt+H4nxYaheZjKlKhuXUQYcUWGiMb67bGMem/YAlMa4y9C9qF+9mPXxyjVlI
 hBsd5SfZNz99DyB/bO8BumQeIWlTfzLeFzWW67eQ864LRKO6k0/VIbPZHCfn2oVG
 XvyC2fUhNOIURP3IMxcyHYxOA7Mu6EDsVVDTpuqLVbZQ5IPjDEfQ54yB/BLUvbX/
 Gh2/tKn7Xc25HuLAFEbs
 =TD5o
 -----END PGP SIGNATURE-----

Merge tag 'md/4.4' of git://neil.brown.name/md

Pull md updates from Neil Brown:
 "Two major components to this update.

   1) The clustered-raid1 support from SUSE is nearly complete.  There
      are a few outstanding issues being worked on.  Maybe half a dozen
      patches will bring this to a usable state.

   2) The first stage of journalled-raid5 support from Facebook makes an
      appearance.  With a journal device configured (typically NVRAM or
      SSD), the "RAID5 write hole" should be closed - a crash during
      degraded operations cannot result in data corruption.

      The next stage will be to use the journal as a write-behind cache
      so that latency can be reduced and in some cases throughput
      increased by performing more full-stripe writes.

* tag 'md/4.4' of git://neil.brown.name/md: (66 commits)
  MD: when RAID journal is missing/faulty, block RESTART_ARRAY_RW
  MD: set journal disk ->raid_disk
  MD: kick out journal disk if it's not fresh
  raid5-cache: start raid5 readonly if journal is missing
  MD: add new bit to indicate raid array with journal
  raid5-cache: IO error handling
  raid5: journal disk can't be removed
  raid5-cache: add trim support for log
  MD: fix info output for journal disk
  raid5-cache: use bio chaining
  raid5-cache: small log->seq cleanup
  raid5-cache: new helper: r5_reserve_log_entry
  raid5-cache: inline r5l_alloc_io_unit into r5l_new_meta
  raid5-cache: take rdev->data_offset into account early on
  raid5-cache: refactor bio allocation
  raid5-cache: clean up r5l_get_meta
  raid5-cache: simplify state machine when caches flushes are not needed
  raid5-cache: factor out a helper to run all stripes for an I/O unit
  raid5-cache: rename flushed_ios to finished_ios
  raid5-cache: free I/O units earlier
  ...
This commit is contained in:
Linus Torvalds 2015-11-04 21:12:47 -08:00
commit ac322de6bf
14 changed files with 1991 additions and 312 deletions

View File

@ -17,7 +17,7 @@ dm-cache-smq-y += dm-cache-policy-smq.o
dm-cache-cleaner-y += dm-cache-policy-cleaner.o
dm-era-y += dm-era-target.o
md-mod-y += md.o bitmap.o
raid456-y += raid5.o
raid456-y += raid5.o raid5-cache.o
# Note: link order is important. All raid personalities
# and must come before md.o, as they each initialise

View File

@ -613,12 +613,10 @@ re_read:
daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
write_behind = le32_to_cpu(sb->write_behind);
sectors_reserved = le32_to_cpu(sb->sectors_reserved);
/* XXX: This is a hack to ensure that we don't use clustering
* in case:
* - dm-raid is in use and
* - the nodes written in bitmap_sb is erroneous.
/* Setup nodes/clustername only if bitmap version is
* cluster-compatible
*/
if (!bitmap->mddev->sync_super) {
if (sb->version == cpu_to_le32(BITMAP_MAJOR_CLUSTERED)) {
nodes = le32_to_cpu(sb->nodes);
strlcpy(bitmap->mddev->bitmap_info.cluster_name,
sb->cluster_name, 64);
@ -628,7 +626,7 @@ re_read:
if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
reason = "bad magic";
else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO ||
le32_to_cpu(sb->version) > BITMAP_MAJOR_HI)
le32_to_cpu(sb->version) > BITMAP_MAJOR_CLUSTERED)
reason = "unrecognized superblock version";
else if (chunksize < 512)
reason = "bitmap chunksize too small";
@ -1572,7 +1570,7 @@ void bitmap_close_sync(struct bitmap *bitmap)
}
EXPORT_SYMBOL(bitmap_close_sync);
void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force)
{
sector_t s = 0;
sector_t blocks;
@ -1583,7 +1581,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
bitmap->last_end_sync = jiffies;
return;
}
if (time_before(jiffies, (bitmap->last_end_sync
if (!force && time_before(jiffies, (bitmap->last_end_sync
+ bitmap->mddev->bitmap_info.daemon_sleep)))
return;
wait_event(bitmap->mddev->recovery_wait,

View File

@ -9,8 +9,10 @@
#define BITMAP_MAJOR_LO 3
/* version 4 insists the bitmap is in little-endian order
* with version 3, it is host-endian which is non-portable
* Version 5 is currently set only for clustered devices
*/
#define BITMAP_MAJOR_HI 4
#define BITMAP_MAJOR_CLUSTERED 5
#define BITMAP_MAJOR_HOSTENDIAN 3
/*
@ -255,7 +257,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded);
void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted);
void bitmap_close_sync(struct bitmap *bitmap);
void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector);
void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force);
void bitmap_unplug(struct bitmap *bitmap);
void bitmap_daemon_work(struct mddev *mddev);

View File

@ -28,6 +28,7 @@ struct dlm_lock_resource {
struct completion completion; /* completion for synchronized locking */
void (*bast)(void *arg, int mode); /* blocking AST function pointer*/
struct mddev *mddev; /* pointing back to mddev. */
int mode;
};
struct suspend_info {
@ -53,8 +54,8 @@ struct md_cluster_info {
dlm_lockspace_t *lockspace;
int slot_number;
struct completion completion;
struct mutex sb_mutex;
struct dlm_lock_resource *bitmap_lockres;
struct dlm_lock_resource *resync_lockres;
struct list_head suspend_list;
spinlock_t suspend_lock;
struct md_thread *recovery_thread;
@ -79,20 +80,20 @@ enum msg_type {
};
struct cluster_msg {
int type;
int slot;
__le32 type;
__le32 slot;
/* TODO: Unionize this for smaller footprint */
sector_t low;
sector_t high;
__le64 low;
__le64 high;
char uuid[16];
int raid_slot;
__le32 raid_slot;
};
static void sync_ast(void *arg)
{
struct dlm_lock_resource *res;
res = (struct dlm_lock_resource *) arg;
res = arg;
complete(&res->completion);
}
@ -106,6 +107,8 @@ static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
if (ret)
return ret;
wait_for_completion(&res->completion);
if (res->lksb.sb_status == 0)
res->mode = mode;
return res->lksb.sb_status;
}
@ -127,6 +130,7 @@ static struct dlm_lock_resource *lockres_init(struct mddev *mddev,
init_completion(&res->completion);
res->ls = cinfo->lockspace;
res->mddev = mddev;
res->mode = DLM_LOCK_IV;
namelen = strlen(name);
res->name = kzalloc(namelen + 1, GFP_KERNEL);
if (!res->name) {
@ -191,8 +195,8 @@ retry:
kfree(res);
}
static void add_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres,
sector_t lo, sector_t hi)
static void add_resync_info(struct dlm_lock_resource *lockres,
sector_t lo, sector_t hi)
{
struct resync_info *ri;
@ -210,7 +214,7 @@ static struct suspend_info *read_resync_info(struct mddev *mddev, struct dlm_loc
dlm_lock_sync(lockres, DLM_LOCK_CR);
memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info));
hi = le64_to_cpu(ri.hi);
if (ri.hi > 0) {
if (hi > 0) {
s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
if (!s)
goto out;
@ -345,7 +349,7 @@ static const struct dlm_lockspace_ops md_ls_ops = {
*/
static void ack_bast(void *arg, int mode)
{
struct dlm_lock_resource *res = (struct dlm_lock_resource *)arg;
struct dlm_lock_resource *res = arg;
struct md_cluster_info *cinfo = res->mddev->cluster_info;
if (mode == DLM_LOCK_EX)
@ -358,29 +362,32 @@ static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot)
list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
if (slot == s->slot) {
pr_info("%s:%d Deleting suspend_info: %d\n",
__func__, __LINE__, slot);
list_del(&s->list);
kfree(s);
break;
}
}
static void remove_suspend_info(struct md_cluster_info *cinfo, int slot)
static void remove_suspend_info(struct mddev *mddev, int slot)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
spin_lock_irq(&cinfo->suspend_lock);
__remove_suspend_info(cinfo, slot);
spin_unlock_irq(&cinfo->suspend_lock);
mddev->pers->quiesce(mddev, 2);
}
static void process_suspend_info(struct md_cluster_info *cinfo,
static void process_suspend_info(struct mddev *mddev,
int slot, sector_t lo, sector_t hi)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
struct suspend_info *s;
if (!hi) {
remove_suspend_info(cinfo, slot);
remove_suspend_info(mddev, slot);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
return;
}
s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
@ -389,11 +396,14 @@ static void process_suspend_info(struct md_cluster_info *cinfo,
s->slot = slot;
s->lo = lo;
s->hi = hi;
mddev->pers->quiesce(mddev, 1);
mddev->pers->quiesce(mddev, 0);
spin_lock_irq(&cinfo->suspend_lock);
/* Remove existing entry (if exists) before adding */
__remove_suspend_info(cinfo, slot);
list_add(&s->list, &cinfo->suspend_list);
spin_unlock_irq(&cinfo->suspend_lock);
mddev->pers->quiesce(mddev, 2);
}
static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
@ -407,7 +417,7 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
len = snprintf(disk_uuid, 64, "DEVICE_UUID=");
sprintf(disk_uuid + len, "%pU", cmsg->uuid);
snprintf(raid_slot, 16, "RAID_DISK=%d", cmsg->raid_slot);
snprintf(raid_slot, 16, "RAID_DISK=%d", le32_to_cpu(cmsg->raid_slot));
pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot);
init_completion(&cinfo->newdisk_completion);
set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
@ -421,64 +431,59 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
md_reload_sb(mddev);
md_reload_sb(mddev, le32_to_cpu(msg->raid_slot));
dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
}
static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
{
struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot);
struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev,
le32_to_cpu(msg->raid_slot));
if (rdev)
md_kick_rdev_from_array(rdev);
else
pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", __func__, __LINE__, msg->raid_slot);
pr_warn("%s: %d Could not find disk(%d) to REMOVE\n",
__func__, __LINE__, le32_to_cpu(msg->raid_slot));
}
static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg)
{
struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot);
struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev,
le32_to_cpu(msg->raid_slot));
if (rdev && test_bit(Faulty, &rdev->flags))
clear_bit(Faulty, &rdev->flags);
else
pr_warn("%s: %d Could not find disk(%d) which is faulty", __func__, __LINE__, msg->raid_slot);
pr_warn("%s: %d Could not find disk(%d) which is faulty",
__func__, __LINE__, le32_to_cpu(msg->raid_slot));
}
static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
{
switch (msg->type) {
if (WARN(mddev->cluster_info->slot_number - 1 == le32_to_cpu(msg->slot),
"node %d received it's own msg\n", le32_to_cpu(msg->slot)))
return;
switch (le32_to_cpu(msg->type)) {
case METADATA_UPDATED:
pr_info("%s: %d Received message: METADATA_UPDATE from %d\n",
__func__, __LINE__, msg->slot);
process_metadata_update(mddev, msg);
break;
case RESYNCING:
pr_info("%s: %d Received message: RESYNCING from %d\n",
__func__, __LINE__, msg->slot);
process_suspend_info(mddev->cluster_info, msg->slot,
msg->low, msg->high);
process_suspend_info(mddev, le32_to_cpu(msg->slot),
le64_to_cpu(msg->low),
le64_to_cpu(msg->high));
break;
case NEWDISK:
pr_info("%s: %d Received message: NEWDISK from %d\n",
__func__, __LINE__, msg->slot);
process_add_new_disk(mddev, msg);
break;
case REMOVE:
pr_info("%s: %d Received REMOVE from %d\n",
__func__, __LINE__, msg->slot);
process_remove_disk(mddev, msg);
break;
case RE_ADD:
pr_info("%s: %d Received RE_ADD from %d\n",
__func__, __LINE__, msg->slot);
process_readd_disk(mddev, msg);
break;
case BITMAP_NEEDS_SYNC:
pr_info("%s: %d Received BITMAP_NEEDS_SYNC from %d\n",
__func__, __LINE__, msg->slot);
__recover_slot(mddev, msg->slot);
__recover_slot(mddev, le32_to_cpu(msg->slot));
break;
default:
pr_warn("%s:%d Received unknown message from %d\n",
@ -528,11 +533,17 @@ static void recv_daemon(struct md_thread *thread)
/* lock_comm()
* Takes the lock on the TOKEN lock resource so no other
* node can communicate while the operation is underway.
* If called again, and the TOKEN lock is alread in EX mode
* return success. However, care must be taken that unlock_comm()
* is called only once.
*/
static int lock_comm(struct md_cluster_info *cinfo)
{
int error;
if (cinfo->token_lockres->mode == DLM_LOCK_EX)
return 0;
error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
if (error)
pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
@ -542,6 +553,7 @@ static int lock_comm(struct md_cluster_info *cinfo)
static void unlock_comm(struct md_cluster_info *cinfo)
{
WARN_ON(cinfo->token_lockres->mode != DLM_LOCK_EX);
dlm_unlock_sync(cinfo->token_lockres);
}
@ -696,7 +708,6 @@ static int join(struct mddev *mddev, int nodes)
init_completion(&cinfo->completion);
set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
mutex_init(&cinfo->sb_mutex);
mddev->cluster_info = cinfo;
memset(str, 0, 64);
@ -753,6 +764,10 @@ static int join(struct mddev *mddev, int nodes)
goto err;
}
cinfo->resync_lockres = lockres_init(mddev, "resync", NULL, 0);
if (!cinfo->resync_lockres)
goto err;
ret = gather_all_resync_info(mddev, nodes);
if (ret)
goto err;
@ -763,6 +778,7 @@ err:
lockres_free(cinfo->token_lockres);
lockres_free(cinfo->ack_lockres);
lockres_free(cinfo->no_new_dev_lockres);
lockres_free(cinfo->resync_lockres);
lockres_free(cinfo->bitmap_lockres);
if (cinfo->lockspace)
dlm_release_lockspace(cinfo->lockspace, 2);
@ -771,12 +787,32 @@ err:
return ret;
}
static void resync_bitmap(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
struct cluster_msg cmsg = {0};
int err;
cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
err = sendmsg(cinfo, &cmsg);
if (err)
pr_err("%s:%d: failed to send BITMAP_NEEDS_SYNC message (%d)\n",
__func__, __LINE__, err);
}
static int leave(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
if (!cinfo)
return 0;
/* BITMAP_NEEDS_SYNC message should be sent when node
* is leaving the cluster with dirty bitmap, also we
* can only deliver it when dlm connection is available */
if (cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector)
resync_bitmap(mddev);
md_unregister_thread(&cinfo->recovery_thread);
md_unregister_thread(&cinfo->recv_thread);
lockres_free(cinfo->message_lockres);
@ -799,15 +835,6 @@ static int slot_number(struct mddev *mddev)
return cinfo->slot_number - 1;
}
static void resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
add_resync_info(mddev, cinfo->bitmap_lockres, lo, hi);
/* Re-acquire the lock to refresh LVB */
dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
}
static int metadata_update_start(struct mddev *mddev)
{
return lock_comm(mddev->cluster_info);
@ -817,59 +844,62 @@ static int metadata_update_finish(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
struct cluster_msg cmsg;
int ret;
struct md_rdev *rdev;
int ret = 0;
int raid_slot = -1;
memset(&cmsg, 0, sizeof(cmsg));
cmsg.type = cpu_to_le32(METADATA_UPDATED);
ret = __sendmsg(cinfo, &cmsg);
/* Pick up a good active device number to send.
*/
rdev_for_each(rdev, mddev)
if (rdev->raid_disk > -1 && !test_bit(Faulty, &rdev->flags)) {
raid_slot = rdev->desc_nr;
break;
}
if (raid_slot >= 0) {
cmsg.raid_slot = cpu_to_le32(raid_slot);
ret = __sendmsg(cinfo, &cmsg);
} else
pr_warn("md-cluster: No good device id found to send\n");
unlock_comm(cinfo);
return ret;
}
static int metadata_update_cancel(struct mddev *mddev)
static void metadata_update_cancel(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
return dlm_unlock_sync(cinfo->token_lockres);
unlock_comm(cinfo);
}
static int resync_send(struct mddev *mddev, enum msg_type type,
sector_t lo, sector_t hi)
static int resync_start(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
struct cluster_msg cmsg;
int slot = cinfo->slot_number - 1;
cinfo->resync_lockres->flags |= DLM_LKF_NOQUEUE;
return dlm_lock_sync(cinfo->resync_lockres, DLM_LOCK_EX);
}
pr_info("%s:%d lo: %llu hi: %llu\n", __func__, __LINE__,
(unsigned long long)lo,
(unsigned long long)hi);
resync_info_update(mddev, lo, hi);
cmsg.type = cpu_to_le32(type);
cmsg.slot = cpu_to_le32(slot);
static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
struct cluster_msg cmsg = {0};
add_resync_info(cinfo->bitmap_lockres, lo, hi);
/* Re-acquire the lock to refresh LVB */
dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
cmsg.type = cpu_to_le32(RESYNCING);
cmsg.low = cpu_to_le64(lo);
cmsg.high = cpu_to_le64(hi);
return sendmsg(cinfo, &cmsg);
}
static int resync_start(struct mddev *mddev, sector_t lo, sector_t hi)
{
pr_info("%s:%d\n", __func__, __LINE__);
return resync_send(mddev, RESYNCING, lo, hi);
}
static void resync_finish(struct mddev *mddev)
static int resync_finish(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
struct cluster_msg cmsg;
int slot = cinfo->slot_number - 1;
pr_info("%s:%d\n", __func__, __LINE__);
resync_send(mddev, RESYNCING, 0, 0);
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
cmsg.slot = cpu_to_le32(slot);
sendmsg(cinfo, &cmsg);
}
cinfo->resync_lockres->flags &= ~DLM_LKF_NOQUEUE;
dlm_unlock_sync(cinfo->resync_lockres);
return resync_info_update(mddev, 0, 0);
}
static int area_resyncing(struct mddev *mddev, int direction,
@ -896,7 +926,11 @@ out:
return ret;
}
static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev)
/* add_new_disk() - initiates a disk add
* However, if this fails before writing md_update_sb(),
* add_new_disk_cancel() must be called to release token lock
*/
static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
struct cluster_msg cmsg;
@ -907,7 +941,7 @@ static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev)
memset(&cmsg, 0, sizeof(cmsg));
cmsg.type = cpu_to_le32(NEWDISK);
memcpy(cmsg.uuid, uuid, 16);
cmsg.raid_slot = rdev->desc_nr;
cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
lock_comm(cinfo);
ret = __sendmsg(cinfo, &cmsg);
if (ret)
@ -918,22 +952,17 @@ static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev)
/* Some node does not "see" the device */
if (ret == -EAGAIN)
ret = -ENOENT;
if (ret)
unlock_comm(cinfo);
else
dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
return ret;
}
static int add_new_disk_finish(struct mddev *mddev)
static void add_new_disk_cancel(struct mddev *mddev)
{
struct cluster_msg cmsg;
struct md_cluster_info *cinfo = mddev->cluster_info;
int ret;
/* Write sb and inform others */
md_update_sb(mddev, 1);
cmsg.type = METADATA_UPDATED;
ret = __sendmsg(cinfo, &cmsg);
unlock_comm(cinfo);
return ret;
}
static int new_disk_ack(struct mddev *mddev, bool ack)
@ -953,10 +982,10 @@ static int new_disk_ack(struct mddev *mddev, bool ack)
static int remove_disk(struct mddev *mddev, struct md_rdev *rdev)
{
struct cluster_msg cmsg;
struct cluster_msg cmsg = {0};
struct md_cluster_info *cinfo = mddev->cluster_info;
cmsg.type = REMOVE;
cmsg.raid_slot = rdev->desc_nr;
cmsg.type = cpu_to_le32(REMOVE);
cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
return __sendmsg(cinfo, &cmsg);
}
@ -964,12 +993,12 @@ static int gather_bitmaps(struct md_rdev *rdev)
{
int sn, err;
sector_t lo, hi;
struct cluster_msg cmsg;
struct cluster_msg cmsg = {0};
struct mddev *mddev = rdev->mddev;
struct md_cluster_info *cinfo = mddev->cluster_info;
cmsg.type = RE_ADD;
cmsg.raid_slot = rdev->desc_nr;
cmsg.type = cpu_to_le32(RE_ADD);
cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
err = sendmsg(cinfo, &cmsg);
if (err)
goto out;
@ -993,15 +1022,15 @@ static struct md_cluster_operations cluster_ops = {
.join = join,
.leave = leave,
.slot_number = slot_number,
.resync_info_update = resync_info_update,
.resync_start = resync_start,
.resync_finish = resync_finish,
.resync_info_update = resync_info_update,
.metadata_update_start = metadata_update_start,
.metadata_update_finish = metadata_update_finish,
.metadata_update_cancel = metadata_update_cancel,
.area_resyncing = area_resyncing,
.add_new_disk_start = add_new_disk_start,
.add_new_disk_finish = add_new_disk_finish,
.add_new_disk = add_new_disk,
.add_new_disk_cancel = add_new_disk_cancel,
.new_disk_ack = new_disk_ack,
.remove_disk = remove_disk,
.gather_bitmaps = gather_bitmaps,
@ -1022,5 +1051,6 @@ static void cluster_exit(void)
module_init(cluster_init);
module_exit(cluster_exit);
MODULE_AUTHOR("SUSE");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Clustering support for MD");

View File

@ -12,15 +12,15 @@ struct md_cluster_operations {
int (*join)(struct mddev *mddev, int nodes);
int (*leave)(struct mddev *mddev);
int (*slot_number)(struct mddev *mddev);
void (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi);
int (*resync_start)(struct mddev *mddev, sector_t lo, sector_t hi);
void (*resync_finish)(struct mddev *mddev);
int (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi);
int (*metadata_update_start)(struct mddev *mddev);
int (*metadata_update_finish)(struct mddev *mddev);
int (*metadata_update_cancel)(struct mddev *mddev);
void (*metadata_update_cancel)(struct mddev *mddev);
int (*resync_start)(struct mddev *mddev);
int (*resync_finish)(struct mddev *mddev);
int (*area_resyncing)(struct mddev *mddev, int direction, sector_t lo, sector_t hi);
int (*add_new_disk_start)(struct mddev *mddev, struct md_rdev *rdev);
int (*add_new_disk_finish)(struct mddev *mddev);
int (*add_new_disk)(struct mddev *mddev, struct md_rdev *rdev);
void (*add_new_disk_cancel)(struct mddev *mddev);
int (*new_disk_ack)(struct mddev *mddev, bool ack);
int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev);
int (*gather_bitmaps)(struct md_rdev *rdev);

View File

@ -1608,7 +1608,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
++ev1;
if (rdev->desc_nr >= 0 &&
rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe)
(le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
if (ev1 < mddev->events)
return -EINVAL;
} else if (mddev->bitmap) {
@ -1628,16 +1629,29 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
int role;
if (rdev->desc_nr < 0 ||
rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
role = 0xffff;
role = MD_DISK_ROLE_SPARE;
rdev->desc_nr = -1;
} else
role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
switch(role) {
case 0xffff: /* spare */
case MD_DISK_ROLE_SPARE: /* spare */
break;
case 0xfffe: /* faulty */
case MD_DISK_ROLE_FAULTY: /* faulty */
set_bit(Faulty, &rdev->flags);
break;
case MD_DISK_ROLE_JOURNAL: /* journal device */
if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
/* journal device without journal feature */
printk(KERN_WARNING
"md: journal device provided without journal feature, ignoring the device\n");
return -EINVAL;
}
set_bit(Journal, &rdev->flags);
rdev->journal_tail = le64_to_cpu(sb->journal_tail);
if (mddev->recovery_cp == MaxSector)
set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
rdev->raid_disk = mddev->raid_disks;
break;
default:
rdev->saved_raid_disk = role;
if ((le32_to_cpu(sb->feature_map) &
@ -1655,6 +1669,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
set_bit(WriteMostly, &rdev->flags);
if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
set_bit(Replacement, &rdev->flags);
if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
set_bit(MD_HAS_JOURNAL, &mddev->flags);
} else /* MULTIPATH are always insync */
set_bit(In_sync, &rdev->flags);
@ -1679,6 +1695,8 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->events = cpu_to_le64(mddev->events);
if (mddev->in_sync)
sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
sb->resync_offset = cpu_to_le64(MaxSector);
else
sb->resync_offset = cpu_to_le64(0);
@ -1702,7 +1720,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
}
if (rdev->raid_disk >= 0 &&
if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags)) {
sb->feature_map |=
cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
@ -1712,6 +1730,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->feature_map |=
cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
}
/* Note: recovery_offset and journal_tail share space */
if (test_bit(Journal, &rdev->flags))
sb->journal_tail = cpu_to_le64(rdev->journal_tail);
if (test_bit(Replacement, &rdev->flags))
sb->feature_map |=
cpu_to_le32(MD_FEATURE_REPLACEMENT);
@ -1735,6 +1756,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
}
}
if (mddev_is_clustered(mddev))
sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED);
if (rdev->badblocks.count == 0)
/* Nothing to do for bad blocks*/ ;
else if (sb->bblog_offset == 0)
@ -1785,18 +1809,23 @@ retry:
max_dev = le32_to_cpu(sb->max_dev);
for (i=0; i<max_dev;i++)
sb->dev_roles[i] = cpu_to_le16(0xfffe);
sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
rdev_for_each(rdev2, mddev) {
i = rdev2->desc_nr;
if (test_bit(Faulty, &rdev2->flags))
sb->dev_roles[i] = cpu_to_le16(0xfffe);
sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
else if (test_bit(In_sync, &rdev2->flags))
sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
else if (test_bit(Journal, &rdev2->flags))
sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL);
else if (rdev2->raid_disk >= 0)
sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
else
sb->dev_roles[i] = cpu_to_le16(0xffff);
sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
}
sb->sb_csum = calc_sb_1_csum(sb);
@ -1912,13 +1941,23 @@ static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
struct md_rdev *rdev, *rdev2;
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev1)
rdev_for_each_rcu(rdev2, mddev2)
rdev_for_each_rcu(rdev, mddev1) {
if (test_bit(Faulty, &rdev->flags) ||
test_bit(Journal, &rdev->flags) ||
rdev->raid_disk == -1)
continue;
rdev_for_each_rcu(rdev2, mddev2) {
if (test_bit(Faulty, &rdev2->flags) ||
test_bit(Journal, &rdev2->flags) ||
rdev2->raid_disk == -1)
continue;
if (rdev->bdev->bd_contains ==
rdev2->bdev->bd_contains) {
rcu_read_unlock();
return 1;
}
}
}
rcu_read_unlock();
return 0;
}
@ -2194,23 +2233,77 @@ static void sync_sbs(struct mddev *mddev, int nospares)
}
}
static bool does_sb_need_changing(struct mddev *mddev)
{
struct md_rdev *rdev;
struct mdp_superblock_1 *sb;
int role;
/* Find a good rdev */
rdev_for_each(rdev, mddev)
if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags))
break;
/* No good device found. */
if (!rdev)
return false;
sb = page_address(rdev->sb_page);
/* Check if a device has become faulty or a spare become active */
rdev_for_each(rdev, mddev) {
role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
/* Device activated? */
if (role == 0xffff && rdev->raid_disk >=0 &&
!test_bit(Faulty, &rdev->flags))
return true;
/* Device turned faulty? */
if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd))
return true;
}
/* Check if any mddev parameters have changed */
if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
(mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
(mddev->layout != le64_to_cpu(sb->layout)) ||
(mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
(mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
return true;
return false;
}
void md_update_sb(struct mddev *mddev, int force_change)
{
struct md_rdev *rdev;
int sync_req;
int nospares = 0;
int any_badblocks_changed = 0;
int ret = -1;
if (mddev->ro) {
if (force_change)
set_bit(MD_CHANGE_DEVS, &mddev->flags);
return;
}
if (mddev_is_clustered(mddev)) {
if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
force_change = 1;
ret = md_cluster_ops->metadata_update_start(mddev);
/* Has someone else has updated the sb */
if (!does_sb_need_changing(mddev)) {
if (ret == 0)
md_cluster_ops->metadata_update_cancel(mddev);
clear_bit(MD_CHANGE_PENDING, &mddev->flags);
return;
}
}
repeat:
/* First make sure individual recovery_offsets are correct */
rdev_for_each(rdev, mddev) {
if (rdev->raid_disk >= 0 &&
mddev->delta_disks >= 0 &&
!test_bit(Journal, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) &&
mddev->curr_resync_completed > rdev->recovery_offset)
rdev->recovery_offset = mddev->curr_resync_completed;
@ -2354,6 +2447,9 @@ repeat:
clear_bit(BlockedBadBlocks, &rdev->flags);
wake_up(&rdev->blocked_wait);
}
if (mddev_is_clustered(mddev) && ret == 0)
md_cluster_ops->metadata_update_finish(mddev);
}
EXPORT_SYMBOL(md_update_sb);
@ -2429,6 +2525,10 @@ state_show(struct md_rdev *rdev, char *page)
len += sprintf(page+len, "%sin_sync",sep);
sep = ",";
}
if (test_bit(Journal, &flags)) {
len += sprintf(page+len, "%sjournal",sep);
sep = ",";
}
if (test_bit(WriteMostly, &flags)) {
len += sprintf(page+len, "%swrite_mostly",sep);
sep = ",";
@ -2440,6 +2540,7 @@ state_show(struct md_rdev *rdev, char *page)
sep = ",";
}
if (!test_bit(Faulty, &flags) &&
!test_bit(Journal, &flags) &&
!test_bit(In_sync, &flags)) {
len += sprintf(page+len, "%sspare", sep);
sep = ",";
@ -2488,17 +2589,16 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
err = -EBUSY;
else {
struct mddev *mddev = rdev->mddev;
if (mddev_is_clustered(mddev))
md_cluster_ops->remove_disk(mddev, rdev);
md_kick_rdev_from_array(rdev);
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_start(mddev);
if (mddev->pers)
md_update_sb(mddev, 1);
md_new_event(mddev);
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_finish(mddev);
err = 0;
if (mddev_is_clustered(mddev))
err = md_cluster_ops->remove_disk(mddev, rdev);
if (err == 0) {
md_kick_rdev_from_array(rdev);
if (mddev->pers)
md_update_sb(mddev, 1);
md_new_event(mddev);
}
}
} else if (cmd_match(buf, "writemostly")) {
set_bit(WriteMostly, &rdev->flags);
@ -2527,7 +2627,8 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
} else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
set_bit(In_sync, &rdev->flags);
err = 0;
} else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0) {
} else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
!test_bit(Journal, &rdev->flags)) {
if (rdev->mddev->pers == NULL) {
clear_bit(In_sync, &rdev->flags);
rdev->saved_raid_disk = rdev->raid_disk;
@ -2546,6 +2647,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
* check if recovery is needed.
*/
if (rdev->raid_disk >= 0 &&
!test_bit(Journal, &rdev->flags) &&
!test_bit(Replacement, &rdev->flags))
set_bit(WantReplacement, &rdev->flags);
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
@ -2623,7 +2725,9 @@ __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
static ssize_t
slot_show(struct md_rdev *rdev, char *page)
{
if (rdev->raid_disk < 0)
if (test_bit(Journal, &rdev->flags))
return sprintf(page, "journal\n");
else if (rdev->raid_disk < 0)
return sprintf(page, "none\n");
else
return sprintf(page, "%d\n", rdev->raid_disk);
@ -2635,6 +2739,8 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
int slot;
int err;
if (test_bit(Journal, &rdev->flags))
return -EBUSY;
if (strncmp(buf, "none", 4)==0)
slot = -1;
else {
@ -2686,15 +2792,9 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
rdev->saved_raid_disk = -1;
clear_bit(In_sync, &rdev->flags);
clear_bit(Bitmap_sync, &rdev->flags);
err = rdev->mddev->pers->
hot_add_disk(rdev->mddev, rdev);
if (err) {
rdev->raid_disk = -1;
return err;
} else
sysfs_notify_dirent_safe(rdev->sysfs_state);
if (sysfs_link_rdev(rdev->mddev, rdev))
/* failure here is OK */;
remove_and_add_spares(rdev->mddev, rdev);
if (rdev->raid_disk == -1)
return -EBUSY;
/* don't wakeup anyone, leave that to userspace. */
} else {
if (slot >= rdev->mddev->raid_disks &&
@ -2839,6 +2939,8 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
sector_t oldsectors = rdev->sectors;
sector_t sectors;
if (test_bit(Journal, &rdev->flags))
return -EBUSY;
if (strict_blocks_to_sectors(buf, &sectors) < 0)
return -EINVAL;
if (rdev->data_offset != rdev->new_data_offset)
@ -3196,20 +3298,14 @@ static void analyze_sbs(struct mddev *mddev)
md_kick_rdev_from_array(rdev);
continue;
}
/* No device should have a Candidate flag
* when reading devices
*/
if (test_bit(Candidate, &rdev->flags)) {
pr_info("md: kicking Cluster Candidate %s from array!\n",
bdevname(rdev->bdev, b));
md_kick_rdev_from_array(rdev);
}
}
if (mddev->level == LEVEL_MULTIPATH) {
rdev->desc_nr = i++;
rdev->raid_disk = rdev->desc_nr;
set_bit(In_sync, &rdev->flags);
} else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) {
} else if (rdev->raid_disk >=
(mddev->raid_disks - min(0, mddev->delta_disks)) &&
!test_bit(Journal, &rdev->flags)) {
rdev->raid_disk = -1;
clear_bit(In_sync, &rdev->flags);
}
@ -3267,6 +3363,11 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
{
unsigned long msec;
if (mddev_is_clustered(mddev)) {
pr_info("md: Safemode is disabled for clustered mode\n");
return -EINVAL;
}
if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
return -EINVAL;
if (msec == 0)
@ -3867,7 +3968,9 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
break;
case clean:
if (mddev->pers) {
restart_array(mddev);
err = restart_array(mddev);
if (err)
break;
spin_lock(&mddev->lock);
if (atomic_read(&mddev->writes_pending) == 0) {
if (mddev->in_sync == 0) {
@ -3885,7 +3988,9 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
break;
case active:
if (mddev->pers) {
restart_array(mddev);
err = restart_array(mddev);
if (err)
break;
clear_bit(MD_CHANGE_PENDING, &mddev->flags);
wake_up(&mddev->sb_wait);
err = 0;
@ -4064,12 +4169,8 @@ size_store(struct mddev *mddev, const char *buf, size_t len)
if (err)
return err;
if (mddev->pers) {
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_start(mddev);
err = update_size(mddev, sectors);
md_update_sb(mddev, 1);
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_finish(mddev);
} else {
if (mddev->dev_sectors == 0 ||
mddev->dev_sectors > sectors)
@ -5181,7 +5282,10 @@ int md_run(struct mddev *mddev)
atomic_set(&mddev->max_corr_read_errors,
MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
mddev->safemode = 0;
mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
if (mddev_is_clustered(mddev))
mddev->safemode_delay = 0;
else
mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
mddev->in_sync = 1;
smp_wmb();
spin_lock(&mddev->lock);
@ -5224,6 +5328,9 @@ static int do_md_run(struct mddev *mddev)
goto out;
}
if (mddev_is_clustered(mddev))
md_allow_write(mddev);
md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
@ -5246,6 +5353,25 @@ static int restart_array(struct mddev *mddev)
return -EINVAL;
if (!mddev->ro)
return -EBUSY;
if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
struct md_rdev *rdev;
bool has_journal = false;
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev) {
if (test_bit(Journal, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags)) {
has_journal = true;
break;
}
}
rcu_read_unlock();
/* Don't restart rw with journal missing/faulty */
if (!has_journal)
return -EINVAL;
}
mddev->safemode = 0;
mddev->ro = 0;
set_disk_ro(disk, 0);
@ -5307,8 +5433,6 @@ static void md_clean(struct mddev *mddev)
static void __md_stop_writes(struct mddev *mddev)
{
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_start(mddev);
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
flush_workqueue(md_misc_wq);
if (mddev->sync_thread) {
@ -5322,13 +5446,13 @@ static void __md_stop_writes(struct mddev *mddev)
md_super_wait(mddev);
if (mddev->ro == 0 &&
(!mddev->in_sync || (mddev->flags & MD_UPDATE_SB_FLAGS))) {
((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
(mddev->flags & MD_UPDATE_SB_FLAGS))) {
/* mark array as shutdown cleanly */
mddev->in_sync = 1;
if (!mddev_is_clustered(mddev))
mddev->in_sync = 1;
md_update_sb(mddev, 1);
}
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_finish(mddev);
}
void md_stop_writes(struct mddev *mddev)
@ -5789,6 +5913,8 @@ static int get_disk_info(struct mddev *mddev, void __user * arg)
info.state |= (1<<MD_DISK_ACTIVE);
info.state |= (1<<MD_DISK_SYNC);
}
if (test_bit(Journal, &rdev->flags))
info.state |= (1<<MD_DISK_JOURNAL);
if (test_bit(WriteMostly, &rdev->flags))
info.state |= (1<<MD_DISK_WRITEMOSTLY);
} else {
@ -5903,23 +6029,18 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
else
clear_bit(WriteMostly, &rdev->flags);
if (info->state & (1<<MD_DISK_JOURNAL))
set_bit(Journal, &rdev->flags);
/*
* check whether the device shows up in other nodes
*/
if (mddev_is_clustered(mddev)) {
if (info->state & (1 << MD_DISK_CANDIDATE)) {
/* Through --cluster-confirm */
if (info->state & (1 << MD_DISK_CANDIDATE))
set_bit(Candidate, &rdev->flags);
err = md_cluster_ops->new_disk_ack(mddev, true);
if (err) {
export_rdev(rdev);
return err;
}
} else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
/* --add initiated by this node */
err = md_cluster_ops->add_new_disk_start(mddev, rdev);
err = md_cluster_ops->add_new_disk(mddev, rdev);
if (err) {
md_cluster_ops->add_new_disk_finish(mddev);
export_rdev(rdev);
return err;
}
@ -5928,13 +6049,23 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
rdev->raid_disk = -1;
err = bind_rdev_to_array(rdev, mddev);
if (err)
export_rdev(rdev);
else
if (mddev_is_clustered(mddev)) {
if (info->state & (1 << MD_DISK_CANDIDATE))
md_cluster_ops->new_disk_ack(mddev, (err == 0));
else {
if (err)
md_cluster_ops->add_new_disk_cancel(mddev);
else
err = add_bound_rdev(rdev);
}
} else if (!err)
err = add_bound_rdev(rdev);
if (mddev_is_clustered(mddev) &&
(info->state & (1 << MD_DISK_CLUSTER_ADD)))
md_cluster_ops->add_new_disk_finish(mddev);
return err;
}
@ -5990,13 +6121,17 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
{
char b[BDEVNAME_SIZE];
struct md_rdev *rdev;
int ret = -1;
rdev = find_rdev(mddev, dev);
if (!rdev)
return -ENXIO;
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_start(mddev);
ret = md_cluster_ops->metadata_update_start(mddev);
if (rdev->raid_disk < 0)
goto kick_rdev;
clear_bit(Blocked, &rdev->flags);
remove_and_add_spares(mddev, rdev);
@ -6004,20 +6139,19 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
if (rdev->raid_disk >= 0)
goto busy;
if (mddev_is_clustered(mddev))
kick_rdev:
if (mddev_is_clustered(mddev) && ret == 0)
md_cluster_ops->remove_disk(mddev, rdev);
md_kick_rdev_from_array(rdev);
md_update_sb(mddev, 1);
md_new_event(mddev);
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_finish(mddev);
return 0;
busy:
if (mddev_is_clustered(mddev))
if (mddev_is_clustered(mddev) && ret == 0)
md_cluster_ops->metadata_update_cancel(mddev);
printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
bdevname(rdev->bdev,b), mdname(mddev));
return -EBUSY;
@ -6068,14 +6202,12 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
goto abort_export;
}
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_start(mddev);
clear_bit(In_sync, &rdev->flags);
rdev->desc_nr = -1;
rdev->saved_raid_disk = -1;
err = bind_rdev_to_array(rdev, mddev);
if (err)
goto abort_clustered;
goto abort_export;
/*
* The rest should better be atomic, we can have disk failures
@ -6085,9 +6217,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
rdev->raid_disk = -1;
md_update_sb(mddev, 1);
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_finish(mddev);
/*
* Kick recovery, maybe this spare has to be added to the
* array immediately.
@ -6097,9 +6226,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
md_new_event(mddev);
return 0;
abort_clustered:
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_cancel(mddev);
abort_export:
export_rdev(rdev);
return err;
@ -6417,8 +6543,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
return rv;
}
}
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_start(mddev);
if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
rv = update_size(mddev, (sector_t)info->size * 2);
@ -6476,12 +6600,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
}
}
md_update_sb(mddev, 1);
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_finish(mddev);
return rv;
err:
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_cancel(mddev);
return rv;
}
@ -7282,6 +7402,8 @@ static int md_seq_show(struct seq_file *seq, void *v)
bdevname(rdev->bdev,b), rdev->desc_nr);
if (test_bit(WriteMostly, &rdev->flags))
seq_printf(seq, "(W)");
if (test_bit(Journal, &rdev->flags))
seq_printf(seq, "(J)");
if (test_bit(Faulty, &rdev->flags)) {
seq_printf(seq, "(F)");
continue;
@ -7594,11 +7716,7 @@ int md_allow_write(struct mddev *mddev)
mddev->safemode == 0)
mddev->safemode = 1;
spin_unlock(&mddev->lock);
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_start(mddev);
md_update_sb(mddev, 0);
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_finish(mddev);
sysfs_notify_dirent_safe(mddev->sysfs_state);
} else
spin_unlock(&mddev->lock);
@ -7630,6 +7748,7 @@ void md_do_sync(struct md_thread *thread)
struct md_rdev *rdev;
char *desc, *action = NULL;
struct blk_plug plug;
bool cluster_resync_finished = false;
/* just incase thread restarts... */
if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
@ -7739,6 +7858,7 @@ void md_do_sync(struct md_thread *thread)
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev)
if (rdev->raid_disk >= 0 &&
!test_bit(Journal, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) &&
rdev->recovery_offset < j)
@ -7799,9 +7919,6 @@ void md_do_sync(struct md_thread *thread)
md_new_event(mddev);
update_time = jiffies;
if (mddev_is_clustered(mddev))
md_cluster_ops->resync_start(mddev, j, max_sectors);
blk_start_plug(&plug);
while (j < max_sectors) {
sector_t sectors;
@ -7865,8 +7982,6 @@ void md_do_sync(struct md_thread *thread)
j = max_sectors;
if (j > 2)
mddev->curr_resync = j;
if (mddev_is_clustered(mddev))
md_cluster_ops->resync_info_update(mddev, j, max_sectors);
mddev->curr_mark_cnt = io_sectors;
if (last_check == 0)
/* this is the earliest that rebuild will be
@ -7937,7 +8052,11 @@ void md_do_sync(struct md_thread *thread)
mddev->curr_resync_completed = mddev->curr_resync;
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
}
/* tell personality that we are finished */
/* tell personality and other nodes that we are finished */
if (mddev_is_clustered(mddev)) {
md_cluster_ops->resync_finish(mddev);
cluster_resync_finished = true;
}
mddev->pers->sync_request(mddev, max_sectors, &skipped);
if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
@ -7965,6 +8084,7 @@ void md_do_sync(struct md_thread *thread)
rdev_for_each_rcu(rdev, mddev)
if (rdev->raid_disk >= 0 &&
mddev->delta_disks >= 0 &&
!test_bit(Journal, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) &&
rdev->recovery_offset < mddev->curr_resync)
@ -7973,11 +8093,13 @@ void md_do_sync(struct md_thread *thread)
}
}
skip:
if (mddev_is_clustered(mddev))
md_cluster_ops->resync_finish(mddev);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
if (mddev_is_clustered(mddev) &&
test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
!cluster_resync_finished)
md_cluster_ops->resync_finish(mddev);
spin_lock(&mddev->lock);
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
/* We completed so min/max setting can be forgotten if used. */
@ -8008,7 +8130,8 @@ static int remove_and_add_spares(struct mddev *mddev,
rdev->raid_disk >= 0 &&
!test_bit(Blocked, &rdev->flags) &&
(test_bit(Faulty, &rdev->flags) ||
! test_bit(In_sync, &rdev->flags)) &&
(!test_bit(In_sync, &rdev->flags) &&
!test_bit(Journal, &rdev->flags))) &&
atomic_read(&rdev->nr_pending)==0) {
if (mddev->pers->hot_remove_disk(
mddev, rdev) == 0) {
@ -8020,18 +8143,25 @@ static int remove_and_add_spares(struct mddev *mddev,
if (removed && mddev->kobj.sd)
sysfs_notify(&mddev->kobj, NULL, "degraded");
if (this)
if (this && removed)
goto no_add;
rdev_for_each(rdev, mddev) {
if (this && this != rdev)
continue;
if (test_bit(Candidate, &rdev->flags))
continue;
if (rdev->raid_disk >= 0 &&
!test_bit(In_sync, &rdev->flags) &&
!test_bit(Journal, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags))
spares++;
if (rdev->raid_disk >= 0)
continue;
if (test_bit(Faulty, &rdev->flags))
continue;
if (test_bit(Journal, &rdev->flags))
continue;
if (mddev->ro &&
! (rdev->saved_raid_disk >= 0 &&
!test_bit(Bitmap_sync, &rdev->flags)))
@ -8056,14 +8186,25 @@ no_add:
static void md_start_sync(struct work_struct *ws)
{
struct mddev *mddev = container_of(ws, struct mddev, del_work);
int ret = 0;
if (mddev_is_clustered(mddev)) {
ret = md_cluster_ops->resync_start(mddev);
if (ret) {
mddev->sync_thread = NULL;
goto out;
}
}
mddev->sync_thread = md_register_thread(md_do_sync,
mddev,
"resync");
out:
if (!mddev->sync_thread) {
printk(KERN_ERR "%s: could not start resync"
" thread...\n",
mdname(mddev));
if (!(mddev_is_clustered(mddev) && ret == -EAGAIN))
printk(KERN_ERR "%s: could not start resync"
" thread...\n",
mdname(mddev));
/* leave the spares where they are, it shouldn't hurt */
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
@ -8182,13 +8323,8 @@ void md_check_recovery(struct mddev *mddev)
sysfs_notify_dirent_safe(mddev->sysfs_state);
}
if (mddev->flags & MD_UPDATE_SB_FLAGS) {
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_start(mddev);
if (mddev->flags & MD_UPDATE_SB_FLAGS)
md_update_sb(mddev, 0);
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_finish(mddev);
}
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
@ -8286,8 +8422,6 @@ void md_reap_sync_thread(struct mddev *mddev)
set_bit(MD_CHANGE_DEVS, &mddev->flags);
}
}
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_start(mddev);
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
mddev->pers->finish_reshape)
mddev->pers->finish_reshape(mddev);
@ -8300,8 +8434,6 @@ void md_reap_sync_thread(struct mddev *mddev)
rdev->saved_raid_disk = -1;
md_update_sb(mddev, 1);
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_finish(mddev);
clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
@ -8924,25 +9056,128 @@ err_wq:
return ret;
}
void md_reload_sb(struct mddev *mddev)
static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
{
struct md_rdev *rdev, *tmp;
struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
struct md_rdev *rdev2;
int role, ret;
char b[BDEVNAME_SIZE];
rdev_for_each_safe(rdev, tmp, mddev) {
rdev->sb_loaded = 0;
ClearPageUptodate(rdev->sb_page);
}
mddev->raid_disks = 0;
analyze_sbs(mddev);
rdev_for_each_safe(rdev, tmp, mddev) {
struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
/* since we don't write to faulty devices, we figure out if the
* disk is faulty by comparing events
*/
if (mddev->events > sb->events)
set_bit(Faulty, &rdev->flags);
/* Check for change of roles in the active devices */
rdev_for_each(rdev2, mddev) {
if (test_bit(Faulty, &rdev2->flags))
continue;
/* Check if the roles changed */
role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
if (test_bit(Candidate, &rdev2->flags)) {
if (role == 0xfffe) {
pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b));
md_kick_rdev_from_array(rdev2);
continue;
}
else
clear_bit(Candidate, &rdev2->flags);
}
if (role != rdev2->raid_disk) {
/* got activated */
if (rdev2->raid_disk == -1 && role != 0xffff) {
rdev2->saved_raid_disk = role;
ret = remove_and_add_spares(mddev, rdev2);
pr_info("Activated spare: %s\n",
bdevname(rdev2->bdev,b));
continue;
}
/* device faulty
* We just want to do the minimum to mark the disk
* as faulty. The recovery is performed by the
* one who initiated the error.
*/
if ((role == 0xfffe) || (role == 0xfffd)) {
md_error(mddev, rdev2);
clear_bit(Blocked, &rdev2->flags);
}
}
}
if (mddev->raid_disks != le32_to_cpu(sb->raid_disks))
update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
/* Finally set the event to be up to date */
mddev->events = le64_to_cpu(sb->events);
}
static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
{
int err;
struct page *swapout = rdev->sb_page;
struct mdp_superblock_1 *sb;
/* Store the sb page of the rdev in the swapout temporary
* variable in case we err in the future
*/
rdev->sb_page = NULL;
alloc_disk_sb(rdev);
ClearPageUptodate(rdev->sb_page);
rdev->sb_loaded = 0;
err = super_types[mddev->major_version].load_super(rdev, NULL, mddev->minor_version);
if (err < 0) {
pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n",
__func__, __LINE__, rdev->desc_nr, err);
put_page(rdev->sb_page);
rdev->sb_page = swapout;
rdev->sb_loaded = 1;
return err;
}
sb = page_address(rdev->sb_page);
/* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET
* is not set
*/
if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET))
rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
/* The other node finished recovery, call spare_active to set
* device In_sync and mddev->degraded
*/
if (rdev->recovery_offset == MaxSector &&
!test_bit(In_sync, &rdev->flags) &&
mddev->pers->spare_active(mddev))
sysfs_notify(&mddev->kobj, NULL, "degraded");
put_page(swapout);
return 0;
}
void md_reload_sb(struct mddev *mddev, int nr)
{
struct md_rdev *rdev;
int err;
/* Find the rdev */
rdev_for_each_rcu(rdev, mddev) {
if (rdev->desc_nr == nr)
break;
}
if (!rdev || rdev->desc_nr != nr) {
pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr);
return;
}
err = read_rdev(mddev, rdev);
if (err < 0)
return;
check_sb_changes(mddev, rdev);
/* Read all rdev's to update recovery_offset */
rdev_for_each_rcu(rdev, mddev)
read_rdev(mddev, rdev);
}
EXPORT_SYMBOL(md_reload_sb);

View File

@ -87,10 +87,16 @@ struct md_rdev {
* array and could again if we did a partial
* resync from the bitmap
*/
sector_t recovery_offset;/* If this device has been partially
union {
sector_t recovery_offset;/* If this device has been partially
* recovered, this is where we were
* up to.
*/
sector_t journal_tail; /* If this device is a journal device,
* this is the journal tail (journal
* recovery start point)
*/
};
atomic_t nr_pending; /* number of pending requests.
* only maintained for arrays that
@ -172,6 +178,11 @@ enum flag_bits {
* This device is seen locally but not
* by the whole cluster
*/
Journal, /* This device is used as journal for
* raid-5/6.
* Usually, this device should be faster
* than other devices in the array
*/
};
#define BB_LEN_MASK (0x00000000000001FFULL)
@ -221,6 +232,8 @@ struct mddev {
#define MD_STILL_CLOSED 4 /* If set, then array has not been opened since
* md_ioctl checked on it.
*/
#define MD_JOURNAL_CLEAN 5 /* A raid with journal is already clean */
#define MD_HAS_JOURNAL 6 /* The raid array has journal feature set */
int suspended;
atomic_t active_io;
@ -658,7 +671,7 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
struct mddev *mddev);
extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule);
extern void md_reload_sb(struct mddev *mddev);
extern void md_reload_sb(struct mddev *mddev, int raid_disk);
extern void md_update_sb(struct mddev *mddev, int force);
extern void md_kick_rdev_from_array(struct md_rdev * rdev);
struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);

View File

@ -90,6 +90,8 @@ static void r1bio_pool_free(void *r1_bio, void *data)
#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS)
static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
@ -1590,6 +1592,15 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
if (rdev->raid_disk >= 0)
first = last = rdev->raid_disk;
/*
* find the disk ... but prefer rdev->saved_raid_disk
* if possible.
*/
if (rdev->saved_raid_disk >= 0 &&
rdev->saved_raid_disk >= first &&
conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
first = last = rdev->saved_raid_disk;
for (mirror = first; mirror <= last; mirror++) {
p = conf->mirrors+mirror;
if (!p->rdev) {
@ -2495,6 +2506,11 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
bitmap_close_sync(mddev->bitmap);
close_sync(conf);
if (mddev_is_clustered(mddev)) {
conf->cluster_sync_low = 0;
conf->cluster_sync_high = 0;
}
return 0;
}
@ -2515,7 +2531,12 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
return sync_blocks;
}
bitmap_cond_end_sync(mddev->bitmap, sector_nr);
/* we are incrementing sector_nr below. To be safe, we check against
* sector_nr + two times RESYNC_SECTORS
*/
bitmap_cond_end_sync(mddev->bitmap, sector_nr,
mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
raise_barrier(conf, sector_nr);
@ -2706,6 +2727,16 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
bio_full:
r1_bio->sectors = nr_sectors;
if (mddev_is_clustered(mddev) &&
conf->cluster_sync_high < sector_nr + nr_sectors) {
conf->cluster_sync_low = mddev->curr_resync_completed;
conf->cluster_sync_high = conf->cluster_sync_low + CLUSTER_RESYNC_WINDOW_SECTORS;
/* Send resync message */
md_cluster_ops->resync_info_update(mddev,
conf->cluster_sync_low,
conf->cluster_sync_high);
}
/* For a user-requested sync, we read all readable devices and do a
* compare
*/
@ -3020,9 +3051,11 @@ static int raid1_reshape(struct mddev *mddev)
return -EINVAL;
}
err = md_allow_write(mddev);
if (err)
return err;
if (!mddev_is_clustered(mddev)) {
err = md_allow_write(mddev);
if (err)
return err;
}
raid_disks = mddev->raid_disks + mddev->delta_disks;

View File

@ -111,6 +111,13 @@ struct r1conf {
* the new thread here until we fully activate the array.
*/
struct md_thread *thread;
/* Keep track of cluster resync window to send to other
* nodes.
*/
sector_t cluster_sync_low;
sector_t cluster_sync_high;
};
/*

View File

@ -3149,7 +3149,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
/* resync. Schedule a read for every block at this virt offset */
int count = 0;
bitmap_cond_end_sync(mddev->bitmap, sector_nr);
bitmap_cond_end_sync(mddev->bitmap, sector_nr, 0);
if (!bitmap_start_sync(mddev->bitmap, sector_nr,
&sync_blocks, mddev->degraded) &&

1191
drivers/md/raid5-cache.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -353,7 +353,7 @@ static void release_inactive_stripe_list(struct r5conf *conf,
struct list_head *list = &temp_inactive_list[size - 1];
/*
* We don't hold any lock here yet, get_active_stripe() might
* We don't hold any lock here yet, raid5_get_active_stripe() might
* remove stripes from the list
*/
if (!list_empty_careful(list)) {
@ -413,7 +413,7 @@ static int release_stripe_list(struct r5conf *conf,
return count;
}
static void release_stripe(struct stripe_head *sh)
void raid5_release_stripe(struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
unsigned long flags;
@ -658,9 +658,9 @@ static int has_failed(struct r5conf *conf)
return 0;
}
static struct stripe_head *
get_active_stripe(struct r5conf *conf, sector_t sector,
int previous, int noblock, int noquiesce)
struct stripe_head *
raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
int previous, int noblock, int noquiesce)
{
struct stripe_head *sh;
int hash = stripe_hash_locks_hash(sector);
@ -755,6 +755,10 @@ static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
/* Only freshly new full stripe normal write stripe can be added to a batch list */
static bool stripe_can_batch(struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
if (conf->log)
return false;
return test_bit(STRIPE_BATCH_READY, &sh->state) &&
!test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
is_full_stripe_write(sh);
@ -858,7 +862,7 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh
unlock_out:
unlock_two_stripes(head, sh);
out:
release_stripe(head);
raid5_release_stripe(head);
}
/* Determine if 'data_offset' or 'new_data_offset' should be used
@ -895,6 +899,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
might_sleep();
if (r5l_write_stripe(conf->log, sh) == 0)
return;
for (i = disks; i--; ) {
int rw;
int replace_only = 0;
@ -1208,7 +1214,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
return_io(&return_bi);
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
raid5_release_stripe(sh);
}
static void ops_run_biofill(struct stripe_head *sh)
@ -1271,7 +1277,7 @@ static void ops_complete_compute(void *stripe_head_ref)
if (sh->check_state == check_state_compute_run)
sh->check_state = check_state_compute_result;
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
raid5_release_stripe(sh);
}
/* return a pointer to the address conversion region of the scribble buffer */
@ -1697,7 +1703,7 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
}
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
raid5_release_stripe(sh);
}
static void
@ -1855,7 +1861,7 @@ static void ops_complete_check(void *stripe_head_ref)
sh->check_state = check_state_check_result;
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
raid5_release_stripe(sh);
}
static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
@ -2017,7 +2023,7 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
/* we just created an active stripe so... */
atomic_inc(&conf->active_stripes);
release_stripe(sh);
raid5_release_stripe(sh);
conf->max_nr_stripes++;
return 1;
}
@ -2236,7 +2242,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
if (!p)
err = -ENOMEM;
}
release_stripe(nsh);
raid5_release_stripe(nsh);
}
/* critical section pass, GFP_NOIO no longer needed */
@ -2394,7 +2400,7 @@ static void raid5_end_read_request(struct bio * bi)
rdev_dec_pending(rdev, conf->mddev);
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
raid5_release_stripe(sh);
}
static void raid5_end_write_request(struct bio *bi)
@ -2468,14 +2474,12 @@ static void raid5_end_write_request(struct bio *bi)
if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
raid5_release_stripe(sh);
if (sh->batch_head && sh != sh->batch_head)
release_stripe(sh->batch_head);
raid5_release_stripe(sh->batch_head);
}
static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
static void raid5_build_block(struct stripe_head *sh, int i, int previous)
{
struct r5dev *dev = &sh->dev[i];
@ -2491,7 +2495,7 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous)
dev->rreq.bi_private = sh;
dev->flags = 0;
dev->sector = compute_blocknr(sh, i, previous);
dev->sector = raid5_compute_blocknr(sh, i, previous);
}
static void error(struct mddev *mddev, struct md_rdev *rdev)
@ -2524,9 +2528,9 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
* Input: a 'big' sector number,
* Output: index of the data and parity disk, and the sector # in them.
*/
static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
int previous, int *dd_idx,
struct stripe_head *sh)
sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
int previous, int *dd_idx,
struct stripe_head *sh)
{
sector_t stripe, stripe2;
sector_t chunk_number;
@ -2726,7 +2730,7 @@ static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
return new_sector;
}
static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
{
struct r5conf *conf = sh->raid_conf;
int raid_disks = sh->disks;
@ -3098,6 +3102,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
if (bi)
bitmap_end = 1;
r5l_stripe_write_finished(sh);
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
@ -3141,6 +3147,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
* the data has not reached the cache yet.
*/
if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
s->failed > conf->max_degraded &&
(!test_bit(R5_Insync, &sh->dev[i].flags) ||
test_bit(R5_ReadError, &sh->dev[i].flags))) {
spin_lock_irq(&sh->stripe_lock);
@ -3497,6 +3504,9 @@ returnbi:
WARN_ON(test_bit(R5_SkipCopy, &dev->flags));
WARN_ON(dev->page != dev->orig_page);
}
r5l_stripe_write_finished(sh);
if (!discard_pending &&
test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
int hash;
@ -3939,10 +3949,10 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
struct stripe_head *sh2;
struct async_submit_ctl submit;
sector_t bn = compute_blocknr(sh, i, 1);
sector_t bn = raid5_compute_blocknr(sh, i, 1);
sector_t s = raid5_compute_sector(conf, bn, 0,
&dd_idx, NULL);
sh2 = get_active_stripe(conf, s, 0, 1, 1);
sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1);
if (sh2 == NULL)
/* so far only the early blocks of this stripe
* have been requested. When later blocks
@ -3952,7 +3962,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
/* must have already done this block */
release_stripe(sh2);
raid5_release_stripe(sh2);
continue;
}
@ -3973,7 +3983,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
set_bit(STRIPE_EXPAND_READY, &sh2->state);
set_bit(STRIPE_HANDLE, &sh2->state);
}
release_stripe(sh2);
raid5_release_stripe(sh2);
}
/* done submitting copies, wait for them to complete */
@ -4008,6 +4018,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head;
s->failed_num[0] = -1;
s->failed_num[1] = -1;
s->log_failed = r5l_log_disk_error(conf);
/* Now to look around and see what can be done */
rcu_read_lock();
@ -4259,7 +4270,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
if (handle_flags == 0 ||
sh->state & handle_flags)
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
raid5_release_stripe(sh);
}
spin_lock_irq(&head_sh->stripe_lock);
head_sh->batch_head = NULL;
@ -4320,6 +4331,9 @@ static void handle_stripe(struct stripe_head *sh)
analyse_stripe(sh, &s);
if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
goto finish;
if (s.handle_bad_blocks) {
set_bit(STRIPE_HANDLE, &sh->state);
goto finish;
@ -4348,7 +4362,7 @@ static void handle_stripe(struct stripe_head *sh)
/* check if the array has lost more than max_degraded devices and,
* if so, some requests might need to be failed.
*/
if (s.failed > conf->max_degraded) {
if (s.failed > conf->max_degraded || s.log_failed) {
sh->check_state = 0;
sh->reconstruct_state = 0;
break_stripe_batch_list(sh, 0);
@ -4506,7 +4520,7 @@ static void handle_stripe(struct stripe_head *sh)
/* Finish reconstruct operations initiated by the expansion process */
if (sh->reconstruct_state == reconstruct_state_result) {
struct stripe_head *sh_src
= get_active_stripe(conf, sh->sector, 1, 1, 1);
= raid5_get_active_stripe(conf, sh->sector, 1, 1, 1);
if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
/* sh cannot be written until sh_src has been read.
* so arrange for sh to be delayed a little
@ -4516,11 +4530,11 @@ static void handle_stripe(struct stripe_head *sh)
if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
&sh_src->state))
atomic_inc(&conf->preread_active_stripes);
release_stripe(sh_src);
raid5_release_stripe(sh_src);
goto finish;
}
if (sh_src)
release_stripe(sh_src);
raid5_release_stripe(sh_src);
sh->reconstruct_state = reconstruct_state_idle;
clear_bit(STRIPE_EXPANDING, &sh->state);
@ -5012,7 +5026,7 @@ static void release_stripe_plug(struct mddev *mddev,
struct raid5_plug_cb *cb;
if (!blk_cb) {
release_stripe(sh);
raid5_release_stripe(sh);
return;
}
@ -5028,7 +5042,7 @@ static void release_stripe_plug(struct mddev *mddev,
if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
list_add_tail(&sh->lru, &cb->list);
else
release_stripe(sh);
raid5_release_stripe(sh);
}
static void make_discard_request(struct mddev *mddev, struct bio *bi)
@ -5063,12 +5077,12 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
DEFINE_WAIT(w);
int d;
again:
sh = get_active_stripe(conf, logical_sector, 0, 0, 0);
sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0);
prepare_to_wait(&conf->wait_for_overlap, &w,
TASK_UNINTERRUPTIBLE);
set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
if (test_bit(STRIPE_SYNCING, &sh->state)) {
release_stripe(sh);
raid5_release_stripe(sh);
schedule();
goto again;
}
@ -5080,7 +5094,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
if (sh->dev[d].towrite || sh->dev[d].toread) {
set_bit(R5_Overlap, &sh->dev[d].flags);
spin_unlock_irq(&sh->stripe_lock);
release_stripe(sh);
raid5_release_stripe(sh);
schedule();
goto again;
}
@ -5136,8 +5150,15 @@ static void make_request(struct mddev *mddev, struct bio * bi)
bool do_prepare;
if (unlikely(bi->bi_rw & REQ_FLUSH)) {
md_flush_request(mddev, bi);
return;
int ret = r5l_handle_flush_request(conf->log, bi);
if (ret == 0)
return;
if (ret == -ENODEV) {
md_flush_request(mddev, bi);
return;
}
/* ret == -EAGAIN, fallback */
}
md_write_start(mddev, bi);
@ -5210,7 +5231,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
(unsigned long long)new_sector,
(unsigned long long)logical_sector);
sh = get_active_stripe(conf, new_sector, previous,
sh = raid5_get_active_stripe(conf, new_sector, previous,
(bi->bi_rw&RWA_MASK), 0);
if (sh) {
if (unlikely(previous)) {
@ -5231,7 +5252,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
must_retry = 1;
spin_unlock_irq(&conf->device_lock);
if (must_retry) {
release_stripe(sh);
raid5_release_stripe(sh);
schedule();
do_prepare = true;
goto retry;
@ -5241,14 +5262,14 @@ static void make_request(struct mddev *mddev, struct bio * bi)
/* Might have got the wrong stripe_head
* by accident
*/
release_stripe(sh);
raid5_release_stripe(sh);
goto retry;
}
if (rw == WRITE &&
logical_sector >= mddev->suspend_lo &&
logical_sector < mddev->suspend_hi) {
release_stripe(sh);
raid5_release_stripe(sh);
/* As the suspend_* range is controlled by
* userspace, we want an interruptible
* wait.
@ -5271,7 +5292,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
* and wait a while
*/
md_wakeup_thread(mddev->thread);
release_stripe(sh);
raid5_release_stripe(sh);
schedule();
do_prepare = true;
goto retry;
@ -5458,7 +5479,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
int j;
int skipped_disk = 0;
sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
set_bit(STRIPE_EXPANDING, &sh->state);
atomic_inc(&conf->reshape_stripes);
/* If any of this stripe is beyond the end of the old
@ -5471,7 +5492,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
if (conf->level == 6 &&
j == sh->qd_idx)
continue;
s = compute_blocknr(sh, j, 0);
s = raid5_compute_blocknr(sh, j, 0);
if (s < raid5_size(mddev, 0, 0)) {
skipped_disk = 1;
continue;
@ -5507,10 +5528,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
if (last_sector >= mddev->dev_sectors)
last_sector = mddev->dev_sectors - 1;
while (first_sector <= last_sector) {
sh = get_active_stripe(conf, first_sector, 1, 0, 1);
sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1);
set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
raid5_release_stripe(sh);
first_sector += STRIPE_SECTORS;
}
/* Now that the sources are clearly marked, we can release
@ -5519,7 +5540,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
while (!list_empty(&stripes)) {
sh = list_entry(stripes.next, struct stripe_head, lru);
list_del_init(&sh->lru);
release_stripe(sh);
raid5_release_stripe(sh);
}
/* If this takes us to the resync_max point where we have to pause,
* then we need to write out the superblock.
@ -5615,11 +5636,11 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
}
bitmap_cond_end_sync(mddev->bitmap, sector_nr);
bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
sh = get_active_stripe(conf, sector_nr, 0, 1, 0);
sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0);
if (sh == NULL) {
sh = get_active_stripe(conf, sector_nr, 0, 0, 0);
sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0);
/* make sure we don't swamp the stripe cache if someone else
* is trying to get access
*/
@ -5643,7 +5664,7 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
raid5_release_stripe(sh);
return STRIPE_SECTORS;
}
@ -5682,7 +5703,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
/* already done this stripe */
continue;
sh = get_active_stripe(conf, sector, 0, 1, 1);
sh = raid5_get_active_stripe(conf, sector, 0, 1, 1);
if (!sh) {
/* failed to get a stripe - must wait */
@ -5692,7 +5713,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
}
if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
release_stripe(sh);
raid5_release_stripe(sh);
raid5_set_bi_processed_stripes(raid_bio, scnt);
conf->retry_read_aligned = raid_bio;
return handled;
@ -5700,7 +5721,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags);
handle_stripe(sh);
release_stripe(sh);
raid5_release_stripe(sh);
handled++;
}
remaining = raid5_dec_bi_active_stripes(raid_bio);
@ -5730,8 +5751,12 @@ static int handle_active_stripes(struct r5conf *conf, int group,
for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
if (!list_empty(temp_inactive_list + i))
break;
if (i == NR_STRIPE_HASH_LOCKS)
if (i == NR_STRIPE_HASH_LOCKS) {
spin_unlock_irq(&conf->device_lock);
r5l_flush_stripe_to_raid(conf->log);
spin_lock_irq(&conf->device_lock);
return batch_size;
}
release_inactive = true;
}
spin_unlock_irq(&conf->device_lock);
@ -5739,6 +5764,7 @@ static int handle_active_stripes(struct r5conf *conf, int group,
release_inactive_stripe_list(conf, temp_inactive_list,
NR_STRIPE_HASH_LOCKS);
r5l_flush_stripe_to_raid(conf->log);
if (release_inactive) {
spin_lock_irq(&conf->device_lock);
return 0;
@ -5746,6 +5772,7 @@ static int handle_active_stripes(struct r5conf *conf, int group,
for (i = 0; i < batch_size; i++)
handle_stripe(batch[i]);
r5l_write_stripe_run(conf->log);
cond_resched();
@ -5879,6 +5906,8 @@ static void raid5d(struct md_thread *thread)
mutex_unlock(&conf->cache_size_mutex);
}
r5l_flush_stripe_to_raid(conf->log);
async_tx_issue_pending_all();
blk_finish_plug(&plug);
@ -6316,8 +6345,11 @@ static void raid5_free_percpu(struct r5conf *conf)
static void free_conf(struct r5conf *conf)
{
if (conf->log)
r5l_exit_log(conf->log);
if (conf->shrinker.seeks)
unregister_shrinker(&conf->shrinker);
free_thread_groups(conf);
shrink_stripes(conf);
raid5_free_percpu(conf);
@ -6530,7 +6562,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
rdev_for_each(rdev, mddev) {
raid_disk = rdev->raid_disk;
if (raid_disk >= max_disks
|| raid_disk < 0)
|| raid_disk < 0 || test_bit(Journal, &rdev->flags))
continue;
disk = conf->disks + raid_disk;
@ -6650,6 +6682,7 @@ static int run(struct mddev *mddev)
int working_disks = 0;
int dirty_parity_disks = 0;
struct md_rdev *rdev;
struct md_rdev *journal_dev = NULL;
sector_t reshape_offset = 0;
int i;
long long min_offset_diff = 0;
@ -6662,6 +6695,11 @@ static int run(struct mddev *mddev)
rdev_for_each(rdev, mddev) {
long long diff;
if (test_bit(Journal, &rdev->flags)) {
journal_dev = rdev;
continue;
}
if (rdev->raid_disk < 0)
continue;
diff = (rdev->new_data_offset - rdev->data_offset);
@ -6695,6 +6733,12 @@ static int run(struct mddev *mddev)
int chunk_sectors;
int new_data_disks;
if (journal_dev) {
printk(KERN_ERR "md/raid:%s: don't support reshape with journal - aborting.\n",
mdname(mddev));
return -EINVAL;
}
if (mddev->new_level != mddev->level) {
printk(KERN_ERR "md/raid:%s: unsupported reshape "
"required - aborting.\n",
@ -6770,6 +6814,13 @@ static int run(struct mddev *mddev)
if (IS_ERR(conf))
return PTR_ERR(conf);
if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !journal_dev) {
printk(KERN_ERR "md/raid:%s: journal disk is missing, force array readonly\n",
mdname(mddev));
mddev->ro = 1;
set_disk_ro(mddev->gendisk, 1);
}
conf->min_offset_diff = min_offset_diff;
mddev->thread = conf->thread;
conf->thread = NULL;
@ -6973,6 +7024,14 @@ static int run(struct mddev *mddev)
mddev->queue);
}
if (journal_dev) {
char b[BDEVNAME_SIZE];
printk(KERN_INFO"md/raid:%s: using device %s as journal\n",
mdname(mddev), bdevname(journal_dev->bdev, b));
r5l_init_log(conf, journal_dev);
}
return 0;
abort:
md_unregister_thread(&mddev->thread);
@ -7082,6 +7141,15 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
struct disk_info *p = conf->disks + number;
print_raid5_conf(conf);
if (test_bit(Journal, &rdev->flags)) {
/*
* journal disk is not removable, but we need give a chance to
* update superblock of other disks. Otherwise journal disk
* will be considered as 'fresh'
*/
set_bit(MD_CHANGE_DEVS, &mddev->flags);
return -EINVAL;
}
if (rdev == p->rdev)
rdevp = &p->rdev;
else if (rdev == p->replacement)
@ -7144,6 +7212,8 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
int first = 0;
int last = conf->raid_disks - 1;
if (test_bit(Journal, &rdev->flags))
return -EINVAL;
if (mddev->recovery_disabled == conf->recovery_disabled)
return -EBUSY;
@ -7205,6 +7275,8 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
sector_t newsize;
struct r5conf *conf = mddev->private;
if (conf->log)
return -EINVAL;
sectors &= ~((sector_t)conf->chunk_sectors - 1);
newsize = raid5_size(mddev, sectors, mddev->raid_disks);
if (mddev->external_size &&
@ -7256,6 +7328,8 @@ static int check_reshape(struct mddev *mddev)
{
struct r5conf *conf = mddev->private;
if (conf->log)
return -EINVAL;
if (mddev->delta_disks == 0 &&
mddev->new_layout == mddev->layout &&
mddev->new_chunk_sectors == mddev->chunk_sectors)
@ -7532,6 +7606,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
unlock_all_device_hash_locks_irq(conf);
break;
}
r5l_quiesce(conf->log, state);
}
static void *raid45_takeover_raid0(struct mddev *mddev, int level)

View File

@ -223,6 +223,9 @@ struct stripe_head {
struct stripe_head *batch_head; /* protected by stripe lock */
spinlock_t batch_lock; /* only header's lock is useful */
struct list_head batch_list; /* protected by head's batch lock*/
struct r5l_io_unit *log_io;
struct list_head log_list;
/**
* struct stripe_operations
* @target - STRIPE_OP_COMPUTE_BLK target
@ -244,6 +247,7 @@ struct stripe_head {
struct bio *toread, *read, *towrite, *written;
sector_t sector; /* sector of this page */
unsigned long flags;
u32 log_checksum;
} dev[1]; /* allocated with extra space depending of RAID geometry */
};
@ -268,6 +272,7 @@ struct stripe_head_state {
struct bio_list return_bi;
struct md_rdev *blocked_rdev;
int handle_bad_blocks;
int log_failed;
};
/* Flags for struct r5dev.flags */
@ -340,6 +345,7 @@ enum {
STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add
* to batch yet.
*/
STRIPE_LOG_TRAPPED, /* trapped into log */
};
#define STRIPE_EXPAND_SYNC_FLAGS \
@ -543,6 +549,7 @@ struct r5conf {
struct r5worker_group *worker_groups;
int group_cnt;
int worker_cnt_per_group;
struct r5l_log *log;
};
@ -609,4 +616,21 @@ static inline int algorithm_is_DDF(int layout)
extern void md_raid5_kick_device(struct r5conf *conf);
extern int raid5_set_cache_size(struct mddev *mddev, int size);
extern sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous);
extern void raid5_release_stripe(struct stripe_head *sh);
extern sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
int previous, int *dd_idx,
struct stripe_head *sh);
extern struct stripe_head *
raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
int previous, int noblock, int noquiesce);
extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
extern void r5l_exit_log(struct r5l_log *log);
extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
extern void r5l_write_stripe_run(struct r5l_log *log);
extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
extern void r5l_stripe_write_finished(struct stripe_head *sh);
extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
extern void r5l_quiesce(struct r5l_log *log, int state);
extern bool r5l_log_disk_error(struct r5conf *conf);
#endif

View File

@ -89,6 +89,12 @@
* read requests will only be sent here in
* dire need
*/
#define MD_DISK_JOURNAL 18 /* disk is used as the write journal in RAID-5/6 */
#define MD_DISK_ROLE_SPARE 0xffff
#define MD_DISK_ROLE_FAULTY 0xfffe
#define MD_DISK_ROLE_JOURNAL 0xfffd
#define MD_DISK_ROLE_MAX 0xff00 /* max value of regular disk role */
typedef struct mdp_device_descriptor_s {
__u32 number; /* 0 Device number in the entire set */
@ -252,7 +258,10 @@ struct mdp_superblock_1 {
__le64 data_offset; /* sector start of data, often 0 */
__le64 data_size; /* sectors in this device that can be used for data */
__le64 super_offset; /* sector start of this superblock */
__le64 recovery_offset;/* sectors before this offset (from data_offset) have been recovered */
union {
__le64 recovery_offset;/* sectors before this offset (from data_offset) have been recovered */
__le64 journal_tail;/* journal tail of journal device (from data_offset) */
};
__le32 dev_number; /* permanent identifier of this device - not role in raid */
__le32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */
__u8 device_uuid[16]; /* user-space setable, ignored by kernel */
@ -302,6 +311,8 @@ struct mdp_superblock_1 {
#define MD_FEATURE_RECOVERY_BITMAP 128 /* recovery that is happening
* is guided by bitmap.
*/
#define MD_FEATURE_CLUSTERED 256 /* clustered MD */
#define MD_FEATURE_JOURNAL 512 /* support write cache */
#define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \
|MD_FEATURE_RECOVERY_OFFSET \
|MD_FEATURE_RESHAPE_ACTIVE \
@ -310,6 +321,66 @@ struct mdp_superblock_1 {
|MD_FEATURE_RESHAPE_BACKWARDS \
|MD_FEATURE_NEW_OFFSET \
|MD_FEATURE_RECOVERY_BITMAP \
|MD_FEATURE_CLUSTERED \
|MD_FEATURE_JOURNAL \
)
struct r5l_payload_header {
__le16 type;
__le16 flags;
} __attribute__ ((__packed__));
enum r5l_payload_type {
R5LOG_PAYLOAD_DATA = 0,
R5LOG_PAYLOAD_PARITY = 1,
R5LOG_PAYLOAD_FLUSH = 2,
};
struct r5l_payload_data_parity {
struct r5l_payload_header header;
__le32 size; /* sector. data/parity size. each 4k
* has a checksum */
__le64 location; /* sector. For data, it's raid sector. For
* parity, it's stripe sector */
__le32 checksum[];
} __attribute__ ((__packed__));
enum r5l_payload_data_parity_flag {
R5LOG_PAYLOAD_FLAG_DISCARD = 1, /* payload is discard */
/*
* RESHAPED/RESHAPING is only set when there is reshape activity. Note,
* both data/parity of a stripe should have the same flag set
*
* RESHAPED: reshape is running, and this stripe finished reshape
* RESHAPING: reshape is running, and this stripe isn't reshaped
*/
R5LOG_PAYLOAD_FLAG_RESHAPED = 2,
R5LOG_PAYLOAD_FLAG_RESHAPING = 3,
};
struct r5l_payload_flush {
struct r5l_payload_header header;
__le32 size; /* flush_stripes size, bytes */
__le64 flush_stripes[];
} __attribute__ ((__packed__));
enum r5l_payload_flush_flag {
R5LOG_PAYLOAD_FLAG_FLUSH_STRIPE = 1, /* data represents whole stripe */
};
struct r5l_meta_block {
__le32 magic;
__le32 checksum;
__u8 version;
__u8 __zero_pading_1;
__le16 __zero_pading_2;
__le32 meta_size; /* whole size of the block */
__le64 seq;
__le64 position; /* sector, start from rdev->data_offset, current position */
struct r5l_payload_header payloads[];
} __attribute__ ((__packed__));
#define R5LOG_VERSION 0x1
#define R5LOG_MAGIC 0x6433c509
#endif