mirror of
https://github.com/torvalds/linux.git
synced 2024-11-15 00:21:59 +00:00
dd0fc66fb3
- added typedef unsigned int __nocast gfp_t; - replaced __nocast uses for gfp flags with gfp_t - it gives exactly the same warnings as far as sparse is concerned, doesn't change generated code (from gcc point of view we replaced unsigned int with typedef) and documents what's going on far better. Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
1277 lines
28 KiB
C
1277 lines
28 KiB
C
/*
|
|
* Copyright (C) 2003 Sistina Software Limited.
|
|
*
|
|
* This file is released under the GPL.
|
|
*/
|
|
|
|
#include "dm.h"
|
|
#include "dm-bio-list.h"
|
|
#include "dm-io.h"
|
|
#include "dm-log.h"
|
|
#include "kcopyd.h"
|
|
|
|
#include <linux/ctype.h>
|
|
#include <linux/init.h>
|
|
#include <linux/mempool.h>
|
|
#include <linux/module.h>
|
|
#include <linux/pagemap.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/time.h>
|
|
#include <linux/vmalloc.h>
|
|
#include <linux/workqueue.h>
|
|
|
|
static struct workqueue_struct *_kmirrord_wq;
|
|
static struct work_struct _kmirrord_work;
|
|
|
|
static inline void wake(void)
|
|
{
|
|
queue_work(_kmirrord_wq, &_kmirrord_work);
|
|
}
|
|
|
|
/*-----------------------------------------------------------------
|
|
* Region hash
|
|
*
|
|
* The mirror splits itself up into discrete regions. Each
|
|
* region can be in one of three states: clean, dirty,
|
|
* nosync. There is no need to put clean regions in the hash.
|
|
*
|
|
* In addition to being present in the hash table a region _may_
|
|
* be present on one of three lists.
|
|
*
|
|
* clean_regions: Regions on this list have no io pending to
|
|
* them, they are in sync, we are no longer interested in them,
|
|
* they are dull. rh_update_states() will remove them from the
|
|
* hash table.
|
|
*
|
|
* quiesced_regions: These regions have been spun down, ready
|
|
* for recovery. rh_recovery_start() will remove regions from
|
|
* this list and hand them to kmirrord, which will schedule the
|
|
* recovery io with kcopyd.
|
|
*
|
|
* recovered_regions: Regions that kcopyd has successfully
|
|
* recovered. rh_update_states() will now schedule any delayed
|
|
* io, up the recovery_count, and remove the region from the
|
|
* hash.
|
|
*
|
|
* There are 2 locks:
|
|
* A rw spin lock 'hash_lock' protects just the hash table,
|
|
* this is never held in write mode from interrupt context,
|
|
* which I believe means that we only have to disable irqs when
|
|
* doing a write lock.
|
|
*
|
|
* An ordinary spin lock 'region_lock' that protects the three
|
|
* lists in the region_hash, with the 'state', 'list' and
|
|
* 'bhs_delayed' fields of the regions. This is used from irq
|
|
* context, so all other uses will have to suspend local irqs.
|
|
*---------------------------------------------------------------*/
|
|
struct mirror_set;
|
|
struct region_hash {
|
|
struct mirror_set *ms;
|
|
uint32_t region_size;
|
|
unsigned region_shift;
|
|
|
|
/* holds persistent region state */
|
|
struct dirty_log *log;
|
|
|
|
/* hash table */
|
|
rwlock_t hash_lock;
|
|
mempool_t *region_pool;
|
|
unsigned int mask;
|
|
unsigned int nr_buckets;
|
|
struct list_head *buckets;
|
|
|
|
spinlock_t region_lock;
|
|
struct semaphore recovery_count;
|
|
struct list_head clean_regions;
|
|
struct list_head quiesced_regions;
|
|
struct list_head recovered_regions;
|
|
};
|
|
|
|
enum {
|
|
RH_CLEAN,
|
|
RH_DIRTY,
|
|
RH_NOSYNC,
|
|
RH_RECOVERING
|
|
};
|
|
|
|
struct region {
|
|
struct region_hash *rh; /* FIXME: can we get rid of this ? */
|
|
region_t key;
|
|
int state;
|
|
|
|
struct list_head hash_list;
|
|
struct list_head list;
|
|
|
|
atomic_t pending;
|
|
struct bio_list delayed_bios;
|
|
};
|
|
|
|
/*
|
|
* Conversion fns
|
|
*/
|
|
static inline region_t bio_to_region(struct region_hash *rh, struct bio *bio)
|
|
{
|
|
return bio->bi_sector >> rh->region_shift;
|
|
}
|
|
|
|
static inline sector_t region_to_sector(struct region_hash *rh, region_t region)
|
|
{
|
|
return region << rh->region_shift;
|
|
}
|
|
|
|
/* FIXME move this */
|
|
static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw);
|
|
|
|
static void *region_alloc(gfp_t gfp_mask, void *pool_data)
|
|
{
|
|
return kmalloc(sizeof(struct region), gfp_mask);
|
|
}
|
|
|
|
static void region_free(void *element, void *pool_data)
|
|
{
|
|
kfree(element);
|
|
}
|
|
|
|
#define MIN_REGIONS 64
|
|
#define MAX_RECOVERY 1
|
|
static int rh_init(struct region_hash *rh, struct mirror_set *ms,
|
|
struct dirty_log *log, uint32_t region_size,
|
|
region_t nr_regions)
|
|
{
|
|
unsigned int nr_buckets, max_buckets;
|
|
size_t i;
|
|
|
|
/*
|
|
* Calculate a suitable number of buckets for our hash
|
|
* table.
|
|
*/
|
|
max_buckets = nr_regions >> 6;
|
|
for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1)
|
|
;
|
|
nr_buckets >>= 1;
|
|
|
|
rh->ms = ms;
|
|
rh->log = log;
|
|
rh->region_size = region_size;
|
|
rh->region_shift = ffs(region_size) - 1;
|
|
rwlock_init(&rh->hash_lock);
|
|
rh->mask = nr_buckets - 1;
|
|
rh->nr_buckets = nr_buckets;
|
|
|
|
rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets));
|
|
if (!rh->buckets) {
|
|
DMERR("unable to allocate region hash memory");
|
|
return -ENOMEM;
|
|
}
|
|
|
|
for (i = 0; i < nr_buckets; i++)
|
|
INIT_LIST_HEAD(rh->buckets + i);
|
|
|
|
spin_lock_init(&rh->region_lock);
|
|
sema_init(&rh->recovery_count, 0);
|
|
INIT_LIST_HEAD(&rh->clean_regions);
|
|
INIT_LIST_HEAD(&rh->quiesced_regions);
|
|
INIT_LIST_HEAD(&rh->recovered_regions);
|
|
|
|
rh->region_pool = mempool_create(MIN_REGIONS, region_alloc,
|
|
region_free, NULL);
|
|
if (!rh->region_pool) {
|
|
vfree(rh->buckets);
|
|
rh->buckets = NULL;
|
|
return -ENOMEM;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void rh_exit(struct region_hash *rh)
|
|
{
|
|
unsigned int h;
|
|
struct region *reg, *nreg;
|
|
|
|
BUG_ON(!list_empty(&rh->quiesced_regions));
|
|
for (h = 0; h < rh->nr_buckets; h++) {
|
|
list_for_each_entry_safe(reg, nreg, rh->buckets + h, hash_list) {
|
|
BUG_ON(atomic_read(®->pending));
|
|
mempool_free(reg, rh->region_pool);
|
|
}
|
|
}
|
|
|
|
if (rh->log)
|
|
dm_destroy_dirty_log(rh->log);
|
|
if (rh->region_pool)
|
|
mempool_destroy(rh->region_pool);
|
|
vfree(rh->buckets);
|
|
}
|
|
|
|
#define RH_HASH_MULT 2654435387U
|
|
|
|
static inline unsigned int rh_hash(struct region_hash *rh, region_t region)
|
|
{
|
|
return (unsigned int) ((region * RH_HASH_MULT) >> 12) & rh->mask;
|
|
}
|
|
|
|
static struct region *__rh_lookup(struct region_hash *rh, region_t region)
|
|
{
|
|
struct region *reg;
|
|
|
|
list_for_each_entry (reg, rh->buckets + rh_hash(rh, region), hash_list)
|
|
if (reg->key == region)
|
|
return reg;
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static void __rh_insert(struct region_hash *rh, struct region *reg)
|
|
{
|
|
unsigned int h = rh_hash(rh, reg->key);
|
|
list_add(®->hash_list, rh->buckets + h);
|
|
}
|
|
|
|
static struct region *__rh_alloc(struct region_hash *rh, region_t region)
|
|
{
|
|
struct region *reg, *nreg;
|
|
|
|
read_unlock(&rh->hash_lock);
|
|
nreg = mempool_alloc(rh->region_pool, GFP_NOIO);
|
|
nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
|
|
RH_CLEAN : RH_NOSYNC;
|
|
nreg->rh = rh;
|
|
nreg->key = region;
|
|
|
|
INIT_LIST_HEAD(&nreg->list);
|
|
|
|
atomic_set(&nreg->pending, 0);
|
|
bio_list_init(&nreg->delayed_bios);
|
|
write_lock_irq(&rh->hash_lock);
|
|
|
|
reg = __rh_lookup(rh, region);
|
|
if (reg)
|
|
/* we lost the race */
|
|
mempool_free(nreg, rh->region_pool);
|
|
|
|
else {
|
|
__rh_insert(rh, nreg);
|
|
if (nreg->state == RH_CLEAN) {
|
|
spin_lock(&rh->region_lock);
|
|
list_add(&nreg->list, &rh->clean_regions);
|
|
spin_unlock(&rh->region_lock);
|
|
}
|
|
reg = nreg;
|
|
}
|
|
write_unlock_irq(&rh->hash_lock);
|
|
read_lock(&rh->hash_lock);
|
|
|
|
return reg;
|
|
}
|
|
|
|
static inline struct region *__rh_find(struct region_hash *rh, region_t region)
|
|
{
|
|
struct region *reg;
|
|
|
|
reg = __rh_lookup(rh, region);
|
|
if (!reg)
|
|
reg = __rh_alloc(rh, region);
|
|
|
|
return reg;
|
|
}
|
|
|
|
static int rh_state(struct region_hash *rh, region_t region, int may_block)
|
|
{
|
|
int r;
|
|
struct region *reg;
|
|
|
|
read_lock(&rh->hash_lock);
|
|
reg = __rh_lookup(rh, region);
|
|
read_unlock(&rh->hash_lock);
|
|
|
|
if (reg)
|
|
return reg->state;
|
|
|
|
/*
|
|
* The region wasn't in the hash, so we fall back to the
|
|
* dirty log.
|
|
*/
|
|
r = rh->log->type->in_sync(rh->log, region, may_block);
|
|
|
|
/*
|
|
* Any error from the dirty log (eg. -EWOULDBLOCK) gets
|
|
* taken as a RH_NOSYNC
|
|
*/
|
|
return r == 1 ? RH_CLEAN : RH_NOSYNC;
|
|
}
|
|
|
|
static inline int rh_in_sync(struct region_hash *rh,
|
|
region_t region, int may_block)
|
|
{
|
|
int state = rh_state(rh, region, may_block);
|
|
return state == RH_CLEAN || state == RH_DIRTY;
|
|
}
|
|
|
|
static void dispatch_bios(struct mirror_set *ms, struct bio_list *bio_list)
|
|
{
|
|
struct bio *bio;
|
|
|
|
while ((bio = bio_list_pop(bio_list))) {
|
|
queue_bio(ms, bio, WRITE);
|
|
}
|
|
}
|
|
|
|
static void rh_update_states(struct region_hash *rh)
|
|
{
|
|
struct region *reg, *next;
|
|
|
|
LIST_HEAD(clean);
|
|
LIST_HEAD(recovered);
|
|
|
|
/*
|
|
* Quickly grab the lists.
|
|
*/
|
|
write_lock_irq(&rh->hash_lock);
|
|
spin_lock(&rh->region_lock);
|
|
if (!list_empty(&rh->clean_regions)) {
|
|
list_splice(&rh->clean_regions, &clean);
|
|
INIT_LIST_HEAD(&rh->clean_regions);
|
|
|
|
list_for_each_entry (reg, &clean, list) {
|
|
rh->log->type->clear_region(rh->log, reg->key);
|
|
list_del(®->hash_list);
|
|
}
|
|
}
|
|
|
|
if (!list_empty(&rh->recovered_regions)) {
|
|
list_splice(&rh->recovered_regions, &recovered);
|
|
INIT_LIST_HEAD(&rh->recovered_regions);
|
|
|
|
list_for_each_entry (reg, &recovered, list)
|
|
list_del(®->hash_list);
|
|
}
|
|
spin_unlock(&rh->region_lock);
|
|
write_unlock_irq(&rh->hash_lock);
|
|
|
|
/*
|
|
* All the regions on the recovered and clean lists have
|
|
* now been pulled out of the system, so no need to do
|
|
* any more locking.
|
|
*/
|
|
list_for_each_entry_safe (reg, next, &recovered, list) {
|
|
rh->log->type->clear_region(rh->log, reg->key);
|
|
rh->log->type->complete_resync_work(rh->log, reg->key, 1);
|
|
dispatch_bios(rh->ms, ®->delayed_bios);
|
|
up(&rh->recovery_count);
|
|
mempool_free(reg, rh->region_pool);
|
|
}
|
|
|
|
if (!list_empty(&recovered))
|
|
rh->log->type->flush(rh->log);
|
|
|
|
list_for_each_entry_safe (reg, next, &clean, list)
|
|
mempool_free(reg, rh->region_pool);
|
|
}
|
|
|
|
static void rh_inc(struct region_hash *rh, region_t region)
|
|
{
|
|
struct region *reg;
|
|
|
|
read_lock(&rh->hash_lock);
|
|
reg = __rh_find(rh, region);
|
|
|
|
atomic_inc(®->pending);
|
|
|
|
spin_lock_irq(&rh->region_lock);
|
|
if (reg->state == RH_CLEAN) {
|
|
rh->log->type->mark_region(rh->log, reg->key);
|
|
|
|
reg->state = RH_DIRTY;
|
|
list_del_init(®->list); /* take off the clean list */
|
|
}
|
|
spin_unlock_irq(&rh->region_lock);
|
|
|
|
read_unlock(&rh->hash_lock);
|
|
}
|
|
|
|
static void rh_inc_pending(struct region_hash *rh, struct bio_list *bios)
|
|
{
|
|
struct bio *bio;
|
|
|
|
for (bio = bios->head; bio; bio = bio->bi_next)
|
|
rh_inc(rh, bio_to_region(rh, bio));
|
|
}
|
|
|
|
static void rh_dec(struct region_hash *rh, region_t region)
|
|
{
|
|
unsigned long flags;
|
|
struct region *reg;
|
|
int should_wake = 0;
|
|
|
|
read_lock(&rh->hash_lock);
|
|
reg = __rh_lookup(rh, region);
|
|
read_unlock(&rh->hash_lock);
|
|
|
|
if (atomic_dec_and_test(®->pending)) {
|
|
spin_lock_irqsave(&rh->region_lock, flags);
|
|
if (atomic_read(®->pending)) { /* check race */
|
|
spin_unlock_irqrestore(&rh->region_lock, flags);
|
|
return;
|
|
}
|
|
if (reg->state == RH_RECOVERING) {
|
|
list_add_tail(®->list, &rh->quiesced_regions);
|
|
} else {
|
|
reg->state = RH_CLEAN;
|
|
list_add(®->list, &rh->clean_regions);
|
|
}
|
|
spin_unlock_irqrestore(&rh->region_lock, flags);
|
|
should_wake = 1;
|
|
}
|
|
|
|
if (should_wake)
|
|
wake();
|
|
}
|
|
|
|
/*
|
|
* Starts quiescing a region in preparation for recovery.
|
|
*/
|
|
static int __rh_recovery_prepare(struct region_hash *rh)
|
|
{
|
|
int r;
|
|
struct region *reg;
|
|
region_t region;
|
|
|
|
/*
|
|
* Ask the dirty log what's next.
|
|
*/
|
|
r = rh->log->type->get_resync_work(rh->log, ®ion);
|
|
if (r <= 0)
|
|
return r;
|
|
|
|
/*
|
|
* Get this region, and start it quiescing by setting the
|
|
* recovering flag.
|
|
*/
|
|
read_lock(&rh->hash_lock);
|
|
reg = __rh_find(rh, region);
|
|
read_unlock(&rh->hash_lock);
|
|
|
|
spin_lock_irq(&rh->region_lock);
|
|
reg->state = RH_RECOVERING;
|
|
|
|
/* Already quiesced ? */
|
|
if (atomic_read(®->pending))
|
|
list_del_init(®->list);
|
|
|
|
else {
|
|
list_del_init(®->list);
|
|
list_add(®->list, &rh->quiesced_regions);
|
|
}
|
|
spin_unlock_irq(&rh->region_lock);
|
|
|
|
return 1;
|
|
}
|
|
|
|
static void rh_recovery_prepare(struct region_hash *rh)
|
|
{
|
|
while (!down_trylock(&rh->recovery_count))
|
|
if (__rh_recovery_prepare(rh) <= 0) {
|
|
up(&rh->recovery_count);
|
|
break;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Returns any quiesced regions.
|
|
*/
|
|
static struct region *rh_recovery_start(struct region_hash *rh)
|
|
{
|
|
struct region *reg = NULL;
|
|
|
|
spin_lock_irq(&rh->region_lock);
|
|
if (!list_empty(&rh->quiesced_regions)) {
|
|
reg = list_entry(rh->quiesced_regions.next,
|
|
struct region, list);
|
|
list_del_init(®->list); /* remove from the quiesced list */
|
|
}
|
|
spin_unlock_irq(&rh->region_lock);
|
|
|
|
return reg;
|
|
}
|
|
|
|
/* FIXME: success ignored for now */
|
|
static void rh_recovery_end(struct region *reg, int success)
|
|
{
|
|
struct region_hash *rh = reg->rh;
|
|
|
|
spin_lock_irq(&rh->region_lock);
|
|
list_add(®->list, ®->rh->recovered_regions);
|
|
spin_unlock_irq(&rh->region_lock);
|
|
|
|
wake();
|
|
}
|
|
|
|
static void rh_flush(struct region_hash *rh)
|
|
{
|
|
rh->log->type->flush(rh->log);
|
|
}
|
|
|
|
static void rh_delay(struct region_hash *rh, struct bio *bio)
|
|
{
|
|
struct region *reg;
|
|
|
|
read_lock(&rh->hash_lock);
|
|
reg = __rh_find(rh, bio_to_region(rh, bio));
|
|
bio_list_add(®->delayed_bios, bio);
|
|
read_unlock(&rh->hash_lock);
|
|
}
|
|
|
|
static void rh_stop_recovery(struct region_hash *rh)
|
|
{
|
|
int i;
|
|
|
|
/* wait for any recovering regions */
|
|
for (i = 0; i < MAX_RECOVERY; i++)
|
|
down(&rh->recovery_count);
|
|
}
|
|
|
|
static void rh_start_recovery(struct region_hash *rh)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < MAX_RECOVERY; i++)
|
|
up(&rh->recovery_count);
|
|
|
|
wake();
|
|
}
|
|
|
|
/*-----------------------------------------------------------------
|
|
* Mirror set structures.
|
|
*---------------------------------------------------------------*/
|
|
struct mirror {
|
|
atomic_t error_count;
|
|
struct dm_dev *dev;
|
|
sector_t offset;
|
|
};
|
|
|
|
struct mirror_set {
|
|
struct dm_target *ti;
|
|
struct list_head list;
|
|
struct region_hash rh;
|
|
struct kcopyd_client *kcopyd_client;
|
|
|
|
spinlock_t lock; /* protects the next two lists */
|
|
struct bio_list reads;
|
|
struct bio_list writes;
|
|
|
|
/* recovery */
|
|
region_t nr_regions;
|
|
int in_sync;
|
|
|
|
unsigned int nr_mirrors;
|
|
struct mirror mirror[0];
|
|
};
|
|
|
|
/*
|
|
* Every mirror should look like this one.
|
|
*/
|
|
#define DEFAULT_MIRROR 0
|
|
|
|
/*
|
|
* This is yucky. We squirrel the mirror_set struct away inside
|
|
* bi_next for write buffers. This is safe since the bh
|
|
* doesn't get submitted to the lower levels of block layer.
|
|
*/
|
|
static struct mirror_set *bio_get_ms(struct bio *bio)
|
|
{
|
|
return (struct mirror_set *) bio->bi_next;
|
|
}
|
|
|
|
static void bio_set_ms(struct bio *bio, struct mirror_set *ms)
|
|
{
|
|
bio->bi_next = (struct bio *) ms;
|
|
}
|
|
|
|
/*-----------------------------------------------------------------
|
|
* Recovery.
|
|
*
|
|
* When a mirror is first activated we may find that some regions
|
|
* are in the no-sync state. We have to recover these by
|
|
* recopying from the default mirror to all the others.
|
|
*---------------------------------------------------------------*/
|
|
static void recovery_complete(int read_err, unsigned int write_err,
|
|
void *context)
|
|
{
|
|
struct region *reg = (struct region *) context;
|
|
|
|
/* FIXME: better error handling */
|
|
rh_recovery_end(reg, read_err || write_err);
|
|
}
|
|
|
|
static int recover(struct mirror_set *ms, struct region *reg)
|
|
{
|
|
int r;
|
|
unsigned int i;
|
|
struct io_region from, to[KCOPYD_MAX_REGIONS], *dest;
|
|
struct mirror *m;
|
|
unsigned long flags = 0;
|
|
|
|
/* fill in the source */
|
|
m = ms->mirror + DEFAULT_MIRROR;
|
|
from.bdev = m->dev->bdev;
|
|
from.sector = m->offset + region_to_sector(reg->rh, reg->key);
|
|
if (reg->key == (ms->nr_regions - 1)) {
|
|
/*
|
|
* The final region may be smaller than
|
|
* region_size.
|
|
*/
|
|
from.count = ms->ti->len & (reg->rh->region_size - 1);
|
|
if (!from.count)
|
|
from.count = reg->rh->region_size;
|
|
} else
|
|
from.count = reg->rh->region_size;
|
|
|
|
/* fill in the destinations */
|
|
for (i = 0, dest = to; i < ms->nr_mirrors; i++) {
|
|
if (i == DEFAULT_MIRROR)
|
|
continue;
|
|
|
|
m = ms->mirror + i;
|
|
dest->bdev = m->dev->bdev;
|
|
dest->sector = m->offset + region_to_sector(reg->rh, reg->key);
|
|
dest->count = from.count;
|
|
dest++;
|
|
}
|
|
|
|
/* hand to kcopyd */
|
|
set_bit(KCOPYD_IGNORE_ERROR, &flags);
|
|
r = kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, flags,
|
|
recovery_complete, reg);
|
|
|
|
return r;
|
|
}
|
|
|
|
static void do_recovery(struct mirror_set *ms)
|
|
{
|
|
int r;
|
|
struct region *reg;
|
|
struct dirty_log *log = ms->rh.log;
|
|
|
|
/*
|
|
* Start quiescing some regions.
|
|
*/
|
|
rh_recovery_prepare(&ms->rh);
|
|
|
|
/*
|
|
* Copy any already quiesced regions.
|
|
*/
|
|
while ((reg = rh_recovery_start(&ms->rh))) {
|
|
r = recover(ms, reg);
|
|
if (r)
|
|
rh_recovery_end(reg, 0);
|
|
}
|
|
|
|
/*
|
|
* Update the in sync flag.
|
|
*/
|
|
if (!ms->in_sync &&
|
|
(log->type->get_sync_count(log) == ms->nr_regions)) {
|
|
/* the sync is complete */
|
|
dm_table_event(ms->ti->table);
|
|
ms->in_sync = 1;
|
|
}
|
|
}
|
|
|
|
/*-----------------------------------------------------------------
|
|
* Reads
|
|
*---------------------------------------------------------------*/
|
|
static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector)
|
|
{
|
|
/* FIXME: add read balancing */
|
|
return ms->mirror + DEFAULT_MIRROR;
|
|
}
|
|
|
|
/*
|
|
* remap a buffer to a particular mirror.
|
|
*/
|
|
static void map_bio(struct mirror_set *ms, struct mirror *m, struct bio *bio)
|
|
{
|
|
bio->bi_bdev = m->dev->bdev;
|
|
bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin);
|
|
}
|
|
|
|
static void do_reads(struct mirror_set *ms, struct bio_list *reads)
|
|
{
|
|
region_t region;
|
|
struct bio *bio;
|
|
struct mirror *m;
|
|
|
|
while ((bio = bio_list_pop(reads))) {
|
|
region = bio_to_region(&ms->rh, bio);
|
|
|
|
/*
|
|
* We can only read balance if the region is in sync.
|
|
*/
|
|
if (rh_in_sync(&ms->rh, region, 0))
|
|
m = choose_mirror(ms, bio->bi_sector);
|
|
else
|
|
m = ms->mirror + DEFAULT_MIRROR;
|
|
|
|
map_bio(ms, m, bio);
|
|
generic_make_request(bio);
|
|
}
|
|
}
|
|
|
|
/*-----------------------------------------------------------------
|
|
* Writes.
|
|
*
|
|
* We do different things with the write io depending on the
|
|
* state of the region that it's in:
|
|
*
|
|
* SYNC: increment pending, use kcopyd to write to *all* mirrors
|
|
* RECOVERING: delay the io until recovery completes
|
|
* NOSYNC: increment pending, just write to the default mirror
|
|
*---------------------------------------------------------------*/
|
|
static void write_callback(unsigned long error, void *context)
|
|
{
|
|
unsigned int i;
|
|
int uptodate = 1;
|
|
struct bio *bio = (struct bio *) context;
|
|
struct mirror_set *ms;
|
|
|
|
ms = bio_get_ms(bio);
|
|
bio_set_ms(bio, NULL);
|
|
|
|
/*
|
|
* NOTE: We don't decrement the pending count here,
|
|
* instead it is done by the targets endio function.
|
|
* This way we handle both writes to SYNC and NOSYNC
|
|
* regions with the same code.
|
|
*/
|
|
|
|
if (error) {
|
|
/*
|
|
* only error the io if all mirrors failed.
|
|
* FIXME: bogus
|
|
*/
|
|
uptodate = 0;
|
|
for (i = 0; i < ms->nr_mirrors; i++)
|
|
if (!test_bit(i, &error)) {
|
|
uptodate = 1;
|
|
break;
|
|
}
|
|
}
|
|
bio_endio(bio, bio->bi_size, 0);
|
|
}
|
|
|
|
static void do_write(struct mirror_set *ms, struct bio *bio)
|
|
{
|
|
unsigned int i;
|
|
struct io_region io[KCOPYD_MAX_REGIONS+1];
|
|
struct mirror *m;
|
|
|
|
for (i = 0; i < ms->nr_mirrors; i++) {
|
|
m = ms->mirror + i;
|
|
|
|
io[i].bdev = m->dev->bdev;
|
|
io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin);
|
|
io[i].count = bio->bi_size >> 9;
|
|
}
|
|
|
|
bio_set_ms(bio, ms);
|
|
dm_io_async_bvec(ms->nr_mirrors, io, WRITE,
|
|
bio->bi_io_vec + bio->bi_idx,
|
|
write_callback, bio);
|
|
}
|
|
|
|
static void do_writes(struct mirror_set *ms, struct bio_list *writes)
|
|
{
|
|
int state;
|
|
struct bio *bio;
|
|
struct bio_list sync, nosync, recover, *this_list = NULL;
|
|
|
|
if (!writes->head)
|
|
return;
|
|
|
|
/*
|
|
* Classify each write.
|
|
*/
|
|
bio_list_init(&sync);
|
|
bio_list_init(&nosync);
|
|
bio_list_init(&recover);
|
|
|
|
while ((bio = bio_list_pop(writes))) {
|
|
state = rh_state(&ms->rh, bio_to_region(&ms->rh, bio), 1);
|
|
switch (state) {
|
|
case RH_CLEAN:
|
|
case RH_DIRTY:
|
|
this_list = &sync;
|
|
break;
|
|
|
|
case RH_NOSYNC:
|
|
this_list = &nosync;
|
|
break;
|
|
|
|
case RH_RECOVERING:
|
|
this_list = &recover;
|
|
break;
|
|
}
|
|
|
|
bio_list_add(this_list, bio);
|
|
}
|
|
|
|
/*
|
|
* Increment the pending counts for any regions that will
|
|
* be written to (writes to recover regions are going to
|
|
* be delayed).
|
|
*/
|
|
rh_inc_pending(&ms->rh, &sync);
|
|
rh_inc_pending(&ms->rh, &nosync);
|
|
rh_flush(&ms->rh);
|
|
|
|
/*
|
|
* Dispatch io.
|
|
*/
|
|
while ((bio = bio_list_pop(&sync)))
|
|
do_write(ms, bio);
|
|
|
|
while ((bio = bio_list_pop(&recover)))
|
|
rh_delay(&ms->rh, bio);
|
|
|
|
while ((bio = bio_list_pop(&nosync))) {
|
|
map_bio(ms, ms->mirror + DEFAULT_MIRROR, bio);
|
|
generic_make_request(bio);
|
|
}
|
|
}
|
|
|
|
/*-----------------------------------------------------------------
|
|
* kmirrord
|
|
*---------------------------------------------------------------*/
|
|
static LIST_HEAD(_mirror_sets);
|
|
static DECLARE_RWSEM(_mirror_sets_lock);
|
|
|
|
static void do_mirror(struct mirror_set *ms)
|
|
{
|
|
struct bio_list reads, writes;
|
|
|
|
spin_lock(&ms->lock);
|
|
reads = ms->reads;
|
|
writes = ms->writes;
|
|
bio_list_init(&ms->reads);
|
|
bio_list_init(&ms->writes);
|
|
spin_unlock(&ms->lock);
|
|
|
|
rh_update_states(&ms->rh);
|
|
do_recovery(ms);
|
|
do_reads(ms, &reads);
|
|
do_writes(ms, &writes);
|
|
}
|
|
|
|
static void do_work(void *ignored)
|
|
{
|
|
struct mirror_set *ms;
|
|
|
|
down_read(&_mirror_sets_lock);
|
|
list_for_each_entry (ms, &_mirror_sets, list)
|
|
do_mirror(ms);
|
|
up_read(&_mirror_sets_lock);
|
|
}
|
|
|
|
/*-----------------------------------------------------------------
|
|
* Target functions
|
|
*---------------------------------------------------------------*/
|
|
static struct mirror_set *alloc_context(unsigned int nr_mirrors,
|
|
uint32_t region_size,
|
|
struct dm_target *ti,
|
|
struct dirty_log *dl)
|
|
{
|
|
size_t len;
|
|
struct mirror_set *ms = NULL;
|
|
|
|
if (array_too_big(sizeof(*ms), sizeof(ms->mirror[0]), nr_mirrors))
|
|
return NULL;
|
|
|
|
len = sizeof(*ms) + (sizeof(ms->mirror[0]) * nr_mirrors);
|
|
|
|
ms = kmalloc(len, GFP_KERNEL);
|
|
if (!ms) {
|
|
ti->error = "dm-mirror: Cannot allocate mirror context";
|
|
return NULL;
|
|
}
|
|
|
|
memset(ms, 0, len);
|
|
spin_lock_init(&ms->lock);
|
|
|
|
ms->ti = ti;
|
|
ms->nr_mirrors = nr_mirrors;
|
|
ms->nr_regions = dm_sector_div_up(ti->len, region_size);
|
|
ms->in_sync = 0;
|
|
|
|
if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) {
|
|
ti->error = "dm-mirror: Error creating dirty region hash";
|
|
kfree(ms);
|
|
return NULL;
|
|
}
|
|
|
|
return ms;
|
|
}
|
|
|
|
static void free_context(struct mirror_set *ms, struct dm_target *ti,
|
|
unsigned int m)
|
|
{
|
|
while (m--)
|
|
dm_put_device(ti, ms->mirror[m].dev);
|
|
|
|
rh_exit(&ms->rh);
|
|
kfree(ms);
|
|
}
|
|
|
|
static inline int _check_region_size(struct dm_target *ti, uint32_t size)
|
|
{
|
|
return !(size % (PAGE_SIZE >> 9) || (size & (size - 1)) ||
|
|
size > ti->len);
|
|
}
|
|
|
|
static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
|
|
unsigned int mirror, char **argv)
|
|
{
|
|
sector_t offset;
|
|
|
|
if (sscanf(argv[1], SECTOR_FORMAT, &offset) != 1) {
|
|
ti->error = "dm-mirror: Invalid offset";
|
|
return -EINVAL;
|
|
}
|
|
|
|
if (dm_get_device(ti, argv[0], offset, ti->len,
|
|
dm_table_get_mode(ti->table),
|
|
&ms->mirror[mirror].dev)) {
|
|
ti->error = "dm-mirror: Device lookup failure";
|
|
return -ENXIO;
|
|
}
|
|
|
|
ms->mirror[mirror].offset = offset;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int add_mirror_set(struct mirror_set *ms)
|
|
{
|
|
down_write(&_mirror_sets_lock);
|
|
list_add_tail(&ms->list, &_mirror_sets);
|
|
up_write(&_mirror_sets_lock);
|
|
wake();
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void del_mirror_set(struct mirror_set *ms)
|
|
{
|
|
down_write(&_mirror_sets_lock);
|
|
list_del(&ms->list);
|
|
up_write(&_mirror_sets_lock);
|
|
}
|
|
|
|
/*
|
|
* Create dirty log: log_type #log_params <log_params>
|
|
*/
|
|
static struct dirty_log *create_dirty_log(struct dm_target *ti,
|
|
unsigned int argc, char **argv,
|
|
unsigned int *args_used)
|
|
{
|
|
unsigned int param_count;
|
|
struct dirty_log *dl;
|
|
|
|
if (argc < 2) {
|
|
ti->error = "dm-mirror: Insufficient mirror log arguments";
|
|
return NULL;
|
|
}
|
|
|
|
if (sscanf(argv[1], "%u", ¶m_count) != 1) {
|
|
ti->error = "dm-mirror: Invalid mirror log argument count";
|
|
return NULL;
|
|
}
|
|
|
|
*args_used = 2 + param_count;
|
|
|
|
if (argc < *args_used) {
|
|
ti->error = "dm-mirror: Insufficient mirror log arguments";
|
|
return NULL;
|
|
}
|
|
|
|
dl = dm_create_dirty_log(argv[0], ti, param_count, argv + 2);
|
|
if (!dl) {
|
|
ti->error = "dm-mirror: Error creating mirror dirty log";
|
|
return NULL;
|
|
}
|
|
|
|
if (!_check_region_size(ti, dl->type->get_region_size(dl))) {
|
|
ti->error = "dm-mirror: Invalid region size";
|
|
dm_destroy_dirty_log(dl);
|
|
return NULL;
|
|
}
|
|
|
|
return dl;
|
|
}
|
|
|
|
/*
|
|
* Construct a mirror mapping:
|
|
*
|
|
* log_type #log_params <log_params>
|
|
* #mirrors [mirror_path offset]{2,}
|
|
*
|
|
* log_type is "core" or "disk"
|
|
* #log_params is between 1 and 3
|
|
*/
|
|
#define DM_IO_PAGES 64
|
|
static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
|
|
{
|
|
int r;
|
|
unsigned int nr_mirrors, m, args_used;
|
|
struct mirror_set *ms;
|
|
struct dirty_log *dl;
|
|
|
|
dl = create_dirty_log(ti, argc, argv, &args_used);
|
|
if (!dl)
|
|
return -EINVAL;
|
|
|
|
argv += args_used;
|
|
argc -= args_used;
|
|
|
|
if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 ||
|
|
nr_mirrors < 2 || nr_mirrors > KCOPYD_MAX_REGIONS + 1) {
|
|
ti->error = "dm-mirror: Invalid number of mirrors";
|
|
dm_destroy_dirty_log(dl);
|
|
return -EINVAL;
|
|
}
|
|
|
|
argv++, argc--;
|
|
|
|
if (argc != nr_mirrors * 2) {
|
|
ti->error = "dm-mirror: Wrong number of mirror arguments";
|
|
dm_destroy_dirty_log(dl);
|
|
return -EINVAL;
|
|
}
|
|
|
|
ms = alloc_context(nr_mirrors, dl->type->get_region_size(dl), ti, dl);
|
|
if (!ms) {
|
|
dm_destroy_dirty_log(dl);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
/* Get the mirror parameter sets */
|
|
for (m = 0; m < nr_mirrors; m++) {
|
|
r = get_mirror(ms, ti, m, argv);
|
|
if (r) {
|
|
free_context(ms, ti, m);
|
|
return r;
|
|
}
|
|
argv += 2;
|
|
argc -= 2;
|
|
}
|
|
|
|
ti->private = ms;
|
|
ti->split_io = ms->rh.region_size;
|
|
|
|
r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client);
|
|
if (r) {
|
|
free_context(ms, ti, ms->nr_mirrors);
|
|
return r;
|
|
}
|
|
|
|
add_mirror_set(ms);
|
|
return 0;
|
|
}
|
|
|
|
static void mirror_dtr(struct dm_target *ti)
|
|
{
|
|
struct mirror_set *ms = (struct mirror_set *) ti->private;
|
|
|
|
del_mirror_set(ms);
|
|
kcopyd_client_destroy(ms->kcopyd_client);
|
|
free_context(ms, ti, ms->nr_mirrors);
|
|
}
|
|
|
|
static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw)
|
|
{
|
|
int should_wake = 0;
|
|
struct bio_list *bl;
|
|
|
|
bl = (rw == WRITE) ? &ms->writes : &ms->reads;
|
|
spin_lock(&ms->lock);
|
|
should_wake = !(bl->head);
|
|
bio_list_add(bl, bio);
|
|
spin_unlock(&ms->lock);
|
|
|
|
if (should_wake)
|
|
wake();
|
|
}
|
|
|
|
/*
|
|
* Mirror mapping function
|
|
*/
|
|
static int mirror_map(struct dm_target *ti, struct bio *bio,
|
|
union map_info *map_context)
|
|
{
|
|
int r, rw = bio_rw(bio);
|
|
struct mirror *m;
|
|
struct mirror_set *ms = ti->private;
|
|
|
|
map_context->ll = bio->bi_sector >> ms->rh.region_shift;
|
|
|
|
if (rw == WRITE) {
|
|
queue_bio(ms, bio, rw);
|
|
return 0;
|
|
}
|
|
|
|
r = ms->rh.log->type->in_sync(ms->rh.log,
|
|
bio_to_region(&ms->rh, bio), 0);
|
|
if (r < 0 && r != -EWOULDBLOCK)
|
|
return r;
|
|
|
|
if (r == -EWOULDBLOCK) /* FIXME: ugly */
|
|
r = 0;
|
|
|
|
/*
|
|
* We don't want to fast track a recovery just for a read
|
|
* ahead. So we just let it silently fail.
|
|
* FIXME: get rid of this.
|
|
*/
|
|
if (!r && rw == READA)
|
|
return -EIO;
|
|
|
|
if (!r) {
|
|
/* Pass this io over to the daemon */
|
|
queue_bio(ms, bio, rw);
|
|
return 0;
|
|
}
|
|
|
|
m = choose_mirror(ms, bio->bi_sector);
|
|
if (!m)
|
|
return -EIO;
|
|
|
|
map_bio(ms, m, bio);
|
|
return 1;
|
|
}
|
|
|
|
static int mirror_end_io(struct dm_target *ti, struct bio *bio,
|
|
int error, union map_info *map_context)
|
|
{
|
|
int rw = bio_rw(bio);
|
|
struct mirror_set *ms = (struct mirror_set *) ti->private;
|
|
region_t region = map_context->ll;
|
|
|
|
/*
|
|
* We need to dec pending if this was a write.
|
|
*/
|
|
if (rw == WRITE)
|
|
rh_dec(&ms->rh, region);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void mirror_postsuspend(struct dm_target *ti)
|
|
{
|
|
struct mirror_set *ms = (struct mirror_set *) ti->private;
|
|
struct dirty_log *log = ms->rh.log;
|
|
|
|
rh_stop_recovery(&ms->rh);
|
|
if (log->type->suspend && log->type->suspend(log))
|
|
/* FIXME: need better error handling */
|
|
DMWARN("log suspend failed");
|
|
}
|
|
|
|
static void mirror_resume(struct dm_target *ti)
|
|
{
|
|
struct mirror_set *ms = (struct mirror_set *) ti->private;
|
|
struct dirty_log *log = ms->rh.log;
|
|
if (log->type->resume && log->type->resume(log))
|
|
/* FIXME: need better error handling */
|
|
DMWARN("log resume failed");
|
|
rh_start_recovery(&ms->rh);
|
|
}
|
|
|
|
static int mirror_status(struct dm_target *ti, status_type_t type,
|
|
char *result, unsigned int maxlen)
|
|
{
|
|
unsigned int m, sz;
|
|
struct mirror_set *ms = (struct mirror_set *) ti->private;
|
|
|
|
sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen);
|
|
|
|
switch (type) {
|
|
case STATUSTYPE_INFO:
|
|
DMEMIT("%d ", ms->nr_mirrors);
|
|
for (m = 0; m < ms->nr_mirrors; m++)
|
|
DMEMIT("%s ", ms->mirror[m].dev->name);
|
|
|
|
DMEMIT(SECTOR_FORMAT "/" SECTOR_FORMAT,
|
|
ms->rh.log->type->get_sync_count(ms->rh.log),
|
|
ms->nr_regions);
|
|
break;
|
|
|
|
case STATUSTYPE_TABLE:
|
|
DMEMIT("%d ", ms->nr_mirrors);
|
|
for (m = 0; m < ms->nr_mirrors; m++)
|
|
DMEMIT("%s " SECTOR_FORMAT " ",
|
|
ms->mirror[m].dev->name, ms->mirror[m].offset);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static struct target_type mirror_target = {
|
|
.name = "mirror",
|
|
.version = {1, 0, 1},
|
|
.module = THIS_MODULE,
|
|
.ctr = mirror_ctr,
|
|
.dtr = mirror_dtr,
|
|
.map = mirror_map,
|
|
.end_io = mirror_end_io,
|
|
.postsuspend = mirror_postsuspend,
|
|
.resume = mirror_resume,
|
|
.status = mirror_status,
|
|
};
|
|
|
|
static int __init dm_mirror_init(void)
|
|
{
|
|
int r;
|
|
|
|
r = dm_dirty_log_init();
|
|
if (r)
|
|
return r;
|
|
|
|
_kmirrord_wq = create_singlethread_workqueue("kmirrord");
|
|
if (!_kmirrord_wq) {
|
|
DMERR("couldn't start kmirrord");
|
|
dm_dirty_log_exit();
|
|
return r;
|
|
}
|
|
INIT_WORK(&_kmirrord_work, do_work, NULL);
|
|
|
|
r = dm_register_target(&mirror_target);
|
|
if (r < 0) {
|
|
DMERR("%s: Failed to register mirror target",
|
|
mirror_target.name);
|
|
dm_dirty_log_exit();
|
|
destroy_workqueue(_kmirrord_wq);
|
|
}
|
|
|
|
return r;
|
|
}
|
|
|
|
static void __exit dm_mirror_exit(void)
|
|
{
|
|
int r;
|
|
|
|
r = dm_unregister_target(&mirror_target);
|
|
if (r < 0)
|
|
DMERR("%s: unregister failed %d", mirror_target.name, r);
|
|
|
|
destroy_workqueue(_kmirrord_wq);
|
|
dm_dirty_log_exit();
|
|
}
|
|
|
|
/* Module hooks */
|
|
module_init(dm_mirror_init);
|
|
module_exit(dm_mirror_exit);
|
|
|
|
MODULE_DESCRIPTION(DM_NAME " mirror target");
|
|
MODULE_AUTHOR("Joe Thornber");
|
|
MODULE_LICENSE("GPL");
|