bcachefs: Move extent overwrite handling out of core btree code

Ever since the btree code was first written, handling of overwriting
existing extents - including partially overwriting and splittin existing
extents - was handled as part of the core btree insert path. The modern
transaction and iterator infrastructure didn't exist then, so that was
the only way for it to be done.

This patch moves that outside of the core btree code to a pass that runs
at transaction commit time.

This is a significant simplification to the btree code and overall
reduction in code size, but more importantly it gets us much closer to
the core btree code being completely independent of extents and is
important prep work for snapshots.

This introduces a new feature bit; the old and new extent update models
are incompatible when the filesystem needs journal replay.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2019-12-30 14:37:25 -05:00 committed by Kent Overstreet
parent 57b0b3db47
commit e3e464ac6d
15 changed files with 406 additions and 604 deletions

View File

@ -1315,12 +1315,14 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16);
x(inline_data, 8) \
x(new_extent_overwrite, 9) \
x(incompressible, 10) \
x(btree_ptr_v2, 11)
x(btree_ptr_v2, 11) \
x(extents_above_btree_updates, 12)
#define BCH_SB_FEATURES_ALL \
((1ULL << BCH_FEATURE_new_siphash)| \
(1ULL << BCH_FEATURE_new_extent_overwrite)| \
(1ULL << BCH_FEATURE_btree_ptr_v2))
(1ULL << BCH_FEATURE_btree_ptr_v2)| \
(1ULL << BCH_FEATURE_extents_above_btree_updates))
enum bch_sb_feature {
#define x(f, n) BCH_FEATURE_##f,

View File

@ -186,8 +186,16 @@ fsck_err:
return ret;
}
static int btree_gc_mark_node(struct bch_fs *c, struct btree *b,
u8 *max_stale, bool initial)
static bool pos_in_journal_keys(struct journal_keys *journal_keys,
enum btree_id id, struct bpos pos)
{
struct journal_key *k = journal_key_search(journal_keys, id, pos);
return k && k->btree_id == id && !bkey_cmp(k->k->k.p, pos);
}
static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
struct journal_keys *journal_keys, bool initial)
{
struct btree_node_iter iter;
struct bkey unpacked;
@ -201,6 +209,10 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b,
for_each_btree_node_key_unpack(b, k, &iter,
&unpacked) {
if (!b->c.level && journal_keys &&
pos_in_journal_keys(journal_keys, b->c.btree_id, k.k->p))
continue;
bch2_bkey_debugcheck(c, b, k);
ret = bch2_gc_mark_key(c, k, max_stale, initial);
@ -212,6 +224,7 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b,
}
static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
struct journal_keys *journal_keys,
bool initial, bool metadata_only)
{
struct btree_trans trans;
@ -239,7 +252,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
gc_pos_set(c, gc_pos_btree_node(b));
ret = btree_gc_mark_node(c, b, &max_stale, initial);
ret = btree_gc_mark_node(c, b, &max_stale,
journal_keys, initial);
if (ret)
break;
@ -281,36 +295,6 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
(int) btree_id_to_gc_phase(r);
}
static int mark_journal_key(struct bch_fs *c, enum btree_id id,
struct bkey_i *insert)
{
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
u8 max_stale;
int ret = 0;
ret = bch2_gc_mark_key(c, bkey_i_to_s_c(insert), &max_stale, true);
if (ret)
return ret;
bch2_trans_init(&trans, c, 0, 0);
for_each_btree_key(&trans, iter, id, bkey_start_pos(&insert->k),
BTREE_ITER_SLOTS, k, ret) {
percpu_down_read(&c->mark_lock);
ret = bch2_mark_overwrite(&trans, iter, k, insert, NULL,
BTREE_TRIGGER_GC|
BTREE_TRIGGER_NOATOMIC);
percpu_up_read(&c->mark_lock);
if (!ret)
break;
}
return bch2_trans_exit(&trans) ?: ret;
}
static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
bool initial, bool metadata_only)
{
@ -325,18 +309,21 @@ static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
enum btree_id id = ids[i];
enum btree_node_type type = __btree_node_type(0, id);
int ret = bch2_gc_btree(c, id, initial, metadata_only);
int ret = bch2_gc_btree(c, id, journal_keys,
initial, metadata_only);
if (ret)
return ret;
if (journal_keys && !metadata_only &&
btree_node_type_needs_gc(type)) {
struct journal_key *j;
u8 max_stale;
int ret;
for_each_journal_key(*journal_keys, j)
if (j->btree_id == id) {
ret = mark_journal_key(c, id, j->k);
ret = bch2_gc_mark_key(c, bkey_i_to_s_c(j->k),
&max_stale, initial);
if (ret)
return ret;
}

View File

@ -708,9 +708,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
unsigned *whiteout_u64s, int write,
bool have_retry)
{
struct bkey_packed *k;
struct bkey prev = KEY(0, 0, 0);
struct bpos prev_data = POS_MIN;
struct bkey_packed *k, *prev = NULL;
bool seen_non_whiteout = false;
unsigned version;
const char *err;
@ -852,15 +850,15 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
if (!seen_non_whiteout &&
(!bkey_whiteout(k) ||
(bkey_cmp(prev.p, bkey_start_pos(u.k)) > 0))) {
(prev && bkey_iter_cmp(b, prev, k) > 0))) {
*whiteout_u64s = k->_data - i->_data;
seen_non_whiteout = true;
} else if (bkey_cmp(prev_data, bkey_start_pos(u.k)) > 0 ||
bkey_cmp(prev.p, u.k->p) > 0) {
} else if (prev && bkey_iter_cmp(b, prev, k) > 0) {
char buf1[80];
char buf2[80];
struct bkey up = bkey_unpack_key(b, prev);
bch2_bkey_to_text(&PBUF(buf1), &prev);
bch2_bkey_to_text(&PBUF(buf1), &up);
bch2_bkey_to_text(&PBUF(buf2), u.k);
bch2_dump_bset(b, i, 0);
@ -870,10 +868,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
/* XXX: repair this */
}
if (!bkey_deleted(u.k))
prev_data = u.k->p;
prev = *u.k;
prev = k;
k = bkey_next_skip_noops(k, vstruct_last(i));
}

View File

@ -1504,12 +1504,12 @@ static struct bkey_s_c __btree_trans_updates_peek(struct btree_iter *iter)
struct btree_trans *trans = iter->trans;
struct btree_insert_entry *i;
trans_for_each_update(trans, i)
trans_for_each_update2(trans, i)
if ((cmp_int(iter->btree_id, i->iter->btree_id) ?:
bkey_cmp(pos, i->k->k.p)) <= 0)
break;
return i < trans->updates + trans->nr_updates &&
return i < trans->updates2 + trans->nr_updates2 &&
iter->btree_id == i->iter->btree_id
? bkey_i_to_s_c(i->k)
: bkey_s_c_null;
@ -1821,7 +1821,7 @@ int bch2_trans_iter_free(struct btree_trans *trans,
static int bch2_trans_realloc_iters(struct btree_trans *trans,
unsigned new_size)
{
void *new_iters, *new_updates;
void *p, *new_iters, *new_updates, *new_updates2;
size_t iters_bytes;
size_t updates_bytes;
@ -1839,21 +1839,27 @@ static int bch2_trans_realloc_iters(struct btree_trans *trans,
iters_bytes = sizeof(struct btree_iter) * new_size;
updates_bytes = sizeof(struct btree_insert_entry) * new_size;
new_iters = kmalloc(iters_bytes + updates_bytes, GFP_NOFS);
if (new_iters)
p = kmalloc(iters_bytes +
updates_bytes +
updates_bytes, GFP_NOFS);
if (p)
goto success;
new_iters = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
new_size = BTREE_ITER_MAX;
trans->used_mempool = true;
success:
new_updates = new_iters + iters_bytes;
new_iters = p; p += iters_bytes;
new_updates = p; p += updates_bytes;
new_updates2 = p; p += updates_bytes;
memcpy(new_iters, trans->iters,
sizeof(struct btree_iter) * trans->nr_iters);
memcpy(new_updates, trans->updates,
sizeof(struct btree_insert_entry) * trans->nr_updates);
memcpy(new_updates2, trans->updates2,
sizeof(struct btree_insert_entry) * trans->nr_updates2);
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
memset(trans->iters, POISON_FREE,
@ -1865,6 +1871,7 @@ success:
trans->iters = new_iters;
trans->updates = new_updates;
trans->updates2 = new_updates2;
trans->size = new_size;
if (trans->iters_live) {
@ -2126,6 +2133,7 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
trans->need_reset = 0;
trans->nr_updates = 0;
trans->nr_updates2 = 0;
trans->mem_top = 0;
if (trans->fs_usage_deltas) {
@ -2157,6 +2165,7 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
trans->size = ARRAY_SIZE(trans->iters_onstack);
trans->iters = trans->iters_onstack;
trans->updates = trans->updates_onstack;
trans->updates2 = trans->updates2_onstack;
trans->fs_usage_deltas = NULL;
if (expected_nr_iters > trans->size)
@ -2194,5 +2203,5 @@ int bch2_fs_btree_iter_init(struct bch_fs *c)
return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
sizeof(struct btree_iter) * nr +
sizeof(struct btree_insert_entry) * nr +
sizeof(u8) * nr);
sizeof(struct btree_insert_entry) * nr);
}

View File

@ -283,6 +283,7 @@ struct btree_trans {
u8 nr_iters;
u8 nr_updates;
u8 nr_updates2;
u8 size;
unsigned used_mempool:1;
unsigned error:1;
@ -295,6 +296,7 @@ struct btree_trans {
struct btree_iter *iters;
struct btree_insert_entry *updates;
struct btree_insert_entry *updates2;
/* update path: */
struct journal_res journal_res;
@ -308,6 +310,7 @@ struct btree_trans {
struct btree_iter iters_onstack[2];
struct btree_insert_entry updates_onstack[2];
struct btree_insert_entry updates2_onstack[2];
};
#define BTREE_FLAG(flag) \

View File

@ -132,4 +132,9 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
(_i) < (_trans)->updates + (_trans)->nr_updates; \
(_i)++)
#define trans_for_each_update2(_trans, _i) \
for ((_i) = (_trans)->updates2; \
(_i) < (_trans)->updates2 + (_trans)->nr_updates2; \
(_i)++)
#endif /* _BCACHEFS_BTREE_UPDATE_H */

View File

@ -303,18 +303,23 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
}
static inline void push_whiteout(struct bch_fs *c, struct btree *b,
struct bkey_packed *k)
struct bpos pos)
{
unsigned u64s = bkeyp_key_u64s(&b->format, k);
struct bkey_packed *dst;
struct bkey_packed k;
BUG_ON(u64s > bch_btree_keys_u64s_remaining(c, b));
BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s);
b->whiteout_u64s += bkeyp_key_u64s(&b->format, k);
dst = unwritten_whiteouts_start(c, b);
memcpy_u64s(dst, k, u64s);
dst->u64s = u64s;
dst->type = KEY_TYPE_deleted;
if (!bkey_pack_pos(&k, pos, b)) {
struct bkey *u = (void *) &k;
bkey_init(u);
u->p = pos;
}
k.needs_whiteout = true;
b->whiteout_u64s += k.u64s;
bkey_copy(unwritten_whiteouts_start(c, b), &k);
}
/*

View File

@ -23,11 +23,10 @@
static inline bool same_leaf_as_prev(struct btree_trans *trans,
struct btree_insert_entry *i)
{
return i != trans->updates &&
return i != trans->updates2 &&
i[0].iter->l[0].b == i[-1].iter->l[0].b;
}
inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
struct btree_iter *iter)
{
@ -61,6 +60,9 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0 ||
bkey_cmp(insert->k.p, b->data->max_key) > 0);
EBUG_ON(insert->k.u64s >
bch_btree_keys_u64s_remaining(iter->trans->c, b));
EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
k = bch2_btree_node_iter_peek_all(node_iter, b);
if (k && bkey_cmp_packed(b, k, &insert->k))
@ -79,7 +81,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
k->type = KEY_TYPE_deleted;
if (k->needs_whiteout)
push_whiteout(iter->trans->c, b, k);
push_whiteout(iter->trans->c, b, insert->k.p);
k->needs_whiteout = false;
if (k >= btree_bset_last(b)->start) {
@ -195,20 +197,6 @@ void bch2_btree_journal_key(struct btree_trans *trans,
set_btree_node_dirty(b);
}
static void bch2_insert_fixup_key(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_i *insert)
{
struct btree_iter_level *l = &iter->l[0];
EBUG_ON(iter->level);
EBUG_ON(insert->k.u64s >
bch_btree_keys_u64s_remaining(trans->c, l->b));
if (likely(bch2_btree_bset_insert_key(iter, l->b, &l->iter, insert)))
bch2_btree_journal_key(trans, iter, insert);
}
/**
* btree_insert_key - insert a key one key into a leaf node
*/
@ -223,12 +211,12 @@ static void btree_insert_key_leaf(struct btree_trans *trans,
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
EBUG_ON(iter->level);
insert->k.needs_whiteout = false;
if (!btree_node_is_extents(b))
bch2_insert_fixup_key(trans, iter, insert);
else
bch2_insert_fixup_extent(trans, iter, insert);
if (likely(bch2_btree_bset_insert_key(iter, b, &iter->l[0].iter, insert)))
bch2_btree_journal_key(trans, iter, insert);
live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
u64s_added = (int) bset_u64s(t) - old_u64s;
@ -254,12 +242,8 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
struct bch_fs *c = trans->c;
BUG_ON(iter->level);
BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), iter->pos));
EBUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
bkey_cmp(insert->k.p, iter->l[0].b->key.k.p) > 0);
BUG_ON(bkey_cmp(insert->k.p, iter->pos));
BUG_ON(debug_check_bkeys(c) &&
!bkey_deleted(&insert->k) &&
bch2_bkey_invalid(c, bkey_i_to_s_c(insert), iter->btree_id));
}
@ -312,9 +296,16 @@ btree_key_can_insert(struct btree_trans *trans,
if (unlikely(btree_node_fake(b)))
return BTREE_INSERT_BTREE_NODE_FULL;
/*
* old bch2_extent_sort_fix_overlapping() algorithm won't work with new
* style extent updates:
*/
if (unlikely(btree_node_old_extent_overwrite(b)))
return BTREE_INSERT_BTREE_NODE_FULL;
ret = !btree_node_is_extents(b)
? BTREE_INSERT_OK
: bch2_extent_can_insert(trans, iter, insert, u64s);
: bch2_extent_can_insert(trans, iter, insert);
if (ret)
return ret;
@ -383,7 +374,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
prefetch(&trans->c->journal.flags);
trans_for_each_update(trans, i) {
trans_for_each_update2(trans, i) {
/* Multiple inserts might go to same leaf: */
if (!same_leaf_as_prev(trans, i))
u64s = 0;
@ -422,10 +413,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
if (journal_seq_verify(c))
trans_for_each_update(trans, i)
trans_for_each_update2(trans, i)
i->k->k.version.lo = trans->journal_res.seq;
else if (inject_invalid_keys(c))
trans_for_each_update(trans, i)
trans_for_each_update2(trans, i)
i->k->k.version = MAX_VERSION;
}
@ -448,7 +439,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
if (unlikely(c->gc_pos.phase))
bch2_trans_mark_gc(trans);
trans_for_each_update(trans, i)
trans_for_each_update2(trans, i)
do_btree_insert_one(trans, i->iter, i->k);
err:
if (marking) {
@ -469,7 +460,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
struct btree_iter *iter;
int ret;
trans_for_each_update(trans, i)
trans_for_each_update2(trans, i)
BUG_ON(!btree_node_intent_locked(i->iter, 0));
ret = bch2_journal_preres_get(&trans->c->journal,
@ -497,18 +488,18 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
}
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
trans_for_each_update(trans, i)
trans_for_each_update2(trans, i)
btree_insert_entry_checks(trans, i->iter, i->k);
bch2_btree_trans_verify_locks(trans);
trans_for_each_update(trans, i)
trans_for_each_update2(trans, i)
if (!same_leaf_as_prev(trans, i))
bch2_btree_node_lock_for_insert(trans->c,
i->iter->l[0].b, i->iter);
ret = bch2_trans_commit_write_locked(trans, stopped_at);
trans_for_each_update(trans, i)
trans_for_each_update2(trans, i)
if (!same_leaf_as_prev(trans, i))
bch2_btree_node_unlock_write_inlined(i->iter->l[0].b,
i->iter);
@ -525,14 +516,14 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
if (trans->flags & BTREE_INSERT_NOUNLOCK)
trans->nounlock = true;
trans_for_each_update(trans, i)
trans_for_each_update2(trans, i)
if (!same_leaf_as_prev(trans, i))
bch2_foreground_maybe_merge(trans->c, i->iter,
0, trans->flags);
trans->nounlock = false;
trans_for_each_update(trans, i)
trans_for_each_update2(trans, i)
bch2_btree_iter_downgrade(i->iter);
return 0;
@ -655,6 +646,135 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
return 0;
}
static void bch2_trans_update2(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_i *insert)
{
struct btree_insert_entry *i, n = (struct btree_insert_entry) {
.iter = iter, .k = insert
};
btree_insert_entry_checks(trans, n.iter, n.k);
BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
EBUG_ON(trans->nr_updates2 >= trans->nr_iters);
iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
trans_for_each_update2(trans, i) {
if (btree_iter_cmp(n.iter, i->iter) == 0) {
*i = n;
return;
}
if (btree_iter_cmp(n.iter, i->iter) <= 0)
break;
}
array_insert_item(trans->updates2, trans->nr_updates2,
i - trans->updates2, n);
}
static int extent_update_to_keys(struct btree_trans *trans,
struct btree_iter *orig_iter,
struct bkey_i *insert)
{
struct btree_iter *iter;
if (bkey_deleted(&insert->k))
return 0;
iter = bch2_trans_copy_iter(trans, orig_iter);
if (IS_ERR(iter))
return PTR_ERR(iter);
iter->flags |= BTREE_ITER_INTENT;
__bch2_btree_iter_set_pos(iter, insert->k.p, false);
bch2_trans_update2(trans, iter, insert);
bch2_trans_iter_put(trans, iter);
return 0;
}
static int extent_handle_overwrites(struct btree_trans *trans,
enum btree_id btree_id,
struct bpos start, struct bpos end)
{
struct btree_iter *iter = NULL, *update_iter;
struct bkey_i *update;
struct bkey_s_c k;
int ret = 0;
iter = bch2_trans_get_iter(trans, btree_id, start, BTREE_ITER_INTENT);
ret = PTR_ERR_OR_ZERO(iter);
if (ret)
return ret;
k = bch2_btree_iter_peek_with_updates(iter);
while (k.k && !(ret = bkey_err(k))) {
if (bkey_cmp(end, bkey_start_pos(k.k)) <= 0)
break;
if (bkey_cmp(bkey_start_pos(k.k), start) < 0) {
update_iter = bch2_trans_copy_iter(trans, iter);
if ((ret = PTR_ERR_OR_ZERO(update_iter)))
goto err;
update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
if ((ret = PTR_ERR_OR_ZERO(update)))
goto err;
bkey_reassemble(update, k);
bch2_cut_back(start, update);
__bch2_btree_iter_set_pos(update_iter, update->k.p, false);
bch2_trans_update2(trans, update_iter, update);
bch2_trans_iter_put(trans, update_iter);
}
if (bkey_cmp(k.k->p, end) > 0) {
update_iter = bch2_trans_copy_iter(trans, iter);
if ((ret = PTR_ERR_OR_ZERO(update_iter)))
goto err;
update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
if ((ret = PTR_ERR_OR_ZERO(update)))
goto err;
bkey_reassemble(update, k);
bch2_cut_front(end, update);
__bch2_btree_iter_set_pos(update_iter, update->k.p, false);
bch2_trans_update2(trans, update_iter, update);
bch2_trans_iter_put(trans, update_iter);
} else {
update_iter = bch2_trans_copy_iter(trans, iter);
if ((ret = PTR_ERR_OR_ZERO(update_iter)))
goto err;
update = bch2_trans_kmalloc(trans, sizeof(struct bkey));
if ((ret = PTR_ERR_OR_ZERO(update)))
goto err;
update->k = *k.k;
set_bkey_val_u64s(&update->k, 0);
update->k.type = KEY_TYPE_deleted;
update->k.size = 0;
__bch2_btree_iter_set_pos(update_iter, update->k.p, false);
bch2_trans_update2(trans, update_iter, update);
bch2_trans_iter_put(trans, update_iter);
}
k = bch2_btree_iter_next_with_updates(iter);
}
err:
if (!IS_ERR_OR_NULL(iter))
bch2_trans_iter_put(trans, iter);
return ret;
}
int __bch2_trans_commit(struct btree_trans *trans)
{
struct btree_insert_entry *i = NULL;
@ -724,7 +844,36 @@ int __bch2_trans_commit(struct btree_trans *trans)
}
} while (trans_trigger_run);
/* Turn extents updates into keys: */
trans_for_each_update(trans, i)
if (i->iter->flags & BTREE_ITER_IS_EXTENTS) {
struct bpos start = bkey_start_pos(&i->k->k);
while (i + 1 < trans->updates + trans->nr_updates &&
i[0].iter->btree_id == i[1].iter->btree_id &&
!bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k)))
i++;
ret = extent_handle_overwrites(trans, i->iter->btree_id,
start, i->k->k.p);
if (ret)
goto out;
}
trans_for_each_update(trans, i) {
if (i->iter->flags & BTREE_ITER_IS_EXTENTS) {
ret = extent_update_to_keys(trans, i->iter, i->k);
if (ret)
goto out;
} else {
bch2_trans_update2(trans, i->iter, i->k);
}
}
trans_for_each_update2(trans, i) {
BUG_ON(i->iter->uptodate > BTREE_ITER_NEED_PEEK);
BUG_ON(i->iter->locks_want < 1);
u64s = jset_u64s(i->k->k.u64s);
if (0)
trans->journal_preres_u64s += u64s;
@ -773,7 +922,10 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
.trigger_flags = flags, .iter = iter, .k = k
};
EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&k->k)));
EBUG_ON(bkey_cmp(iter->pos,
(iter->flags & BTREE_ITER_IS_EXTENTS)
? bkey_start_pos(&k->k)
: k->k.p));
iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;

View File

@ -1254,21 +1254,21 @@ inline int bch2_mark_overwrite(struct btree_trans *trans,
struct bkey_s_c old,
struct bkey_i *new,
struct bch_fs_usage *fs_usage,
unsigned flags)
unsigned flags,
bool is_extents)
{
struct bch_fs *c = trans->c;
struct btree *b = iter->l[0].b;
unsigned offset = 0;
s64 sectors = 0;
s64 sectors = -((s64) old.k->size);
flags |= BTREE_TRIGGER_OVERWRITE;
if (btree_node_is_extents(b)
if (is_extents
? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0
: bkey_cmp(new->k.p, old.k->p))
return 0;
if (btree_node_is_extents(b)) {
if (is_extents) {
switch (bch2_extent_overlap(&new->k, old.k)) {
case BCH_EXTENT_OVERLAP_ALL:
offset = 0;
@ -1341,7 +1341,8 @@ int bch2_mark_update(struct btree_trans *trans,
struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked);
ret = bch2_mark_overwrite(trans, iter, k, insert,
fs_usage, flags);
fs_usage, flags,
btree_node_type_is_extents(iter->btree_id));
if (ret <= 0)
break;

View File

@ -268,7 +268,7 @@ int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage_online *,
int bch2_mark_overwrite(struct btree_trans *, struct btree_iter *,
struct bkey_s_c, struct bkey_i *,
struct bch_fs_usage *, unsigned);
struct bch_fs_usage *, unsigned, bool);
int bch2_mark_update(struct btree_trans *, struct btree_iter *,
struct bkey_i *, struct bch_fs_usage *, unsigned);

View File

@ -39,6 +39,12 @@ static int count_iters_for_insert(struct btree_trans *trans,
{
int ret = 0;
/*
* The extent update path requires an _additional_ iterator for each
* extent we're inserting and overwriting:
*/
*nr_iters += 1;
switch (k.k->type) {
case KEY_TYPE_extent:
case KEY_TYPE_reflink_v:
@ -167,402 +173,40 @@ int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter)
enum btree_insert_ret
bch2_extent_can_insert(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_i *insert,
unsigned *u64s)
struct bkey_i *insert)
{
struct btree_iter_level *l = &iter->l[0];
struct btree_node_iter node_iter = l->iter;
struct bkey_packed *_k;
struct bkey_s_c k;
struct bkey unpacked;
int sectors;
while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, l->b,
KEY_TYPE_discard))) {
struct bkey_s_c k = bkey_disassemble(l->b, _k, &unpacked);
enum bch_extent_overlap overlap =
bch2_extent_overlap(&insert->k, k.k);
_k = bch2_btree_node_iter_peek_filter(&node_iter, l->b,
KEY_TYPE_discard);
if (!_k)
return BTREE_INSERT_OK;
if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
k = bkey_disassemble(l->b, _k, &unpacked);
/* Check if we're splitting a compressed extent: */
if (bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k)) > 0 &&
bkey_cmp(insert->k.p, k.k->p) < 0 &&
(sectors = bch2_bkey_sectors_compressed(k))) {
int flags = trans->flags & BTREE_INSERT_NOFAIL
? BCH_DISK_RESERVATION_NOFAIL : 0;
switch (bch2_disk_reservation_add(trans->c, trans->disk_res,
sectors, flags)) {
case 0:
break;
overlap = bch2_extent_overlap(&insert->k, k.k);
/*
* If we're overwriting an existing extent, we may need to emit
* a whiteout - unless we're inserting a new extent at the same
* position:
*/
if (k.k->needs_whiteout &&
(!bkey_whiteout(&insert->k) ||
bkey_cmp(k.k->p, insert->k.p)))
*u64s += BKEY_U64s;
/*
* If we're partially overwriting an existing extent which has
* been written out to disk, we'll need to emit a new version of
* that extent:
*/
if (bkey_written(l->b, _k) &&
overlap != BCH_EXTENT_OVERLAP_ALL)
*u64s += _k->u64s;
/* And we may be splitting an existing extent: */
if (overlap == BCH_EXTENT_OVERLAP_MIDDLE)
*u64s += _k->u64s;
if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
(sectors = bch2_bkey_sectors_compressed(k))) {
int flags = trans->flags & BTREE_INSERT_NOFAIL
? BCH_DISK_RESERVATION_NOFAIL : 0;
switch (bch2_disk_reservation_add(trans->c,
trans->disk_res,
sectors, flags)) {
case 0:
break;
case -ENOSPC:
return BTREE_INSERT_ENOSPC;
default:
BUG();
}
case -ENOSPC:
return BTREE_INSERT_ENOSPC;
default:
BUG();
}
if (overlap == BCH_EXTENT_OVERLAP_FRONT ||
overlap == BCH_EXTENT_OVERLAP_MIDDLE)
break;
bch2_btree_node_iter_advance(&node_iter, l->b);
}
return BTREE_INSERT_OK;
}
static void verify_extent_nonoverlapping(struct bch_fs *c,
struct btree *b,
struct btree_node_iter *_iter,
struct bkey_i *insert)
{
#ifdef CONFIG_BCACHEFS_DEBUG
struct btree_node_iter iter;
struct bkey_packed *k;
struct bkey uk;
if (!expensive_debug_checks(c))
return;
iter = *_iter;
k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard);
BUG_ON(k &&
(uk = bkey_unpack_key(b, k),
bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0));
iter = *_iter;
k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard);
#if 0
BUG_ON(k &&
(uk = bkey_unpack_key(b, k),
bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0);
#else
if (k &&
(uk = bkey_unpack_key(b, k),
bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) {
char buf1[100];
char buf2[100];
bch2_bkey_to_text(&PBUF(buf1), &insert->k);
bch2_bkey_to_text(&PBUF(buf2), &uk);
bch2_dump_btree_node(b);
panic("insert > next :\n"
"insert %s\n"
"next %s\n",
buf1, buf2);
}
#endif
#endif
}
static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
struct bkey_i *insert)
{
struct btree_iter_level *l = &iter->l[0];
struct bkey_packed *k =
bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b));
BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b));
EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
verify_extent_nonoverlapping(c, l->b, &l->iter, insert);
if (debug_check_bkeys(c))
bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert));
bch2_bset_insert(l->b, &l->iter, k, insert, 0);
bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s);
}
static void pack_push_whiteout(struct bch_fs *c, struct btree *b,
struct bpos pos)
{
struct bkey_packed k;
if (!bkey_pack_pos(&k, pos, b)) {
struct bkey_i tmp;
bkey_init(&tmp.k);
tmp.k.p = pos;
bkey_copy(&k, &tmp);
}
k.needs_whiteout = true;
push_whiteout(c, b, &k);
}
static void
extent_drop(struct bch_fs *c, struct btree_iter *iter,
struct bkey_packed *_k, struct bkey_s k)
{
struct btree_iter_level *l = &iter->l[0];
if (!bkey_whiteout(k.k))
btree_account_key_drop(l->b, _k);
k.k->size = 0;
k.k->type = KEY_TYPE_deleted;
if (!btree_node_old_extent_overwrite(l->b) &&
k.k->needs_whiteout) {
pack_push_whiteout(c, l->b, k.k->p);
k.k->needs_whiteout = false;
}
if (_k >= btree_bset_last(l->b)->start) {
unsigned u64s = _k->u64s;
bch2_bset_delete(l->b, _k, _k->u64s);
bch2_btree_node_iter_fix(iter, l->b, &l->iter, _k, u64s, 0);
} else {
extent_save(l->b, _k, k.k);
bch2_btree_iter_fix_key_modified(iter, l->b, _k);
}
}
static void
extent_squash(struct bch_fs *c, struct btree_iter *iter,
struct bkey_i *insert,
struct bkey_packed *_k, struct bkey_s k,
enum bch_extent_overlap overlap)
{
struct btree_iter_level *l = &iter->l[0];
struct bkey_on_stack tmp, split;
bkey_on_stack_init(&tmp);
bkey_on_stack_init(&split);
if (!btree_node_old_extent_overwrite(l->b)) {
if (!bkey_whiteout(&insert->k) &&
!bkey_cmp(k.k->p, insert->k.p)) {
insert->k.needs_whiteout = k.k->needs_whiteout;
k.k->needs_whiteout = false;
}
} else {
insert->k.needs_whiteout |= k.k->needs_whiteout;
}
switch (overlap) {
case BCH_EXTENT_OVERLAP_FRONT:
if (bkey_written(l->b, _k)) {
bkey_on_stack_reassemble(&tmp, c, k.s_c);
bch2_cut_front(insert->k.p, tmp.k);
/*
* needs_whiteout was propagated to new version of @k,
* @tmp:
*/
if (!btree_node_old_extent_overwrite(l->b))
k.k->needs_whiteout = false;
extent_drop(c, iter, _k, k);
extent_bset_insert(c, iter, tmp.k);
} else {
btree_keys_account_val_delta(l->b, _k,
bch2_cut_front_s(insert->k.p, k));
extent_save(l->b, _k, k.k);
/*
* No need to call bset_fix_invalidated_key, start of
* extent changed but extents are indexed by where they
* end
*/
bch2_btree_iter_fix_key_modified(iter, l->b, _k);
}
break;
case BCH_EXTENT_OVERLAP_BACK:
if (bkey_written(l->b, _k)) {
bkey_on_stack_reassemble(&tmp, c, k.s_c);
bch2_cut_back(bkey_start_pos(&insert->k), tmp.k);
/*
* @tmp has different position than @k, needs_whiteout
* should not be propagated:
*/
if (!btree_node_old_extent_overwrite(l->b))
tmp.k->k.needs_whiteout = false;
extent_drop(c, iter, _k, k);
extent_bset_insert(c, iter, tmp.k);
} else {
/*
* position of @k is changing, emit a whiteout if
* needs_whiteout is set:
*/
if (!btree_node_old_extent_overwrite(l->b) &&
k.k->needs_whiteout) {
pack_push_whiteout(c, l->b, k.k->p);
k.k->needs_whiteout = false;
}
btree_keys_account_val_delta(l->b, _k,
bch2_cut_back_s(bkey_start_pos(&insert->k), k));
extent_save(l->b, _k, k.k);
bch2_bset_fix_invalidated_key(l->b, _k);
bch2_btree_node_iter_fix(iter, l->b, &l->iter,
_k, _k->u64s, _k->u64s);
}
break;
case BCH_EXTENT_OVERLAP_ALL:
extent_drop(c, iter, _k, k);
break;
case BCH_EXTENT_OVERLAP_MIDDLE:
bkey_on_stack_reassemble(&split, c, k.s_c);
bch2_cut_back(bkey_start_pos(&insert->k), split.k);
if (!btree_node_old_extent_overwrite(l->b))
split.k->k.needs_whiteout = false;
/* this is identical to BCH_EXTENT_OVERLAP_FRONT: */
if (bkey_written(l->b, _k)) {
bkey_on_stack_reassemble(&tmp, c, k.s_c);
bch2_cut_front(insert->k.p, tmp.k);
if (!btree_node_old_extent_overwrite(l->b))
k.k->needs_whiteout = false;
extent_drop(c, iter, _k, k);
extent_bset_insert(c, iter, tmp.k);
} else {
btree_keys_account_val_delta(l->b, _k,
bch2_cut_front_s(insert->k.p, k));
extent_save(l->b, _k, k.k);
bch2_btree_iter_fix_key_modified(iter, l->b, _k);
}
extent_bset_insert(c, iter, split.k);
break;
}
bkey_on_stack_exit(&split, c);
bkey_on_stack_exit(&tmp, c);
}
/**
* bch_extent_insert_fixup - insert a new extent and deal with overlaps
*
* this may result in not actually doing the insert, or inserting some subset
* of the insert key. For cmpxchg operations this is where that logic lives.
*
* All subsets of @insert that need to be inserted are inserted using
* bch2_btree_insert_and_journal(). If @b or @res fills up, this function
* returns false, setting @iter->pos for the prefix of @insert that actually got
* inserted.
*
* BSET INVARIANTS: this function is responsible for maintaining all the
* invariants for bsets of extents in memory. things get really hairy with 0
* size extents
*
* within one bset:
*
* bkey_start_pos(bkey_next(k)) >= k
* or bkey_start_offset(bkey_next(k)) >= k->offset
*
* i.e. strict ordering, no overlapping extents.
*
* multiple bsets (i.e. full btree node):
*
* k, j
* k.size != 0 j.size != 0
* ¬ (k > bkey_start_pos(j) k < j)
*
* i.e. no two overlapping keys _of nonzero size_
*
* We can't realistically maintain this invariant for zero size keys because of
* the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j
* there may be another 0 size key between them in another bset, and it will
* thus overlap with the merged key.
*
* In addition, the end of iter->pos indicates how much has been processed.
* If the end of iter->pos is not the same as the end of insert, then
* key insertion needs to continue/be retried.
*/
void bch2_insert_fixup_extent(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_i *insert)
{
struct bch_fs *c = trans->c;
struct btree_iter_level *l = &iter->l[0];
struct btree_node_iter node_iter = l->iter;
bool do_update = !bkey_whiteout(&insert->k);
struct bkey_packed *_k;
struct bkey unpacked;
EBUG_ON(iter->level);
EBUG_ON(!insert->k.size);
EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b,
KEY_TYPE_discard))) {
struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked);
enum bch_extent_overlap overlap =
bch2_extent_overlap(&insert->k, k.k);
if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
break;
if (!bkey_whiteout(k.k))
do_update = true;
if (!do_update) {
struct bpos cur_end = bpos_min(insert->k.p, k.k->p);
bch2_cut_front(cur_end, insert);
bch2_btree_iter_set_pos_same_leaf(iter, cur_end);
} else {
extent_squash(c, iter, insert, _k, k, overlap);
}
node_iter = l->iter;
if (overlap == BCH_EXTENT_OVERLAP_FRONT ||
overlap == BCH_EXTENT_OVERLAP_MIDDLE)
break;
}
l->iter = node_iter;
bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p);
if (do_update) {
if (insert->k.type == KEY_TYPE_deleted)
insert->k.type = KEY_TYPE_discard;
if (!bkey_whiteout(&insert->k) ||
btree_node_old_extent_overwrite(l->b))
extent_bset_insert(c, iter, insert);
bch2_btree_journal_key(trans, iter, insert);
}
bch2_cut_front(insert->k.p, insert);
}

View File

@ -11,9 +11,6 @@ int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *);
enum btree_insert_ret
bch2_extent_can_insert(struct btree_trans *, struct btree_iter *,
struct bkey_i *, unsigned *);
void bch2_insert_fixup_extent(struct btree_trans *,
struct btree_iter *,
struct bkey_i *);
struct bkey_i *);
#endif /* _BCACHEFS_EXTENT_UPDATE_H */

View File

@ -422,6 +422,42 @@ static int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size)
POS(inode_nr + 1, 0), NULL);
}
static int bch2_fix_overlapping_extent(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k, struct bpos cut_at)
{
struct btree_iter *u_iter;
struct bkey_i *u;
int ret;
u = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
ret = PTR_ERR_OR_ZERO(u);
if (ret)
return ret;
bkey_reassemble(u, k);
bch2_cut_front(cut_at, u);
u_iter = bch2_trans_copy_iter(trans, iter);
ret = PTR_ERR_OR_ZERO(u_iter);
if (ret)
return ret;
/*
* We don't want to go through the
* extent_handle_overwrites path:
*/
__bch2_btree_iter_set_pos(u_iter, u->k.p, false);
/*
* XXX: this is going to leave disk space
* accounting slightly wrong
*/
ret = bch2_trans_update(trans, u_iter, u, 0);
bch2_trans_iter_put(trans, u_iter);
return ret;
}
/*
* Walk extents: verify that extents have a corresponding S_ISREG inode, and
* that i_size an i_sectors are consistent
@ -433,6 +469,7 @@ static int check_extents(struct bch_fs *c)
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
struct bkey prev = KEY(0, 0, 0);
u64 i_sectors;
int ret = 0;
@ -444,6 +481,25 @@ static int check_extents(struct bch_fs *c)
POS(BCACHEFS_ROOT_INO, 0), 0);
retry:
for_each_btree_key_continue(iter, 0, k, ret) {
if (bkey_cmp(prev.p, bkey_start_pos(k.k)) > 0) {
char buf1[100];
char buf2[100];
bch2_bkey_to_text(&PBUF(buf1), &prev);
bch2_bkey_to_text(&PBUF(buf2), k.k);
if (fsck_err(c, "overlapping extents: %s, %s", buf1, buf2)) {
ret = __bch2_trans_do(&trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW,
bch2_fix_overlapping_extent(&trans,
iter, k, prev.p));
if (ret)
goto err;
}
}
prev = *k.k;
ret = walk_inode(&trans, &w, k.k->p.inode);
if (ret)
break;

View File

@ -161,13 +161,16 @@ static void journal_entries_free(struct list_head *list)
}
}
/*
* When keys compare equal, oldest compares first:
*/
static int journal_sort_key_cmp(const void *_l, const void *_r)
{
const struct journal_key *l = _l;
const struct journal_key *r = _r;
return cmp_int(l->btree_id, r->btree_id) ?:
bkey_cmp(l->pos, r->pos) ?:
bkey_cmp(l->k->k.p, r->k->k.p) ?:
cmp_int(l->journal_seq, r->journal_seq) ?:
cmp_int(l->journal_offset, r->journal_offset);
}
@ -179,25 +182,11 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
return cmp_int(l->journal_seq, r->journal_seq) ?:
cmp_int(l->btree_id, r->btree_id) ?:
bkey_cmp(l->pos, r->pos);
}
static void journal_keys_sift(struct journal_keys *keys, struct journal_key *i)
{
while (i + 1 < keys->d + keys->nr &&
journal_sort_key_cmp(i, i + 1) > 0) {
swap(i[0], i[1]);
i++;
}
bkey_cmp(l->k->k.p, r->k->k.p);
}
static void journal_keys_free(struct journal_keys *keys)
{
struct journal_key *i;
for_each_journal_key(*keys, i)
if (i->allocated)
kfree(i->k);
kvfree(keys->d);
keys->d = NULL;
keys->nr = 0;
@ -208,15 +197,15 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
struct journal_replay *p;
struct jset_entry *entry;
struct bkey_i *k, *_n;
struct journal_keys keys = { NULL }, keys_deduped = { NULL };
struct journal_key *i;
struct journal_keys keys = { NULL };
struct journal_key *src, *dst;
size_t nr_keys = 0;
list_for_each_entry(p, journal_entries, list)
for_each_jset_key(k, _n, entry, &p->j)
nr_keys++;
keys.journal_seq_base = keys_deduped.journal_seq_base =
keys.journal_seq_base =
le64_to_cpu(list_first_entry(journal_entries,
struct journal_replay,
list)->j.seq);
@ -225,96 +214,31 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
if (!keys.d)
goto err;
keys_deduped.d = kvmalloc(sizeof(keys.d[0]) * nr_keys * 2, GFP_KERNEL);
if (!keys_deduped.d)
goto err;
list_for_each_entry(p, journal_entries, list)
for_each_jset_key(k, _n, entry, &p->j) {
if (bkey_deleted(&k->k) &&
btree_node_type_is_extents(entry->btree_id))
continue;
for_each_jset_key(k, _n, entry, &p->j)
keys.d[keys.nr++] = (struct journal_key) {
.btree_id = entry->btree_id,
.pos = bkey_start_pos(&k->k),
.k = k,
.journal_seq = le64_to_cpu(p->j.seq) -
keys.journal_seq_base,
.journal_offset = k->_data - p->j._data,
};
}
sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_key_cmp, NULL);
i = keys.d;
while (i < keys.d + keys.nr) {
if (i + 1 < keys.d + keys.nr &&
i[0].btree_id == i[1].btree_id &&
!bkey_cmp(i[0].pos, i[1].pos)) {
if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) {
i++;
} else {
bch2_cut_front(i[1].k->k.p, i[0].k);
i[0].pos = i[1].k->k.p;
journal_keys_sift(&keys, i);
}
continue;
}
src = dst = keys.d;
while (src < keys.d + keys.nr) {
while (src + 1 < keys.d + keys.nr &&
src[0].btree_id == src[1].btree_id &&
!bkey_cmp(src[0].k->k.p, src[1].k->k.p))
src++;
if (i + 1 < keys.d + keys.nr &&
i[0].btree_id == i[1].btree_id &&
bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k)) > 0) {
if ((cmp_int(i[0].journal_seq, i[1].journal_seq) ?:
cmp_int(i[0].journal_offset, i[1].journal_offset)) < 0) {
if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) {
bch2_cut_back(bkey_start_pos(&i[1].k->k), i[0].k);
} else {
struct bkey_i *split =
kmalloc(bkey_bytes(i[0].k), GFP_KERNEL);
if (!split)
goto err;
bkey_copy(split, i[0].k);
bch2_cut_back(bkey_start_pos(&i[1].k->k), split);
keys_deduped.d[keys_deduped.nr++] = (struct journal_key) {
.btree_id = i[0].btree_id,
.allocated = true,
.pos = bkey_start_pos(&split->k),
.k = split,
.journal_seq = i[0].journal_seq,
.journal_offset = i[0].journal_offset,
};
bch2_cut_front(i[1].k->k.p, i[0].k);
i[0].pos = i[1].k->k.p;
journal_keys_sift(&keys, i);
continue;
}
} else {
if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) >= 0) {
i[1] = i[0];
i++;
continue;
} else {
bch2_cut_front(i[0].k->k.p, i[1].k);
i[1].pos = i[0].k->k.p;
journal_keys_sift(&keys, i + 1);
continue;
}
}
}
keys_deduped.d[keys_deduped.nr++] = *i++;
*dst++ = *src++;
}
kvfree(keys.d);
return keys_deduped;
keys.nr = dst - keys.d;
err:
journal_keys_free(&keys_deduped);
kvfree(keys.d);
return (struct journal_keys) { NULL };
return keys;
}
/* journal replay: */
@ -365,11 +289,6 @@ retry:
atomic_end = bpos_min(k->k.p, iter->l[0].b->key.k.p);
split_iter = bch2_trans_copy_iter(&trans, iter);
ret = PTR_ERR_OR_ZERO(split_iter);
if (ret)
goto err;
split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k));
ret = PTR_ERR_OR_ZERO(split);
if (ret)
@ -388,12 +307,25 @@ retry:
}
bkey_copy(split, k);
bch2_cut_front(split_iter->pos, split);
bch2_cut_front(iter->pos, split);
bch2_cut_back(atomic_end, split);
split_iter = bch2_trans_copy_iter(&trans, iter);
ret = PTR_ERR_OR_ZERO(split_iter);
if (ret)
goto err;
/*
* It's important that we don't go through the
* extent_handle_overwrites() and extent_update_to_keys() path
* here: journal replay is supposed to treat extents like
* regular keys
*/
__bch2_btree_iter_set_pos(split_iter, split->k.p, false);
bch2_trans_update(&trans, split_iter, split, !remark
? BTREE_TRIGGER_NORUN
: BTREE_TRIGGER_NOOVERWRITES);
bch2_btree_iter_set_pos(iter, split->k.p);
} while (bkey_cmp(iter->pos, k->k.p) < 0);
@ -424,11 +356,18 @@ static int __bch2_journal_replay_key(struct btree_trans *trans,
struct btree_iter *iter;
int ret;
iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k),
BTREE_ITER_INTENT);
iter = bch2_trans_get_iter(trans, id, k->k.p, BTREE_ITER_INTENT);
if (IS_ERR(iter))
return PTR_ERR(iter);
/*
* iter->flags & BTREE_ITER_IS_EXTENTS triggers the update path to run
* extent_handle_overwrites() and extent_update_to_keys() - but we don't
* want that here, journal replay is supposed to treat extents like
* regular keys:
*/
__bch2_btree_iter_set_pos(iter, k->k.p, false);
ret = bch2_btree_iter_traverse(iter) ?:
bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
bch2_trans_iter_put(trans, iter);
@ -459,7 +398,7 @@ static int bch2_journal_replay(struct bch_fs *c,
if (i->btree_id == BTREE_ID_ALLOC)
ret = bch2_alloc_replay_key(c, i->k);
else if (btree_node_type_is_extents(i->btree_id))
else if (i->k->k.size)
ret = bch2_extent_replay_key(c, i->btree_id, i->k);
else
ret = bch2_journal_replay_key(c, i->btree_id, i->k);
@ -859,6 +798,15 @@ int bch2_fs_recovery(struct bch_fs *c)
journal_seq = le64_to_cpu(clean->journal_seq) + 1;
}
if (!c->sb.clean &&
!(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) {
bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix");
ret = -EINVAL;
goto err;
}
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates;
ret = journal_replay_early(c, clean, &journal_entries);
if (ret)
goto err;

View File

@ -5,8 +5,6 @@
struct journal_keys {
struct journal_key {
enum btree_id btree_id:8;
unsigned allocated:1;
struct bpos pos;
struct bkey_i *k;
u32 journal_seq;
u32 journal_offset;