linux/fs/bcachefs/btree_journal_iter.c
Kent Overstreet 9dec2a473b bcachefs: Accumulate accounting keys in journal replay
Until accounting keys hit the btree, they are deltas, not new versions
of the existing key; this means we have to teach journal replay to
accumulate them.

Additionally, the journal doesn't track precisely which entries have
been flushed to the btree; it only tracks a range of entries that may
possibly still need to be flushed.

That means we need to compare accounting keys against the version in the
btree and only flush updates that are newer.

There's another wrinkle with the write buffer: if the write buffer
starts flushing accounting keys before journal replay has finished
flushing accounting keys, journal replay will see the version number
from the new updates and updates from the journal will be lost.

To avoid this, journal replay has to flush accounting keys first, and
we'll be adding a flag so that write buffer flush knows to hold
accounting keys until then.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-07-14 19:00:13 -04:00

634 lines
15 KiB
C

// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "bkey_buf.h"
#include "bset.h"
#include "btree_cache.h"
#include "btree_journal_iter.h"
#include "journal_io.h"
#include <linux/sort.h>
/*
* For managing keys we read from the journal: until journal replay works normal
* btree lookups need to be able to find and return keys from the journal where
* they overwrite what's in the btree, so we have a special iterator and
* operations for the regular btree iter code to use:
*/
static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
{
size_t gap_size = keys->size - keys->nr;
if (idx >= keys->gap)
idx += gap_size;
return idx;
}
static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx)
{
return keys->data + idx_to_pos(keys, idx);
}
static size_t __bch2_journal_key_search(struct journal_keys *keys,
enum btree_id id, unsigned level,
struct bpos pos)
{
size_t l = 0, r = keys->nr, m;
while (l < r) {
m = l + ((r - l) >> 1);
if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0)
l = m + 1;
else
r = m;
}
BUG_ON(l < keys->nr &&
__journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0);
BUG_ON(l &&
__journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0);
return l;
}
static size_t bch2_journal_key_search(struct journal_keys *keys,
enum btree_id id, unsigned level,
struct bpos pos)
{
return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
}
/* Returns first non-overwritten key >= search key: */
struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
unsigned level, struct bpos pos,
struct bpos end_pos, size_t *idx)
{
struct journal_keys *keys = &c->journal_keys;
unsigned iters = 0;
struct journal_key *k;
BUG_ON(*idx > keys->nr);
search:
if (!*idx)
*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
while (*idx &&
__journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) {
--(*idx);
iters++;
if (iters == 10) {
*idx = 0;
goto search;
}
}
while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
return NULL;
if (k->overwritten) {
(*idx)++;
continue;
}
if (__journal_key_cmp(btree_id, level, pos, k) <= 0)
return k->k;
(*idx)++;
iters++;
if (iters == 10) {
*idx = 0;
goto search;
}
}
return NULL;
}
struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
unsigned level, struct bpos pos)
{
size_t idx = 0;
return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx);
}
static void journal_iter_verify(struct journal_iter *iter)
{
struct journal_keys *keys = iter->keys;
size_t gap_size = keys->size - keys->nr;
BUG_ON(iter->idx >= keys->gap &&
iter->idx < keys->gap + gap_size);
if (iter->idx < keys->size) {
struct journal_key *k = keys->data + iter->idx;
int cmp = cmp_int(k->btree_id, iter->btree_id) ?:
cmp_int(k->level, iter->level);
BUG_ON(cmp < 0);
}
}
static void journal_iters_fix(struct bch_fs *c)
{
struct journal_keys *keys = &c->journal_keys;
/* The key we just inserted is immediately before the gap: */
size_t gap_end = keys->gap + (keys->size - keys->nr);
struct journal_key *new_key = &keys->data[keys->gap - 1];
struct journal_iter *iter;
/*
* If an iterator points one after the key we just inserted, decrement
* the iterator so it points at the key we just inserted - if the
* decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
* handle that:
*/
list_for_each_entry(iter, &c->journal_iters, list) {
journal_iter_verify(iter);
if (iter->idx == gap_end &&
new_key->btree_id == iter->btree_id &&
new_key->level == iter->level)
iter->idx = keys->gap - 1;
journal_iter_verify(iter);
}
}
static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
{
struct journal_keys *keys = &c->journal_keys;
struct journal_iter *iter;
size_t gap_size = keys->size - keys->nr;
list_for_each_entry(iter, &c->journal_iters, list) {
if (iter->idx > old_gap)
iter->idx -= gap_size;
if (iter->idx >= new_gap)
iter->idx += gap_size;
}
}
int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
unsigned level, struct bkey_i *k)
{
struct journal_key n = {
.btree_id = id,
.level = level,
.k = k,
.allocated = true,
/*
* Ensure these keys are done last by journal replay, to unblock
* journal reclaim:
*/
.journal_seq = U32_MAX,
};
struct journal_keys *keys = &c->journal_keys;
size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
BUG_ON(test_bit(BCH_FS_rw, &c->flags));
if (idx < keys->size &&
journal_key_cmp(&n, &keys->data[idx]) == 0) {
if (keys->data[idx].allocated)
kfree(keys->data[idx].k);
keys->data[idx] = n;
return 0;
}
if (idx > keys->gap)
idx -= keys->size - keys->nr;
size_t old_gap = keys->gap;
if (keys->nr == keys->size) {
journal_iters_move_gap(c, old_gap, keys->size);
old_gap = keys->size;
struct journal_keys new_keys = {
.nr = keys->nr,
.size = max_t(size_t, keys->size, 8) * 2,
};
new_keys.data = kvmalloc_array(new_keys.size, sizeof(new_keys.data[0]), GFP_KERNEL);
if (!new_keys.data) {
bch_err(c, "%s: error allocating new key array (size %zu)",
__func__, new_keys.size);
return -BCH_ERR_ENOMEM_journal_key_insert;
}
/* Since @keys was full, there was no gap: */
memcpy(new_keys.data, keys->data, sizeof(keys->data[0]) * keys->nr);
kvfree(keys->data);
keys->data = new_keys.data;
keys->nr = new_keys.nr;
keys->size = new_keys.size;
/* And now the gap is at the end: */
keys->gap = keys->nr;
}
journal_iters_move_gap(c, old_gap, idx);
move_gap(keys, idx);
keys->nr++;
keys->data[keys->gap++] = n;
journal_iters_fix(c);
return 0;
}
/*
* Can only be used from the recovery thread while we're still RO - can't be
* used once we've got RW, as journal_keys is at that point used by multiple
* threads:
*/
int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
unsigned level, struct bkey_i *k)
{
struct bkey_i *n;
int ret;
n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
if (!n)
return -BCH_ERR_ENOMEM_journal_key_insert;
bkey_copy(n, k);
ret = bch2_journal_key_insert_take(c, id, level, n);
if (ret)
kfree(n);
return ret;
}
int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
unsigned level, struct bpos pos)
{
struct bkey_i whiteout;
bkey_init(&whiteout.k);
whiteout.k.p = pos;
return bch2_journal_key_insert(c, id, level, &whiteout);
}
bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree,
unsigned level, struct bpos pos)
{
struct journal_keys *keys = &trans->c->journal_keys;
size_t idx = bch2_journal_key_search(keys, btree, level, pos);
if (!trans->journal_replay_not_finished)
return false;
return (idx < keys->size &&
keys->data[idx].btree_id == btree &&
keys->data[idx].level == level &&
bpos_eq(keys->data[idx].k->k.p, pos) &&
bkey_deleted(&keys->data[idx].k->k));
}
void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
unsigned level, struct bpos pos)
{
struct journal_keys *keys = &c->journal_keys;
size_t idx = bch2_journal_key_search(keys, btree, level, pos);
if (idx < keys->size &&
keys->data[idx].btree_id == btree &&
keys->data[idx].level == level &&
bpos_eq(keys->data[idx].k->k.p, pos))
keys->data[idx].overwritten = true;
}
static void bch2_journal_iter_advance(struct journal_iter *iter)
{
if (iter->idx < iter->keys->size) {
iter->idx++;
if (iter->idx == iter->keys->gap)
iter->idx += iter->keys->size - iter->keys->nr;
}
}
static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
{
journal_iter_verify(iter);
while (iter->idx < iter->keys->size) {
struct journal_key *k = iter->keys->data + iter->idx;
int cmp = cmp_int(k->btree_id, iter->btree_id) ?:
cmp_int(k->level, iter->level);
if (cmp > 0)
break;
BUG_ON(cmp);
if (!k->overwritten)
return bkey_i_to_s_c(k->k);
bch2_journal_iter_advance(iter);
}
return bkey_s_c_null;
}
static void bch2_journal_iter_exit(struct journal_iter *iter)
{
list_del(&iter->list);
}
static void bch2_journal_iter_init(struct bch_fs *c,
struct journal_iter *iter,
enum btree_id id, unsigned level,
struct bpos pos)
{
iter->btree_id = id;
iter->level = level;
iter->keys = &c->journal_keys;
iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos);
journal_iter_verify(iter);
}
static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
{
return bch2_btree_node_iter_peek_unpack(&iter->node_iter,
iter->b, &iter->unpacked);
}
static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
{
bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
}
void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
{
if (bpos_eq(iter->pos, SPOS_MAX))
iter->at_end = true;
else
iter->pos = bpos_successor(iter->pos);
}
static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter)
{
struct btree_and_journal_iter iter = *_iter;
struct bch_fs *c = iter.trans->c;
unsigned level = iter.journal.level;
struct bkey_buf tmp;
unsigned nr = test_bit(BCH_FS_started, &c->flags)
? (level > 1 ? 0 : 2)
: (level > 1 ? 1 : 16);
iter.prefetch = false;
bch2_bkey_buf_init(&tmp);
while (nr--) {
bch2_btree_and_journal_iter_advance(&iter);
struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter);
if (!k.k)
break;
bch2_bkey_buf_reassemble(&tmp, c, k);
bch2_btree_node_prefetch(iter.trans, NULL, tmp.k, iter.journal.btree_id, level - 1);
}
bch2_bkey_buf_exit(&tmp, c);
}
struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
{
struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret;
if (iter->prefetch && iter->journal.level)
btree_and_journal_iter_prefetch(iter);
again:
if (iter->at_end)
return bkey_s_c_null;
while ((btree_k = bch2_journal_iter_peek_btree(iter)).k &&
bpos_lt(btree_k.k->p, iter->pos))
bch2_journal_iter_advance_btree(iter);
if (iter->trans->journal_replay_not_finished)
while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
bpos_lt(journal_k.k->p, iter->pos))
bch2_journal_iter_advance(&iter->journal);
ret = journal_k.k &&
(!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p))
? journal_k
: btree_k;
if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key))
ret = bkey_s_c_null;
if (ret.k) {
iter->pos = ret.k->p;
if (bkey_deleted(ret.k)) {
bch2_btree_and_journal_iter_advance(iter);
goto again;
}
} else {
iter->pos = SPOS_MAX;
iter->at_end = true;
}
return ret;
}
void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
{
bch2_journal_iter_exit(&iter->journal);
}
void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
struct btree_and_journal_iter *iter,
struct btree *b,
struct btree_node_iter node_iter,
struct bpos pos)
{
memset(iter, 0, sizeof(*iter));
iter->trans = trans;
iter->b = b;
iter->node_iter = node_iter;
iter->pos = b->data->min_key;
iter->at_end = false;
INIT_LIST_HEAD(&iter->journal.list);
if (trans->journal_replay_not_finished) {
bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos);
if (!test_bit(BCH_FS_may_go_rw, &trans->c->flags))
list_add(&iter->journal.list, &trans->c->journal_iters);
}
}
/*
* this version is used by btree_gc before filesystem has gone RW and
* multithreaded, so uses the journal_iters list:
*/
void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
struct btree_and_journal_iter *iter,
struct btree *b)
{
struct btree_node_iter node_iter;
bch2_btree_node_iter_init_from_start(&node_iter, b);
__bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key);
}
/* sort and dedup all keys in the journal: */
void bch2_journal_entries_free(struct bch_fs *c)
{
struct journal_replay **i;
struct genradix_iter iter;
genradix_for_each(&c->journal_entries, iter, i)
kvfree(*i);
genradix_free(&c->journal_entries);
}
/*
* When keys compare equal, oldest compares first:
*/
static int journal_sort_key_cmp(const void *_l, const void *_r)
{
const struct journal_key *l = _l;
const struct journal_key *r = _r;
return journal_key_cmp(l, r) ?:
cmp_int(l->journal_seq, r->journal_seq) ?:
cmp_int(l->journal_offset, r->journal_offset);
}
void bch2_journal_keys_put(struct bch_fs *c)
{
struct journal_keys *keys = &c->journal_keys;
BUG_ON(atomic_read(&keys->ref) <= 0);
if (!atomic_dec_and_test(&keys->ref))
return;
move_gap(keys, keys->nr);
darray_for_each(*keys, i)
if (i->allocated)
kfree(i->k);
kvfree(keys->data);
keys->data = NULL;
keys->nr = keys->gap = keys->size = 0;
bch2_journal_entries_free(c);
}
static void __journal_keys_sort(struct journal_keys *keys)
{
sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL);
struct journal_key *dst = keys->data;
darray_for_each(*keys, src) {
/*
* We don't accumulate accounting keys here because we have to
* compare each individual accounting key against the version in
* the btree during replay:
*/
if (src->k->k.type != KEY_TYPE_accounting &&
src + 1 < &darray_top(*keys) &&
!journal_key_cmp(src, src + 1))
continue;
*dst++ = *src;
}
keys->nr = dst - keys->data;
}
int bch2_journal_keys_sort(struct bch_fs *c)
{
struct genradix_iter iter;
struct journal_replay *i, **_i;
struct journal_keys *keys = &c->journal_keys;
size_t nr_read = 0;
genradix_for_each(&c->journal_entries, iter, _i) {
i = *_i;
if (journal_replay_ignore(i))
continue;
cond_resched();
for_each_jset_key(k, entry, &i->j) {
struct journal_key n = (struct journal_key) {
.btree_id = entry->btree_id,
.level = entry->level,
.k = k,
.journal_seq = le64_to_cpu(i->j.seq),
.journal_offset = k->_data - i->j._data,
};
if (darray_push(keys, n)) {
__journal_keys_sort(keys);
if (keys->nr * 8 > keys->size * 7) {
bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu",
keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq));
return -BCH_ERR_ENOMEM_journal_keys_sort;
}
BUG_ON(darray_push(keys, n));
}
nr_read++;
}
}
__journal_keys_sort(keys);
keys->gap = keys->nr;
bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr);
return 0;
}
void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree,
unsigned level_min, unsigned level_max,
struct bpos start, struct bpos end)
{
struct journal_keys *keys = &c->journal_keys;
size_t dst = 0;
move_gap(keys, keys->nr);
darray_for_each(*keys, i)
if (!(i->btree_id == btree &&
i->level >= level_min &&
i->level <= level_max &&
bpos_ge(i->k->k.p, start) &&
bpos_le(i->k->k.p, end)))
keys->data[dst++] = *i;
keys->nr = keys->gap = dst;
}
void bch2_journal_keys_dump(struct bch_fs *c)
{
struct journal_keys *keys = &c->journal_keys;
struct printbuf buf = PRINTBUF;
pr_info("%zu keys:", keys->nr);
move_gap(keys, keys->nr);
darray_for_each(*keys, i) {
printbuf_reset(&buf);
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
pr_err("%s l=%u %s", bch2_btree_id_str(i->btree_id), i->level, buf.buf);
}
printbuf_exit(&buf);
}