bcachefs: Don't downgrade locks on transaction restart

We should only be downgrading locks on success - otherwise, our
transaction restarts won't be getting the correct locks and we'll
livelock.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2023-10-27 15:23:46 -04:00
parent 2e7acdfbca
commit be9e782df3
9 changed files with 96 additions and 37 deletions

View File

@ -1523,6 +1523,7 @@ static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
path->ref = 0;
path->intent_ref = 0;
path->nodes_locked = 0;
path->alloc_seq++;
btree_path_list_add(trans, pos, path);
trans->paths_sorted = false;
@ -1598,7 +1599,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
locks_want = min(locks_want, BTREE_MAX_DEPTH);
if (locks_want > path->locks_want)
bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want);
bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL);
return path;
}

View File

@ -509,7 +509,7 @@ fill:
* path->uptodate yet:
*/
if (!path->locks_want &&
!__bch2_btree_path_upgrade(trans, path, 1)) {
!__bch2_btree_path_upgrade(trans, path, 1, NULL)) {
trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_);
ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade);
goto err;

View File

@ -431,7 +431,8 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
static inline bool btree_path_get_locks(struct btree_trans *trans,
struct btree_path *path,
bool upgrade)
bool upgrade,
struct get_locks_fail *f)
{
unsigned l = path->level;
int fail_idx = -1;
@ -442,8 +443,14 @@ static inline bool btree_path_get_locks(struct btree_trans *trans,
if (!(upgrade
? bch2_btree_node_upgrade(trans, path, l)
: bch2_btree_node_relock(trans, path, l)))
fail_idx = l;
: bch2_btree_node_relock(trans, path, l))) {
fail_idx = l;
if (f) {
f->l = l;
f->b = path->l[l].b;
}
}
l++;
} while (l < path->locks_want);
@ -584,7 +591,9 @@ __flatten
bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
struct btree_path *path, unsigned long trace_ip)
{
return btree_path_get_locks(trans, path, false);
struct get_locks_fail f;
return btree_path_get_locks(trans, path, false, &f);
}
int __bch2_btree_path_relock(struct btree_trans *trans,
@ -600,22 +609,24 @@ int __bch2_btree_path_relock(struct btree_trans *trans,
bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans,
struct btree_path *path,
unsigned new_locks_want)
unsigned new_locks_want,
struct get_locks_fail *f)
{
EBUG_ON(path->locks_want >= new_locks_want);
path->locks_want = new_locks_want;
return btree_path_get_locks(trans, path, true);
return btree_path_get_locks(trans, path, true, f);
}
bool __bch2_btree_path_upgrade(struct btree_trans *trans,
struct btree_path *path,
unsigned new_locks_want)
unsigned new_locks_want,
struct get_locks_fail *f)
{
struct btree_path *linked;
if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want))
if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f))
return true;
/*
@ -644,7 +655,7 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans,
linked->btree_id == path->btree_id &&
linked->locks_want < new_locks_want) {
linked->locks_want = new_locks_want;
btree_path_get_locks(trans, linked, true);
btree_path_get_locks(trans, linked, true, NULL);
}
return false;
@ -656,6 +667,9 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans,
{
unsigned l;
if (trans->restarted)
return;
EBUG_ON(path->locks_want < new_locks_want);
path->locks_want = new_locks_want;
@ -674,6 +688,9 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans,
}
bch2_btree_path_verify_locks(path);
path->downgrade_seq++;
trace_path_downgrade(trans, _RET_IP_, path);
}
/* Btree transaction locking: */
@ -682,6 +699,9 @@ void bch2_trans_downgrade(struct btree_trans *trans)
{
struct btree_path *path;
if (trans->restarted)
return;
trans_for_each_path(trans, path)
bch2_btree_path_downgrade(trans, path);
}

View File

@ -355,26 +355,36 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans,
/* upgrade */
struct get_locks_fail {
unsigned l;
struct btree *b;
};
bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *,
struct btree_path *, unsigned);
struct btree_path *, unsigned,
struct get_locks_fail *);
bool __bch2_btree_path_upgrade(struct btree_trans *,
struct btree_path *, unsigned);
struct btree_path *, unsigned,
struct get_locks_fail *);
static inline int bch2_btree_path_upgrade(struct btree_trans *trans,
struct btree_path *path,
unsigned new_locks_want)
{
struct get_locks_fail f;
unsigned old_locks_want = path->locks_want;
new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
if (path->locks_want < new_locks_want
? __bch2_btree_path_upgrade(trans, path, new_locks_want)
? __bch2_btree_path_upgrade(trans, path, new_locks_want, &f)
: path->uptodate == BTREE_ITER_UPTODATE)
return 0;
trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path,
old_locks_want, new_locks_want);
old_locks_want, new_locks_want, &f);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
}

View File

@ -861,12 +861,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
*/
bch2_journal_res_put(&c->journal, &trans->journal_res);
if (unlikely(ret))
return ret;
bch2_trans_downgrade(trans);
return 0;
return ret;
}
static int journal_reclaim_wait_done(struct bch_fs *c)
@ -1135,6 +1130,8 @@ out:
if (likely(!(flags & BTREE_INSERT_NOCHECK_RW)))
bch2_write_ref_put(c, BCH_WRITE_REF_trans);
out_reset:
if (!ret)
bch2_trans_downgrade(trans);
bch2_trans_reset_updates(trans);
return ret;

View File

@ -228,6 +228,8 @@ struct btree_path {
u8 sorted_idx;
u8 ref;
u8 intent_ref;
u32 alloc_seq;
u32 downgrade_seq;
/* btree_iter_copy starts here: */
struct bpos pos;

View File

@ -1987,7 +1987,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
out:
if (new_path)
bch2_path_put(trans, new_path, true);
bch2_btree_path_downgrade(trans, iter->path);
bch2_trans_downgrade(trans);
return ret;
err:
bch2_btree_node_free_never_used(as, trans, n);

View File

@ -162,11 +162,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
if (((1U << i) & m->data_opts.rewrite_ptrs) &&
(ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
!ptr->cached) {
bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr);
/*
* See comment below:
bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr);
*/
rewrites_found |= 1U << i;
}
i++;
@ -212,14 +208,8 @@ restart_drop_extra_replicas:
if (!p.ptr.cached &&
durability - ptr_durability >= m->op.opts.data_replicas) {
durability -= ptr_durability;
bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), &entry->ptr);
/*
* Currently, we're dropping unneeded replicas
* instead of marking them as cached, since
* cached data in stripe buckets prevents them
* from being reused:
bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr);
*/
goto restart_drop_extra_replicas;
}
}

View File

@ -1043,13 +1043,16 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split,
TP_ARGS(trans, caller_ip, path)
);
struct get_locks_fail;
TRACE_EVENT(trans_restart_upgrade,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
struct btree_path *path,
unsigned old_locks_want,
unsigned new_locks_want),
TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want),
unsigned new_locks_want,
struct get_locks_fail *f),
TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want, f),
TP_STRUCT__entry(
__array(char, trans_fn, 32 )
@ -1057,6 +1060,11 @@ TRACE_EVENT(trans_restart_upgrade,
__field(u8, btree_id )
__field(u8, old_locks_want )
__field(u8, new_locks_want )
__field(u8, level )
__field(u32, path_seq )
__field(u32, node_seq )
__field(u32, path_alloc_seq )
__field(u32, downgrade_seq)
TRACE_BPOS_entries(pos)
),
@ -1066,10 +1074,15 @@ TRACE_EVENT(trans_restart_upgrade,
__entry->btree_id = path->btree_id;
__entry->old_locks_want = old_locks_want;
__entry->new_locks_want = new_locks_want;
__entry->level = f->l;
__entry->path_seq = path->l[f->l].lock_seq;
__entry->node_seq = IS_ERR_OR_NULL(f->b) ? 0 : f->b->c.lock.seq;
__entry->path_alloc_seq = path->alloc_seq;
__entry->downgrade_seq = path->downgrade_seq;
TRACE_BPOS_assign(pos, path->pos)
),
TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u",
TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u alloc_seq %u downgrade_seq %u",
__entry->trans_fn,
(void *) __entry->caller_ip,
bch2_btree_id_str(__entry->btree_id),
@ -1077,7 +1090,12 @@ TRACE_EVENT(trans_restart_upgrade,
__entry->pos_offset,
__entry->pos_snapshot,
__entry->old_locks_want,
__entry->new_locks_want)
__entry->new_locks_want,
__entry->level,
__entry->path_seq,
__entry->node_seq,
__entry->path_alloc_seq,
__entry->downgrade_seq)
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_relock,
@ -1238,6 +1256,27 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced,
__entry->new_u64s)
);
TRACE_EVENT(path_downgrade,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
struct btree_path *path),
TP_ARGS(trans, caller_ip, path),
TP_STRUCT__entry(
__array(char, trans_fn, 32 )
__field(unsigned long, caller_ip )
),
TP_fast_assign(
strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
__entry->caller_ip = caller_ip;
),
TP_printk("%s %pS",
__entry->trans_fn,
(void *) __entry->caller_ip)
);
DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),