diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index a5203fbc089e..17eb0dd657a8 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -624,10 +624,11 @@ struct bch_fs { struct bch_fs_pcpu __percpu *pcpu; - struct bch_fs_usage __percpu *usage[2]; - struct percpu_rw_semaphore mark_lock; + struct bch_fs_usage __percpu *usage[2]; + struct bch_fs_usage __percpu *usage_scratch; + /* * When we invalidate buckets, we use both the priority and the amount * of good data to determine which buckets to reuse first - to weight diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 466469a0d852..a725a106f6dc 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -478,33 +478,12 @@ static void bch2_gc_free(struct bch_fs *c) ca->usage[1] = NULL; } + percpu_down_write(&c->mark_lock); + free_percpu(c->usage[1]); c->usage[1] = NULL; -} -/* - * Accumulate percpu counters onto one cpu's copy - only valid when access - * against any percpu counter is guarded against - */ -static u64 *acc_percpu_u64s(u64 __percpu *p, unsigned nr) -{ - u64 *ret; - int cpu; - - preempt_disable(); - ret = this_cpu_ptr(p); - preempt_enable(); - - for_each_possible_cpu(cpu) { - u64 *i = per_cpu_ptr(p, cpu); - - if (i != ret) { - acc_u64s(ret, i, nr); - memset(i, 0, nr * sizeof(u64)); - } - } - - return ret; + percpu_up_write(&c->mark_lock); } static void bch2_gc_done_nocheck(struct bch_fs *c) @@ -542,24 +521,25 @@ static void bch2_gc_done_nocheck(struct bch_fs *c) for_each_member_device(ca, c, i) { unsigned nr = sizeof(struct bch_dev_usage) / sizeof(u64); struct bch_dev_usage *dst = (void *) - acc_percpu_u64s((void *) ca->usage[0], nr); + bch2_acc_percpu_u64s((void *) ca->usage[0], nr); struct bch_dev_usage *src = (void *) - acc_percpu_u64s((void *) ca->usage[1], nr); + bch2_acc_percpu_u64s((void *) ca->usage[1], nr); *dst = *src; } { - unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64); + unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64) + + c->replicas.nr; struct bch_fs_usage *dst = (void *) - acc_percpu_u64s((void *) c->usage[0], nr); + bch2_acc_percpu_u64s((void *) c->usage[0], nr); struct bch_fs_usage *src = (void *) - acc_percpu_u64s((void *) c->usage[1], nr); + bch2_acc_percpu_u64s((void *) c->usage[1], nr); unsigned offset = offsetof(typeof(*dst), s.gc_start); memcpy((void *) dst + offset, (void *) src + offset, - sizeof(*dst) - offset); + nr * sizeof(u64) - offset); } } @@ -655,9 +635,9 @@ static void bch2_gc_done(struct bch_fs *c, bool initial) for_each_member_device(ca, c, i) { unsigned nr = sizeof(struct bch_dev_usage) / sizeof(u64); struct bch_dev_usage *dst = (void *) - acc_percpu_u64s((void *) ca->usage[0], nr); + bch2_acc_percpu_u64s((void *) ca->usage[0], nr); struct bch_dev_usage *src = (void *) - acc_percpu_u64s((void *) ca->usage[1], nr); + bch2_acc_percpu_u64s((void *) ca->usage[1], nr); unsigned b; for (b = 0; b < BCH_DATA_NR; b++) @@ -674,12 +654,12 @@ static void bch2_gc_done(struct bch_fs *c, bool initial) } { - unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64); + unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64) + + c->replicas.nr; struct bch_fs_usage *dst = (void *) - acc_percpu_u64s((void *) c->usage[0], nr); + bch2_acc_percpu_u64s((void *) c->usage[0], nr); struct bch_fs_usage *src = (void *) - acc_percpu_u64s((void *) c->usage[1], nr); - unsigned r, b; + bch2_acc_percpu_u64s((void *) c->usage[1], nr); copy_fs_field(s.hidden, "hidden"); copy_fs_field(s.data, "data"); @@ -687,20 +667,16 @@ static void bch2_gc_done(struct bch_fs *c, bool initial) copy_fs_field(s.reserved, "reserved"); copy_fs_field(s.nr_inodes, "nr_inodes"); - for (r = 0; r < BCH_REPLICAS_MAX; r++) { - for (b = 0; b < BCH_DATA_NR; b++) - copy_fs_field(replicas[r].data[b], - "replicas[%i].data[%s]", - r, bch2_data_types[b]); - copy_fs_field(replicas[r].ec_data, - "replicas[%i].ec_data", r); - copy_fs_field(replicas[r].persistent_reserved, - "replicas[%i].persistent_reserved", r); - } + for (i = 0; i < BCH_REPLICAS_MAX; i++) + copy_fs_field(persistent_reserved[i], + "persistent_reserved[%i]", i); - for (b = 0; b < BCH_DATA_NR; b++) - copy_fs_field(buckets[b], - "buckets[%s]", bch2_data_types[b]); + for (i = 0; i < c->replicas.nr; i++) { + /* + * XXX: print out replicas entry + */ + copy_fs_field(data[i], "data[%i]", i); + } } out: percpu_up_write(&c->mark_lock); @@ -723,9 +699,15 @@ static int bch2_gc_start(struct bch_fs *c) */ gc_pos_set(c, gc_phase(GC_PHASE_START)); + percpu_down_write(&c->mark_lock); BUG_ON(c->usage[1]); - c->usage[1] = alloc_percpu(struct bch_fs_usage); + c->usage[1] = __alloc_percpu_gfp(sizeof(struct bch_fs_usage) + + sizeof(u64) * c->replicas.nr, + sizeof(u64), + GFP_KERNEL); + percpu_up_write(&c->mark_lock); + if (!c->usage[1]) return -ENOMEM; diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 2efe191cdc30..d55778696bcd 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1070,25 +1070,28 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b) { struct bch_fs *c = as->c; struct btree *old = btree_node_root(c, b); - struct bch_fs_usage stats = { 0 }; + struct bch_fs_usage *fs_usage; __bch2_btree_set_root_inmem(c, b); mutex_lock(&c->btree_interior_update_lock); percpu_down_read(&c->mark_lock); + preempt_disable(); + fs_usage = bch2_fs_usage_get_scratch(c); bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key), true, 0, gc_pos_btree_root(b->btree_id), - &stats, 0, 0); + fs_usage, 0, 0); if (old && !btree_node_fake(old)) bch2_btree_node_free_index(as, NULL, bkey_i_to_s_c(&old->key), - &stats); - bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res, + fs_usage); + bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, gc_pos_btree_root(b->btree_id)); + preempt_enable(); percpu_up_read(&c->mark_lock); mutex_unlock(&c->btree_interior_update_lock); } @@ -1161,7 +1164,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b struct btree_node_iter *node_iter) { struct bch_fs *c = as->c; - struct bch_fs_usage stats = { 0 }; + struct bch_fs_usage *fs_usage; struct bkey_packed *k; struct bkey tmp; @@ -1169,10 +1172,11 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b mutex_lock(&c->btree_interior_update_lock); percpu_down_read(&c->mark_lock); + fs_usage = bch2_fs_usage_get_scratch(c); bch2_mark_key_locked(c, bkey_i_to_s_c(insert), true, 0, - gc_pos_btree_node(b), &stats, 0, 0); + gc_pos_btree_node(b), fs_usage, 0, 0); while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && bkey_iter_pos_cmp(b, &insert->k.p, k) > 0) @@ -1185,9 +1189,9 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b if (k && !bkey_cmp_packed(b, k, &insert->k)) bch2_btree_node_free_index(as, b, bkey_disassemble(b, k, &tmp), - &stats); + fs_usage); - bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res, + bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, gc_pos_btree_node(b)); percpu_up_read(&c->mark_lock); @@ -1971,7 +1975,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, bkey_copy(&b->key, &new_key->k_i); } } else { - struct bch_fs_usage stats = { 0 }; + struct bch_fs_usage *fs_usage; BUG_ON(btree_node_root(c, b) != b); @@ -1979,15 +1983,16 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, mutex_lock(&c->btree_interior_update_lock); percpu_down_read(&c->mark_lock); + fs_usage = bch2_fs_usage_get_scratch(c); bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i), true, 0, gc_pos_btree_root(b->btree_id), - &stats, 0, 0); + fs_usage, 0, 0); bch2_btree_node_free_index(as, NULL, bkey_i_to_s_c(&b->key), - &stats); - bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res, + fs_usage); + bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, gc_pos_btree_root(b->btree_id)); percpu_up_read(&c->mark_lock); diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 34e5f81b2b5e..cbebc712a1da 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -72,12 +72,11 @@ #include "ec.h" #include "error.h" #include "movinggc.h" +#include "replicas.h" #include "trace.h" #include -static inline u64 __bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage); - /* * Clear journal_seq_valid for buckets for which it's not needed, to prevent * wraparound: @@ -132,9 +131,29 @@ struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca) return bch2_usage_read_raw(ca->usage[0]); } -struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *c) +struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c) { - return bch2_usage_read_raw(c->usage[0]); + struct bch_fs_usage *ret; + unsigned nr = READ_ONCE(c->replicas.nr); +retry: + ret = kzalloc(sizeof(*ret) + nr * sizeof(u64), GFP_NOFS); + if (unlikely(!ret)) + return NULL; + + percpu_down_read(&c->mark_lock); + + if (unlikely(nr < c->replicas.nr)) { + nr = c->replicas.nr; + percpu_up_read(&c->mark_lock); + kfree(ret); + goto retry; + } + + acc_u64s_percpu((u64 *) ret, + (u64 __percpu *) c->usage[0], + sizeof(*ret) / sizeof(u64) + nr); + + return ret; } #define RESERVE_FACTOR 6 @@ -149,17 +168,13 @@ static u64 avail_factor(u64 r) return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1); } -static inline u64 __bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage fs_usage) -{ - return fs_usage.s.hidden + - fs_usage.s.data + - reserve_factor(fs_usage.s.reserved + - fs_usage.s.online_reserved); -} - u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage fs_usage) { - return min(c->capacity, __bch2_fs_sectors_used(c, fs_usage)); + return min(fs_usage.s.hidden + + fs_usage.s.data + + reserve_factor(fs_usage.s.reserved + + fs_usage.s.online_reserved), + c->capacity); } struct bch_fs_usage_short @@ -208,13 +223,14 @@ static bool bucket_became_unavailable(struct bucket_mark old, !is_available_bucket(new); } -void bch2_fs_usage_apply(struct bch_fs *c, - struct bch_fs_usage *fs_usage, - struct disk_reservation *disk_res, - struct gc_pos gc_pos) +int bch2_fs_usage_apply(struct bch_fs *c, + struct bch_fs_usage *fs_usage, + struct disk_reservation *disk_res, + struct gc_pos gc_pos) { s64 added = fs_usage->s.data + fs_usage->s.reserved; s64 should_not_have_added; + int ret = 0; percpu_rwsem_assert_held(&c->mark_lock); @@ -227,6 +243,7 @@ void bch2_fs_usage_apply(struct bch_fs *c, "disk usage increased without a reservation")) { atomic64_sub(should_not_have_added, &c->sectors_available); added -= should_not_have_added; + ret = -1; } if (added > 0) { @@ -237,17 +254,17 @@ void bch2_fs_usage_apply(struct bch_fs *c, preempt_disable(); acc_u64s((u64 *) this_cpu_ptr(c->usage[0]), (u64 *) fs_usage, - sizeof(*fs_usage) / sizeof(u64)); + sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr); if (gc_visited(c, gc_pos)) { BUG_ON(!c->usage[1]); acc_u64s((u64 *) this_cpu_ptr(c->usage[1]), (u64 *) fs_usage, - sizeof(*fs_usage) / sizeof(u64)); + sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr); } preempt_enable(); - memset(fs_usage, 0, sizeof(*fs_usage)); + return ret; } static inline void account_bucket(struct bch_fs_usage *fs_usage, @@ -258,7 +275,6 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage, if (type == BCH_DATA_SB || type == BCH_DATA_JOURNAL) fs_usage->s.hidden += size; - fs_usage->buckets[type] += size; dev_usage->buckets[type] += nr; } @@ -332,6 +348,34 @@ void bch2_dev_usage_from_buckets(struct bch_fs *c, struct bch_dev *ca) _old; \ }) +static inline void update_replicas(struct bch_fs *c, + struct bch_fs_usage *fs_usage, + struct bch_replicas_entry *r, + s64 sectors) +{ + int idx = bch2_replicas_entry_idx(c, r); + + BUG_ON(idx < 0); + BUG_ON(!sectors); + + if (r->data_type == BCH_DATA_CACHED) + fs_usage->s.cached += sectors; + else + fs_usage->s.data += sectors; + fs_usage->data[idx] += sectors; +} + +static inline void update_cached_sectors(struct bch_fs *c, + struct bch_fs_usage *fs_usage, + unsigned dev, s64 sectors) +{ + struct bch_replicas_padded r; + + bch2_replicas_entry_cached(&r.e, dev); + + update_replicas(c, fs_usage, &r.e, sectors); +} + static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, size_t b, struct bucket_mark *old, bool gc) @@ -350,8 +394,9 @@ static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, new.gen++; })); - fs_usage->replicas[0].data[BCH_DATA_CACHED] -= old->cached_sectors; - fs_usage->s.cached -= old->cached_sectors; + if (old->cached_sectors) + update_cached_sectors(c, fs_usage, ca->dev_idx, + -old->cached_sectors); } void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, @@ -418,11 +463,6 @@ static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, new.data_type = type; checked_add(new.dirty_sectors, sectors); })); - - if (type == BCH_DATA_BTREE || - type == BCH_DATA_USER) - fs_usage->s.data += sectors; - fs_usage->replicas[0].data[type] += sectors; } void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, @@ -550,9 +590,9 @@ static void bch2_mark_pointer(struct bch_fs *c, static int bch2_mark_stripe_ptr(struct bch_fs *c, struct bch_extent_stripe_ptr p, + enum bch_data_type data_type, + struct bch_fs_usage *fs_usage, s64 sectors, unsigned flags, - s64 *adjusted_disk_sectors, - unsigned *redundancy, bool gc) { struct stripe *m; @@ -568,16 +608,15 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, return -1; } + BUG_ON(m->r.e.data_type != data_type); + nr_data = m->nr_blocks - m->nr_redundant; parity_sectors = DIV_ROUND_UP(abs(sectors) * m->nr_redundant, nr_data); if (sectors < 0) parity_sectors = -parity_sectors; - - *adjusted_disk_sectors += parity_sectors; - - *redundancy = max_t(unsigned, *redundancy, m->nr_redundant + 1); + sectors += parity_sectors; new = atomic_add_return(sectors, &m->block_sectors[p.block]); old = new - sectors; @@ -593,6 +632,8 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, if (!gc) bch2_stripes_heap_update(c, m, p.idx); + update_replicas(c, fs_usage, &m->r.e, sectors); + return 0; } @@ -605,58 +646,46 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; - s64 cached_sectors = 0; - s64 dirty_sectors = 0; - s64 ec_sectors = 0; - unsigned replicas = 0; - unsigned ec_redundancy = 0; + struct bch_replicas_padded r; + s64 dirty_sectors = 0; unsigned i; int ret; + r.e.data_type = data_type; + r.e.nr_devs = 0; + r.e.nr_required = 1; + BUG_ON(!sectors); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { s64 disk_sectors = data_type == BCH_DATA_BTREE ? sectors : ptr_disk_sectors_delta(p, sectors); - s64 adjusted_disk_sectors = disk_sectors; bch2_mark_pointer(c, p, disk_sectors, data_type, fs_usage, journal_seq, flags, gc); - if (!p.ptr.cached) + if (p.ptr.cached) { + update_cached_sectors(c, fs_usage, p.ptr.dev, + disk_sectors); + } else if (!p.ec_nr) { + dirty_sectors += disk_sectors; + r.e.devs[r.e.nr_devs++] = p.ptr.dev; + } else { for (i = 0; i < p.ec_nr; i++) { ret = bch2_mark_stripe_ptr(c, p.ec[i], - disk_sectors, flags, - &adjusted_disk_sectors, - &ec_redundancy, gc); + data_type, fs_usage, + disk_sectors, flags, gc); if (ret) return ret; } - if (!p.ptr.cached) - replicas++; - if (p.ptr.cached) - cached_sectors += adjusted_disk_sectors; - else if (!p.ec_nr) - dirty_sectors += adjusted_disk_sectors; - else - ec_sectors += adjusted_disk_sectors; + r.e.nr_required = 0; + } } - replicas = clamp_t(unsigned, replicas, - 1, ARRAY_SIZE(fs_usage->replicas)); - ec_redundancy = clamp_t(unsigned, ec_redundancy, - 1, ARRAY_SIZE(fs_usage->replicas)); - - fs_usage->s.cached += cached_sectors; - fs_usage->replicas[0].data[BCH_DATA_CACHED] += cached_sectors; - - fs_usage->s.data += dirty_sectors; - fs_usage->replicas[replicas - 1].data[data_type] += dirty_sectors; - - fs_usage->s.data += ec_sectors; - fs_usage->replicas[ec_redundancy - 1].ec_data += ec_sectors; + if (dirty_sectors) + update_replicas(c, fs_usage, &r.e, dirty_sectors); return 0; } @@ -724,8 +753,24 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k, m->algorithm = s.v->algorithm; m->nr_blocks = s.v->nr_blocks; m->nr_redundant = s.v->nr_redundant; + + memset(&m->r, 0, sizeof(m->r)); + + m->r.e.data_type = BCH_DATA_USER; + m->r.e.nr_devs = s.v->nr_blocks; + m->r.e.nr_required = s.v->nr_blocks - s.v->nr_redundant; + + for (i = 0; i < s.v->nr_blocks; i++) + m->r.e.devs[i] = s.v->ptrs[i].dev; } + /* + * XXX: account for stripes somehow here + */ +#if 0 + update_replicas(c, fs_usage, &m->r.e, stripe_sectors); +#endif + if (!gc) { if (inserting) bch2_stripes_heap_insert(c, m, idx); @@ -773,11 +818,11 @@ static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; sectors *= replicas; - replicas = clamp_t(unsigned, replicas, - 1, ARRAY_SIZE(fs_usage->replicas)); + replicas = clamp_t(unsigned, replicas, 1, + ARRAY_SIZE(fs_usage->persistent_reserved)); - fs_usage->s.reserved += sectors; - fs_usage->replicas[replicas - 1].persistent_reserved += sectors; + fs_usage->s.reserved += sectors; + fs_usage->persistent_reserved[replicas - 1] += sectors; break; } default: @@ -839,20 +884,24 @@ void bch2_mark_update(struct btree_insert *trans, struct btree_iter *iter = insert->iter; struct btree *b = iter->l[0].b; struct btree_node_iter node_iter = iter->l[0].iter; - struct bch_fs_usage fs_usage = { 0 }; + struct bch_fs_usage *fs_usage; struct gc_pos pos = gc_pos_btree_node(b); struct bkey_packed *_k; + u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; + static int warned_disk_usage = 0; if (!btree_node_type_needs_gc(iter->btree_id)) return; percpu_down_read(&c->mark_lock); + preempt_disable(); + fs_usage = bch2_fs_usage_get_scratch(c); if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true, bpos_min(insert->k->k.p, b->key.k.p).offset - bkey_start_offset(&insert->k->k), - pos, &fs_usage, trans->journal_res.seq, 0); + pos, fs_usage, trans->journal_res.seq, 0); while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, KEY_TYPE_discard))) { @@ -885,7 +934,7 @@ void bch2_mark_update(struct btree_insert *trans, BUG_ON(sectors <= 0); bch2_mark_key_locked(c, k, true, sectors, - pos, &fs_usage, trans->journal_res.seq, 0); + pos, fs_usage, trans->journal_res.seq, 0); sectors = bkey_start_offset(&insert->k->k) - k.k->p.offset; @@ -896,13 +945,44 @@ void bch2_mark_update(struct btree_insert *trans, } bch2_mark_key_locked(c, k, false, sectors, - pos, &fs_usage, trans->journal_res.seq, 0); + pos, fs_usage, trans->journal_res.seq, 0); bch2_btree_node_iter_advance(&node_iter, b); } - bch2_fs_usage_apply(c, &fs_usage, trans->disk_res, pos); + if (bch2_fs_usage_apply(c, fs_usage, trans->disk_res, pos) && + !warned_disk_usage && + !xchg(&warned_disk_usage, 1)) { + char buf[200]; + pr_err("disk usage increased more than %llu sectors reserved", disk_res_sectors); + + pr_err("while inserting"); + bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert->k)); + pr_err("%s", buf); + pr_err("overlapping with"); + + node_iter = iter->l[0].iter; + while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, + KEY_TYPE_discard))) { + struct bkey unpacked; + struct bkey_s_c k; + + k = bkey_disassemble(b, _k, &unpacked); + + if (btree_node_is_extents(b) + ? bkey_cmp(insert->k->k.p, bkey_start_pos(k.k)) <= 0 + : bkey_cmp(insert->k->k.p, k.k->p)) + break; + + bch2_bkey_val_to_text(&PBUF(buf), c, k); + pr_err("%s", buf); + + bch2_btree_node_iter_advance(&node_iter, b); + } + } + + preempt_enable(); percpu_up_read(&c->mark_lock); } diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 88e083325232..107cb48e3929 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -218,7 +218,18 @@ static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca) /* Filesystem usage: */ -struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *); +static inline struct bch_fs_usage *bch2_fs_usage_get_scratch(struct bch_fs *c) +{ + struct bch_fs_usage *ret; + + ret = this_cpu_ptr(c->usage_scratch); + + memset(ret, 0, sizeof(*ret) + c->replicas.nr * sizeof(u64)); + + return ret; +} + +struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *); u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage); @@ -254,8 +265,8 @@ int bch2_mark_key(struct bch_fs *, struct bkey_s_c, bool, s64, struct gc_pos, struct bch_fs_usage *, u64, unsigned); void bch2_mark_update(struct btree_insert *, struct btree_insert_entry *); -void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, - struct disk_reservation *, struct gc_pos); +int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, + struct disk_reservation *, struct gc_pos); /* disk reservations: */ diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index 196f07f41728..65b4bb39f88e 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -75,16 +75,18 @@ struct bch_fs_usage { u64 cached; u64 reserved; u64 nr_inodes; + + /* XXX: add stats for compression ratio */ +#if 0 + u64 uncompressed; + u64 compressed; +#endif } s; /* broken out: */ - struct { - u64 data[BCH_DATA_NR]; - u64 ec_data; - u64 persistent_reserved; - } replicas[BCH_REPLICAS_MAX]; - u64 buckets[BCH_DATA_NR]; + u64 persistent_reserved[BCH_REPLICAS_MAX]; + u64 data[]; }; struct bch_fs_usage_short { diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 7f79f020d904..f090b61f23f1 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -394,21 +394,31 @@ static long bch2_ioctl_usage(struct bch_fs *c, } { - struct bch_fs_usage src = bch2_fs_usage_read(c); + struct bch_fs_usage *src; struct bch_ioctl_fs_usage dst = { .capacity = c->capacity, - .used = bch2_fs_sectors_used(c, src), - .online_reserved = src.s.online_reserved, }; + src = bch2_fs_usage_read(c); + if (!src) + return -ENOMEM; + + percpu_up_read(&c->mark_lock); + + dst.used = bch2_fs_sectors_used(c, *src); + dst.online_reserved = src->s.online_reserved; + for (i = 0; i < BCH_REPLICAS_MAX; i++) { dst.persistent_reserved[i] = - src.replicas[i].persistent_reserved; - + src->persistent_reserved[i]; +#if 0 for (j = 0; j < BCH_DATA_NR; j++) dst.sectors[j][i] = src.replicas[i].data[j]; +#endif } + kfree(src); + ret = copy_to_user(&user_arg->fs, &dst, sizeof(dst)); if (ret) return ret; diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h index a3216ca01913..e416dac7ee19 100644 --- a/fs/bcachefs/ec_types.h +++ b/fs/bcachefs/ec_types.h @@ -6,6 +6,11 @@ #define EC_STRIPE_MAX 16 +struct bch_replicas_padded { + struct bch_replicas_entry e; + u8 pad[EC_STRIPE_MAX]; +}; + struct stripe { size_t heap_idx; @@ -18,6 +23,8 @@ struct stripe { u8 alive; atomic_t blocks_nonempty; atomic_t block_sectors[EC_STRIPE_MAX]; + + struct bch_replicas_padded r; }; struct ec_stripe_heap_entry { diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 67f6250ef91a..1d96a1773f74 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -1669,12 +1669,13 @@ static bool bch2_extent_merge_inline(struct bch_fs *c, return ret == BCH_MERGE_MERGE; } -int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size) +bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, + unsigned nr_replicas) { struct btree_iter iter; struct bpos end = pos; struct bkey_s_c k; - int ret = 0; + bool ret = true; end.offset += size; @@ -1683,8 +1684,8 @@ int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size) if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) break; - if (!bch2_extent_is_fully_allocated(k)) { - ret = -ENOSPC; + if (nr_replicas > bch2_bkey_nr_ptrs_allocated(k)) { + ret = false; break; } } @@ -1693,6 +1694,29 @@ int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size) return ret; } +unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k) +{ + unsigned ret = 0; + + switch (k.k->type) { + case KEY_TYPE_extent: { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + extent_for_each_ptr_decode(e, p, entry) + ret += !p.ptr.cached && + p.crc.compression_type == BCH_COMPRESSION_NONE; + break; + } + case KEY_TYPE_reservation: + ret = bkey_s_c_to_reservation(k).v->nr_replicas; + break; + } + + return ret; +} + /* KEY_TYPE_reservation: */ const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k) diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 57eb35699545..17cae891bccb 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -572,6 +572,7 @@ static inline void extent_save(struct btree *b, struct bkey_packed *dst, BUG_ON(!bch2_bkey_pack_key(dst, src, f)); } -int bch2_check_range_allocated(struct bch_fs *, struct bpos, u64); +bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned); +unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); #endif /* _BCACHEFS_EXTENTS_H */ diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index 7cb4942cacf7..26d5cad7e6a5 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -263,18 +263,20 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, } } -static inline size_t eytzinger0_find(void *base, size_t nr, size_t size, - eytzinger_cmp_fn cmp, const void *search) -{ - size_t i = 0; - int res; - - while (i < nr && - (res = cmp(search, base + i * size, size))) - i = eytzinger0_child(i, res > 0); - - return i; -} +#define eytzinger0_find(base, nr, size, _cmp, search) \ +({ \ + void *_base = (base); \ + void *_search = (search); \ + size_t _nr = (nr); \ + size_t _size = (size); \ + size_t _i = 0; \ + int _res; \ + \ + while (_i < _nr && \ + (_res = _cmp(_search, _base + _i * _size, _size))) \ + _i = eytzinger0_child(_i, _res > 0); \ + _i; \ +}) void eytzinger0_sort(void *, size_t, size_t, int (*cmp_func)(const void *, const void *, size_t), diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index a59fedcaed07..7681cfbc6bed 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -253,7 +253,9 @@ static s64 sum_sector_overwrites(struct bkey_i *new, struct btree_iter *_iter, BUG_ON(btree_iter_err(old)); if (allocating && - !bch2_extent_is_fully_allocated(old)) + !*allocating && + bch2_bkey_nr_ptrs_allocated(old) < + bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(new))) *allocating = true; delta += (min(new->k.p.offset, @@ -812,9 +814,7 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k) { struct bvec_iter iter; struct bio_vec bv; - unsigned nr_ptrs = !bch2_extent_is_compressed(k) - ? bch2_bkey_nr_dirty_ptrs(k) - : 0; + unsigned nr_ptrs = bch2_bkey_nr_ptrs_allocated(k); bio_for_each_segment(bv, bio, iter) { /* brand new pages, don't need to be locked: */ @@ -1930,19 +1930,20 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) if (unlikely(ret)) goto err; + dio->iop.op.nr_replicas = dio->iop.op.opts.data_replicas; + ret = bch2_disk_reservation_get(c, &dio->iop.op.res, iter->count >> 9, dio->iop.op.opts.data_replicas, 0); if (unlikely(ret)) { - if (bch2_check_range_allocated(c, POS(inode->v.i_ino, - req->ki_pos >> 9), - iter->count >> 9)) + if (!bch2_check_range_allocated(c, POS(inode->v.i_ino, + req->ki_pos >> 9), + iter->count >> 9, + dio->iop.op.opts.data_replicas)) goto err; dio->iop.unalloc = true; } - dio->iop.op.nr_replicas = dio->iop.op.res.nr_replicas; - return bch2_dio_write_loop(dio); err: bch2_disk_reservation_put(c, &dio->iop.op.res); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 67ff2633ba16..9c794c9a1924 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -694,6 +694,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) } list_for_each_entry(i, list, list) { + struct bch_replicas_padded replicas; + char buf[80]; + + bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, i->devs); + ret = jset_validate_entries(c, &i->j, READ); if (ret) goto fsck_err; @@ -705,11 +710,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) if (!degraded && (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || - fsck_err_on(!bch2_replicas_marked(c, BCH_DATA_JOURNAL, - i->devs, false), c, - "superblock not marked as containing replicas (type %u)", - BCH_DATA_JOURNAL))) { - ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, i->devs); + fsck_err_on(!bch2_replicas_marked(c, &replicas.e, false), c, + "superblock not marked as containing replicas %s", + (bch2_replicas_entry_to_text(&PBUF(buf), + &replicas.e), buf)))) { + ret = bch2_mark_replicas(c, &replicas.e); if (ret) return ret; } @@ -1108,6 +1113,7 @@ static void journal_write_done(struct closure *cl) struct journal_buf *w = journal_prev_buf(j); struct bch_devs_list devs = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); + struct bch_replicas_padded replicas; u64 seq = le64_to_cpu(w->data->seq); u64 last_seq = le64_to_cpu(w->data->last_seq); @@ -1118,7 +1124,9 @@ static void journal_write_done(struct closure *cl) goto err; } - if (bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs)) + bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, devs); + + if (bch2_mark_replicas(c, &replicas.e)) goto err; spin_lock(&j->lock); diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index f24546dbf3ed..98345dcd1e67 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -388,7 +388,6 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_entry_pin_list *p; - struct bch_devs_list devs; u64 iter, seq = 0; int ret = 0; @@ -413,12 +412,15 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) spin_lock(&j->lock); while (!ret && seq < j->pin.back) { + struct bch_replicas_padded replicas; + seq = max(seq, journal_last_seq(j)); - devs = journal_seq_pin(j, seq)->devs; + bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, + journal_seq_pin(j, seq)->devs); seq++; spin_unlock(&j->lock); - ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs); + ret = bch2_mark_replicas(c, &replicas.e); spin_lock(&j->lock); } spin_unlock(&j->lock); diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index 63fe8cbb0564..b97a5a8f3910 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -5,6 +5,7 @@ #include "bcachefs.h" #include "btree_update.h" +#include "btree_update_interior.h" #include "buckets.h" #include "extents.h" #include "io.h" @@ -153,6 +154,16 @@ retry: bch2_btree_iter_unlock(&iter); } + /* flush relevant btree updates */ + while (1) { + closure_wait_event(&c->btree_interior_update_wait, + !bch2_btree_interior_updates_nr_pending(c) || + c->btree_roots_dirty); + if (!bch2_btree_interior_updates_nr_pending(c)) + break; + bch2_journal_meta(&c->journal); + } + ret = 0; out: ret = bch2_replicas_gc_end(c, ret); diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 9081952316b0..5a35f76006cf 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -4,6 +4,7 @@ #include "alloc_foreground.h" #include "btree_gc.h" #include "btree_update.h" +#include "btree_update_interior.h" #include "buckets.h" #include "disk_groups.h" #include "inode.h" @@ -762,6 +763,16 @@ int bch2_data_job(struct bch_fs *c, ret = bch2_journal_flush_device_pins(&c->journal, -1); ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret; + + while (1) { + closure_wait_event(&c->btree_interior_update_wait, + !bch2_btree_interior_updates_nr_pending(c) || + c->btree_roots_dirty); + if (!bch2_btree_interior_updates_nr_pending(c)) + break; + bch2_journal_meta(&c->journal); + } + ret = bch2_gc_btree_replicas(c) ?: ret; ret = bch2_move_data(c, NULL, diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index b63da1bef760..34a5475cfaba 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -4,11 +4,6 @@ #include "replicas.h" #include "super-io.h" -struct bch_replicas_padded { - struct bch_replicas_entry e; - u8 pad[BCH_SB_MEMBERS_MAX]; -}; - static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, struct bch_replicas_cpu *); @@ -19,6 +14,16 @@ static inline int u8_cmp(u8 l, u8 r) return (l > r) - (l < r); } +static void verify_replicas_entry_sorted(struct bch_replicas_entry *e) +{ +#ifdef CONFIG_BCACHES_DEBUG + unsigned i; + + for (i = 0; i + 1 < e->nr_devs; i++) + BUG_ON(e->devs[i] >= e->devs[i + 1]); +#endif +} + static void replicas_entry_sort(struct bch_replicas_entry *e) { bubble_sort(e->devs, e->nr_devs, u8_cmp); @@ -29,19 +34,13 @@ static void replicas_entry_sort(struct bch_replicas_entry *e) (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\ _i = (void *) (_i) + (_r)->entry_size) -static inline struct bch_replicas_entry * -cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) -{ - return (void *) r->entries + r->entry_size * i; -} - static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) { eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL); } -static void replicas_entry_to_text(struct printbuf *out, - struct bch_replicas_entry *e) +void bch2_replicas_entry_to_text(struct printbuf *out, + struct bch_replicas_entry *e) { unsigned i; @@ -66,7 +65,7 @@ void bch2_cpu_replicas_to_text(struct printbuf *out, pr_buf(out, " "); first = false; - replicas_entry_to_text(out, e); + bch2_replicas_entry_to_text(out, e); } } @@ -106,8 +105,8 @@ static void stripe_to_replicas(struct bkey_s_c k, r->devs[r->nr_devs++] = ptr->dev; } -static void bkey_to_replicas(struct bkey_s_c k, - struct bch_replicas_entry *e) +static void bkey_to_replicas(struct bch_replicas_entry *e, + struct bkey_s_c k) { e->nr_devs = 0; @@ -129,9 +128,9 @@ static void bkey_to_replicas(struct bkey_s_c k, replicas_entry_sort(e); } -static inline void devlist_to_replicas(struct bch_devs_list devs, - enum bch_data_type data_type, - struct bch_replicas_entry *e) +void bch2_devlist_to_replicas(struct bch_replicas_entry *e, + enum bch_data_type data_type, + struct bch_devs_list devs) { unsigned i; @@ -160,6 +159,9 @@ cpu_replicas_add_entry(struct bch_replicas_cpu *old, replicas_entry_bytes(new_entry)), }; + BUG_ON(!new_entry->data_type); + verify_replicas_entry_sorted(new_entry); + new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO); if (!new.entries) return new; @@ -177,21 +179,49 @@ cpu_replicas_add_entry(struct bch_replicas_cpu *old, return new; } +static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, + struct bch_replicas_entry *search) +{ + int idx, entry_size = replicas_entry_bytes(search); + + if (unlikely(entry_size > r->entry_size)) + return -1; + + verify_replicas_entry_sorted(search); + +#define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size) + idx = eytzinger0_find(r->entries, r->nr, r->entry_size, + entry_cmp, search); +#undef entry_cmp + + return idx < r->nr ? idx : -1; +} + +int bch2_replicas_entry_idx(struct bch_fs *c, + struct bch_replicas_entry *search) +{ + replicas_entry_sort(search); + + return __replicas_entry_idx(&c->replicas, search); +} + static bool __replicas_has_entry(struct bch_replicas_cpu *r, struct bch_replicas_entry *search) { - return replicas_entry_bytes(search) <= r->entry_size && - eytzinger0_find(r->entries, r->nr, - r->entry_size, - memcmp, search) < r->nr; + return __replicas_entry_idx(r, search) >= 0; } -static bool replicas_has_entry(struct bch_fs *c, - struct bch_replicas_entry *search, - bool check_gc_replicas) +bool bch2_replicas_marked(struct bch_fs *c, + struct bch_replicas_entry *search, + bool check_gc_replicas) { bool marked; + if (!search->nr_devs) + return true; + + verify_replicas_entry_sorted(search); + percpu_down_read(&c->mark_lock); marked = __replicas_has_entry(&c->replicas, search) && (!check_gc_replicas || @@ -202,6 +232,76 @@ static bool replicas_has_entry(struct bch_fs *c, return marked; } +static void __replicas_table_update(struct bch_fs_usage __percpu *dst_p, + struct bch_replicas_cpu *dst_r, + struct bch_fs_usage __percpu *src_p, + struct bch_replicas_cpu *src_r) +{ + unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr; + struct bch_fs_usage *dst, *src = (void *) + bch2_acc_percpu_u64s((void *) src_p, src_nr); + int src_idx, dst_idx; + + preempt_disable(); + dst = this_cpu_ptr(dst_p); + preempt_enable(); + + *dst = *src; + + for (src_idx = 0; src_idx < src_r->nr; src_idx++) { + if (!src->data[src_idx]) + continue; + + dst_idx = __replicas_entry_idx(dst_r, + cpu_replicas_entry(src_r, src_idx)); + BUG_ON(dst_idx < 0); + + dst->data[dst_idx] = src->data[src_idx]; + } +} + +/* + * Resize filesystem accounting: + */ +static int replicas_table_update(struct bch_fs *c, + struct bch_replicas_cpu *new_r) +{ + struct bch_fs_usage __percpu *new_usage[3] = { NULL, NULL, NULL }; + unsigned bytes = sizeof(struct bch_fs_usage) + + sizeof(u64) * new_r->nr; + unsigned i; + int ret = -ENOMEM; + + for (i = 0; i < 3; i++) { + if (i < 2 && !c->usage[i]) + continue; + + new_usage[i] = __alloc_percpu_gfp(bytes, sizeof(u64), + GFP_NOIO); + if (!new_usage[i]) + goto err; + } + + for (i = 0; i < 2; i++) { + if (!c->usage[i]) + continue; + + __replicas_table_update(new_usage[i], new_r, + c->usage[i], &c->replicas); + + swap(c->usage[i], new_usage[i]); + } + + swap(c->usage_scratch, new_usage[2]); + + swap(c->replicas, *new_r); + ret = 0; +err: + for (i = 0; i < 3; i++) + free_percpu(new_usage[i]); + return ret; +} + noinline static int bch2_mark_replicas_slowpath(struct bch_fs *c, struct bch_replicas_entry *new_entry) @@ -243,7 +343,7 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c, /* don't update in memory replicas until changes are persistent */ percpu_down_write(&c->mark_lock); if (new_r.entries) - swap(new_r, c->replicas); + ret = replicas_table_update(c, &new_r); if (new_gc.entries) swap(new_gc, c->replicas_gc); percpu_up_write(&c->mark_lock); @@ -258,30 +358,32 @@ err: return ret; } -static int __bch2_mark_replicas(struct bch_fs *c, - struct bch_replicas_entry *devs) +int bch2_mark_replicas(struct bch_fs *c, + struct bch_replicas_entry *r) { - return likely(replicas_has_entry(c, devs, true)) + return likely(bch2_replicas_marked(c, r, true)) ? 0 - : bch2_mark_replicas_slowpath(c, devs); + : bch2_mark_replicas_slowpath(c, r); } -int bch2_mark_replicas(struct bch_fs *c, - enum bch_data_type data_type, - struct bch_devs_list devs) +bool bch2_bkey_replicas_marked(struct bch_fs *c, + struct bkey_s_c k, + bool check_gc_replicas) { struct bch_replicas_padded search; + struct bch_devs_list cached = bch2_bkey_cached_devs(k); + unsigned i; - if (!devs.nr) - return 0; + for (i = 0; i < cached.nr; i++) { + bch2_replicas_entry_cached(&search.e, cached.devs[i]); - memset(&search, 0, sizeof(search)); + if (!bch2_replicas_marked(c, &search.e, check_gc_replicas)) + return false; + } - BUG_ON(devs.nr >= BCH_REPLICAS_MAX); + bkey_to_replicas(&search.e, k); - devlist_to_replicas(devs, data_type, &search.e); - - return __bch2_mark_replicas(c, &search.e); + return bch2_replicas_marked(c, &search.e, check_gc_replicas); } int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) @@ -291,22 +393,23 @@ int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) unsigned i; int ret; - memset(&search, 0, sizeof(search)); + for (i = 0; i < cached.nr; i++) { + bch2_replicas_entry_cached(&search.e, cached.devs[i]); - for (i = 0; i < cached.nr; i++) - if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED, - bch2_dev_list_single(cached.devs[i])))) + ret = bch2_mark_replicas(c, &search.e); + if (ret) return ret; + } - bkey_to_replicas(k, &search.e); + bkey_to_replicas(&search.e, k); - return search.e.nr_devs - ? __bch2_mark_replicas(c, &search.e) - : 0; + return bch2_mark_replicas(c, &search.e); } int bch2_replicas_gc_end(struct bch_fs *c, int ret) { + unsigned i; + lockdep_assert_held(&c->replicas_gc_lock); mutex_lock(&c->sb_lock); @@ -314,6 +417,39 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret) if (ret) goto err; + /* + * this is kind of crappy; the replicas gc mechanism needs to be ripped + * out + */ + + for (i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry *e = + cpu_replicas_entry(&c->replicas, i); + struct bch_replicas_cpu n; + u64 v = 0; + int cpu; + + if (__replicas_has_entry(&c->replicas_gc, e)) + continue; + + for_each_possible_cpu(cpu) + v += *per_cpu_ptr(&c->usage[0]->data[i], cpu); + if (!v) + continue; + + n = cpu_replicas_add_entry(&c->replicas_gc, e); + if (!n.entries) { + ret = -ENOSPC; + goto err; + } + + percpu_down_write(&c->mark_lock); + swap(n, c->replicas_gc); + percpu_up_write(&c->mark_lock); + + kfree(n.entries); + } + if (bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc)) { ret = -ENOSPC; goto err; @@ -325,7 +461,7 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret) err: percpu_down_write(&c->mark_lock); if (!ret) - swap(c->replicas, c->replicas_gc); + ret = replicas_table_update(c, &c->replicas_gc); kfree(c->replicas_gc.entries); c->replicas_gc.entries = NULL; @@ -461,7 +597,7 @@ int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) bch2_cpu_replicas_sort(&new_r); percpu_down_write(&c->mark_lock); - swap(c->replicas, new_r); + ret = replicas_table_update(c, &new_r); percpu_up_write(&c->mark_lock); kfree(new_r.entries); @@ -628,7 +764,7 @@ static void bch2_sb_replicas_to_text(struct printbuf *out, pr_buf(out, " "); first = false; - replicas_entry_to_text(out, e); + bch2_replicas_entry_to_text(out, e); } } @@ -677,46 +813,6 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { /* Query replicas: */ -bool bch2_replicas_marked(struct bch_fs *c, - enum bch_data_type data_type, - struct bch_devs_list devs, - bool check_gc_replicas) -{ - struct bch_replicas_padded search; - - if (!devs.nr) - return true; - - memset(&search, 0, sizeof(search)); - - devlist_to_replicas(devs, data_type, &search.e); - - return replicas_has_entry(c, &search.e, check_gc_replicas); -} - -bool bch2_bkey_replicas_marked(struct bch_fs *c, - struct bkey_s_c k, - bool check_gc_replicas) -{ - struct bch_replicas_padded search; - struct bch_devs_list cached = bch2_bkey_cached_devs(k); - unsigned i; - - memset(&search, 0, sizeof(search)); - - for (i = 0; i < cached.nr; i++) - if (!bch2_replicas_marked(c, BCH_DATA_CACHED, - bch2_dev_list_single(cached.devs[i]), - check_gc_replicas)) - return false; - - bkey_to_replicas(k, &search.e); - - return search.e.nr_devs - ? replicas_has_entry(c, &search.e, check_gc_replicas) - : true; -} - struct replicas_status __bch2_replicas_status(struct bch_fs *c, struct bch_devs_mask online_devs) { diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h index 03aaafdc7c17..923bddb21ec3 100644 --- a/fs/bcachefs/replicas.h +++ b/fs/bcachefs/replicas.h @@ -2,17 +2,42 @@ #ifndef _BCACHEFS_REPLICAS_H #define _BCACHEFS_REPLICAS_H +#include "eytzinger.h" #include "replicas_types.h" -bool bch2_replicas_marked(struct bch_fs *, enum bch_data_type, - struct bch_devs_list, bool); +void bch2_replicas_entry_to_text(struct printbuf *, + struct bch_replicas_entry *); +void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); + +static inline struct bch_replicas_entry * +cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) +{ + return (void *) r->entries + r->entry_size * i; +} + +int bch2_replicas_entry_idx(struct bch_fs *, + struct bch_replicas_entry *); + +void bch2_devlist_to_replicas(struct bch_replicas_entry *, + enum bch_data_type, + struct bch_devs_list); +bool bch2_replicas_marked(struct bch_fs *, + struct bch_replicas_entry *, bool); +int bch2_mark_replicas(struct bch_fs *, + struct bch_replicas_entry *); + bool bch2_bkey_replicas_marked(struct bch_fs *, struct bkey_s_c, bool); -int bch2_mark_replicas(struct bch_fs *, enum bch_data_type, - struct bch_devs_list); int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c); -void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); +static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e, + unsigned dev) +{ + e->data_type = BCH_DATA_CACHED; + e->nr_devs = 1; + e->nr_required = 1; + e->devs[0] = dev; +} struct replicas_status { struct { diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 55069f40d04b..9a862b19ce22 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -375,6 +375,7 @@ static void bch2_fs_free(struct bch_fs *c) bch2_io_clock_exit(&c->io_clock[READ]); bch2_fs_compress_exit(c); percpu_free_rwsem(&c->mark_lock); + free_percpu(c->usage_scratch); free_percpu(c->usage[0]); free_percpu(c->pcpu); mempool_exit(&c->btree_iters_pool); @@ -506,7 +507,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) { struct bch_sb_field_members *mi; struct bch_fs *c; - unsigned i, iter_size; + unsigned i, iter_size, fs_usage_size; const char *err; pr_verbose_init(opts, ""); @@ -600,6 +601,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) (btree_blocks(c) + 1) * 2 * sizeof(struct btree_node_iter_set); + fs_usage_size = sizeof(struct bch_fs_usage) + + sizeof(u64) * c->replicas.nr; + if (!(c->wq = alloc_workqueue("bcachefs", WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) || !(c->copygc_wq = alloc_workqueue("bcache_copygc", @@ -616,7 +620,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) max(offsetof(struct btree_read_bio, bio), offsetof(struct btree_write_bio, wbio.bio)), BIOSET_NEED_BVECS) || - !(c->usage[0] = alloc_percpu(struct bch_fs_usage)) || + !(c->usage[0] = __alloc_percpu(fs_usage_size, sizeof(u64))) || + !(c->usage_scratch = __alloc_percpu(fs_usage_size, sizeof(u64))) || !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) || diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 27fd6dfe83f5..424636310bbf 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -234,33 +234,45 @@ static size_t bch2_btree_cache_size(struct bch_fs *c) static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf) { struct printbuf out = _PBUF(buf, PAGE_SIZE); - struct bch_fs_usage stats = bch2_fs_usage_read(c); - unsigned replicas, type; + struct bch_fs_usage *fs_usage = bch2_fs_usage_read(c); + unsigned i; + + if (!fs_usage) + return -ENOMEM; pr_buf(&out, "capacity:\t\t%llu\n", c->capacity); - for (replicas = 0; replicas < ARRAY_SIZE(stats.replicas); replicas++) { - pr_buf(&out, "%u replicas:\n", replicas + 1); - + for (i = 0; + i < ARRAY_SIZE(fs_usage->persistent_reserved); + i++) { + pr_buf(&out, "%u replicas:\n", i + 1); +#if 0 for (type = BCH_DATA_SB; type < BCH_DATA_NR; type++) pr_buf(&out, "\t%s:\t\t%llu\n", bch2_data_types[type], stats.replicas[replicas].data[type]); pr_buf(&out, "\terasure coded:\t%llu\n", stats.replicas[replicas].ec_data); +#endif pr_buf(&out, "\treserved:\t%llu\n", - stats.replicas[replicas].persistent_reserved); + fs_usage->persistent_reserved[i]); } - pr_buf(&out, "bucket usage\n"); - - for (type = BCH_DATA_SB; type < BCH_DATA_NR; type++) - pr_buf(&out, "\t%s:\t\t%llu\n", - bch2_data_types[type], - stats.buckets[type]); - pr_buf(&out, "online reserved:\t%llu\n", - stats.s.online_reserved); + fs_usage->s.online_reserved); + + for (i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry *e = + cpu_replicas_entry(&c->replicas, i); + + pr_buf(&out, "\t"); + bch2_replicas_entry_to_text(&out, e); + pr_buf(&out, ":\t%llu\n", fs_usage->data[i]); + } + + percpu_up_read(&c->mark_lock); + + kfree(fs_usage); return out.pos - buf; } diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 9f3eafb3e0d4..295f4577e9c1 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -904,3 +904,28 @@ void eytzinger0_find_test(void) kfree(test_array); } #endif + +/* + * Accumulate percpu counters onto one cpu's copy - only valid when access + * against any percpu counter is guarded against + */ +u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr) +{ + u64 *ret; + int cpu; + + preempt_disable(); + ret = this_cpu_ptr(p); + preempt_enable(); + + for_each_possible_cpu(cpu) { + u64 *i = per_cpu_ptr(p, cpu); + + if (i != ret) { + acc_u64s(ret, i, nr); + memset(i, 0, nr * sizeof(u64)); + } + } + + return ret; +} diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 8bbb0e30d07f..fa1a3adc87df 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -718,4 +718,6 @@ static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src, acc_u64s(acc, per_cpu_ptr(src, cpu), nr); } +u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned); + #endif /* _BCACHEFS_UTIL_H */