mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-04-13 09:49:06 +08:00
Merge tag 'bcachefs-2025-05-15' of git://evilpiepirate.org/bcachefs
Pull bcachefs fixes from Kent Overstreet:
"The main user reported ones are:
- Fix a btree iterator locking inconsistency that's been causing us
to go emergency read-only in evacuate: "Fix broken btree_path lock
invariants in next_node()"
- Minor btree node cache reclaim tweak that should help with OOMs:
don't set btree nodes as accessed on fill
- Fix a bch2_bkey_clear_rebalance() issue that was causing rebalance
to do needless work"
* tag 'bcachefs-2025-05-15' of git://evilpiepirate.org/bcachefs:
bcachefs: fix wrong arg to fsck_err()
bcachefs: Fix missing commit in backpointer to missing target
bcachefs: Fix accidental O(n^2) in fiemap
bcachefs: Fix set_should_be_locked() call in peek_slot()
bcachefs: Fix self deadlock
bcachefs: Don't set btree nodes as accessed on fill
bcachefs: Fix livelock in journal_entry_open()
bcachefs: Fix broken btree_path lock invariants in next_node()
bcachefs: Don't strip rebalance_opts from indirect extents
This commit is contained in:
@@ -192,7 +192,8 @@ static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans,
|
||||
static int backpointer_target_not_found(struct btree_trans *trans,
|
||||
struct bkey_s_c_backpointer bp,
|
||||
struct bkey_s_c target_k,
|
||||
struct bkey_buf *last_flushed)
|
||||
struct bkey_buf *last_flushed,
|
||||
bool commit)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct printbuf buf = PRINTBUF;
|
||||
@@ -228,18 +229,77 @@ static int backpointer_target_not_found(struct btree_trans *trans,
|
||||
}
|
||||
|
||||
if (fsck_err(trans, backpointer_to_missing_ptr,
|
||||
"%s", buf.buf))
|
||||
"%s", buf.buf)) {
|
||||
ret = bch2_backpointer_del(trans, bp.k->p);
|
||||
if (ret || !commit)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* Normally, on transaction commit from inside a transaction,
|
||||
* we'll return -BCH_ERR_transaction_restart_nested, since a
|
||||
* transaction commit invalidates pointers given out by peek().
|
||||
*
|
||||
* However, since we're updating a write buffer btree, if we
|
||||
* return a transaction restart and loop we won't see that the
|
||||
* backpointer has been deleted without an additional write
|
||||
* buffer flush - and those are expensive.
|
||||
*
|
||||
* So we're relying on the caller immediately advancing to the
|
||||
* next backpointer and starting a new transaction immediately
|
||||
* after backpointer_get_key() returns NULL:
|
||||
*/
|
||||
ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
|
||||
}
|
||||
out:
|
||||
fsck_err:
|
||||
printbuf_exit(&buf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
|
||||
struct bkey_s_c_backpointer bp,
|
||||
struct btree_iter *iter,
|
||||
unsigned iter_flags,
|
||||
struct bkey_buf *last_flushed)
|
||||
static struct btree *__bch2_backpointer_get_node(struct btree_trans *trans,
|
||||
struct bkey_s_c_backpointer bp,
|
||||
struct btree_iter *iter,
|
||||
struct bkey_buf *last_flushed,
|
||||
bool commit)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
|
||||
BUG_ON(!bp.v->level);
|
||||
|
||||
bch2_trans_node_iter_init(trans, iter,
|
||||
bp.v->btree_id,
|
||||
bp.v->pos,
|
||||
0,
|
||||
bp.v->level - 1,
|
||||
0);
|
||||
struct btree *b = bch2_btree_iter_peek_node(trans, iter);
|
||||
if (IS_ERR_OR_NULL(b))
|
||||
goto err;
|
||||
|
||||
BUG_ON(b->c.level != bp.v->level - 1);
|
||||
|
||||
if (extent_matches_bp(c, bp.v->btree_id, bp.v->level,
|
||||
bkey_i_to_s_c(&b->key), bp))
|
||||
return b;
|
||||
|
||||
if (btree_node_will_make_reachable(b)) {
|
||||
b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
|
||||
} else {
|
||||
int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key),
|
||||
last_flushed, commit);
|
||||
b = ret ? ERR_PTR(ret) : NULL;
|
||||
}
|
||||
err:
|
||||
bch2_trans_iter_exit(trans, iter);
|
||||
return b;
|
||||
}
|
||||
|
||||
static struct bkey_s_c __bch2_backpointer_get_key(struct btree_trans *trans,
|
||||
struct bkey_s_c_backpointer bp,
|
||||
struct btree_iter *iter,
|
||||
unsigned iter_flags,
|
||||
struct bkey_buf *last_flushed,
|
||||
bool commit)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
|
||||
@@ -277,10 +337,10 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
|
||||
bch2_trans_iter_exit(trans, iter);
|
||||
|
||||
if (!bp.v->level) {
|
||||
int ret = backpointer_target_not_found(trans, bp, k, last_flushed);
|
||||
int ret = backpointer_target_not_found(trans, bp, k, last_flushed, commit);
|
||||
return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
|
||||
} else {
|
||||
struct btree *b = bch2_backpointer_get_node(trans, bp, iter, last_flushed);
|
||||
struct btree *b = __bch2_backpointer_get_node(trans, bp, iter, last_flushed, commit);
|
||||
if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node))
|
||||
return bkey_s_c_null;
|
||||
if (IS_ERR_OR_NULL(b))
|
||||
@@ -295,35 +355,16 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
|
||||
struct btree_iter *iter,
|
||||
struct bkey_buf *last_flushed)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
return __bch2_backpointer_get_node(trans, bp, iter, last_flushed, true);
|
||||
}
|
||||
|
||||
BUG_ON(!bp.v->level);
|
||||
|
||||
bch2_trans_node_iter_init(trans, iter,
|
||||
bp.v->btree_id,
|
||||
bp.v->pos,
|
||||
0,
|
||||
bp.v->level - 1,
|
||||
0);
|
||||
struct btree *b = bch2_btree_iter_peek_node(trans, iter);
|
||||
if (IS_ERR_OR_NULL(b))
|
||||
goto err;
|
||||
|
||||
BUG_ON(b->c.level != bp.v->level - 1);
|
||||
|
||||
if (extent_matches_bp(c, bp.v->btree_id, bp.v->level,
|
||||
bkey_i_to_s_c(&b->key), bp))
|
||||
return b;
|
||||
|
||||
if (btree_node_will_make_reachable(b)) {
|
||||
b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
|
||||
} else {
|
||||
int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), last_flushed);
|
||||
b = ret ? ERR_PTR(ret) : NULL;
|
||||
}
|
||||
err:
|
||||
bch2_trans_iter_exit(trans, iter);
|
||||
return b;
|
||||
struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
|
||||
struct bkey_s_c_backpointer bp,
|
||||
struct btree_iter *iter,
|
||||
unsigned iter_flags,
|
||||
struct bkey_buf *last_flushed)
|
||||
{
|
||||
return __bch2_backpointer_get_key(trans, bp, iter, iter_flags, last_flushed, true);
|
||||
}
|
||||
|
||||
static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, struct bkey_s_c k,
|
||||
@@ -521,7 +562,7 @@ check_existing_bp:
|
||||
struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k);
|
||||
|
||||
struct bkey_s_c other_extent =
|
||||
bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL);
|
||||
__bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL, false);
|
||||
ret = bkey_err(other_extent);
|
||||
if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
|
||||
ret = 0;
|
||||
|
||||
@@ -852,7 +852,6 @@ out:
|
||||
b->sib_u64s[1] = 0;
|
||||
b->whiteout_u64s = 0;
|
||||
bch2_btree_keys_init(b);
|
||||
set_btree_node_accessed(b);
|
||||
|
||||
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
|
||||
start_time);
|
||||
@@ -1286,6 +1285,10 @@ lock_node:
|
||||
six_unlock_read(&b->c.lock);
|
||||
goto retry;
|
||||
}
|
||||
|
||||
/* avoid atomic set bit if it's not needed: */
|
||||
if (!btree_node_accessed(b))
|
||||
set_btree_node_accessed(b);
|
||||
}
|
||||
|
||||
/* XXX: waiting on IO with btree locks held: */
|
||||
@@ -1301,10 +1304,6 @@ lock_node:
|
||||
prefetch(p + L1_CACHE_BYTES * 2);
|
||||
}
|
||||
|
||||
/* avoid atomic set bit if it's not needed: */
|
||||
if (!btree_node_accessed(b))
|
||||
set_btree_node_accessed(b);
|
||||
|
||||
if (unlikely(btree_node_read_error(b))) {
|
||||
six_unlock_read(&b->c.lock);
|
||||
b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached);
|
||||
|
||||
@@ -1971,6 +1971,12 @@ struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* We don't correctly handle nodes with extra intent locks here:
|
||||
* downgrade so we don't violate locking invariants
|
||||
*/
|
||||
bch2_btree_path_downgrade(trans, path);
|
||||
|
||||
if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
|
||||
__bch2_btree_path_unlock(trans, path);
|
||||
path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock);
|
||||
@@ -2743,7 +2749,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre
|
||||
ret = trans_maybe_inject_restart(trans, _RET_IP_);
|
||||
if (unlikely(ret)) {
|
||||
k = bkey_s_c_err(ret);
|
||||
goto out_no_locked;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* extents can't span inode numbers: */
|
||||
@@ -2763,13 +2769,15 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre
|
||||
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
|
||||
if (unlikely(ret)) {
|
||||
k = bkey_s_c_err(ret);
|
||||
goto out_no_locked;
|
||||
goto out;
|
||||
}
|
||||
|
||||
struct btree_path *path = btree_iter_path(trans, iter);
|
||||
if (unlikely(!btree_path_node(path, path->level)))
|
||||
return bkey_s_c_null;
|
||||
|
||||
btree_path_set_should_be_locked(trans, path);
|
||||
|
||||
if ((iter->flags & BTREE_ITER_cached) ||
|
||||
!(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) {
|
||||
k = bkey_s_c_null;
|
||||
@@ -2790,12 +2798,12 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre
|
||||
if (!bkey_err(k))
|
||||
iter->k = *k.k;
|
||||
/* We're not returning a key from iter->path: */
|
||||
goto out_no_locked;
|
||||
goto out;
|
||||
}
|
||||
|
||||
k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k);
|
||||
k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k);
|
||||
if (unlikely(!k.k))
|
||||
goto out_no_locked;
|
||||
goto out;
|
||||
|
||||
if (unlikely(k.k->type == KEY_TYPE_whiteout &&
|
||||
(iter->flags & BTREE_ITER_filter_snapshots) &&
|
||||
@@ -2833,7 +2841,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre
|
||||
}
|
||||
|
||||
if (unlikely(bkey_err(k)))
|
||||
goto out_no_locked;
|
||||
goto out;
|
||||
|
||||
next = k.k ? bkey_start_pos(k.k) : POS_MAX;
|
||||
|
||||
@@ -2855,8 +2863,6 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre
|
||||
}
|
||||
}
|
||||
out:
|
||||
btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
|
||||
out_no_locked:
|
||||
bch2_btree_iter_verify_entry_exit(iter);
|
||||
bch2_btree_iter_verify(trans, iter);
|
||||
ret = bch2_btree_iter_verify_ret(trans, iter, k);
|
||||
|
||||
@@ -376,6 +376,19 @@ int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a,
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_accounting_mem_insert_locked(struct bch_fs *c, struct bkey_s_c_accounting a,
|
||||
enum bch_accounting_mode mode)
|
||||
{
|
||||
struct bch_replicas_padded r;
|
||||
|
||||
if (mode != BCH_ACCOUNTING_read &&
|
||||
accounting_to_replicas(&r.e, a.k->p) &&
|
||||
!bch2_replicas_marked_locked(c, &r.e))
|
||||
return -BCH_ERR_btree_insert_need_mark_replicas;
|
||||
|
||||
return __bch2_accounting_mem_insert(c, a);
|
||||
}
|
||||
|
||||
static bool accounting_mem_entry_is_zero(struct accounting_mem_entry *e)
|
||||
{
|
||||
for (unsigned i = 0; i < e->nr_counters; i++)
|
||||
@@ -583,7 +596,7 @@ int bch2_gc_accounting_done(struct bch_fs *c)
|
||||
accounting_key_init(&k_i.k, &acc_k, src_v, nr);
|
||||
bch2_accounting_mem_mod_locked(trans,
|
||||
bkey_i_to_s_c_accounting(&k_i.k),
|
||||
BCH_ACCOUNTING_normal);
|
||||
BCH_ACCOUNTING_normal, true);
|
||||
|
||||
preempt_disable();
|
||||
struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage);
|
||||
@@ -612,7 +625,7 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k)
|
||||
|
||||
percpu_down_read(&c->mark_lock);
|
||||
int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k),
|
||||
BCH_ACCOUNTING_read);
|
||||
BCH_ACCOUNTING_read, false);
|
||||
percpu_up_read(&c->mark_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -136,6 +136,7 @@ enum bch_accounting_mode {
|
||||
};
|
||||
|
||||
int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode);
|
||||
int bch2_accounting_mem_insert_locked(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode);
|
||||
void bch2_accounting_mem_gc(struct bch_fs *);
|
||||
|
||||
static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc)
|
||||
@@ -150,7 +151,8 @@ static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc)
|
||||
*/
|
||||
static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
|
||||
struct bkey_s_c_accounting a,
|
||||
enum bch_accounting_mode mode)
|
||||
enum bch_accounting_mode mode,
|
||||
bool write_locked)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct bch_accounting_mem *acc = &c->accounting;
|
||||
@@ -189,7 +191,11 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
|
||||
|
||||
while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
|
||||
accounting_pos_cmp, &a.k->p)) >= acc->k.nr) {
|
||||
int ret = bch2_accounting_mem_insert(c, a, mode);
|
||||
int ret = 0;
|
||||
if (unlikely(write_locked))
|
||||
ret = bch2_accounting_mem_insert_locked(c, a, mode);
|
||||
else
|
||||
ret = bch2_accounting_mem_insert(c, a, mode);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
@@ -206,7 +212,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
|
||||
static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc)
|
||||
{
|
||||
percpu_down_read(&trans->c->mark_lock);
|
||||
int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal);
|
||||
int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal, false);
|
||||
percpu_up_read(&trans->c->mark_lock);
|
||||
return ret;
|
||||
}
|
||||
@@ -259,7 +265,7 @@ static inline int bch2_accounting_trans_commit_hook(struct btree_trans *trans,
|
||||
EBUG_ON(bversion_zero(a->k.bversion));
|
||||
|
||||
return likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply))
|
||||
? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal)
|
||||
? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal, false)
|
||||
: 0;
|
||||
}
|
||||
|
||||
@@ -271,7 +277,7 @@ static inline void bch2_accounting_trans_commit_revert(struct btree_trans *trans
|
||||
struct bkey_s_accounting a = accounting_i_to_s(a_i);
|
||||
|
||||
bch2_accounting_neg(a);
|
||||
bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal);
|
||||
bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal, false);
|
||||
bch2_accounting_neg(a);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1429,7 +1429,9 @@ static int bch2_next_fiemap_extent(struct btree_trans *trans,
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
ret = bch2_next_fiemap_pagecache_extent(trans, inode, start, end, cur);
|
||||
u64 pagecache_end = k.k ? max(start, bkey_start_offset(k.k)) : end;
|
||||
|
||||
ret = bch2_next_fiemap_pagecache_extent(trans, inode, start, pagecache_end, cur);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
|
||||
@@ -2446,7 +2446,7 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter,
|
||||
u32 parent = le32_to_cpu(s.v->fs_path_parent);
|
||||
|
||||
if (darray_u32_has(&subvol_path, parent)) {
|
||||
if (fsck_err(c, subvol_loop, "subvolume loop"))
|
||||
if (fsck_err(trans, subvol_loop, "subvolume loop"))
|
||||
ret = reattach_subvol(trans, s);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -17,6 +17,8 @@
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/sched/mm.h>
|
||||
|
||||
static bool __should_discard_bucket(struct journal *, struct journal_device *);
|
||||
|
||||
/* Free space calculations: */
|
||||
|
||||
static unsigned journal_space_from(struct journal_device *ja,
|
||||
@@ -203,8 +205,7 @@ void bch2_journal_space_available(struct journal *j)
|
||||
ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
|
||||
ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
|
||||
|
||||
if (ja->discard_idx != ja->dirty_idx_ondisk)
|
||||
can_discard = true;
|
||||
can_discard |= __should_discard_bucket(j, ja);
|
||||
|
||||
max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
|
||||
nr_online++;
|
||||
@@ -264,13 +265,19 @@ out:
|
||||
|
||||
/* Discards - last part of journal reclaim: */
|
||||
|
||||
static bool __should_discard_bucket(struct journal *j, struct journal_device *ja)
|
||||
{
|
||||
unsigned min_free = max(4, ja->nr / 8);
|
||||
|
||||
return bch2_journal_dev_buckets_available(j, ja, journal_space_discarded) <
|
||||
min_free &&
|
||||
ja->discard_idx != ja->dirty_idx_ondisk;
|
||||
}
|
||||
|
||||
static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
|
||||
{
|
||||
spin_lock(&j->lock);
|
||||
unsigned min_free = max(4, ja->nr / 8);
|
||||
|
||||
bool ret = bch2_journal_dev_buckets_available(j, ja, journal_space_discarded) < min_free &&
|
||||
ja->discard_idx != ja->dirty_idx_ondisk;
|
||||
bool ret = __should_discard_bucket(j, ja);
|
||||
spin_unlock(&j->lock);
|
||||
|
||||
return ret;
|
||||
|
||||
@@ -309,7 +309,7 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
|
||||
struct btree_iter *iter,
|
||||
struct bkey_s_c k)
|
||||
{
|
||||
if (!bch2_bkey_rebalance_opts(k))
|
||||
if (k.k->type == KEY_TYPE_reflink_v || !bch2_bkey_rebalance_opts(k))
|
||||
return 0;
|
||||
|
||||
struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0);
|
||||
|
||||
Reference in New Issue
Block a user