Merge tag 'bcachefs-2025-05-15' of git://evilpiepirate.org/bcachefs

Pull bcachefs fixes from Kent Overstreet:
 "The main user reported ones are:

   - Fix a btree iterator locking inconsistency that's been causing us
     to go emergency read-only in evacuate: "Fix broken btree_path lock
     invariants in next_node()"

   - Minor btree node cache reclaim tweak that should help with OOMs:
     don't set btree nodes as accessed on fill

   - Fix a bch2_bkey_clear_rebalance() issue that was causing rebalance
     to do needless work"

* tag 'bcachefs-2025-05-15' of git://evilpiepirate.org/bcachefs:
  bcachefs: fix wrong arg to fsck_err()
  bcachefs: Fix missing commit in backpointer to missing target
  bcachefs: Fix accidental O(n^2) in fiemap
  bcachefs: Fix set_should_be_locked() call in peek_slot()
  bcachefs: Fix self deadlock
  bcachefs: Don't set btree nodes as accessed on fill
  bcachefs: Fix livelock in journal_entry_open()
  bcachefs: Fix broken btree_path lock invariants in next_node()
  bcachefs: Don't strip rebalance_opts from indirect extents
This commit is contained in:
Linus Torvalds
2025-05-15 14:20:48 -07:00
9 changed files with 141 additions and 67 deletions

View File

@@ -192,7 +192,8 @@ static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans,
static int backpointer_target_not_found(struct btree_trans *trans,
struct bkey_s_c_backpointer bp,
struct bkey_s_c target_k,
struct bkey_buf *last_flushed)
struct bkey_buf *last_flushed,
bool commit)
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
@@ -228,18 +229,77 @@ static int backpointer_target_not_found(struct btree_trans *trans,
}
if (fsck_err(trans, backpointer_to_missing_ptr,
"%s", buf.buf))
"%s", buf.buf)) {
ret = bch2_backpointer_del(trans, bp.k->p);
if (ret || !commit)
goto out;
/*
* Normally, on transaction commit from inside a transaction,
* we'll return -BCH_ERR_transaction_restart_nested, since a
* transaction commit invalidates pointers given out by peek().
*
* However, since we're updating a write buffer btree, if we
* return a transaction restart and loop we won't see that the
* backpointer has been deleted without an additional write
* buffer flush - and those are expensive.
*
* So we're relying on the caller immediately advancing to the
* next backpointer and starting a new transaction immediately
* after backpointer_get_key() returns NULL:
*/
ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}
out:
fsck_err:
printbuf_exit(&buf);
return ret;
}
struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
struct bkey_s_c_backpointer bp,
struct btree_iter *iter,
unsigned iter_flags,
struct bkey_buf *last_flushed)
static struct btree *__bch2_backpointer_get_node(struct btree_trans *trans,
struct bkey_s_c_backpointer bp,
struct btree_iter *iter,
struct bkey_buf *last_flushed,
bool commit)
{
struct bch_fs *c = trans->c;
BUG_ON(!bp.v->level);
bch2_trans_node_iter_init(trans, iter,
bp.v->btree_id,
bp.v->pos,
0,
bp.v->level - 1,
0);
struct btree *b = bch2_btree_iter_peek_node(trans, iter);
if (IS_ERR_OR_NULL(b))
goto err;
BUG_ON(b->c.level != bp.v->level - 1);
if (extent_matches_bp(c, bp.v->btree_id, bp.v->level,
bkey_i_to_s_c(&b->key), bp))
return b;
if (btree_node_will_make_reachable(b)) {
b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
} else {
int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key),
last_flushed, commit);
b = ret ? ERR_PTR(ret) : NULL;
}
err:
bch2_trans_iter_exit(trans, iter);
return b;
}
static struct bkey_s_c __bch2_backpointer_get_key(struct btree_trans *trans,
struct bkey_s_c_backpointer bp,
struct btree_iter *iter,
unsigned iter_flags,
struct bkey_buf *last_flushed,
bool commit)
{
struct bch_fs *c = trans->c;
@@ -277,10 +337,10 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
bch2_trans_iter_exit(trans, iter);
if (!bp.v->level) {
int ret = backpointer_target_not_found(trans, bp, k, last_flushed);
int ret = backpointer_target_not_found(trans, bp, k, last_flushed, commit);
return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
} else {
struct btree *b = bch2_backpointer_get_node(trans, bp, iter, last_flushed);
struct btree *b = __bch2_backpointer_get_node(trans, bp, iter, last_flushed, commit);
if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node))
return bkey_s_c_null;
if (IS_ERR_OR_NULL(b))
@@ -295,35 +355,16 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_buf *last_flushed)
{
struct bch_fs *c = trans->c;
return __bch2_backpointer_get_node(trans, bp, iter, last_flushed, true);
}
BUG_ON(!bp.v->level);
bch2_trans_node_iter_init(trans, iter,
bp.v->btree_id,
bp.v->pos,
0,
bp.v->level - 1,
0);
struct btree *b = bch2_btree_iter_peek_node(trans, iter);
if (IS_ERR_OR_NULL(b))
goto err;
BUG_ON(b->c.level != bp.v->level - 1);
if (extent_matches_bp(c, bp.v->btree_id, bp.v->level,
bkey_i_to_s_c(&b->key), bp))
return b;
if (btree_node_will_make_reachable(b)) {
b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
} else {
int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), last_flushed);
b = ret ? ERR_PTR(ret) : NULL;
}
err:
bch2_trans_iter_exit(trans, iter);
return b;
struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
struct bkey_s_c_backpointer bp,
struct btree_iter *iter,
unsigned iter_flags,
struct bkey_buf *last_flushed)
{
return __bch2_backpointer_get_key(trans, bp, iter, iter_flags, last_flushed, true);
}
static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, struct bkey_s_c k,
@@ -521,7 +562,7 @@ check_existing_bp:
struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k);
struct bkey_s_c other_extent =
bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL);
__bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL, false);
ret = bkey_err(other_extent);
if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
ret = 0;

View File

@@ -852,7 +852,6 @@ out:
b->sib_u64s[1] = 0;
b->whiteout_u64s = 0;
bch2_btree_keys_init(b);
set_btree_node_accessed(b);
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
start_time);
@@ -1286,6 +1285,10 @@ lock_node:
six_unlock_read(&b->c.lock);
goto retry;
}
/* avoid atomic set bit if it's not needed: */
if (!btree_node_accessed(b))
set_btree_node_accessed(b);
}
/* XXX: waiting on IO with btree locks held: */
@@ -1301,10 +1304,6 @@ lock_node:
prefetch(p + L1_CACHE_BYTES * 2);
}
/* avoid atomic set bit if it's not needed: */
if (!btree_node_accessed(b))
set_btree_node_accessed(b);
if (unlikely(btree_node_read_error(b))) {
six_unlock_read(&b->c.lock);
b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached);

View File

@@ -1971,6 +1971,12 @@ struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_
return NULL;
}
/*
* We don't correctly handle nodes with extra intent locks here:
* downgrade so we don't violate locking invariants
*/
bch2_btree_path_downgrade(trans, path);
if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
__bch2_btree_path_unlock(trans, path);
path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock);
@@ -2743,7 +2749,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre
ret = trans_maybe_inject_restart(trans, _RET_IP_);
if (unlikely(ret)) {
k = bkey_s_c_err(ret);
goto out_no_locked;
goto out;
}
/* extents can't span inode numbers: */
@@ -2763,13 +2769,15 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (unlikely(ret)) {
k = bkey_s_c_err(ret);
goto out_no_locked;
goto out;
}
struct btree_path *path = btree_iter_path(trans, iter);
if (unlikely(!btree_path_node(path, path->level)))
return bkey_s_c_null;
btree_path_set_should_be_locked(trans, path);
if ((iter->flags & BTREE_ITER_cached) ||
!(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) {
k = bkey_s_c_null;
@@ -2790,12 +2798,12 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre
if (!bkey_err(k))
iter->k = *k.k;
/* We're not returning a key from iter->path: */
goto out_no_locked;
goto out;
}
k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k);
k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k);
if (unlikely(!k.k))
goto out_no_locked;
goto out;
if (unlikely(k.k->type == KEY_TYPE_whiteout &&
(iter->flags & BTREE_ITER_filter_snapshots) &&
@@ -2833,7 +2841,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre
}
if (unlikely(bkey_err(k)))
goto out_no_locked;
goto out;
next = k.k ? bkey_start_pos(k.k) : POS_MAX;
@@ -2855,8 +2863,6 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre
}
}
out:
btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
out_no_locked:
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(trans, iter);
ret = bch2_btree_iter_verify_ret(trans, iter, k);

View File

@@ -376,6 +376,19 @@ int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a,
return ret;
}
int bch2_accounting_mem_insert_locked(struct bch_fs *c, struct bkey_s_c_accounting a,
enum bch_accounting_mode mode)
{
struct bch_replicas_padded r;
if (mode != BCH_ACCOUNTING_read &&
accounting_to_replicas(&r.e, a.k->p) &&
!bch2_replicas_marked_locked(c, &r.e))
return -BCH_ERR_btree_insert_need_mark_replicas;
return __bch2_accounting_mem_insert(c, a);
}
static bool accounting_mem_entry_is_zero(struct accounting_mem_entry *e)
{
for (unsigned i = 0; i < e->nr_counters; i++)
@@ -583,7 +596,7 @@ int bch2_gc_accounting_done(struct bch_fs *c)
accounting_key_init(&k_i.k, &acc_k, src_v, nr);
bch2_accounting_mem_mod_locked(trans,
bkey_i_to_s_c_accounting(&k_i.k),
BCH_ACCOUNTING_normal);
BCH_ACCOUNTING_normal, true);
preempt_disable();
struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage);
@@ -612,7 +625,7 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k)
percpu_down_read(&c->mark_lock);
int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k),
BCH_ACCOUNTING_read);
BCH_ACCOUNTING_read, false);
percpu_up_read(&c->mark_lock);
return ret;
}

View File

@@ -136,6 +136,7 @@ enum bch_accounting_mode {
};
int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode);
int bch2_accounting_mem_insert_locked(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode);
void bch2_accounting_mem_gc(struct bch_fs *);
static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc)
@@ -150,7 +151,8 @@ static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc)
*/
static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
struct bkey_s_c_accounting a,
enum bch_accounting_mode mode)
enum bch_accounting_mode mode,
bool write_locked)
{
struct bch_fs *c = trans->c;
struct bch_accounting_mem *acc = &c->accounting;
@@ -189,7 +191,11 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
accounting_pos_cmp, &a.k->p)) >= acc->k.nr) {
int ret = bch2_accounting_mem_insert(c, a, mode);
int ret = 0;
if (unlikely(write_locked))
ret = bch2_accounting_mem_insert_locked(c, a, mode);
else
ret = bch2_accounting_mem_insert(c, a, mode);
if (ret)
return ret;
}
@@ -206,7 +212,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc)
{
percpu_down_read(&trans->c->mark_lock);
int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal);
int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal, false);
percpu_up_read(&trans->c->mark_lock);
return ret;
}
@@ -259,7 +265,7 @@ static inline int bch2_accounting_trans_commit_hook(struct btree_trans *trans,
EBUG_ON(bversion_zero(a->k.bversion));
return likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply))
? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal)
? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal, false)
: 0;
}
@@ -271,7 +277,7 @@ static inline void bch2_accounting_trans_commit_revert(struct btree_trans *trans
struct bkey_s_accounting a = accounting_i_to_s(a_i);
bch2_accounting_neg(a);
bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal);
bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal, false);
bch2_accounting_neg(a);
}
}

View File

@@ -1429,7 +1429,9 @@ static int bch2_next_fiemap_extent(struct btree_trans *trans,
if (ret)
goto err;
ret = bch2_next_fiemap_pagecache_extent(trans, inode, start, end, cur);
u64 pagecache_end = k.k ? max(start, bkey_start_offset(k.k)) : end;
ret = bch2_next_fiemap_pagecache_extent(trans, inode, start, pagecache_end, cur);
if (ret)
goto err;

View File

@@ -2446,7 +2446,7 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter,
u32 parent = le32_to_cpu(s.v->fs_path_parent);
if (darray_u32_has(&subvol_path, parent)) {
if (fsck_err(c, subvol_loop, "subvolume loop"))
if (fsck_err(trans, subvol_loop, "subvolume loop"))
ret = reattach_subvol(trans, s);
break;
}

View File

@@ -17,6 +17,8 @@
#include <linux/kthread.h>
#include <linux/sched/mm.h>
static bool __should_discard_bucket(struct journal *, struct journal_device *);
/* Free space calculations: */
static unsigned journal_space_from(struct journal_device *ja,
@@ -203,8 +205,7 @@ void bch2_journal_space_available(struct journal *j)
ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
if (ja->discard_idx != ja->dirty_idx_ondisk)
can_discard = true;
can_discard |= __should_discard_bucket(j, ja);
max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
nr_online++;
@@ -264,13 +265,19 @@ out:
/* Discards - last part of journal reclaim: */
static bool __should_discard_bucket(struct journal *j, struct journal_device *ja)
{
unsigned min_free = max(4, ja->nr / 8);
return bch2_journal_dev_buckets_available(j, ja, journal_space_discarded) <
min_free &&
ja->discard_idx != ja->dirty_idx_ondisk;
}
static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
{
spin_lock(&j->lock);
unsigned min_free = max(4, ja->nr / 8);
bool ret = bch2_journal_dev_buckets_available(j, ja, journal_space_discarded) < min_free &&
ja->discard_idx != ja->dirty_idx_ondisk;
bool ret = __should_discard_bucket(j, ja);
spin_unlock(&j->lock);
return ret;

View File

@@ -309,7 +309,7 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k)
{
if (!bch2_bkey_rebalance_opts(k))
if (k.k->type == KEY_TYPE_reflink_v || !bch2_bkey_rebalance_opts(k))
return 0;
struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0);