mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-09-04 20:19:47 +08:00
When doing a send we don't expect the task to ever start a transaction after the initial check that verifies if commit roots match the regular roots. This is because after that we set current->journal_info with a stub (special value) that signals we are in send context, so that we take a read lock on an extent buffer when reading it from disk and verifying it is valid (its generation matches the generation stored in the parent). This stub was introduced in 2014 by commita26e8c9f75("Btrfs: don't clear uptodate if the eb is under IO") in order to fix a concurrency issue between send and balance. However there is one particular exception where we end up needing to start a transaction and when this happens it results in a crash with a stack trace like the following: [60015.902283] kernel: WARNING: CPU: 3 PID: 58159 at arch/x86/include/asm/kfence.h:44 kfence_protect_page+0x21/0x80 [60015.902292] kernel: Modules linked in: uinput rfcomm snd_seq_dummy (...) [60015.902384] kernel: CPU: 3 PID: 58159 Comm: btrfs Not tainted 5.12.9-300.fc34.x86_64 #1 [60015.902387] kernel: Hardware name: Gigabyte Technology Co., Ltd. To be filled by O.E.M./F2A88XN-WIFI, BIOS F6 12/24/2015 [60015.902389] kernel: RIP: 0010:kfence_protect_page+0x21/0x80 [60015.902393] kernel: Code: ff 0f 1f 84 00 00 00 00 00 55 48 89 fd (...) [60015.902396] kernel: RSP: 0018:ffff9fb583453220 EFLAGS: 00010246 [60015.902399] kernel: RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff9fb583453224 [60015.902401] kernel: RDX: ffff9fb583453224 RSI: 0000000000000000 RDI: 0000000000000000 [60015.902402] kernel: RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 [60015.902404] kernel: R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000002 [60015.902406] kernel: R13: ffff9fb583453348 R14: 0000000000000000 R15: 0000000000000001 [60015.902408] kernel: FS: 00007f158e62d8c0(0000) GS:ffff93bd37580000(0000) knlGS:0000000000000000 [60015.902410] kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [60015.902412] kernel: CR2: 0000000000000039 CR3: 00000001256d2000 CR4: 00000000000506e0 [60015.902414] kernel: Call Trace: [60015.902419] kernel: kfence_unprotect+0x13/0x30 [60015.902423] kernel: page_fault_oops+0x89/0x270 [60015.902427] kernel: ? search_module_extables+0xf/0x40 [60015.902431] kernel: ? search_bpf_extables+0x57/0x70 [60015.902435] kernel: kernelmode_fixup_or_oops+0xd6/0xf0 [60015.902437] kernel: __bad_area_nosemaphore+0x142/0x180 [60015.902440] kernel: exc_page_fault+0x67/0x150 [60015.902445] kernel: asm_exc_page_fault+0x1e/0x30 [60015.902450] kernel: RIP: 0010:start_transaction+0x71/0x580 [60015.902454] kernel: Code: d3 0f 84 92 00 00 00 80 e7 06 0f 85 63 (...) [60015.902456] kernel: RSP: 0018:ffff9fb5834533f8 EFLAGS: 00010246 [60015.902458] kernel: RAX: 0000000000000001 RBX: 0000000000000001 RCX: 0000000000000000 [60015.902460] kernel: RDX: 0000000000000801 RSI: 0000000000000000 RDI: 0000000000000039 [60015.902462] kernel: RBP: ffff93bc0a7eb800 R08: 0000000000000001 R09: 0000000000000000 [60015.902463] kernel: R10: 0000000000098a00 R11: 0000000000000001 R12: 0000000000000001 [60015.902464] kernel: R13: 0000000000000000 R14: ffff93bc0c92b000 R15: ffff93bc0c92b000 [60015.902468] kernel: btrfs_commit_inode_delayed_inode+0x5d/0x120 [60015.902473] kernel: btrfs_evict_inode+0x2c5/0x3f0 [60015.902476] kernel: evict+0xd1/0x180 [60015.902480] kernel: inode_lru_isolate+0xe7/0x180 [60015.902483] kernel: __list_lru_walk_one+0x77/0x150 [60015.902487] kernel: ? iput+0x1a0/0x1a0 [60015.902489] kernel: ? iput+0x1a0/0x1a0 [60015.902491] kernel: list_lru_walk_one+0x47/0x70 [60015.902495] kernel: prune_icache_sb+0x39/0x50 [60015.902497] kernel: super_cache_scan+0x161/0x1f0 [60015.902501] kernel: do_shrink_slab+0x142/0x240 [60015.902505] kernel: shrink_slab+0x164/0x280 [60015.902509] kernel: shrink_node+0x2c8/0x6e0 [60015.902512] kernel: do_try_to_free_pages+0xcb/0x4b0 [60015.902514] kernel: try_to_free_pages+0xda/0x190 [60015.902516] kernel: __alloc_pages_slowpath.constprop.0+0x373/0xcc0 [60015.902521] kernel: ? __memcg_kmem_charge_page+0xc2/0x1e0 [60015.902525] kernel: __alloc_pages_nodemask+0x30a/0x340 [60015.902528] kernel: pipe_write+0x30b/0x5c0 [60015.902531] kernel: ? set_next_entity+0xad/0x1e0 [60015.902534] kernel: ? switch_mm_irqs_off+0x58/0x440 [60015.902538] kernel: __kernel_write+0x13a/0x2b0 [60015.902541] kernel: kernel_write+0x73/0x150 [60015.902543] kernel: send_cmd+0x7b/0xd0 [60015.902545] kernel: send_extent_data+0x5a3/0x6b0 [60015.902549] kernel: process_extent+0x19b/0xed0 [60015.902551] kernel: btrfs_ioctl_send+0x1434/0x17e0 [60015.902554] kernel: ? _btrfs_ioctl_send+0xe1/0x100 [60015.902557] kernel: _btrfs_ioctl_send+0xbf/0x100 [60015.902559] kernel: ? enqueue_entity+0x18c/0x7b0 [60015.902562] kernel: btrfs_ioctl+0x185f/0x2f80 [60015.902564] kernel: ? psi_task_change+0x84/0xc0 [60015.902569] kernel: ? _flat_send_IPI_mask+0x21/0x40 [60015.902572] kernel: ? check_preempt_curr+0x2f/0x70 [60015.902576] kernel: ? selinux_file_ioctl+0x137/0x1e0 [60015.902579] kernel: ? expand_files+0x1cb/0x1d0 [60015.902582] kernel: ? __x64_sys_ioctl+0x82/0xb0 [60015.902585] kernel: __x64_sys_ioctl+0x82/0xb0 [60015.902588] kernel: do_syscall_64+0x33/0x40 [60015.902591] kernel: entry_SYSCALL_64_after_hwframe+0x44/0xae [60015.902595] kernel: RIP: 0033:0x7f158e38f0ab [60015.902599] kernel: Code: ff ff ff 85 c0 79 9b (...) [60015.902602] kernel: RSP: 002b:00007ffcb2519bf8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [60015.902605] kernel: RAX: ffffffffffffffda RBX: 00007ffcb251ae00 RCX: 00007f158e38f0ab [60015.902607] kernel: RDX: 00007ffcb2519cf0 RSI: 0000000040489426 RDI: 0000000000000004 [60015.902608] kernel: RBP: 0000000000000004 R08: 00007f158e297640 R09: 00007f158e297640 [60015.902610] kernel: R10: 0000000000000008 R11: 0000000000000246 R12: 0000000000000000 [60015.902612] kernel: R13: 0000000000000002 R14: 00007ffcb251aee0 R15: 0000558c1a83e2a0 [60015.902615] kernel: ---[ end trace 7bbc33e23bb887ae ]--- This happens because when writing to the pipe, by calling kernel_write(), we end up doing page allocations using GFP_HIGHUSER | __GFP_ACCOUNT as the gfp flags, which allow reclaim to happen if there is memory pressure. This allocation happens at fs/pipe.c:pipe_write(). If the reclaim is triggered, inode eviction can be triggered and that in turn can result in starting a transaction if the inode has a link count of 0. The transaction start happens early on during eviction, when we call btrfs_commit_inode_delayed_inode() at btrfs_evict_inode(). This happens if there is currently an open file descriptor for an inode with a link count of 0 and the reclaim task gets a reference on the inode before that descriptor is closed, in which case the reclaim task ends up doing the final iput that triggers the inode eviction. When we have assertions enabled (CONFIG_BTRFS_ASSERT=y), this triggers the following assertion at transaction.c:start_transaction(): /* Send isn't supposed to start transactions. */ ASSERT(current->journal_info != BTRFS_SEND_TRANS_STUB); And when assertions are not enabled, it triggers a crash since after that assertion we cast current->journal_info into a transaction handle pointer and then dereference it: if (current->journal_info) { WARN_ON(type & TRANS_EXTWRITERS); h = current->journal_info; refcount_inc(&h->use_count); (...) Which obviously results in a crash due to an invalid memory access. The same type of issue can happen during other memory allocations we do directly in the send code with kmalloc (and friends) as they use GFP_KERNEL and therefore may trigger reclaim too, which started to happen since 2016 after commite780b0d1c1("btrfs: send: use GFP_KERNEL everywhere"). The issue could be solved by setting up a NOFS context for the entire send operation so that reclaim could not be triggered when allocating memory or pages through kernel_write(). However that is not very friendly and we can in fact get rid of the send stub because: 1) The stub was introduced way back in 2014 by commita26e8c9f75("Btrfs: don't clear uptodate if the eb is under IO") to solve an issue exclusive to when send and balance are running in parallel, however there were other problems between balance and send and we do not allow anymore to have balance and send run concurrently since commit9e967495e0("Btrfs: prevent send failures and crashes due to concurrent relocation"). More generically the issues are between send and relocation, and that last commit eliminated only the possibility of having send and balance run concurrently, but shrinking a device also can trigger relocation, and on zoned filesystems we have relocation of partially used block groups triggered automatically as well. The previous patch that has a subject of: "btrfs: ensure relocation never runs while we have send operations running" Addresses all the remaining cases that can trigger relocation. 2) We can actually allow starting and even committing transactions while in a send context if needed because send is not holding any locks that would block the start or the commit of a transaction. So get rid of all the logic added by commita26e8c9f75("Btrfs: don't clear uptodate if the eb is under IO"). We can now always call clear_extent_buffer_uptodate() at verify_parent_transid() since send is the only case that uses commit roots without having a transaction open or without holding the commit_root_sem. Reported-by: Chris Murphy <lists@colorremedies.com> Link: https://lore.kernel.org/linux-btrfs/CAJCQCtRQ57=qXo3kygwpwEBOU_CA_eKvdmjP52sU=eFvuVOEGw@mail.gmail.com/ Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
245 lines
7.8 KiB
C
245 lines
7.8 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
/*
|
|
* Copyright (C) 2007 Oracle. All rights reserved.
|
|
*/
|
|
|
|
#ifndef BTRFS_TRANSACTION_H
|
|
#define BTRFS_TRANSACTION_H
|
|
|
|
#include <linux/refcount.h>
|
|
#include "btrfs_inode.h"
|
|
#include "delayed-ref.h"
|
|
#include "ctree.h"
|
|
|
|
enum btrfs_trans_state {
|
|
TRANS_STATE_RUNNING,
|
|
TRANS_STATE_COMMIT_START,
|
|
TRANS_STATE_COMMIT_DOING,
|
|
TRANS_STATE_UNBLOCKED,
|
|
TRANS_STATE_SUPER_COMMITTED,
|
|
TRANS_STATE_COMPLETED,
|
|
TRANS_STATE_MAX,
|
|
};
|
|
|
|
#define BTRFS_TRANS_HAVE_FREE_BGS 0
|
|
#define BTRFS_TRANS_DIRTY_BG_RUN 1
|
|
#define BTRFS_TRANS_CACHE_ENOSPC 2
|
|
|
|
struct btrfs_transaction {
|
|
u64 transid;
|
|
/*
|
|
* total external writers(USERSPACE/START/ATTACH) in this
|
|
* transaction, it must be zero before the transaction is
|
|
* being committed
|
|
*/
|
|
atomic_t num_extwriters;
|
|
/*
|
|
* total writers in this transaction, it must be zero before the
|
|
* transaction can end
|
|
*/
|
|
atomic_t num_writers;
|
|
refcount_t use_count;
|
|
|
|
unsigned long flags;
|
|
|
|
/* Be protected by fs_info->trans_lock when we want to change it. */
|
|
enum btrfs_trans_state state;
|
|
int aborted;
|
|
struct list_head list;
|
|
struct extent_io_tree dirty_pages;
|
|
time64_t start_time;
|
|
wait_queue_head_t writer_wait;
|
|
wait_queue_head_t commit_wait;
|
|
struct list_head pending_snapshots;
|
|
struct list_head dev_update_list;
|
|
struct list_head switch_commits;
|
|
struct list_head dirty_bgs;
|
|
|
|
/*
|
|
* There is no explicit lock which protects io_bgs, rather its
|
|
* consistency is implied by the fact that all the sites which modify
|
|
* it do so under some form of transaction critical section, namely:
|
|
*
|
|
* - btrfs_start_dirty_block_groups - This function can only ever be
|
|
* run by one of the transaction committers. Refer to
|
|
* BTRFS_TRANS_DIRTY_BG_RUN usage in btrfs_commit_transaction
|
|
*
|
|
* - btrfs_write_dirty_blockgroups - this is called by
|
|
* commit_cowonly_roots from transaction critical section
|
|
* (TRANS_STATE_COMMIT_DOING)
|
|
*
|
|
* - btrfs_cleanup_dirty_bgs - called on transaction abort
|
|
*/
|
|
struct list_head io_bgs;
|
|
struct list_head dropped_roots;
|
|
struct extent_io_tree pinned_extents;
|
|
|
|
/*
|
|
* we need to make sure block group deletion doesn't race with
|
|
* free space cache writeout. This mutex keeps them from stomping
|
|
* on each other
|
|
*/
|
|
struct mutex cache_write_mutex;
|
|
spinlock_t dirty_bgs_lock;
|
|
/* Protected by spin lock fs_info->unused_bgs_lock. */
|
|
struct list_head deleted_bgs;
|
|
spinlock_t dropped_roots_lock;
|
|
struct btrfs_delayed_ref_root delayed_refs;
|
|
struct btrfs_fs_info *fs_info;
|
|
|
|
/*
|
|
* Number of ordered extents the transaction must wait for before
|
|
* committing. These are ordered extents started by a fast fsync.
|
|
*/
|
|
atomic_t pending_ordered;
|
|
wait_queue_head_t pending_wait;
|
|
|
|
spinlock_t releasing_ebs_lock;
|
|
struct list_head releasing_ebs;
|
|
|
|
/*
|
|
* The number of bytes currently reserved, by all transaction handles
|
|
* attached to this transaction, for metadata extents of the chunk tree.
|
|
*/
|
|
atomic64_t chunk_bytes_reserved;
|
|
wait_queue_head_t chunk_reserve_wait;
|
|
};
|
|
|
|
#define __TRANS_FREEZABLE (1U << 0)
|
|
|
|
#define __TRANS_START (1U << 9)
|
|
#define __TRANS_ATTACH (1U << 10)
|
|
#define __TRANS_JOIN (1U << 11)
|
|
#define __TRANS_JOIN_NOLOCK (1U << 12)
|
|
#define __TRANS_DUMMY (1U << 13)
|
|
#define __TRANS_JOIN_NOSTART (1U << 14)
|
|
|
|
#define TRANS_START (__TRANS_START | __TRANS_FREEZABLE)
|
|
#define TRANS_ATTACH (__TRANS_ATTACH)
|
|
#define TRANS_JOIN (__TRANS_JOIN | __TRANS_FREEZABLE)
|
|
#define TRANS_JOIN_NOLOCK (__TRANS_JOIN_NOLOCK)
|
|
#define TRANS_JOIN_NOSTART (__TRANS_JOIN_NOSTART)
|
|
|
|
#define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH)
|
|
|
|
struct btrfs_trans_handle {
|
|
u64 transid;
|
|
u64 bytes_reserved;
|
|
u64 chunk_bytes_reserved;
|
|
unsigned long delayed_ref_updates;
|
|
struct btrfs_transaction *transaction;
|
|
struct btrfs_block_rsv *block_rsv;
|
|
struct btrfs_block_rsv *orig_rsv;
|
|
refcount_t use_count;
|
|
unsigned int type;
|
|
/*
|
|
* Error code of transaction abort, set outside of locks and must use
|
|
* the READ_ONCE/WRITE_ONCE access
|
|
*/
|
|
short aborted;
|
|
bool adding_csums;
|
|
bool allocating_chunk;
|
|
bool can_flush_pending_bgs;
|
|
bool reloc_reserved;
|
|
bool in_fsync;
|
|
struct btrfs_root *root;
|
|
struct btrfs_fs_info *fs_info;
|
|
struct list_head new_bgs;
|
|
};
|
|
|
|
/*
|
|
* The abort status can be changed between calls and is not protected by locks.
|
|
* This accepts btrfs_transaction and btrfs_trans_handle as types. Once it's
|
|
* set to a non-zero value it does not change, so the macro should be in checks
|
|
* but is not necessary for further reads of the value.
|
|
*/
|
|
#define TRANS_ABORTED(trans) (unlikely(READ_ONCE((trans)->aborted)))
|
|
|
|
struct btrfs_pending_snapshot {
|
|
struct dentry *dentry;
|
|
struct inode *dir;
|
|
struct btrfs_root *root;
|
|
struct btrfs_root_item *root_item;
|
|
struct btrfs_root *snap;
|
|
struct btrfs_qgroup_inherit *inherit;
|
|
struct btrfs_path *path;
|
|
/* block reservation for the operation */
|
|
struct btrfs_block_rsv block_rsv;
|
|
/* extra metadata reservation for relocation */
|
|
int error;
|
|
/* Preallocated anonymous block device number */
|
|
dev_t anon_dev;
|
|
bool readonly;
|
|
struct list_head list;
|
|
};
|
|
|
|
static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
|
|
struct btrfs_inode *inode)
|
|
{
|
|
spin_lock(&inode->lock);
|
|
inode->last_trans = trans->transaction->transid;
|
|
inode->last_sub_trans = inode->root->log_transid;
|
|
inode->last_log_commit = inode->last_sub_trans - 1;
|
|
spin_unlock(&inode->lock);
|
|
}
|
|
|
|
/*
|
|
* Make qgroup codes to skip given qgroupid, means the old/new_roots for
|
|
* qgroup won't contain the qgroupid in it.
|
|
*/
|
|
static inline void btrfs_set_skip_qgroup(struct btrfs_trans_handle *trans,
|
|
u64 qgroupid)
|
|
{
|
|
struct btrfs_delayed_ref_root *delayed_refs;
|
|
|
|
delayed_refs = &trans->transaction->delayed_refs;
|
|
WARN_ON(delayed_refs->qgroup_to_skip);
|
|
delayed_refs->qgroup_to_skip = qgroupid;
|
|
}
|
|
|
|
static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans)
|
|
{
|
|
struct btrfs_delayed_ref_root *delayed_refs;
|
|
|
|
delayed_refs = &trans->transaction->delayed_refs;
|
|
WARN_ON(!delayed_refs->qgroup_to_skip);
|
|
delayed_refs->qgroup_to_skip = 0;
|
|
}
|
|
|
|
int btrfs_end_transaction(struct btrfs_trans_handle *trans);
|
|
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
|
|
unsigned int num_items);
|
|
struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
|
|
struct btrfs_root *root,
|
|
unsigned int num_items);
|
|
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
|
|
struct btrfs_trans_handle *btrfs_join_transaction_spacecache(struct btrfs_root *root);
|
|
struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root);
|
|
struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
|
|
struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
|
|
struct btrfs_root *root);
|
|
int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid);
|
|
|
|
void btrfs_add_dead_root(struct btrfs_root *root);
|
|
int btrfs_defrag_root(struct btrfs_root *root);
|
|
int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
|
|
int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
|
|
int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans);
|
|
int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans);
|
|
bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans);
|
|
void btrfs_throttle(struct btrfs_fs_info *fs_info);
|
|
int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
|
|
struct btrfs_root *root);
|
|
int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
|
|
struct extent_io_tree *dirty_pages, int mark);
|
|
int btrfs_wait_tree_log_extents(struct btrfs_root *root, int mark);
|
|
int btrfs_transaction_blocked(struct btrfs_fs_info *info);
|
|
int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
|
|
void btrfs_put_transaction(struct btrfs_transaction *transaction);
|
|
void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info);
|
|
void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
|
|
struct btrfs_root *root);
|
|
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
|
|
|
|
#endif
|