mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-09-04 20:19:47 +08:00

At btrfs_finish_extent_commit() we have this loop that keeps finding an extent range to unpin in the transaction's pinned_extents io tree, caches the extent state and then passes that cached extent state to btrfs_clear_extent_dirty(), which will free that extent state since we clear the only bit it can have set. So on each loop iteration we do a full io tree search and the cached state is used only to avoid having a tree search done by btrfs_clear_extent_dirty(). During the lifetime of a transaction we can pin many thousands of extents, resulting in a large and deep rb tree that backs the io tree. For example, for the following fs_mark run on a 12 cores boxes: $ cat test.sh #!/bin/bash DEV=/dev/nullb0 MNT=/mnt/nullb0 FILES=100000 THREADS=$(nproc --all) echo "performance" | \ tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor mkfs.btrfs -f $DEV mount $DEV $MNT OPTS="-S 0 -L 8 -n $FILES -s 0 -t $THREADS -k" for ((i = 1; i <= $THREADS; i++)); do OPTS="$OPTS -d $MNT/d$i" done fs_mark $OPTS umount $MNT an histogram for the number of ranges (elements) in the pinned extents io tree of a transaction was the following: Count: 76 Range: 5440.000 - 51088.000; Mean: 27354.368; Median: 28312.000; Stddev: 9800.767 Percentiles: 90th: 40486.000; 95th: 43322.000; 99th: 51088.000 5440.000 - 6805.809: 1 ### 6805.809 - 10652.034: 1 ### 10652.034 - 13326.178: 3 ######## 13326.178 - 16671.590: 8 ###################### 16671.590 - 20856.773: 7 #################### 20856.773 - 26092.528: 13 #################################### 26092.528 - 32642.571: 19 ##################################################### 32642.571 - 40836.818: 17 ############################################### 40836.818 - 51088.000: 7 #################### We can improve on this by grabbing the next state before calling btrfs_clear_extent_dirty(), avoiding a full tree search on the next iteration which always has an O(log n) complexity while grabbing the next element (rb_next() rbtree operation) is in the worst case O(log n) too, but very often much less than that, making it more efficient. Here follow histograms for the execution times, in nanoseconds, of btrfs_finish_extent_commit() before and after applying this patch and all the other patches in the same patchset. Before patchset: Count: 32 Range: 3925691.000 - 269990635.000; Mean: 133814526.906; Median: 122758052.000; Stddev: 65776550.375 Percentiles: 90th: 228672087.000; 95th: 265187000.000; 99th: 269990635.000 3925691.000 - 5993208.660: 1 #### 5993208.660 - 75878537.656: 4 ################## 75878537.656 - 115840974.514: 12 ##################################################### 115840974.514 - 176850157.761: 6 ########################### 176850157.761 - 269990635.000: 9 ######################################## After patchset: Count: 32 Range: 1849393.000 - 231491064.000; Mean: 126978584.625; Median: 123732897.000; Stddev: 58007821.806 Percentiles: 90th: 203055491.000; 95th: 219952699.000; 99th: 231491064.000 1849393.000 - 2997642.092: 1 #### 2997642.092 - 88111637.071: 9 ##################################### 88111637.071 - 142818264.414: 9 ##################################### 142818264.414 - 231491064.000: 13 ##################################################### Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
250 lines
8.2 KiB
C
250 lines
8.2 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
|
|
#ifndef BTRFS_EXTENT_IO_TREE_H
|
|
#define BTRFS_EXTENT_IO_TREE_H
|
|
|
|
#include <linux/rbtree.h>
|
|
#include <linux/spinlock.h>
|
|
#include <linux/refcount.h>
|
|
#include <linux/list.h>
|
|
#include <linux/wait.h>
|
|
#include "misc.h"
|
|
|
|
struct extent_changeset;
|
|
struct btrfs_fs_info;
|
|
struct btrfs_inode;
|
|
|
|
/* Bits for the extent state */
|
|
enum {
|
|
ENUM_BIT(EXTENT_DIRTY),
|
|
ENUM_BIT(EXTENT_LOCKED),
|
|
ENUM_BIT(EXTENT_DIO_LOCKED),
|
|
ENUM_BIT(EXTENT_NEW),
|
|
ENUM_BIT(EXTENT_DELALLOC),
|
|
ENUM_BIT(EXTENT_DEFRAG),
|
|
ENUM_BIT(EXTENT_BOUNDARY),
|
|
ENUM_BIT(EXTENT_NODATASUM),
|
|
ENUM_BIT(EXTENT_CLEAR_META_RESV),
|
|
ENUM_BIT(EXTENT_NEED_WAIT),
|
|
ENUM_BIT(EXTENT_NORESERVE),
|
|
ENUM_BIT(EXTENT_QGROUP_RESERVED),
|
|
ENUM_BIT(EXTENT_CLEAR_DATA_RESV),
|
|
/*
|
|
* Must be cleared only during ordered extent completion or on error
|
|
* paths if we did not manage to submit bios and create the ordered
|
|
* extents for the range. Should not be cleared during page release
|
|
* and page invalidation (if there is an ordered extent in flight),
|
|
* that is left for the ordered extent completion.
|
|
*/
|
|
ENUM_BIT(EXTENT_DELALLOC_NEW),
|
|
/*
|
|
* Mark that a range is being locked for finishing an ordered extent.
|
|
* Used together with EXTENT_LOCKED.
|
|
*/
|
|
ENUM_BIT(EXTENT_FINISHING_ORDERED),
|
|
/*
|
|
* When an ordered extent successfully completes for a region marked as
|
|
* a new delalloc range, use this flag when clearing a new delalloc
|
|
* range to indicate that the VFS' inode number of bytes should be
|
|
* incremented and the inode's new delalloc bytes decremented, in an
|
|
* atomic way to prevent races with stat(2).
|
|
*/
|
|
ENUM_BIT(EXTENT_ADD_INODE_BYTES),
|
|
/*
|
|
* Set during truncate when we're clearing an entire range and we just
|
|
* want the extent states to go away.
|
|
*/
|
|
ENUM_BIT(EXTENT_CLEAR_ALL_BITS),
|
|
|
|
/*
|
|
* This must be last.
|
|
*
|
|
* Bit not representing a state but a request for NOWAIT semantics,
|
|
* e.g. when allocating memory, and must be masked out from the other
|
|
* bits.
|
|
*/
|
|
ENUM_BIT(EXTENT_NOWAIT)
|
|
};
|
|
|
|
#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \
|
|
EXTENT_CLEAR_DATA_RESV)
|
|
#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | \
|
|
EXTENT_ADD_INODE_BYTES | \
|
|
EXTENT_CLEAR_ALL_BITS)
|
|
|
|
#define EXTENT_LOCK_BITS (EXTENT_LOCKED | EXTENT_DIO_LOCKED)
|
|
|
|
/*
|
|
* Redefined bits above which are used only in the device allocation tree,
|
|
* shouldn't be using EXTENT_LOCKED / EXTENT_BOUNDARY / EXTENT_CLEAR_META_RESV
|
|
* / EXTENT_CLEAR_DATA_RESV because they have special meaning to the bit
|
|
* manipulation functions
|
|
*/
|
|
#define CHUNK_ALLOCATED EXTENT_DIRTY
|
|
#define CHUNK_TRIMMED EXTENT_DEFRAG
|
|
#define CHUNK_STATE_MASK (CHUNK_ALLOCATED | \
|
|
CHUNK_TRIMMED)
|
|
|
|
enum {
|
|
IO_TREE_FS_PINNED_EXTENTS,
|
|
IO_TREE_FS_EXCLUDED_EXTENTS,
|
|
IO_TREE_BTREE_INODE_IO,
|
|
IO_TREE_INODE_IO,
|
|
IO_TREE_RELOC_BLOCKS,
|
|
IO_TREE_TRANS_DIRTY_PAGES,
|
|
IO_TREE_ROOT_DIRTY_LOG_PAGES,
|
|
IO_TREE_INODE_FILE_EXTENT,
|
|
IO_TREE_LOG_CSUM_RANGE,
|
|
IO_TREE_SELFTEST,
|
|
IO_TREE_DEVICE_ALLOC_STATE,
|
|
};
|
|
|
|
struct extent_io_tree {
|
|
struct rb_root state;
|
|
/*
|
|
* The fs_info is needed for trace points, a tree attached to an inode
|
|
* needs the inode.
|
|
*
|
|
* owner == IO_TREE_INODE_IO - then inode is valid and fs_info can be
|
|
* accessed as inode->root->fs_info
|
|
*/
|
|
union {
|
|
struct btrfs_fs_info *fs_info;
|
|
struct btrfs_inode *inode;
|
|
};
|
|
|
|
/* Who owns this io tree, should be one of IO_TREE_* */
|
|
u8 owner;
|
|
|
|
spinlock_t lock;
|
|
};
|
|
|
|
struct extent_state {
|
|
u64 start;
|
|
u64 end; /* inclusive */
|
|
struct rb_node rb_node;
|
|
|
|
/* ADD NEW ELEMENTS AFTER THIS */
|
|
wait_queue_head_t wq;
|
|
refcount_t refs;
|
|
u32 state;
|
|
|
|
#ifdef CONFIG_BTRFS_DEBUG
|
|
struct list_head leak_list;
|
|
#endif
|
|
};
|
|
|
|
const struct btrfs_inode *btrfs_extent_io_tree_to_inode(const struct extent_io_tree *tree);
|
|
const struct btrfs_fs_info *btrfs_extent_io_tree_to_fs_info(const struct extent_io_tree *tree);
|
|
|
|
void btrfs_extent_io_tree_init(struct btrfs_fs_info *fs_info,
|
|
struct extent_io_tree *tree, unsigned int owner);
|
|
void btrfs_extent_io_tree_release(struct extent_io_tree *tree);
|
|
int btrfs_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
|
|
struct extent_state **cached);
|
|
bool btrfs_try_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
|
|
u32 bits, struct extent_state **cached);
|
|
|
|
static inline int btrfs_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
|
|
struct extent_state **cached)
|
|
{
|
|
return btrfs_lock_extent_bits(tree, start, end, EXTENT_LOCKED, cached);
|
|
}
|
|
|
|
static inline bool btrfs_try_lock_extent(struct extent_io_tree *tree, u64 start,
|
|
u64 end, struct extent_state **cached)
|
|
{
|
|
return btrfs_try_lock_extent_bits(tree, start, end, EXTENT_LOCKED, cached);
|
|
}
|
|
|
|
int __init btrfs_extent_state_init_cachep(void);
|
|
void __cold btrfs_extent_state_free_cachep(void);
|
|
|
|
u64 btrfs_count_range_bits(struct extent_io_tree *tree,
|
|
u64 *start, u64 search_end,
|
|
u64 max_bytes, u32 bits, int contig,
|
|
struct extent_state **cached_state);
|
|
|
|
void btrfs_free_extent_state(struct extent_state *state);
|
|
bool btrfs_test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit,
|
|
struct extent_state *cached_state);
|
|
bool btrfs_test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit);
|
|
void btrfs_get_range_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 *bits,
|
|
struct extent_state **cached_state);
|
|
int btrfs_clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
|
|
u32 bits, struct extent_changeset *changeset);
|
|
int btrfs_clear_extent_bit_changeset(struct extent_io_tree *tree, u64 start, u64 end,
|
|
u32 bits, struct extent_state **cached,
|
|
struct extent_changeset *changeset);
|
|
|
|
static inline int btrfs_clear_extent_bit(struct extent_io_tree *tree, u64 start,
|
|
u64 end, u32 bits,
|
|
struct extent_state **cached)
|
|
{
|
|
return btrfs_clear_extent_bit_changeset(tree, start, end, bits, cached, NULL);
|
|
}
|
|
|
|
static inline int btrfs_unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
|
|
struct extent_state **cached)
|
|
{
|
|
return btrfs_clear_extent_bit_changeset(tree, start, end, EXTENT_LOCKED,
|
|
cached, NULL);
|
|
}
|
|
|
|
static inline int btrfs_clear_extent_bits(struct extent_io_tree *tree, u64 start,
|
|
u64 end, u32 bits)
|
|
{
|
|
return btrfs_clear_extent_bit(tree, start, end, bits, NULL);
|
|
}
|
|
|
|
int btrfs_set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
|
|
u32 bits, struct extent_changeset *changeset);
|
|
int btrfs_set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
|
|
u32 bits, struct extent_state **cached_state);
|
|
|
|
static inline int btrfs_clear_extent_dirty(struct extent_io_tree *tree, u64 start,
|
|
u64 end, struct extent_state **cached)
|
|
{
|
|
return btrfs_clear_extent_bit(tree, start, end,
|
|
EXTENT_DIRTY | EXTENT_DELALLOC |
|
|
EXTENT_DO_ACCOUNTING, cached);
|
|
}
|
|
|
|
int btrfs_convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
|
|
u32 bits, u32 clear_bits,
|
|
struct extent_state **cached_state);
|
|
|
|
bool btrfs_find_first_extent_bit(struct extent_io_tree *tree, u64 start,
|
|
u64 *start_ret, u64 *end_ret, u32 bits,
|
|
struct extent_state **cached_state);
|
|
void btrfs_find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
|
|
u64 *start_ret, u64 *end_ret, u32 bits);
|
|
bool btrfs_find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
|
|
u64 *start_ret, u64 *end_ret, u32 bits);
|
|
bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
|
|
u64 *end, u64 max_bytes,
|
|
struct extent_state **cached_state);
|
|
static inline int btrfs_lock_dio_extent(struct extent_io_tree *tree, u64 start,
|
|
u64 end, struct extent_state **cached)
|
|
{
|
|
return btrfs_lock_extent_bits(tree, start, end, EXTENT_DIO_LOCKED, cached);
|
|
}
|
|
|
|
static inline bool btrfs_try_lock_dio_extent(struct extent_io_tree *tree, u64 start,
|
|
u64 end, struct extent_state **cached)
|
|
{
|
|
return btrfs_try_lock_extent_bits(tree, start, end, EXTENT_DIO_LOCKED, cached);
|
|
}
|
|
|
|
static inline int btrfs_unlock_dio_extent(struct extent_io_tree *tree, u64 start,
|
|
u64 end, struct extent_state **cached)
|
|
{
|
|
return btrfs_clear_extent_bit_changeset(tree, start, end, EXTENT_DIO_LOCKED,
|
|
cached, NULL);
|
|
}
|
|
|
|
struct extent_state *btrfs_next_extent_state(struct extent_io_tree *tree,
|
|
struct extent_state *state);
|
|
|
|
#endif /* BTRFS_EXTENT_IO_TREE_H */
|