mirror of
				git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
				synced 2025-09-04 20:19:47 +08:00 
			
		
		
		
	 2e60a51e62
			
		
	
	
		2e60a51e62
		
	
	
	
	
		
			
			Currently, we can do unlocked dio reads, but the following race
is possible:
dio_read_task			truncate_task
				->btrfs_setattr()
->btrfs_direct_IO
    ->__blockdev_direct_IO
      ->btrfs_get_block
				  ->btrfs_truncate()
				 #alloc truncated blocks
				 #to other inode
      ->submit_io()
     #INFORMATION LEAK
In order to avoid this problem, we must serialize unlocked dio reads with
truncate. There are two approaches:
- use extent lock to protect the extent that we truncate
- use inode_dio_wait() to make sure the truncating task will wait for
  the read DIO.
If we use the 1st one, we will meet the endless truncation problem due to
the nonlocked read DIO after we implement the nonlocked write DIO. It is
because we still need invoke inode_dio_wait() avoid the race between write
DIO and truncation. By that time, we have to introduce
  btrfs_inode_{block, resume}_nolock_dio()
again. That is we have to implement this patch again, so I choose the 2nd
way to fix the problem.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
		
	
			
		
			
				
	
	
		
			240 lines
		
	
	
		
			6.4 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			240 lines
		
	
	
		
			6.4 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /*
 | |
|  * Copyright (C) 2007 Oracle.  All rights reserved.
 | |
|  *
 | |
|  * This program is free software; you can redistribute it and/or
 | |
|  * modify it under the terms of the GNU General Public
 | |
|  * License v2 as published by the Free Software Foundation.
 | |
|  *
 | |
|  * This program is distributed in the hope that it will be useful,
 | |
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | |
|  * General Public License for more details.
 | |
|  *
 | |
|  * You should have received a copy of the GNU General Public
 | |
|  * License along with this program; if not, write to the
 | |
|  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 | |
|  * Boston, MA 021110-1307, USA.
 | |
|  */
 | |
| 
 | |
| #ifndef __BTRFS_I__
 | |
| #define __BTRFS_I__
 | |
| 
 | |
| #include "extent_map.h"
 | |
| #include "extent_io.h"
 | |
| #include "ordered-data.h"
 | |
| #include "delayed-inode.h"
 | |
| 
 | |
| /*
 | |
|  * ordered_data_close is set by truncate when a file that used
 | |
|  * to have good data has been truncated to zero.  When it is set
 | |
|  * the btrfs file release call will add this inode to the
 | |
|  * ordered operations list so that we make sure to flush out any
 | |
|  * new data the application may have written before commit.
 | |
|  */
 | |
| #define BTRFS_INODE_ORDERED_DATA_CLOSE		0
 | |
| #define BTRFS_INODE_ORPHAN_META_RESERVED	1
 | |
| #define BTRFS_INODE_DUMMY			2
 | |
| #define BTRFS_INODE_IN_DEFRAG			3
 | |
| #define BTRFS_INODE_DELALLOC_META_RESERVED	4
 | |
| #define BTRFS_INODE_HAS_ORPHAN_ITEM		5
 | |
| #define BTRFS_INODE_HAS_ASYNC_EXTENT		6
 | |
| #define BTRFS_INODE_NEEDS_FULL_SYNC		7
 | |
| #define BTRFS_INODE_COPY_EVERYTHING		8
 | |
| #define BTRFS_INODE_IN_DELALLOC_LIST		9
 | |
| #define BTRFS_INODE_READDIO_NEED_LOCK		10
 | |
| 
 | |
| /* in memory btrfs inode */
 | |
| struct btrfs_inode {
 | |
| 	/* which subvolume this inode belongs to */
 | |
| 	struct btrfs_root *root;
 | |
| 
 | |
| 	/* key used to find this inode on disk.  This is used by the code
 | |
| 	 * to read in roots of subvolumes
 | |
| 	 */
 | |
| 	struct btrfs_key location;
 | |
| 
 | |
| 	/* Lock for counters */
 | |
| 	spinlock_t lock;
 | |
| 
 | |
| 	/* the extent_tree has caches of all the extent mappings to disk */
 | |
| 	struct extent_map_tree extent_tree;
 | |
| 
 | |
| 	/* the io_tree does range state (DIRTY, LOCKED etc) */
 | |
| 	struct extent_io_tree io_tree;
 | |
| 
 | |
| 	/* special utility tree used to record which mirrors have already been
 | |
| 	 * tried when checksums fail for a given block
 | |
| 	 */
 | |
| 	struct extent_io_tree io_failure_tree;
 | |
| 
 | |
| 	/* held while logging the inode in tree-log.c */
 | |
| 	struct mutex log_mutex;
 | |
| 
 | |
| 	/* held while doing delalloc reservations */
 | |
| 	struct mutex delalloc_mutex;
 | |
| 
 | |
| 	/* used to order data wrt metadata */
 | |
| 	struct btrfs_ordered_inode_tree ordered_tree;
 | |
| 
 | |
| 	/* list of all the delalloc inodes in the FS.  There are times we need
 | |
| 	 * to write all the delalloc pages to disk, and this list is used
 | |
| 	 * to walk them all.
 | |
| 	 */
 | |
| 	struct list_head delalloc_inodes;
 | |
| 
 | |
| 	/*
 | |
| 	 * list for tracking inodes that must be sent to disk before a
 | |
| 	 * rename or truncate commit
 | |
| 	 */
 | |
| 	struct list_head ordered_operations;
 | |
| 
 | |
| 	/* node for the red-black tree that links inodes in subvolume root */
 | |
| 	struct rb_node rb_node;
 | |
| 
 | |
| 	unsigned long runtime_flags;
 | |
| 
 | |
| 	/* Keep track of who's O_SYNC/fsycing currently */
 | |
| 	atomic_t sync_writers;
 | |
| 
 | |
| 	/* full 64 bit generation number, struct vfs_inode doesn't have a big
 | |
| 	 * enough field for this.
 | |
| 	 */
 | |
| 	u64 generation;
 | |
| 
 | |
| 	/*
 | |
| 	 * transid of the trans_handle that last modified this inode
 | |
| 	 */
 | |
| 	u64 last_trans;
 | |
| 
 | |
| 	/*
 | |
| 	 * log transid when this inode was last modified
 | |
| 	 */
 | |
| 	u64 last_sub_trans;
 | |
| 
 | |
| 	/*
 | |
| 	 * transid that last logged this inode
 | |
| 	 */
 | |
| 	u64 logged_trans;
 | |
| 
 | |
| 	/* total number of bytes pending delalloc, used by stat to calc the
 | |
| 	 * real block usage of the file
 | |
| 	 */
 | |
| 	u64 delalloc_bytes;
 | |
| 
 | |
| 	/*
 | |
| 	 * the size of the file stored in the metadata on disk.  data=ordered
 | |
| 	 * means the in-memory i_size might be larger than the size on disk
 | |
| 	 * because not all the blocks are written yet.
 | |
| 	 */
 | |
| 	u64 disk_i_size;
 | |
| 
 | |
| 	/*
 | |
| 	 * if this is a directory then index_cnt is the counter for the index
 | |
| 	 * number for new files that are created
 | |
| 	 */
 | |
| 	u64 index_cnt;
 | |
| 
 | |
| 	/* the fsync log has some corner cases that mean we have to check
 | |
| 	 * directories to see if any unlinks have been done before
 | |
| 	 * the directory was logged.  See tree-log.c for all the
 | |
| 	 * details
 | |
| 	 */
 | |
| 	u64 last_unlink_trans;
 | |
| 
 | |
| 	/*
 | |
| 	 * Number of bytes outstanding that are going to need csums.  This is
 | |
| 	 * used in ENOSPC accounting.
 | |
| 	 */
 | |
| 	u64 csum_bytes;
 | |
| 
 | |
| 	/* flags field from the on disk inode */
 | |
| 	u32 flags;
 | |
| 
 | |
| 	/* a local copy of root's last_log_commit */
 | |
| 	unsigned long last_log_commit;
 | |
| 
 | |
| 	/*
 | |
| 	 * Counters to keep track of the number of extent item's we may use due
 | |
| 	 * to delalloc and such.  outstanding_extents is the number of extent
 | |
| 	 * items we think we'll end up using, and reserved_extents is the number
 | |
| 	 * of extent items we've reserved metadata for.
 | |
| 	 */
 | |
| 	unsigned outstanding_extents;
 | |
| 	unsigned reserved_extents;
 | |
| 
 | |
| 	/*
 | |
| 	 * always compress this one file
 | |
| 	 */
 | |
| 	unsigned force_compress;
 | |
| 
 | |
| 	struct btrfs_delayed_node *delayed_node;
 | |
| 
 | |
| 	struct inode vfs_inode;
 | |
| };
 | |
| 
 | |
| extern unsigned char btrfs_filetype_table[];
 | |
| 
 | |
| static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
 | |
| {
 | |
| 	return container_of(inode, struct btrfs_inode, vfs_inode);
 | |
| }
 | |
| 
 | |
| static inline u64 btrfs_ino(struct inode *inode)
 | |
| {
 | |
| 	u64 ino = BTRFS_I(inode)->location.objectid;
 | |
| 
 | |
| 	/*
 | |
| 	 * !ino: btree_inode
 | |
| 	 * type == BTRFS_ROOT_ITEM_KEY: subvol dir
 | |
| 	 */
 | |
| 	if (!ino || BTRFS_I(inode)->location.type == BTRFS_ROOT_ITEM_KEY)
 | |
| 		ino = inode->i_ino;
 | |
| 	return ino;
 | |
| }
 | |
| 
 | |
| static inline void btrfs_i_size_write(struct inode *inode, u64 size)
 | |
| {
 | |
| 	i_size_write(inode, size);
 | |
| 	BTRFS_I(inode)->disk_i_size = size;
 | |
| }
 | |
| 
 | |
| static inline bool btrfs_is_free_space_inode(struct inode *inode)
 | |
| {
 | |
| 	struct btrfs_root *root = BTRFS_I(inode)->root;
 | |
| 
 | |
| 	if (root == root->fs_info->tree_root &&
 | |
| 	    btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID)
 | |
| 		return true;
 | |
| 	if (BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID)
 | |
| 		return true;
 | |
| 	return false;
 | |
| }
 | |
| 
 | |
| static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
 | |
| {
 | |
| 	if (BTRFS_I(inode)->logged_trans == generation &&
 | |
| 	    BTRFS_I(inode)->last_sub_trans <= BTRFS_I(inode)->last_log_commit)
 | |
| 		return 1;
 | |
| 	return 0;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Disable DIO read nolock optimization, so new dio readers will be forced
 | |
|  * to grab i_mutex. It is used to avoid the endless truncate due to
 | |
|  * nonlocked dio read.
 | |
|  */
 | |
| static inline void btrfs_inode_block_unlocked_dio(struct inode *inode)
 | |
| {
 | |
| 	set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags);
 | |
| 	smp_mb();
 | |
| }
 | |
| 
 | |
| static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode)
 | |
| {
 | |
| 	smp_mb__before_clear_bit();
 | |
| 	clear_bit(BTRFS_INODE_READDIO_NEED_LOCK,
 | |
| 		  &BTRFS_I(inode)->runtime_flags);
 | |
| }
 | |
| 
 | |
| #endif
 |