mirror of
				git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
				synced 2025-09-04 20:19:47 +08:00 
			
		
		
		
	 4dd72b4a47
			
		
	
	
		4dd72b4a47
		
	
	
	
	
		
			
			When FADV_DONTNEED cannot drop all pages in the range, it observes that some pages might still be on per-cpu LRU caches after recent instantiation and so initiates remote calls to all CPUs to flush their local caches. However, in most cases, the fadvise happens from the same context that instantiated the pages, and any pre-LRU pages in the specified range are most likely sitting on the local CPU's LRU cache, and so in many cases this results in unnecessary remote calls, which, in a loaded system, can hold up the fadvise() call significantly. [ I didn't record it in the extreme case we observed at Facebook, unfortunately. We had a slow-to-respond system and noticed it lru_add_drain_all() leading the profile during fadvise calls. This patch came out of thinking about the code and how we commonly call FADV_DONTNEED. FWIW, I wrote a silly directory tree walker/searcher that recurses through /usr to read and FADV_DONTNEED each file it finds. On a 2 socket 40 ht machine, over 1% is spent in lru_add_drain_all(). With the patch, that cost is gone; the local drain cost shows at 0.09%. ] Try to avoid the remote call by flushing the local LRU cache before even attempting to invalidate anything. It's a cheap operation, and the local LRU cache is the most likely to hold any pre-LRU pages in the specified fadvise range. Link: http://lkml.kernel.org/r/20161214210017.GA1465@cmpxchg.org Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Mel Gorman <mgorman@suse.de> Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
		
			
				
	
	
		
			187 lines
		
	
	
		
			4.3 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			187 lines
		
	
	
		
			4.3 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /*
 | |
|  * mm/fadvise.c
 | |
|  *
 | |
|  * Copyright (C) 2002, Linus Torvalds
 | |
|  *
 | |
|  * 11Jan2003	Andrew Morton
 | |
|  *		Initial version.
 | |
|  */
 | |
| 
 | |
| #include <linux/kernel.h>
 | |
| #include <linux/file.h>
 | |
| #include <linux/fs.h>
 | |
| #include <linux/mm.h>
 | |
| #include <linux/pagemap.h>
 | |
| #include <linux/backing-dev.h>
 | |
| #include <linux/pagevec.h>
 | |
| #include <linux/fadvise.h>
 | |
| #include <linux/writeback.h>
 | |
| #include <linux/syscalls.h>
 | |
| #include <linux/swap.h>
 | |
| 
 | |
| #include <asm/unistd.h>
 | |
| 
 | |
| /*
 | |
|  * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
 | |
|  * deactivate the pages and clear PG_Referenced.
 | |
|  */
 | |
| SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 | |
| {
 | |
| 	struct fd f = fdget(fd);
 | |
| 	struct inode *inode;
 | |
| 	struct address_space *mapping;
 | |
| 	struct backing_dev_info *bdi;
 | |
| 	loff_t endbyte;			/* inclusive */
 | |
| 	pgoff_t start_index;
 | |
| 	pgoff_t end_index;
 | |
| 	unsigned long nrpages;
 | |
| 	int ret = 0;
 | |
| 
 | |
| 	if (!f.file)
 | |
| 		return -EBADF;
 | |
| 
 | |
| 	inode = file_inode(f.file);
 | |
| 	if (S_ISFIFO(inode->i_mode)) {
 | |
| 		ret = -ESPIPE;
 | |
| 		goto out;
 | |
| 	}
 | |
| 
 | |
| 	mapping = f.file->f_mapping;
 | |
| 	if (!mapping || len < 0) {
 | |
| 		ret = -EINVAL;
 | |
| 		goto out;
 | |
| 	}
 | |
| 
 | |
| 	if (IS_DAX(inode)) {
 | |
| 		switch (advice) {
 | |
| 		case POSIX_FADV_NORMAL:
 | |
| 		case POSIX_FADV_RANDOM:
 | |
| 		case POSIX_FADV_SEQUENTIAL:
 | |
| 		case POSIX_FADV_WILLNEED:
 | |
| 		case POSIX_FADV_NOREUSE:
 | |
| 		case POSIX_FADV_DONTNEED:
 | |
| 			/* no bad return value, but ignore advice */
 | |
| 			break;
 | |
| 		default:
 | |
| 			ret = -EINVAL;
 | |
| 		}
 | |
| 		goto out;
 | |
| 	}
 | |
| 
 | |
| 	/* Careful about overflows. Len == 0 means "as much as possible" */
 | |
| 	endbyte = offset + len;
 | |
| 	if (!len || endbyte < len)
 | |
| 		endbyte = -1;
 | |
| 	else
 | |
| 		endbyte--;		/* inclusive */
 | |
| 
 | |
| 	bdi = inode_to_bdi(mapping->host);
 | |
| 
 | |
| 	switch (advice) {
 | |
| 	case POSIX_FADV_NORMAL:
 | |
| 		f.file->f_ra.ra_pages = bdi->ra_pages;
 | |
| 		spin_lock(&f.file->f_lock);
 | |
| 		f.file->f_mode &= ~FMODE_RANDOM;
 | |
| 		spin_unlock(&f.file->f_lock);
 | |
| 		break;
 | |
| 	case POSIX_FADV_RANDOM:
 | |
| 		spin_lock(&f.file->f_lock);
 | |
| 		f.file->f_mode |= FMODE_RANDOM;
 | |
| 		spin_unlock(&f.file->f_lock);
 | |
| 		break;
 | |
| 	case POSIX_FADV_SEQUENTIAL:
 | |
| 		f.file->f_ra.ra_pages = bdi->ra_pages * 2;
 | |
| 		spin_lock(&f.file->f_lock);
 | |
| 		f.file->f_mode &= ~FMODE_RANDOM;
 | |
| 		spin_unlock(&f.file->f_lock);
 | |
| 		break;
 | |
| 	case POSIX_FADV_WILLNEED:
 | |
| 		/* First and last PARTIAL page! */
 | |
| 		start_index = offset >> PAGE_SHIFT;
 | |
| 		end_index = endbyte >> PAGE_SHIFT;
 | |
| 
 | |
| 		/* Careful about overflow on the "+1" */
 | |
| 		nrpages = end_index - start_index + 1;
 | |
| 		if (!nrpages)
 | |
| 			nrpages = ~0UL;
 | |
| 
 | |
| 		/*
 | |
| 		 * Ignore return value because fadvise() shall return
 | |
| 		 * success even if filesystem can't retrieve a hint,
 | |
| 		 */
 | |
| 		force_page_cache_readahead(mapping, f.file, start_index,
 | |
| 					   nrpages);
 | |
| 		break;
 | |
| 	case POSIX_FADV_NOREUSE:
 | |
| 		break;
 | |
| 	case POSIX_FADV_DONTNEED:
 | |
| 		if (!inode_write_congested(mapping->host))
 | |
| 			__filemap_fdatawrite_range(mapping, offset, endbyte,
 | |
| 						   WB_SYNC_NONE);
 | |
| 
 | |
| 		/*
 | |
| 		 * First and last FULL page! Partial pages are deliberately
 | |
| 		 * preserved on the expectation that it is better to preserve
 | |
| 		 * needed memory than to discard unneeded memory.
 | |
| 		 */
 | |
| 		start_index = (offset+(PAGE_SIZE-1)) >> PAGE_SHIFT;
 | |
| 		end_index = (endbyte >> PAGE_SHIFT);
 | |
| 		if ((endbyte & ~PAGE_MASK) != ~PAGE_MASK) {
 | |
| 			/* First page is tricky as 0 - 1 = -1, but pgoff_t
 | |
| 			 * is unsigned, so the end_index >= start_index
 | |
| 			 * check below would be true and we'll discard the whole
 | |
| 			 * file cache which is not what was asked.
 | |
| 			 */
 | |
| 			if (end_index == 0)
 | |
| 				break;
 | |
| 
 | |
| 			end_index--;
 | |
| 		}
 | |
| 
 | |
| 		if (end_index >= start_index) {
 | |
| 			unsigned long count;
 | |
| 
 | |
| 			/*
 | |
| 			 * It's common to FADV_DONTNEED right after
 | |
| 			 * the read or write that instantiates the
 | |
| 			 * pages, in which case there will be some
 | |
| 			 * sitting on the local LRU cache. Try to
 | |
| 			 * avoid the expensive remote drain and the
 | |
| 			 * second cache tree walk below by flushing
 | |
| 			 * them out right away.
 | |
| 			 */
 | |
| 			lru_add_drain();
 | |
| 
 | |
| 			count = invalidate_mapping_pages(mapping,
 | |
| 						start_index, end_index);
 | |
| 
 | |
| 			/*
 | |
| 			 * If fewer pages were invalidated than expected then
 | |
| 			 * it is possible that some of the pages were on
 | |
| 			 * a per-cpu pagevec for a remote CPU. Drain all
 | |
| 			 * pagevecs and try again.
 | |
| 			 */
 | |
| 			if (count < (end_index - start_index + 1)) {
 | |
| 				lru_add_drain_all();
 | |
| 				invalidate_mapping_pages(mapping, start_index,
 | |
| 						end_index);
 | |
| 			}
 | |
| 		}
 | |
| 		break;
 | |
| 	default:
 | |
| 		ret = -EINVAL;
 | |
| 	}
 | |
| out:
 | |
| 	fdput(f);
 | |
| 	return ret;
 | |
| }
 | |
| 
 | |
| #ifdef __ARCH_WANT_SYS_FADVISE64
 | |
| 
 | |
| SYSCALL_DEFINE4(fadvise64, int, fd, loff_t, offset, size_t, len, int, advice)
 | |
| {
 | |
| 	return sys_fadvise64_64(fd, offset, len, advice);
 | |
| }
 | |
| 
 | |
| #endif
 |