mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-03-22 07:27:12 +08:00
Merge tag 'mm-stable-2026-02-18-19-48' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull more MM updates from Andrew Morton: - "mm/vmscan: fix demotion targets checks in reclaim/demotion" fixes a couple of issues in the demotion code - pages were failed demotion and were finding themselves demoted into disallowed nodes (Bing Jiao) - "Remove XA_ZERO from error recovery of dup_mmap()" fixes a rare mapledtree race and performs a number of cleanups (Liam Howlett) - "mm: add bitmap VMA flag helpers and convert all mmap_prepare to use them" implements a lot of cleanups following on from the conversion of the VMA flags into a bitmap (Lorenzo Stoakes) - "support batch checking of references and unmapping for large folios" implements batching to greatly improve the performance of reclaiming clean file-backed large folios (Baolin Wang) - "selftests/mm: add memory failure selftests" does as claimed (Miaohe Lin) * tag 'mm-stable-2026-02-18-19-48' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (36 commits) mm/page_alloc: clear page->private in free_pages_prepare() selftests/mm: add memory failure dirty pagecache test selftests/mm: add memory failure clean pagecache test selftests/mm: add memory failure anonymous page test mm: rmap: support batched unmapping for file large folios arm64: mm: implement the architecture-specific clear_flush_young_ptes() arm64: mm: support batch clearing of the young flag for large folios arm64: mm: factor out the address and ptep alignment into a new helper mm: rmap: support batched checks of the references for large folios tools/testing/vma: add VMA userland tests for VMA flag functions tools/testing/vma: separate out vma_internal.h into logical headers tools/testing/vma: separate VMA userland tests into separate files mm: make vm_area_desc utilise vma_flags_t only mm: update all remaining mmap_prepare users to use vma_flags_t mm: update shmem_[kernel]_file_*() functions to use vma_flags_t mm: update secretmem to use VMA flags on mmap_prepare mm: update hugetlbfs to use VMA flags on mmap_prepare mm: add basic VMA flag operation helper functions tools: bitmap: add missing bitmap_[subset(), andnot()] mm: add mk_vma_flags() bitmap flag macro helper ...
This commit is contained in:
@@ -4012,7 +4012,7 @@ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
|
||||
|
||||
int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc)
|
||||
{
|
||||
if (is_shared_maywrite(desc->vm_flags))
|
||||
if (is_shared_maywrite(&desc->vma_flags))
|
||||
return -EINVAL;
|
||||
return generic_file_mmap_prepare(desc);
|
||||
}
|
||||
|
||||
22
mm/hugetlb.c
22
mm/hugetlb.c
@@ -1193,16 +1193,16 @@ static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
|
||||
|
||||
static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *map)
|
||||
{
|
||||
VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags));
|
||||
VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE);
|
||||
VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags));
|
||||
VM_WARN_ON_ONCE(vma_desc_test_flags(desc, VMA_MAYSHARE_BIT));
|
||||
|
||||
desc->private_data = map;
|
||||
}
|
||||
|
||||
static void set_vma_desc_resv_flags(struct vm_area_desc *desc, unsigned long flags)
|
||||
{
|
||||
VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags));
|
||||
VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE);
|
||||
VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags));
|
||||
VM_WARN_ON_ONCE(vma_desc_test_flags(desc, VMA_MAYSHARE_BIT));
|
||||
|
||||
desc->private_data = (void *)((unsigned long)desc->private_data | flags);
|
||||
}
|
||||
@@ -1216,7 +1216,7 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
|
||||
|
||||
static bool is_vma_desc_resv_set(struct vm_area_desc *desc, unsigned long flag)
|
||||
{
|
||||
VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags));
|
||||
VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags));
|
||||
|
||||
return ((unsigned long)desc->private_data) & flag;
|
||||
}
|
||||
@@ -6571,7 +6571,7 @@ next:
|
||||
long hugetlb_reserve_pages(struct inode *inode,
|
||||
long from, long to,
|
||||
struct vm_area_desc *desc,
|
||||
vm_flags_t vm_flags)
|
||||
vma_flags_t vma_flags)
|
||||
{
|
||||
long chg = -1, add = -1, spool_resv, gbl_resv;
|
||||
struct hstate *h = hstate_inode(inode);
|
||||
@@ -6592,7 +6592,7 @@ long hugetlb_reserve_pages(struct inode *inode,
|
||||
* attempt will be made for VM_NORESERVE to allocate a page
|
||||
* without using reserves
|
||||
*/
|
||||
if (vm_flags & VM_NORESERVE)
|
||||
if (vma_flags_test(&vma_flags, VMA_NORESERVE_BIT))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
@@ -6601,7 +6601,7 @@ long hugetlb_reserve_pages(struct inode *inode,
|
||||
* to reserve the full area even if read-only as mprotect() may be
|
||||
* called to make the mapping read-write. Assume !desc is a shm mapping
|
||||
*/
|
||||
if (!desc || desc->vm_flags & VM_MAYSHARE) {
|
||||
if (!desc || vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)) {
|
||||
/*
|
||||
* resv_map can not be NULL as hugetlb_reserve_pages is only
|
||||
* called for inodes for which resv_maps were created (see
|
||||
@@ -6635,7 +6635,7 @@ long hugetlb_reserve_pages(struct inode *inode,
|
||||
if (err < 0)
|
||||
goto out_err;
|
||||
|
||||
if (desc && !(desc->vm_flags & VM_MAYSHARE) && h_cg) {
|
||||
if (desc && !vma_desc_test_flags(desc, VMA_MAYSHARE_BIT) && h_cg) {
|
||||
/* For private mappings, the hugetlb_cgroup uncharge info hangs
|
||||
* of the resv_map.
|
||||
*/
|
||||
@@ -6672,7 +6672,7 @@ long hugetlb_reserve_pages(struct inode *inode,
|
||||
* consumed reservations are stored in the map. Hence, nothing
|
||||
* else has to be done for private mappings here
|
||||
*/
|
||||
if (!desc || desc->vm_flags & VM_MAYSHARE) {
|
||||
if (!desc || vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)) {
|
||||
add = region_add(resv_map, from, to, regions_needed, h, h_cg);
|
||||
|
||||
if (unlikely(add < 0)) {
|
||||
@@ -6736,7 +6736,7 @@ out_uncharge_cgroup:
|
||||
hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
|
||||
chg * pages_per_huge_page(h), h_cg);
|
||||
out_err:
|
||||
if (!desc || desc->vm_flags & VM_MAYSHARE)
|
||||
if (!desc || vma_desc_test_flags(desc, VMA_MAYSHARE_BIT))
|
||||
/* Only call region_abort if the region_chg succeeded but the
|
||||
* region_add failed or didn't run.
|
||||
*/
|
||||
|
||||
@@ -197,6 +197,9 @@ static inline void vma_close(struct vm_area_struct *vma)
|
||||
}
|
||||
}
|
||||
|
||||
/* unmap_vmas is in mm/memory.c */
|
||||
void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap);
|
||||
|
||||
#ifdef CONFIG_MMU
|
||||
|
||||
static inline void get_anon_vma(struct anon_vma *anon_vma)
|
||||
@@ -509,9 +512,8 @@ bool __folio_end_writeback(struct folio *folio);
|
||||
void deactivate_file_folio(struct folio *folio);
|
||||
void folio_activate(struct folio *folio);
|
||||
|
||||
void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
|
||||
struct vm_area_struct *start_vma, unsigned long floor,
|
||||
unsigned long ceiling, bool mm_wr_locked);
|
||||
void free_pgtables(struct mmu_gather *tlb, struct unmap_desc *desc);
|
||||
|
||||
void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
|
||||
|
||||
struct zap_details;
|
||||
@@ -1044,7 +1046,7 @@ extern long populate_vma_page_range(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end, int *locked);
|
||||
extern long faultin_page_range(struct mm_struct *mm, unsigned long start,
|
||||
unsigned long end, bool write, int *locked);
|
||||
bool mlock_future_ok(const struct mm_struct *mm, vm_flags_t vm_flags,
|
||||
bool mlock_future_ok(const struct mm_struct *mm, bool is_vma_locked,
|
||||
unsigned long bytes);
|
||||
|
||||
/*
|
||||
|
||||
@@ -1732,7 +1732,7 @@ static bool file_backed_vma_is_retractable(struct vm_area_struct *vma)
|
||||
* obtained on guard region installation after the flag is set, so this
|
||||
* check being performed under this lock excludes races.
|
||||
*/
|
||||
if (vma_flag_test_atomic(vma, VMA_MAYBE_GUARD_BIT))
|
||||
if (vma_test_atomic_flag(vma, VMA_MAYBE_GUARD_BIT))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
|
||||
@@ -1140,7 +1140,7 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior)
|
||||
* acquire an mmap/VMA write lock to read it. All remaining readers may
|
||||
* or may not see the flag set, but we don't care.
|
||||
*/
|
||||
vma_flag_set_atomic(vma, VMA_MAYBE_GUARD_BIT);
|
||||
vma_set_atomic_flag(vma, VMA_MAYBE_GUARD_BIT);
|
||||
|
||||
/*
|
||||
* If anonymous and we are establishing page tables the VMA ought to
|
||||
|
||||
@@ -5649,9 +5649,21 @@ subsys_initcall(mem_cgroup_swap_init);
|
||||
|
||||
#endif /* CONFIG_SWAP */
|
||||
|
||||
bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
|
||||
void mem_cgroup_node_filter_allowed(struct mem_cgroup *memcg, nodemask_t *mask)
|
||||
{
|
||||
return memcg ? cpuset_node_allowed(memcg->css.cgroup, nid) : true;
|
||||
nodemask_t allowed;
|
||||
|
||||
if (!memcg)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Since this interface is intended for use by migration paths, and
|
||||
* reclaim and migration are subject to race conditions such as changes
|
||||
* in effective_mems and hot-unpluging of nodes, inaccurate allowed
|
||||
* mask is acceptable.
|
||||
*/
|
||||
cpuset_nodes_allowed(memcg->css.cgroup, &allowed);
|
||||
nodes_and(*mask, *mask, allowed);
|
||||
}
|
||||
|
||||
void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg)
|
||||
|
||||
@@ -86,7 +86,7 @@ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx)
|
||||
gfp_mask &= ~(__GFP_HIGHMEM | __GFP_MOVABLE);
|
||||
idx >>= huge_page_order(h);
|
||||
|
||||
nr_resv = hugetlb_reserve_pages(inode, idx, idx + 1, NULL, 0);
|
||||
nr_resv = hugetlb_reserve_pages(inode, idx, idx + 1, NULL, EMPTY_VMA_FLAGS);
|
||||
if (nr_resv < 0)
|
||||
return ERR_PTR(nr_resv);
|
||||
|
||||
@@ -463,12 +463,12 @@ struct file *memfd_alloc_file(const char *name, unsigned int flags)
|
||||
int err = 0;
|
||||
|
||||
if (flags & MFD_HUGETLB) {
|
||||
file = hugetlb_file_setup(name, 0, VM_NORESERVE,
|
||||
file = hugetlb_file_setup(name, 0, mk_vma_flags(VMA_NORESERVE_BIT),
|
||||
HUGETLB_ANONHUGE_INODE,
|
||||
(flags >> MFD_HUGE_SHIFT) &
|
||||
MFD_HUGE_MASK);
|
||||
} else {
|
||||
file = shmem_file_setup(name, 0, VM_NORESERVE);
|
||||
file = shmem_file_setup(name, 0, mk_vma_flags(VMA_NORESERVE_BIT));
|
||||
}
|
||||
if (IS_ERR(file))
|
||||
return file;
|
||||
|
||||
@@ -320,16 +320,17 @@ void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
|
||||
/**
|
||||
* next_demotion_node() - Get the next node in the demotion path
|
||||
* @node: The starting node to lookup the next node
|
||||
* @allowed_mask: The pointer to allowed node mask
|
||||
*
|
||||
* Return: node id for next memory node in the demotion path hierarchy
|
||||
* from @node; NUMA_NO_NODE if @node is terminal. This does not keep
|
||||
* @node online or guarantee that it *continues* to be the next demotion
|
||||
* target.
|
||||
*/
|
||||
int next_demotion_node(int node)
|
||||
int next_demotion_node(int node, const nodemask_t *allowed_mask)
|
||||
{
|
||||
struct demotion_nodes *nd;
|
||||
int target;
|
||||
nodemask_t mask;
|
||||
|
||||
if (!node_demotion)
|
||||
return NUMA_NO_NODE;
|
||||
@@ -344,6 +345,10 @@ int next_demotion_node(int node)
|
||||
* node_demotion[] reads need to be consistent.
|
||||
*/
|
||||
rcu_read_lock();
|
||||
/* Filter out nodes that are not in allowed_mask. */
|
||||
nodes_and(mask, nd->preferred, *allowed_mask);
|
||||
rcu_read_unlock();
|
||||
|
||||
/*
|
||||
* If there are multiple target nodes, just select one
|
||||
* target node randomly.
|
||||
@@ -356,10 +361,16 @@ int next_demotion_node(int node)
|
||||
* caching issue, which seems more complicated. So selecting
|
||||
* target node randomly seems better until now.
|
||||
*/
|
||||
target = node_random(&nd->preferred);
|
||||
rcu_read_unlock();
|
||||
if (!nodes_empty(mask))
|
||||
return node_random(&mask);
|
||||
|
||||
return target;
|
||||
/*
|
||||
* Preferred nodes are not in allowed_mask. Flip bits in
|
||||
* allowed_mask as used node mask. Then, use it to get the
|
||||
* closest demotion target.
|
||||
*/
|
||||
nodes_complement(mask, *allowed_mask);
|
||||
return find_next_best_node(node, &mask);
|
||||
}
|
||||
|
||||
static void disable_all_demotion_targets(void)
|
||||
|
||||
101
mm/memory.c
101
mm/memory.c
@@ -370,11 +370,32 @@ void free_pgd_range(struct mmu_gather *tlb,
|
||||
} while (pgd++, addr = next, addr != end);
|
||||
}
|
||||
|
||||
void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
|
||||
struct vm_area_struct *vma, unsigned long floor,
|
||||
unsigned long ceiling, bool mm_wr_locked)
|
||||
/**
|
||||
* free_pgtables() - Free a range of page tables
|
||||
* @tlb: The mmu gather
|
||||
* @unmap: The unmap_desc
|
||||
*
|
||||
* Note: pg_start and pg_end are provided to indicate the absolute range of the
|
||||
* page tables that should be removed. This can differ from the vma mappings on
|
||||
* some archs that may have mappings that need to be removed outside the vmas.
|
||||
* Note that the prev->vm_end and next->vm_start are often used.
|
||||
*
|
||||
* The vma_end differs from the pg_end when a dup_mmap() failed and the tree has
|
||||
* unrelated data to the mm_struct being torn down.
|
||||
*/
|
||||
void free_pgtables(struct mmu_gather *tlb, struct unmap_desc *unmap)
|
||||
{
|
||||
struct unlink_vma_file_batch vb;
|
||||
struct ma_state *mas = unmap->mas;
|
||||
struct vm_area_struct *vma = unmap->first;
|
||||
|
||||
/*
|
||||
* Note: USER_PGTABLES_CEILING may be passed as the value of pg_end and
|
||||
* may be 0. Underflow is expected in this case. Otherwise the
|
||||
* pagetable end is exclusive. vma_end is exclusive. The last vma
|
||||
* address should never be larger than the pagetable end.
|
||||
*/
|
||||
WARN_ON_ONCE(unmap->vma_end - 1 > unmap->pg_end - 1);
|
||||
|
||||
tlb_free_vmas(tlb);
|
||||
|
||||
@@ -382,19 +403,13 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
|
||||
unsigned long addr = vma->vm_start;
|
||||
struct vm_area_struct *next;
|
||||
|
||||
/*
|
||||
* Note: USER_PGTABLES_CEILING may be passed as ceiling and may
|
||||
* be 0. This will underflow and is okay.
|
||||
*/
|
||||
next = mas_find(mas, ceiling - 1);
|
||||
if (unlikely(xa_is_zero(next)))
|
||||
next = NULL;
|
||||
next = mas_find(mas, unmap->tree_end - 1);
|
||||
|
||||
/*
|
||||
* Hide vma from rmap and truncate_pagecache before freeing
|
||||
* pgtables
|
||||
*/
|
||||
if (mm_wr_locked)
|
||||
if (unmap->mm_wr_locked)
|
||||
vma_start_write(vma);
|
||||
unlink_anon_vmas(vma);
|
||||
|
||||
@@ -406,18 +421,16 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
|
||||
*/
|
||||
while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
|
||||
vma = next;
|
||||
next = mas_find(mas, ceiling - 1);
|
||||
if (unlikely(xa_is_zero(next)))
|
||||
next = NULL;
|
||||
if (mm_wr_locked)
|
||||
next = mas_find(mas, unmap->tree_end - 1);
|
||||
if (unmap->mm_wr_locked)
|
||||
vma_start_write(vma);
|
||||
unlink_anon_vmas(vma);
|
||||
unlink_file_vma_batch_add(&vb, vma);
|
||||
}
|
||||
unlink_file_vma_batch_final(&vb);
|
||||
|
||||
free_pgd_range(tlb, addr, vma->vm_end,
|
||||
floor, next ? next->vm_start : ceiling);
|
||||
free_pgd_range(tlb, addr, vma->vm_end, unmap->pg_start,
|
||||
next ? next->vm_start : unmap->pg_end);
|
||||
vma = next;
|
||||
} while (vma);
|
||||
}
|
||||
@@ -2124,11 +2137,7 @@ static void unmap_single_vma(struct mmu_gather *tlb,
|
||||
/**
|
||||
* unmap_vmas - unmap a range of memory covered by a list of vma's
|
||||
* @tlb: address of the caller's struct mmu_gather
|
||||
* @mas: the maple state
|
||||
* @vma: the starting vma
|
||||
* @start_addr: virtual address at which to start unmapping
|
||||
* @end_addr: virtual address at which to end unmapping
|
||||
* @tree_end: The maximum index to check
|
||||
* @unmap: The unmap_desc
|
||||
*
|
||||
* Unmap all pages in the vma list.
|
||||
*
|
||||
@@ -2141,10 +2150,9 @@ static void unmap_single_vma(struct mmu_gather *tlb,
|
||||
* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
|
||||
* drops the lock and schedules.
|
||||
*/
|
||||
void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
|
||||
struct vm_area_struct *vma, unsigned long start_addr,
|
||||
unsigned long end_addr, unsigned long tree_end)
|
||||
void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap)
|
||||
{
|
||||
struct vm_area_struct *vma;
|
||||
struct mmu_notifier_range range;
|
||||
struct zap_details details = {
|
||||
.zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP,
|
||||
@@ -2152,17 +2160,18 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
|
||||
.even_cows = true,
|
||||
};
|
||||
|
||||
vma = unmap->first;
|
||||
mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
|
||||
start_addr, end_addr);
|
||||
unmap->vma_start, unmap->vma_end);
|
||||
mmu_notifier_invalidate_range_start(&range);
|
||||
do {
|
||||
unsigned long start = start_addr;
|
||||
unsigned long end = end_addr;
|
||||
unsigned long start = unmap->vma_start;
|
||||
unsigned long end = unmap->vma_end;
|
||||
hugetlb_zap_begin(vma, &start, &end);
|
||||
unmap_single_vma(tlb, vma, start, end, &details);
|
||||
hugetlb_zap_end(vma, &details);
|
||||
vma = mas_find(mas, tree_end - 1);
|
||||
} while (vma && likely(!xa_is_zero(vma)));
|
||||
vma = mas_find(unmap->mas, unmap->tree_end - 1);
|
||||
} while (vma);
|
||||
mmu_notifier_invalidate_range_end(&range);
|
||||
}
|
||||
|
||||
@@ -2948,7 +2957,7 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int get_remap_pgoff(vm_flags_t vm_flags, unsigned long addr,
|
||||
static int get_remap_pgoff(bool is_cow, unsigned long addr,
|
||||
unsigned long end, unsigned long vm_start, unsigned long vm_end,
|
||||
unsigned long pfn, pgoff_t *vm_pgoff_p)
|
||||
{
|
||||
@@ -2958,7 +2967,7 @@ static int get_remap_pgoff(vm_flags_t vm_flags, unsigned long addr,
|
||||
* un-COW'ed pages by matching them up with "vma->vm_pgoff".
|
||||
* See vm_normal_page() for details.
|
||||
*/
|
||||
if (is_cow_mapping(vm_flags)) {
|
||||
if (is_cow) {
|
||||
if (addr != vm_start || end != vm_end)
|
||||
return -EINVAL;
|
||||
*vm_pgoff_p = pfn;
|
||||
@@ -2979,7 +2988,7 @@ static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long ad
|
||||
if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
|
||||
return -EINVAL;
|
||||
|
||||
VM_WARN_ON_ONCE((vma->vm_flags & VM_REMAP_FLAGS) != VM_REMAP_FLAGS);
|
||||
VM_WARN_ON_ONCE(!vma_test_all_flags_mask(vma, VMA_REMAP_FLAGS));
|
||||
|
||||
BUG_ON(addr >= end);
|
||||
pfn -= addr >> PAGE_SHIFT;
|
||||
@@ -3103,9 +3112,9 @@ void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn)
|
||||
* check it again on complete and will fail there if specified addr is
|
||||
* invalid.
|
||||
*/
|
||||
get_remap_pgoff(desc->vm_flags, desc->start, desc->end,
|
||||
get_remap_pgoff(vma_desc_is_cow_mapping(desc), desc->start, desc->end,
|
||||
desc->start, desc->end, pfn, &desc->pgoff);
|
||||
desc->vm_flags |= VM_REMAP_FLAGS;
|
||||
vma_desc_set_flags_mask(desc, VMA_REMAP_FLAGS);
|
||||
}
|
||||
|
||||
static int remap_pfn_range_prepare_vma(struct vm_area_struct *vma, unsigned long addr,
|
||||
@@ -3114,13 +3123,12 @@ static int remap_pfn_range_prepare_vma(struct vm_area_struct *vma, unsigned long
|
||||
unsigned long end = addr + PAGE_ALIGN(size);
|
||||
int err;
|
||||
|
||||
err = get_remap_pgoff(vma->vm_flags, addr, end,
|
||||
vma->vm_start, vma->vm_end,
|
||||
pfn, &vma->vm_pgoff);
|
||||
err = get_remap_pgoff(is_cow_mapping(vma->vm_flags), addr, end,
|
||||
vma->vm_start, vma->vm_end, pfn, &vma->vm_pgoff);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
vm_flags_set(vma, VM_REMAP_FLAGS);
|
||||
vma_set_flags_mask(vma, VMA_REMAP_FLAGS);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -7316,7 +7324,7 @@ void folio_zero_user(struct folio *folio, unsigned long addr_hint)
|
||||
const unsigned long base_addr = ALIGN_DOWN(addr_hint, folio_size(folio));
|
||||
const long fault_idx = (addr_hint - base_addr) / PAGE_SIZE;
|
||||
const struct range pg = DEFINE_RANGE(0, folio_nr_pages(folio) - 1);
|
||||
const int radius = FOLIO_ZERO_LOCALITY_RADIUS;
|
||||
const long radius = FOLIO_ZERO_LOCALITY_RADIUS;
|
||||
struct range r[3];
|
||||
int i;
|
||||
|
||||
@@ -7324,20 +7332,19 @@ void folio_zero_user(struct folio *folio, unsigned long addr_hint)
|
||||
* Faulting page and its immediate neighbourhood. Will be cleared at the
|
||||
* end to keep its cachelines hot.
|
||||
*/
|
||||
r[2] = DEFINE_RANGE(clamp_t(s64, fault_idx - radius, pg.start, pg.end),
|
||||
clamp_t(s64, fault_idx + radius, pg.start, pg.end));
|
||||
r[2] = DEFINE_RANGE(fault_idx - radius < (long)pg.start ? pg.start : fault_idx - radius,
|
||||
fault_idx + radius > (long)pg.end ? pg.end : fault_idx + radius);
|
||||
|
||||
|
||||
/* Region to the left of the fault */
|
||||
r[1] = DEFINE_RANGE(pg.start,
|
||||
clamp_t(s64, r[2].start - 1, pg.start - 1, r[2].start));
|
||||
r[1] = DEFINE_RANGE(pg.start, r[2].start - 1);
|
||||
|
||||
/* Region to the right of the fault: always valid for the common fault_idx=0 case. */
|
||||
r[0] = DEFINE_RANGE(clamp_t(s64, r[2].end + 1, r[2].end, pg.end + 1),
|
||||
pg.end);
|
||||
r[0] = DEFINE_RANGE(r[2].end + 1, pg.end);
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(r); i++) {
|
||||
const unsigned long addr = base_addr + r[i].start * PAGE_SIZE;
|
||||
const unsigned int nr_pages = range_len(&r[i]);
|
||||
const long nr_pages = (long)range_len(&r[i]);
|
||||
struct page *page = folio_page(folio, r[i].start);
|
||||
|
||||
if (nr_pages > 0)
|
||||
|
||||
110
mm/mmap.c
110
mm/mmap.c
@@ -108,7 +108,8 @@ static int check_brk_limits(unsigned long addr, unsigned long len)
|
||||
if (IS_ERR_VALUE(mapped_addr))
|
||||
return mapped_addr;
|
||||
|
||||
return mlock_future_ok(current->mm, current->mm->def_flags, len)
|
||||
return mlock_future_ok(current->mm,
|
||||
current->mm->def_flags & VM_LOCKED, len)
|
||||
? 0 : -EAGAIN;
|
||||
}
|
||||
|
||||
@@ -225,12 +226,12 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
|
||||
return hint;
|
||||
}
|
||||
|
||||
bool mlock_future_ok(const struct mm_struct *mm, vm_flags_t vm_flags,
|
||||
unsigned long bytes)
|
||||
bool mlock_future_ok(const struct mm_struct *mm, bool is_vma_locked,
|
||||
unsigned long bytes)
|
||||
{
|
||||
unsigned long locked_pages, limit_pages;
|
||||
|
||||
if (!(vm_flags & VM_LOCKED) || capable(CAP_IPC_LOCK))
|
||||
if (!is_vma_locked || capable(CAP_IPC_LOCK))
|
||||
return true;
|
||||
|
||||
locked_pages = bytes >> PAGE_SHIFT;
|
||||
@@ -416,7 +417,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
|
||||
if (!can_do_mlock())
|
||||
return -EPERM;
|
||||
|
||||
if (!mlock_future_ok(mm, vm_flags, len))
|
||||
if (!mlock_future_ok(mm, vm_flags & VM_LOCKED, len))
|
||||
return -EAGAIN;
|
||||
|
||||
if (file) {
|
||||
@@ -594,7 +595,7 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
|
||||
* taken when vm_ops->mmap() is called
|
||||
*/
|
||||
file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
|
||||
VM_NORESERVE,
|
||||
mk_vma_flags(VMA_NORESERVE_BIT),
|
||||
HUGETLB_ANONHUGE_INODE,
|
||||
(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
|
||||
if (IS_ERR(file))
|
||||
@@ -1247,6 +1248,29 @@ limits_failed:
|
||||
}
|
||||
EXPORT_SYMBOL(vm_brk_flags);
|
||||
|
||||
static
|
||||
unsigned long tear_down_vmas(struct mm_struct *mm, struct vma_iterator *vmi,
|
||||
struct vm_area_struct *vma, unsigned long end)
|
||||
{
|
||||
unsigned long nr_accounted = 0;
|
||||
int count = 0;
|
||||
|
||||
mmap_assert_write_locked(mm);
|
||||
vma_iter_set(vmi, vma->vm_end);
|
||||
do {
|
||||
if (vma->vm_flags & VM_ACCOUNT)
|
||||
nr_accounted += vma_pages(vma);
|
||||
vma_mark_detached(vma);
|
||||
remove_vma(vma);
|
||||
count++;
|
||||
cond_resched();
|
||||
vma = vma_next(vmi);
|
||||
} while (vma && vma->vm_end <= end);
|
||||
|
||||
VM_WARN_ON_ONCE(count != mm->map_count);
|
||||
return nr_accounted;
|
||||
}
|
||||
|
||||
/* Release all mmaps. */
|
||||
void exit_mmap(struct mm_struct *mm)
|
||||
{
|
||||
@@ -1254,7 +1278,7 @@ void exit_mmap(struct mm_struct *mm)
|
||||
struct vm_area_struct *vma;
|
||||
unsigned long nr_accounted = 0;
|
||||
VMA_ITERATOR(vmi, mm, 0);
|
||||
int count = 0;
|
||||
struct unmap_desc unmap;
|
||||
|
||||
/* mm's last user has gone, and its about to be pulled down */
|
||||
mmu_notifier_release(mm);
|
||||
@@ -1263,18 +1287,19 @@ void exit_mmap(struct mm_struct *mm)
|
||||
arch_exit_mmap(mm);
|
||||
|
||||
vma = vma_next(&vmi);
|
||||
if (!vma || unlikely(xa_is_zero(vma))) {
|
||||
if (!vma) {
|
||||
/* Can happen if dup_mmap() received an OOM */
|
||||
mmap_read_unlock(mm);
|
||||
mmap_write_lock(mm);
|
||||
goto destroy;
|
||||
}
|
||||
|
||||
unmap_all_init(&unmap, &vmi, vma);
|
||||
flush_cache_mm(mm);
|
||||
tlb_gather_mmu_fullmm(&tlb, mm);
|
||||
/* update_hiwater_rss(mm) here? but nobody should be looking */
|
||||
/* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */
|
||||
unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX);
|
||||
unmap_vmas(&tlb, &unmap);
|
||||
mmap_read_unlock(mm);
|
||||
|
||||
/*
|
||||
@@ -1283,10 +1308,10 @@ void exit_mmap(struct mm_struct *mm)
|
||||
*/
|
||||
mm_flags_set(MMF_OOM_SKIP, mm);
|
||||
mmap_write_lock(mm);
|
||||
unmap.mm_wr_locked = true;
|
||||
mt_clear_in_rcu(&mm->mm_mt);
|
||||
vma_iter_set(&vmi, vma->vm_end);
|
||||
free_pgtables(&tlb, &vmi.mas, vma, FIRST_USER_ADDRESS,
|
||||
USER_PGTABLES_CEILING, true);
|
||||
unmap_pgtable_init(&unmap, &vmi);
|
||||
free_pgtables(&tlb, &unmap);
|
||||
tlb_finish_mmu(&tlb);
|
||||
|
||||
/*
|
||||
@@ -1294,22 +1319,11 @@ void exit_mmap(struct mm_struct *mm)
|
||||
* enabled, without holding any MM locks besides the unreachable
|
||||
* mmap_write_lock.
|
||||
*/
|
||||
vma_iter_set(&vmi, vma->vm_end);
|
||||
do {
|
||||
if (vma->vm_flags & VM_ACCOUNT)
|
||||
nr_accounted += vma_pages(vma);
|
||||
vma_mark_detached(vma);
|
||||
remove_vma(vma);
|
||||
count++;
|
||||
cond_resched();
|
||||
vma = vma_next(&vmi);
|
||||
} while (vma && likely(!xa_is_zero(vma)));
|
||||
nr_accounted = tear_down_vmas(mm, &vmi, vma, ULONG_MAX);
|
||||
|
||||
BUG_ON(count != mm->map_count);
|
||||
|
||||
trace_exit_mmap(mm);
|
||||
destroy:
|
||||
__mt_destroy(&mm->mm_mt);
|
||||
trace_exit_mmap(mm);
|
||||
mmap_write_unlock(mm);
|
||||
vm_unacct_memory(nr_accounted);
|
||||
}
|
||||
@@ -1840,20 +1854,46 @@ loop_out:
|
||||
ksm_fork(mm, oldmm);
|
||||
khugepaged_fork(mm, oldmm);
|
||||
} else {
|
||||
unsigned long end;
|
||||
|
||||
/*
|
||||
* The entire maple tree has already been duplicated. If the
|
||||
* mmap duplication fails, mark the failure point with
|
||||
* XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
|
||||
* stop releasing VMAs that have not been duplicated after this
|
||||
* point.
|
||||
* The entire maple tree has already been duplicated, but
|
||||
* replacing the vmas failed at mpnt (which could be NULL if
|
||||
* all were allocated but the last vma was not fully set up).
|
||||
* Use the start address of the failure point to clean up the
|
||||
* partially initialized tree.
|
||||
*/
|
||||
if (mpnt) {
|
||||
mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
|
||||
mas_store(&vmi.mas, XA_ZERO_ENTRY);
|
||||
/* Avoid OOM iterating a broken tree */
|
||||
mm_flags_set(MMF_OOM_SKIP, mm);
|
||||
if (!mm->map_count) {
|
||||
/* zero vmas were written to the new tree. */
|
||||
end = 0;
|
||||
} else if (mpnt) {
|
||||
/* partial tree failure */
|
||||
end = mpnt->vm_start;
|
||||
} else {
|
||||
/* All vmas were written to the new tree */
|
||||
end = ULONG_MAX;
|
||||
}
|
||||
|
||||
/* Hide mm from oom killer because the memory is being freed */
|
||||
mm_flags_set(MMF_OOM_SKIP, mm);
|
||||
if (end) {
|
||||
vma_iter_set(&vmi, 0);
|
||||
tmp = vma_next(&vmi);
|
||||
UNMAP_STATE(unmap, &vmi, /* first = */ tmp,
|
||||
/* vma_start = */ 0, /* vma_end = */ end,
|
||||
/* prev = */ NULL, /* next = */ NULL);
|
||||
|
||||
/*
|
||||
* Don't iterate over vmas beyond the failure point for
|
||||
* both unmap_vma() and free_pgtables().
|
||||
*/
|
||||
unmap.tree_end = end;
|
||||
flush_cache_mm(mm);
|
||||
unmap_region(&unmap);
|
||||
charge = tear_down_vmas(mm, &vmi, tmp, end);
|
||||
vm_unacct_memory(charge);
|
||||
}
|
||||
__mt_destroy(&mm->mm_mt);
|
||||
/*
|
||||
* The mm_struct is going to exit, but the locks will be dropped
|
||||
* first. Set the mm_struct as unstable is advisable as it is
|
||||
|
||||
@@ -1740,7 +1740,7 @@ static int check_prep_vma(struct vma_remap_struct *vrm)
|
||||
if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
|
||||
return -EFAULT;
|
||||
|
||||
if (!mlock_future_ok(mm, vma->vm_flags, vrm->delta))
|
||||
if (!mlock_future_ok(mm, vma->vm_flags & VM_LOCKED, vrm->delta))
|
||||
return -EAGAIN;
|
||||
|
||||
if (!may_expand_vm(mm, vma->vm_flags, vrm->delta >> PAGE_SHIFT))
|
||||
|
||||
@@ -1429,6 +1429,7 @@ __always_inline bool __free_pages_prepare(struct page *page,
|
||||
|
||||
page_cpupid_reset_last(page);
|
||||
page->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
|
||||
page->private = 0;
|
||||
reset_page_owner(page, order);
|
||||
page_table_check_free(page, order);
|
||||
pgalloc_tag_sub(page, 1 << order);
|
||||
|
||||
38
mm/rmap.c
38
mm/rmap.c
@@ -913,9 +913,11 @@ static bool folio_referenced_one(struct folio *folio,
|
||||
struct folio_referenced_arg *pra = arg;
|
||||
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
|
||||
int ptes = 0, referenced = 0;
|
||||
unsigned int nr;
|
||||
|
||||
while (page_vma_mapped_walk(&pvmw)) {
|
||||
address = pvmw.address;
|
||||
nr = 1;
|
||||
|
||||
if (vma->vm_flags & VM_LOCKED) {
|
||||
ptes++;
|
||||
@@ -960,9 +962,21 @@ static bool folio_referenced_one(struct folio *folio,
|
||||
if (lru_gen_look_around(&pvmw))
|
||||
referenced++;
|
||||
} else if (pvmw.pte) {
|
||||
if (ptep_clear_flush_young_notify(vma, address,
|
||||
pvmw.pte))
|
||||
if (folio_test_large(folio)) {
|
||||
unsigned long end_addr = pmd_addr_end(address, vma->vm_end);
|
||||
unsigned int max_nr = (end_addr - address) >> PAGE_SHIFT;
|
||||
pte_t pteval = ptep_get(pvmw.pte);
|
||||
|
||||
nr = folio_pte_batch(folio, pvmw.pte,
|
||||
pteval, max_nr);
|
||||
}
|
||||
|
||||
ptes += nr;
|
||||
if (clear_flush_young_ptes_notify(vma, address, pvmw.pte, nr))
|
||||
referenced++;
|
||||
/* Skip the batched PTEs */
|
||||
pvmw.pte += nr - 1;
|
||||
pvmw.address += (nr - 1) * PAGE_SIZE;
|
||||
} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
|
||||
if (pmdp_clear_flush_young_notify(vma, address,
|
||||
pvmw.pmd))
|
||||
@@ -972,7 +986,15 @@ static bool folio_referenced_one(struct folio *folio,
|
||||
WARN_ON_ONCE(1);
|
||||
}
|
||||
|
||||
pra->mapcount--;
|
||||
pra->mapcount -= nr;
|
||||
/*
|
||||
* If we are sure that we batched the entire folio,
|
||||
* we can just optimize and stop right here.
|
||||
*/
|
||||
if (ptes == pvmw.nr_pages) {
|
||||
page_vma_mapped_walk_done(&pvmw);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (referenced)
|
||||
@@ -1923,12 +1945,16 @@ static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
|
||||
end_addr = pmd_addr_end(addr, vma->vm_end);
|
||||
max_nr = (end_addr - addr) >> PAGE_SHIFT;
|
||||
|
||||
/* We only support lazyfree batching for now ... */
|
||||
if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
|
||||
/* We only support lazyfree or file folios batching for now ... */
|
||||
if (folio_test_anon(folio) && folio_test_swapbacked(folio))
|
||||
return 1;
|
||||
|
||||
if (pte_unused(pte))
|
||||
return 1;
|
||||
|
||||
if (userfaultfd_wp(vma))
|
||||
return 1;
|
||||
|
||||
return folio_pte_batch(folio, pvmw->pte, pte, max_nr);
|
||||
}
|
||||
|
||||
@@ -2291,7 +2317,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
|
||||
*
|
||||
* See Documentation/mm/mmu_notifier.rst
|
||||
*/
|
||||
dec_mm_counter(mm, mm_counter_file(folio));
|
||||
add_mm_counter(mm, mm_counter_file(folio), -nr_pages);
|
||||
}
|
||||
discard:
|
||||
if (unlikely(folio_test_hugetlb(folio))) {
|
||||
|
||||
@@ -122,13 +122,12 @@ static int secretmem_mmap_prepare(struct vm_area_desc *desc)
|
||||
{
|
||||
const unsigned long len = vma_desc_size(desc);
|
||||
|
||||
if ((desc->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0)
|
||||
if (!vma_desc_test_flags(desc, VMA_SHARED_BIT, VMA_MAYSHARE_BIT))
|
||||
return -EINVAL;
|
||||
|
||||
if (!mlock_future_ok(desc->mm, desc->vm_flags | VM_LOCKED, len))
|
||||
vma_desc_set_flags(desc, VMA_LOCKED_BIT, VMA_DONTDUMP_BIT);
|
||||
if (!mlock_future_ok(desc->mm, /*is_vma_locked=*/ true, len))
|
||||
return -EAGAIN;
|
||||
|
||||
desc->vm_flags |= VM_LOCKED | VM_DONTDUMP;
|
||||
desc->vm_ops = &secretmem_vm_ops;
|
||||
|
||||
return 0;
|
||||
|
||||
61
mm/shmem.c
61
mm/shmem.c
@@ -3062,9 +3062,9 @@ static struct offset_ctx *shmem_get_offset_ctx(struct inode *inode)
|
||||
}
|
||||
|
||||
static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
|
||||
struct super_block *sb,
|
||||
struct inode *dir, umode_t mode,
|
||||
dev_t dev, unsigned long flags)
|
||||
struct super_block *sb,
|
||||
struct inode *dir, umode_t mode,
|
||||
dev_t dev, vma_flags_t flags)
|
||||
{
|
||||
struct inode *inode;
|
||||
struct shmem_inode_info *info;
|
||||
@@ -3092,7 +3092,8 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
|
||||
spin_lock_init(&info->lock);
|
||||
atomic_set(&info->stop_eviction, 0);
|
||||
info->seals = F_SEAL_SEAL;
|
||||
info->flags = (flags & VM_NORESERVE) ? SHMEM_F_NORESERVE : 0;
|
||||
info->flags = vma_flags_test(&flags, VMA_NORESERVE_BIT)
|
||||
? SHMEM_F_NORESERVE : 0;
|
||||
info->i_crtime = inode_get_mtime(inode);
|
||||
info->fsflags = (dir == NULL) ? 0 :
|
||||
SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
|
||||
@@ -3145,7 +3146,7 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
|
||||
#ifdef CONFIG_TMPFS_QUOTA
|
||||
static struct inode *shmem_get_inode(struct mnt_idmap *idmap,
|
||||
struct super_block *sb, struct inode *dir,
|
||||
umode_t mode, dev_t dev, unsigned long flags)
|
||||
umode_t mode, dev_t dev, vma_flags_t flags)
|
||||
{
|
||||
int err;
|
||||
struct inode *inode;
|
||||
@@ -3171,9 +3172,9 @@ errout:
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
#else
|
||||
static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
|
||||
static struct inode *shmem_get_inode(struct mnt_idmap *idmap,
|
||||
struct super_block *sb, struct inode *dir,
|
||||
umode_t mode, dev_t dev, unsigned long flags)
|
||||
umode_t mode, dev_t dev, vma_flags_t flags)
|
||||
{
|
||||
return __shmem_get_inode(idmap, sb, dir, mode, dev, flags);
|
||||
}
|
||||
@@ -3880,7 +3881,8 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
|
||||
if (!generic_ci_validate_strict_name(dir, &dentry->d_name))
|
||||
return -EINVAL;
|
||||
|
||||
inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE);
|
||||
inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev,
|
||||
mk_vma_flags(VMA_NORESERVE_BIT));
|
||||
if (IS_ERR(inode))
|
||||
return PTR_ERR(inode);
|
||||
|
||||
@@ -3915,7 +3917,8 @@ shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
|
||||
struct inode *inode;
|
||||
int error;
|
||||
|
||||
inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE);
|
||||
inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0,
|
||||
mk_vma_flags(VMA_NORESERVE_BIT));
|
||||
if (IS_ERR(inode)) {
|
||||
error = PTR_ERR(inode);
|
||||
goto err_out;
|
||||
@@ -4112,7 +4115,7 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
|
||||
return -ENAMETOOLONG;
|
||||
|
||||
inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0,
|
||||
VM_NORESERVE);
|
||||
mk_vma_flags(VMA_NORESERVE_BIT));
|
||||
if (IS_ERR(inode))
|
||||
return PTR_ERR(inode);
|
||||
|
||||
@@ -5113,7 +5116,8 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
|
||||
#endif /* CONFIG_TMPFS_QUOTA */
|
||||
|
||||
inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL,
|
||||
S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
|
||||
S_IFDIR | sbinfo->mode, 0,
|
||||
mk_vma_flags(VMA_NORESERVE_BIT));
|
||||
if (IS_ERR(inode)) {
|
||||
error = PTR_ERR(inode);
|
||||
goto failed;
|
||||
@@ -5814,7 +5818,7 @@ static inline void shmem_unacct_size(unsigned long flags, loff_t size)
|
||||
|
||||
static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
|
||||
struct super_block *sb, struct inode *dir,
|
||||
umode_t mode, dev_t dev, unsigned long flags)
|
||||
umode_t mode, dev_t dev, vma_flags_t flags)
|
||||
{
|
||||
struct inode *inode = ramfs_get_inode(sb, dir, mode, dev);
|
||||
return inode ? inode : ERR_PTR(-ENOSPC);
|
||||
@@ -5825,10 +5829,11 @@ static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
|
||||
/* common code */
|
||||
|
||||
static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
|
||||
loff_t size, unsigned long vm_flags,
|
||||
loff_t size, vma_flags_t flags,
|
||||
unsigned int i_flags)
|
||||
{
|
||||
unsigned long flags = (vm_flags & VM_NORESERVE) ? SHMEM_F_NORESERVE : 0;
|
||||
const unsigned long shmem_flags =
|
||||
vma_flags_test(&flags, VMA_NORESERVE_BIT) ? SHMEM_F_NORESERVE : 0;
|
||||
struct inode *inode;
|
||||
struct file *res;
|
||||
|
||||
@@ -5841,13 +5846,13 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
|
||||
if (is_idmapped_mnt(mnt))
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
if (shmem_acct_size(flags, size))
|
||||
if (shmem_acct_size(shmem_flags, size))
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
|
||||
S_IFREG | S_IRWXUGO, 0, vm_flags);
|
||||
S_IFREG | S_IRWXUGO, 0, flags);
|
||||
if (IS_ERR(inode)) {
|
||||
shmem_unacct_size(flags, size);
|
||||
shmem_unacct_size(shmem_flags, size);
|
||||
return ERR_CAST(inode);
|
||||
}
|
||||
inode->i_flags |= i_flags;
|
||||
@@ -5870,9 +5875,10 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
|
||||
* checks are provided at the key or shm level rather than the inode.
|
||||
* @name: name for dentry (to be seen in /proc/<pid>/maps)
|
||||
* @size: size to be set for the file
|
||||
* @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
|
||||
* @flags: VMA_NORESERVE_BIT suppresses pre-accounting of the entire object size
|
||||
*/
|
||||
struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
|
||||
struct file *shmem_kernel_file_setup(const char *name, loff_t size,
|
||||
vma_flags_t flags)
|
||||
{
|
||||
return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE);
|
||||
}
|
||||
@@ -5882,9 +5888,9 @@ EXPORT_SYMBOL_GPL(shmem_kernel_file_setup);
|
||||
* shmem_file_setup - get an unlinked file living in tmpfs
|
||||
* @name: name for dentry (to be seen in /proc/<pid>/maps)
|
||||
* @size: size to be set for the file
|
||||
* @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
|
||||
* @flags: VMA_NORESERVE_BIT suppresses pre-accounting of the entire object size
|
||||
*/
|
||||
struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
|
||||
struct file *shmem_file_setup(const char *name, loff_t size, vma_flags_t flags)
|
||||
{
|
||||
return __shmem_file_setup(shm_mnt, name, size, flags, 0);
|
||||
}
|
||||
@@ -5895,16 +5901,17 @@ EXPORT_SYMBOL_GPL(shmem_file_setup);
|
||||
* @mnt: the tmpfs mount where the file will be created
|
||||
* @name: name for dentry (to be seen in /proc/<pid>/maps)
|
||||
* @size: size to be set for the file
|
||||
* @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
|
||||
* @flags: VMA_NORESERVE_BIT suppresses pre-accounting of the entire object size
|
||||
*/
|
||||
struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name,
|
||||
loff_t size, unsigned long flags)
|
||||
loff_t size, vma_flags_t flags)
|
||||
{
|
||||
return __shmem_file_setup(mnt, name, size, flags, 0);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
|
||||
|
||||
static struct file *__shmem_zero_setup(unsigned long start, unsigned long end, vm_flags_t vm_flags)
|
||||
static struct file *__shmem_zero_setup(unsigned long start, unsigned long end,
|
||||
vma_flags_t flags)
|
||||
{
|
||||
loff_t size = end - start;
|
||||
|
||||
@@ -5914,7 +5921,7 @@ static struct file *__shmem_zero_setup(unsigned long start, unsigned long end, v
|
||||
* accessible to the user through its mapping, use S_PRIVATE flag to
|
||||
* bypass file security, in the same way as shmem_kernel_file_setup().
|
||||
*/
|
||||
return shmem_kernel_file_setup("dev/zero", size, vm_flags);
|
||||
return shmem_kernel_file_setup("dev/zero", size, flags);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -5924,7 +5931,7 @@ static struct file *__shmem_zero_setup(unsigned long start, unsigned long end, v
|
||||
*/
|
||||
int shmem_zero_setup(struct vm_area_struct *vma)
|
||||
{
|
||||
struct file *file = __shmem_zero_setup(vma->vm_start, vma->vm_end, vma->vm_flags);
|
||||
struct file *file = __shmem_zero_setup(vma->vm_start, vma->vm_end, vma->flags);
|
||||
|
||||
if (IS_ERR(file))
|
||||
return PTR_ERR(file);
|
||||
@@ -5945,7 +5952,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
|
||||
*/
|
||||
int shmem_zero_setup_desc(struct vm_area_desc *desc)
|
||||
{
|
||||
struct file *file = __shmem_zero_setup(desc->start, desc->end, desc->vm_flags);
|
||||
struct file *file = __shmem_zero_setup(desc->start, desc->end, desc->vma_flags);
|
||||
|
||||
if (IS_ERR(file))
|
||||
return PTR_ERR(file);
|
||||
|
||||
@@ -1154,7 +1154,7 @@ int __compat_vma_mmap(const struct file_operations *f_op,
|
||||
|
||||
.pgoff = vma->vm_pgoff,
|
||||
.vm_file = vma->vm_file,
|
||||
.vm_flags = vma->vm_flags,
|
||||
.vma_flags = vma->flags,
|
||||
.page_prot = vma->vm_page_prot,
|
||||
|
||||
.action.type = MMAP_NOTHING, /* Default */
|
||||
|
||||
67
mm/vma.c
67
mm/vma.c
@@ -15,7 +15,10 @@ struct mmap_state {
|
||||
unsigned long end;
|
||||
pgoff_t pgoff;
|
||||
unsigned long pglen;
|
||||
vm_flags_t vm_flags;
|
||||
union {
|
||||
vm_flags_t vm_flags;
|
||||
vma_flags_t vma_flags;
|
||||
};
|
||||
struct file *file;
|
||||
pgprot_t page_prot;
|
||||
|
||||
@@ -472,19 +475,16 @@ void remove_vma(struct vm_area_struct *vma)
|
||||
*
|
||||
* Called with the mm semaphore held.
|
||||
*/
|
||||
void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
|
||||
struct vm_area_struct *prev, struct vm_area_struct *next)
|
||||
void unmap_region(struct unmap_desc *unmap)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
struct mm_struct *mm = unmap->first->vm_mm;
|
||||
struct mmu_gather tlb;
|
||||
|
||||
tlb_gather_mmu(&tlb, mm);
|
||||
update_hiwater_rss(mm);
|
||||
unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end);
|
||||
mas_set(mas, vma->vm_end);
|
||||
free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
|
||||
next ? next->vm_start : USER_PGTABLES_CEILING,
|
||||
/* mm_wr_locked = */ true);
|
||||
unmap_vmas(&tlb, unmap);
|
||||
mas_set(unmap->mas, unmap->tree_reset);
|
||||
free_pgtables(&tlb, unmap);
|
||||
tlb_finish_mmu(&tlb);
|
||||
}
|
||||
|
||||
@@ -1256,26 +1256,32 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
|
||||
static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
|
||||
struct ma_state *mas_detach, bool mm_wr_locked)
|
||||
{
|
||||
struct mmu_gather tlb;
|
||||
struct unmap_desc unmap = {
|
||||
.mas = mas_detach,
|
||||
.first = vms->vma,
|
||||
/* start and end may be different if there is no prev or next vma. */
|
||||
.pg_start = vms->unmap_start,
|
||||
.pg_end = vms->unmap_end,
|
||||
.vma_start = vms->start,
|
||||
.vma_end = vms->end,
|
||||
/*
|
||||
* The tree limits and reset differ from the normal case since it's a
|
||||
* side-tree
|
||||
*/
|
||||
.tree_reset = 1,
|
||||
.tree_end = vms->vma_count,
|
||||
/*
|
||||
* We can free page tables without write-locking mmap_lock because VMAs
|
||||
* were isolated before we downgraded mmap_lock.
|
||||
*/
|
||||
.mm_wr_locked = mm_wr_locked,
|
||||
};
|
||||
|
||||
if (!vms->clear_ptes) /* Nothing to do */
|
||||
return;
|
||||
|
||||
/*
|
||||
* We can free page tables without write-locking mmap_lock because VMAs
|
||||
* were isolated before we downgraded mmap_lock.
|
||||
*/
|
||||
mas_set(mas_detach, 1);
|
||||
tlb_gather_mmu(&tlb, vms->vma->vm_mm);
|
||||
update_hiwater_rss(vms->vma->vm_mm);
|
||||
unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end,
|
||||
vms->vma_count);
|
||||
|
||||
mas_set(mas_detach, 1);
|
||||
/* start and end may be different if there is no prev or next vma. */
|
||||
free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start,
|
||||
vms->unmap_end, mm_wr_locked);
|
||||
tlb_finish_mmu(&tlb);
|
||||
unmap_region(&unmap);
|
||||
vms->clear_ptes = false;
|
||||
}
|
||||
|
||||
@@ -2366,7 +2372,7 @@ static void set_desc_from_map(struct vm_area_desc *desc,
|
||||
|
||||
desc->pgoff = map->pgoff;
|
||||
desc->vm_file = map->file;
|
||||
desc->vm_flags = map->vm_flags;
|
||||
desc->vma_flags = map->vma_flags;
|
||||
desc->page_prot = map->page_prot;
|
||||
}
|
||||
|
||||
@@ -2461,13 +2467,14 @@ static int __mmap_new_file_vma(struct mmap_state *map,
|
||||
|
||||
error = mmap_file(vma->vm_file, vma);
|
||||
if (error) {
|
||||
UNMAP_STATE(unmap, vmi, vma, vma->vm_start, vma->vm_end,
|
||||
map->prev, map->next);
|
||||
fput(vma->vm_file);
|
||||
vma->vm_file = NULL;
|
||||
|
||||
vma_iter_set(vmi, vma->vm_end);
|
||||
/* Undo any partial mapping done by a device driver. */
|
||||
unmap_region(&vmi->mas, vma, map->prev, map->next);
|
||||
|
||||
unmap_region(&unmap);
|
||||
return error;
|
||||
}
|
||||
|
||||
@@ -2646,7 +2653,7 @@ static int call_mmap_prepare(struct mmap_state *map,
|
||||
map->file_doesnt_need_get = true;
|
||||
map->file = desc->vm_file;
|
||||
}
|
||||
map->vm_flags = desc->vm_flags;
|
||||
map->vma_flags = desc->vma_flags;
|
||||
map->page_prot = desc->page_prot;
|
||||
/* User-defined fields. */
|
||||
map->vm_ops = desc->vm_ops;
|
||||
@@ -2819,7 +2826,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
|
||||
return -EINVAL;
|
||||
|
||||
/* Map writable and ensure this isn't a sealed memfd. */
|
||||
if (file && is_shared_maywrite(vm_flags)) {
|
||||
if (file && is_shared_maywrite_vm_flags(vm_flags)) {
|
||||
int error = mapping_map_writable(file->f_mapping);
|
||||
|
||||
if (error)
|
||||
@@ -3049,7 +3056,7 @@ static int acct_stack_growth(struct vm_area_struct *vma,
|
||||
return -ENOMEM;
|
||||
|
||||
/* mlock limit tests */
|
||||
if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT))
|
||||
if (!mlock_future_ok(mm, vma->vm_flags & VM_LOCKED, grow << PAGE_SHIFT))
|
||||
return -ENOMEM;
|
||||
|
||||
/* Check to ensure the stack will not grow into a hugetlb-only region */
|
||||
|
||||
73
mm/vma.h
73
mm/vma.h
@@ -155,6 +155,72 @@ struct vma_merge_struct {
|
||||
|
||||
};
|
||||
|
||||
struct unmap_desc {
|
||||
struct ma_state *mas; /* the maple state point to the first vma */
|
||||
struct vm_area_struct *first; /* The first vma */
|
||||
unsigned long pg_start; /* The first pagetable address to free (floor) */
|
||||
unsigned long pg_end; /* The last pagetable address to free (ceiling) */
|
||||
unsigned long vma_start; /* The min vma address */
|
||||
unsigned long vma_end; /* The max vma address */
|
||||
unsigned long tree_end; /* Maximum for the vma tree search */
|
||||
unsigned long tree_reset; /* Where to reset the vma tree walk */
|
||||
bool mm_wr_locked; /* If the mmap write lock is held */
|
||||
};
|
||||
|
||||
/*
|
||||
* unmap_all_init() - Initialize unmap_desc to remove all vmas, point the
|
||||
* pg_start and pg_end to a safe location.
|
||||
*/
|
||||
static inline void unmap_all_init(struct unmap_desc *unmap,
|
||||
struct vma_iterator *vmi, struct vm_area_struct *vma)
|
||||
{
|
||||
unmap->mas = &vmi->mas;
|
||||
unmap->first = vma;
|
||||
unmap->pg_start = FIRST_USER_ADDRESS;
|
||||
unmap->pg_end = USER_PGTABLES_CEILING;
|
||||
unmap->vma_start = 0;
|
||||
unmap->vma_end = ULONG_MAX;
|
||||
unmap->tree_end = ULONG_MAX;
|
||||
unmap->tree_reset = vma->vm_end;
|
||||
unmap->mm_wr_locked = false;
|
||||
}
|
||||
|
||||
/*
|
||||
* unmap_pgtable_init() - Initialize unmap_desc to remove all page tables within
|
||||
* the user range.
|
||||
*
|
||||
* ARM can have mappings outside of vmas.
|
||||
* See: e2cdef8c847b4 ("[PATCH] freepgt: free_pgtables from FIRST_USER_ADDRESS")
|
||||
*
|
||||
* ARM LPAE uses page table mappings beyond the USER_PGTABLES_CEILING
|
||||
* See: CONFIG_ARM_LPAE in arch/arm/include/asm/pgtable.h
|
||||
*/
|
||||
static inline void unmap_pgtable_init(struct unmap_desc *unmap,
|
||||
struct vma_iterator *vmi)
|
||||
{
|
||||
vma_iter_set(vmi, unmap->tree_reset);
|
||||
unmap->vma_start = FIRST_USER_ADDRESS;
|
||||
unmap->vma_end = USER_PGTABLES_CEILING;
|
||||
unmap->tree_end = USER_PGTABLES_CEILING;
|
||||
}
|
||||
|
||||
#define UNMAP_STATE(name, _vmi, _vma, _vma_start, _vma_end, _prev, _next) \
|
||||
struct unmap_desc name = { \
|
||||
.mas = &(_vmi)->mas, \
|
||||
.first = _vma, \
|
||||
.pg_start = _prev ? ((struct vm_area_struct *)_prev)->vm_end : \
|
||||
FIRST_USER_ADDRESS, \
|
||||
.pg_end = _next ? ((struct vm_area_struct *)_next)->vm_start : \
|
||||
USER_PGTABLES_CEILING, \
|
||||
.vma_start = _vma_start, \
|
||||
.vma_end = _vma_end, \
|
||||
.tree_end = _next ? \
|
||||
((struct vm_area_struct *)_next)->vm_start : \
|
||||
USER_PGTABLES_CEILING, \
|
||||
.tree_reset = _vma->vm_end, \
|
||||
.mm_wr_locked = true, \
|
||||
}
|
||||
|
||||
static inline bool vmg_nomem(struct vma_merge_struct *vmg)
|
||||
{
|
||||
return vmg->state == VMA_MERGE_ERROR_NOMEM;
|
||||
@@ -243,8 +309,7 @@ static inline void set_vma_from_desc(struct vm_area_struct *vma,
|
||||
vma->vm_pgoff = desc->pgoff;
|
||||
if (desc->vm_file != vma->vm_file)
|
||||
vma_set_file(vma, desc->vm_file);
|
||||
if (desc->vm_flags != vma->vm_flags)
|
||||
vm_flags_set(vma, desc->vm_flags);
|
||||
vma->flags = desc->vma_flags;
|
||||
vma->vm_page_prot = desc->page_prot;
|
||||
|
||||
/* User-defined fields. */
|
||||
@@ -262,9 +327,7 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
|
||||
bool unlock);
|
||||
|
||||
void remove_vma(struct vm_area_struct *vma);
|
||||
|
||||
void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
|
||||
struct vm_area_struct *prev, struct vm_area_struct *next);
|
||||
void unmap_region(struct unmap_desc *unmap);
|
||||
|
||||
/**
|
||||
* vma_modify_flags() - Perform any necessary split/merge in preparation for
|
||||
|
||||
@@ -46,6 +46,7 @@
|
||||
#include <linux/swap.h>
|
||||
#include <linux/uprobes.h>
|
||||
#include <linux/userfaultfd_k.h>
|
||||
#include <linux/pgtable.h>
|
||||
|
||||
#include <asm/current.h>
|
||||
#include <asm/tlb.h>
|
||||
|
||||
33
mm/vmscan.c
33
mm/vmscan.c
@@ -343,19 +343,21 @@ static void flush_reclaim_state(struct scan_control *sc)
|
||||
static bool can_demote(int nid, struct scan_control *sc,
|
||||
struct mem_cgroup *memcg)
|
||||
{
|
||||
int demotion_nid;
|
||||
struct pglist_data *pgdat = NODE_DATA(nid);
|
||||
nodemask_t allowed_mask;
|
||||
|
||||
if (!numa_demotion_enabled)
|
||||
if (!pgdat || !numa_demotion_enabled)
|
||||
return false;
|
||||
if (sc && sc->no_demotion)
|
||||
return false;
|
||||
|
||||
demotion_nid = next_demotion_node(nid);
|
||||
if (demotion_nid == NUMA_NO_NODE)
|
||||
node_get_allowed_targets(pgdat, &allowed_mask);
|
||||
if (nodes_empty(allowed_mask))
|
||||
return false;
|
||||
|
||||
/* If demotion node isn't in the cgroup's mems_allowed, fall back */
|
||||
return mem_cgroup_node_allowed(memcg, demotion_nid);
|
||||
/* Filter out nodes that are not in cgroup's mems_allowed. */
|
||||
mem_cgroup_node_filter_allowed(memcg, &allowed_mask);
|
||||
return !nodes_empty(allowed_mask);
|
||||
}
|
||||
|
||||
static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
|
||||
@@ -1017,9 +1019,10 @@ static struct folio *alloc_demote_folio(struct folio *src,
|
||||
* Folios which are not demoted are left on @demote_folios.
|
||||
*/
|
||||
static unsigned int demote_folio_list(struct list_head *demote_folios,
|
||||
struct pglist_data *pgdat)
|
||||
struct pglist_data *pgdat,
|
||||
struct mem_cgroup *memcg)
|
||||
{
|
||||
int target_nid = next_demotion_node(pgdat->node_id);
|
||||
int target_nid;
|
||||
unsigned int nr_succeeded;
|
||||
nodemask_t allowed_mask;
|
||||
|
||||
@@ -1031,7 +1034,6 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
|
||||
*/
|
||||
.gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
|
||||
__GFP_NOMEMALLOC | GFP_NOWAIT,
|
||||
.nid = target_nid,
|
||||
.nmask = &allowed_mask,
|
||||
.reason = MR_DEMOTION,
|
||||
};
|
||||
@@ -1039,10 +1041,17 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
|
||||
if (list_empty(demote_folios))
|
||||
return 0;
|
||||
|
||||
if (target_nid == NUMA_NO_NODE)
|
||||
node_get_allowed_targets(pgdat, &allowed_mask);
|
||||
mem_cgroup_node_filter_allowed(memcg, &allowed_mask);
|
||||
if (nodes_empty(allowed_mask))
|
||||
return 0;
|
||||
|
||||
node_get_allowed_targets(pgdat, &allowed_mask);
|
||||
target_nid = next_demotion_node(pgdat->node_id, &allowed_mask);
|
||||
if (target_nid == NUMA_NO_NODE)
|
||||
/* No lower-tier nodes or nodes were hot-unplugged. */
|
||||
return 0;
|
||||
|
||||
mtc.nid = target_nid;
|
||||
|
||||
/* Demotion ignores all cpuset and mempolicy settings */
|
||||
migrate_pages(demote_folios, alloc_demote_folio, NULL,
|
||||
@@ -1564,7 +1573,7 @@ keep:
|
||||
/* 'folio_list' is always empty here */
|
||||
|
||||
/* Migrate folios selected for demotion */
|
||||
nr_demoted = demote_folio_list(&demote_folios, pgdat);
|
||||
nr_demoted = demote_folio_list(&demote_folios, pgdat, memcg);
|
||||
nr_reclaimed += nr_demoted;
|
||||
stat->nr_demoted += nr_demoted;
|
||||
/* Folios that could not be demoted are still in @demote_folios */
|
||||
|
||||
Reference in New Issue
Block a user