Merge tag 'mm-stable-2026-02-18-19-48' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Pull more MM  updates from Andrew Morton:

 - "mm/vmscan: fix demotion targets checks in reclaim/demotion" fixes a
   couple of issues in the demotion code - pages were failed demotion
   and were finding themselves demoted into disallowed nodes (Bing Jiao)

 - "Remove XA_ZERO from error recovery of dup_mmap()" fixes a rare
   mapledtree race and performs a number of cleanups (Liam Howlett)

 - "mm: add bitmap VMA flag helpers and convert all mmap_prepare to use
   them" implements a lot of cleanups following on from the conversion
   of the VMA flags into a bitmap (Lorenzo Stoakes)

 - "support batch checking of references and unmapping for large folios"
   implements batching to greatly improve the performance of reclaiming
   clean file-backed large folios (Baolin Wang)

 - "selftests/mm: add memory failure selftests" does as claimed (Miaohe
   Lin)

* tag 'mm-stable-2026-02-18-19-48' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (36 commits)
  mm/page_alloc: clear page->private in free_pages_prepare()
  selftests/mm: add memory failure dirty pagecache test
  selftests/mm: add memory failure clean pagecache test
  selftests/mm: add memory failure anonymous page test
  mm: rmap: support batched unmapping for file large folios
  arm64: mm: implement the architecture-specific clear_flush_young_ptes()
  arm64: mm: support batch clearing of the young flag for large folios
  arm64: mm: factor out the address and ptep alignment into a new helper
  mm: rmap: support batched checks of the references for large folios
  tools/testing/vma: add VMA userland tests for VMA flag functions
  tools/testing/vma: separate out vma_internal.h into logical headers
  tools/testing/vma: separate VMA userland tests into separate files
  mm: make vm_area_desc utilise vma_flags_t only
  mm: update all remaining mmap_prepare users to use vma_flags_t
  mm: update shmem_[kernel]_file_*() functions to use vma_flags_t
  mm: update secretmem to use VMA flags on mmap_prepare
  mm: update hugetlbfs to use VMA flags on mmap_prepare
  mm: add basic VMA flag operation helper functions
  tools: bitmap: add missing bitmap_[subset(), andnot()]
  mm: add mk_vma_flags() bitmap flag macro helper
  ...
This commit is contained in:
Linus Torvalds
2026-02-18 20:50:32 -08:00
82 changed files with 3941 additions and 2521 deletions

View File

@@ -4012,7 +4012,7 @@ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc)
{
if (is_shared_maywrite(desc->vm_flags))
if (is_shared_maywrite(&desc->vma_flags))
return -EINVAL;
return generic_file_mmap_prepare(desc);
}

View File

@@ -1193,16 +1193,16 @@ static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *map)
{
VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags));
VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE);
VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags));
VM_WARN_ON_ONCE(vma_desc_test_flags(desc, VMA_MAYSHARE_BIT));
desc->private_data = map;
}
static void set_vma_desc_resv_flags(struct vm_area_desc *desc, unsigned long flags)
{
VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags));
VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE);
VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags));
VM_WARN_ON_ONCE(vma_desc_test_flags(desc, VMA_MAYSHARE_BIT));
desc->private_data = (void *)((unsigned long)desc->private_data | flags);
}
@@ -1216,7 +1216,7 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
static bool is_vma_desc_resv_set(struct vm_area_desc *desc, unsigned long flag)
{
VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags));
VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags));
return ((unsigned long)desc->private_data) & flag;
}
@@ -6571,7 +6571,7 @@ next:
long hugetlb_reserve_pages(struct inode *inode,
long from, long to,
struct vm_area_desc *desc,
vm_flags_t vm_flags)
vma_flags_t vma_flags)
{
long chg = -1, add = -1, spool_resv, gbl_resv;
struct hstate *h = hstate_inode(inode);
@@ -6592,7 +6592,7 @@ long hugetlb_reserve_pages(struct inode *inode,
* attempt will be made for VM_NORESERVE to allocate a page
* without using reserves
*/
if (vm_flags & VM_NORESERVE)
if (vma_flags_test(&vma_flags, VMA_NORESERVE_BIT))
return 0;
/*
@@ -6601,7 +6601,7 @@ long hugetlb_reserve_pages(struct inode *inode,
* to reserve the full area even if read-only as mprotect() may be
* called to make the mapping read-write. Assume !desc is a shm mapping
*/
if (!desc || desc->vm_flags & VM_MAYSHARE) {
if (!desc || vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)) {
/*
* resv_map can not be NULL as hugetlb_reserve_pages is only
* called for inodes for which resv_maps were created (see
@@ -6635,7 +6635,7 @@ long hugetlb_reserve_pages(struct inode *inode,
if (err < 0)
goto out_err;
if (desc && !(desc->vm_flags & VM_MAYSHARE) && h_cg) {
if (desc && !vma_desc_test_flags(desc, VMA_MAYSHARE_BIT) && h_cg) {
/* For private mappings, the hugetlb_cgroup uncharge info hangs
* of the resv_map.
*/
@@ -6672,7 +6672,7 @@ long hugetlb_reserve_pages(struct inode *inode,
* consumed reservations are stored in the map. Hence, nothing
* else has to be done for private mappings here
*/
if (!desc || desc->vm_flags & VM_MAYSHARE) {
if (!desc || vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)) {
add = region_add(resv_map, from, to, regions_needed, h, h_cg);
if (unlikely(add < 0)) {
@@ -6736,7 +6736,7 @@ out_uncharge_cgroup:
hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
chg * pages_per_huge_page(h), h_cg);
out_err:
if (!desc || desc->vm_flags & VM_MAYSHARE)
if (!desc || vma_desc_test_flags(desc, VMA_MAYSHARE_BIT))
/* Only call region_abort if the region_chg succeeded but the
* region_add failed or didn't run.
*/

View File

@@ -197,6 +197,9 @@ static inline void vma_close(struct vm_area_struct *vma)
}
}
/* unmap_vmas is in mm/memory.c */
void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap);
#ifdef CONFIG_MMU
static inline void get_anon_vma(struct anon_vma *anon_vma)
@@ -509,9 +512,8 @@ bool __folio_end_writeback(struct folio *folio);
void deactivate_file_folio(struct folio *folio);
void folio_activate(struct folio *folio);
void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
struct vm_area_struct *start_vma, unsigned long floor,
unsigned long ceiling, bool mm_wr_locked);
void free_pgtables(struct mmu_gather *tlb, struct unmap_desc *desc);
void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
struct zap_details;
@@ -1044,7 +1046,7 @@ extern long populate_vma_page_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end, int *locked);
extern long faultin_page_range(struct mm_struct *mm, unsigned long start,
unsigned long end, bool write, int *locked);
bool mlock_future_ok(const struct mm_struct *mm, vm_flags_t vm_flags,
bool mlock_future_ok(const struct mm_struct *mm, bool is_vma_locked,
unsigned long bytes);
/*

View File

@@ -1732,7 +1732,7 @@ static bool file_backed_vma_is_retractable(struct vm_area_struct *vma)
* obtained on guard region installation after the flag is set, so this
* check being performed under this lock excludes races.
*/
if (vma_flag_test_atomic(vma, VMA_MAYBE_GUARD_BIT))
if (vma_test_atomic_flag(vma, VMA_MAYBE_GUARD_BIT))
return false;
return true;

View File

@@ -1140,7 +1140,7 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior)
* acquire an mmap/VMA write lock to read it. All remaining readers may
* or may not see the flag set, but we don't care.
*/
vma_flag_set_atomic(vma, VMA_MAYBE_GUARD_BIT);
vma_set_atomic_flag(vma, VMA_MAYBE_GUARD_BIT);
/*
* If anonymous and we are establishing page tables the VMA ought to

View File

@@ -5649,9 +5649,21 @@ subsys_initcall(mem_cgroup_swap_init);
#endif /* CONFIG_SWAP */
bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
void mem_cgroup_node_filter_allowed(struct mem_cgroup *memcg, nodemask_t *mask)
{
return memcg ? cpuset_node_allowed(memcg->css.cgroup, nid) : true;
nodemask_t allowed;
if (!memcg)
return;
/*
* Since this interface is intended for use by migration paths, and
* reclaim and migration are subject to race conditions such as changes
* in effective_mems and hot-unpluging of nodes, inaccurate allowed
* mask is acceptable.
*/
cpuset_nodes_allowed(memcg->css.cgroup, &allowed);
nodes_and(*mask, *mask, allowed);
}
void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg)

View File

@@ -86,7 +86,7 @@ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx)
gfp_mask &= ~(__GFP_HIGHMEM | __GFP_MOVABLE);
idx >>= huge_page_order(h);
nr_resv = hugetlb_reserve_pages(inode, idx, idx + 1, NULL, 0);
nr_resv = hugetlb_reserve_pages(inode, idx, idx + 1, NULL, EMPTY_VMA_FLAGS);
if (nr_resv < 0)
return ERR_PTR(nr_resv);
@@ -463,12 +463,12 @@ struct file *memfd_alloc_file(const char *name, unsigned int flags)
int err = 0;
if (flags & MFD_HUGETLB) {
file = hugetlb_file_setup(name, 0, VM_NORESERVE,
file = hugetlb_file_setup(name, 0, mk_vma_flags(VMA_NORESERVE_BIT),
HUGETLB_ANONHUGE_INODE,
(flags >> MFD_HUGE_SHIFT) &
MFD_HUGE_MASK);
} else {
file = shmem_file_setup(name, 0, VM_NORESERVE);
file = shmem_file_setup(name, 0, mk_vma_flags(VMA_NORESERVE_BIT));
}
if (IS_ERR(file))
return file;

View File

@@ -320,16 +320,17 @@ void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
/**
* next_demotion_node() - Get the next node in the demotion path
* @node: The starting node to lookup the next node
* @allowed_mask: The pointer to allowed node mask
*
* Return: node id for next memory node in the demotion path hierarchy
* from @node; NUMA_NO_NODE if @node is terminal. This does not keep
* @node online or guarantee that it *continues* to be the next demotion
* target.
*/
int next_demotion_node(int node)
int next_demotion_node(int node, const nodemask_t *allowed_mask)
{
struct demotion_nodes *nd;
int target;
nodemask_t mask;
if (!node_demotion)
return NUMA_NO_NODE;
@@ -344,6 +345,10 @@ int next_demotion_node(int node)
* node_demotion[] reads need to be consistent.
*/
rcu_read_lock();
/* Filter out nodes that are not in allowed_mask. */
nodes_and(mask, nd->preferred, *allowed_mask);
rcu_read_unlock();
/*
* If there are multiple target nodes, just select one
* target node randomly.
@@ -356,10 +361,16 @@ int next_demotion_node(int node)
* caching issue, which seems more complicated. So selecting
* target node randomly seems better until now.
*/
target = node_random(&nd->preferred);
rcu_read_unlock();
if (!nodes_empty(mask))
return node_random(&mask);
return target;
/*
* Preferred nodes are not in allowed_mask. Flip bits in
* allowed_mask as used node mask. Then, use it to get the
* closest demotion target.
*/
nodes_complement(mask, *allowed_mask);
return find_next_best_node(node, &mask);
}
static void disable_all_demotion_targets(void)

View File

@@ -370,11 +370,32 @@ void free_pgd_range(struct mmu_gather *tlb,
} while (pgd++, addr = next, addr != end);
}
void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
struct vm_area_struct *vma, unsigned long floor,
unsigned long ceiling, bool mm_wr_locked)
/**
* free_pgtables() - Free a range of page tables
* @tlb: The mmu gather
* @unmap: The unmap_desc
*
* Note: pg_start and pg_end are provided to indicate the absolute range of the
* page tables that should be removed. This can differ from the vma mappings on
* some archs that may have mappings that need to be removed outside the vmas.
* Note that the prev->vm_end and next->vm_start are often used.
*
* The vma_end differs from the pg_end when a dup_mmap() failed and the tree has
* unrelated data to the mm_struct being torn down.
*/
void free_pgtables(struct mmu_gather *tlb, struct unmap_desc *unmap)
{
struct unlink_vma_file_batch vb;
struct ma_state *mas = unmap->mas;
struct vm_area_struct *vma = unmap->first;
/*
* Note: USER_PGTABLES_CEILING may be passed as the value of pg_end and
* may be 0. Underflow is expected in this case. Otherwise the
* pagetable end is exclusive. vma_end is exclusive. The last vma
* address should never be larger than the pagetable end.
*/
WARN_ON_ONCE(unmap->vma_end - 1 > unmap->pg_end - 1);
tlb_free_vmas(tlb);
@@ -382,19 +403,13 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
unsigned long addr = vma->vm_start;
struct vm_area_struct *next;
/*
* Note: USER_PGTABLES_CEILING may be passed as ceiling and may
* be 0. This will underflow and is okay.
*/
next = mas_find(mas, ceiling - 1);
if (unlikely(xa_is_zero(next)))
next = NULL;
next = mas_find(mas, unmap->tree_end - 1);
/*
* Hide vma from rmap and truncate_pagecache before freeing
* pgtables
*/
if (mm_wr_locked)
if (unmap->mm_wr_locked)
vma_start_write(vma);
unlink_anon_vmas(vma);
@@ -406,18 +421,16 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
*/
while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
vma = next;
next = mas_find(mas, ceiling - 1);
if (unlikely(xa_is_zero(next)))
next = NULL;
if (mm_wr_locked)
next = mas_find(mas, unmap->tree_end - 1);
if (unmap->mm_wr_locked)
vma_start_write(vma);
unlink_anon_vmas(vma);
unlink_file_vma_batch_add(&vb, vma);
}
unlink_file_vma_batch_final(&vb);
free_pgd_range(tlb, addr, vma->vm_end,
floor, next ? next->vm_start : ceiling);
free_pgd_range(tlb, addr, vma->vm_end, unmap->pg_start,
next ? next->vm_start : unmap->pg_end);
vma = next;
} while (vma);
}
@@ -2124,11 +2137,7 @@ static void unmap_single_vma(struct mmu_gather *tlb,
/**
* unmap_vmas - unmap a range of memory covered by a list of vma's
* @tlb: address of the caller's struct mmu_gather
* @mas: the maple state
* @vma: the starting vma
* @start_addr: virtual address at which to start unmapping
* @end_addr: virtual address at which to end unmapping
* @tree_end: The maximum index to check
* @unmap: The unmap_desc
*
* Unmap all pages in the vma list.
*
@@ -2141,10 +2150,9 @@ static void unmap_single_vma(struct mmu_gather *tlb,
* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
* drops the lock and schedules.
*/
void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
struct vm_area_struct *vma, unsigned long start_addr,
unsigned long end_addr, unsigned long tree_end)
void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap)
{
struct vm_area_struct *vma;
struct mmu_notifier_range range;
struct zap_details details = {
.zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP,
@@ -2152,17 +2160,18 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
.even_cows = true,
};
vma = unmap->first;
mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
start_addr, end_addr);
unmap->vma_start, unmap->vma_end);
mmu_notifier_invalidate_range_start(&range);
do {
unsigned long start = start_addr;
unsigned long end = end_addr;
unsigned long start = unmap->vma_start;
unsigned long end = unmap->vma_end;
hugetlb_zap_begin(vma, &start, &end);
unmap_single_vma(tlb, vma, start, end, &details);
hugetlb_zap_end(vma, &details);
vma = mas_find(mas, tree_end - 1);
} while (vma && likely(!xa_is_zero(vma)));
vma = mas_find(unmap->mas, unmap->tree_end - 1);
} while (vma);
mmu_notifier_invalidate_range_end(&range);
}
@@ -2948,7 +2957,7 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
return 0;
}
static int get_remap_pgoff(vm_flags_t vm_flags, unsigned long addr,
static int get_remap_pgoff(bool is_cow, unsigned long addr,
unsigned long end, unsigned long vm_start, unsigned long vm_end,
unsigned long pfn, pgoff_t *vm_pgoff_p)
{
@@ -2958,7 +2967,7 @@ static int get_remap_pgoff(vm_flags_t vm_flags, unsigned long addr,
* un-COW'ed pages by matching them up with "vma->vm_pgoff".
* See vm_normal_page() for details.
*/
if (is_cow_mapping(vm_flags)) {
if (is_cow) {
if (addr != vm_start || end != vm_end)
return -EINVAL;
*vm_pgoff_p = pfn;
@@ -2979,7 +2988,7 @@ static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long ad
if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
return -EINVAL;
VM_WARN_ON_ONCE((vma->vm_flags & VM_REMAP_FLAGS) != VM_REMAP_FLAGS);
VM_WARN_ON_ONCE(!vma_test_all_flags_mask(vma, VMA_REMAP_FLAGS));
BUG_ON(addr >= end);
pfn -= addr >> PAGE_SHIFT;
@@ -3103,9 +3112,9 @@ void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn)
* check it again on complete and will fail there if specified addr is
* invalid.
*/
get_remap_pgoff(desc->vm_flags, desc->start, desc->end,
get_remap_pgoff(vma_desc_is_cow_mapping(desc), desc->start, desc->end,
desc->start, desc->end, pfn, &desc->pgoff);
desc->vm_flags |= VM_REMAP_FLAGS;
vma_desc_set_flags_mask(desc, VMA_REMAP_FLAGS);
}
static int remap_pfn_range_prepare_vma(struct vm_area_struct *vma, unsigned long addr,
@@ -3114,13 +3123,12 @@ static int remap_pfn_range_prepare_vma(struct vm_area_struct *vma, unsigned long
unsigned long end = addr + PAGE_ALIGN(size);
int err;
err = get_remap_pgoff(vma->vm_flags, addr, end,
vma->vm_start, vma->vm_end,
pfn, &vma->vm_pgoff);
err = get_remap_pgoff(is_cow_mapping(vma->vm_flags), addr, end,
vma->vm_start, vma->vm_end, pfn, &vma->vm_pgoff);
if (err)
return err;
vm_flags_set(vma, VM_REMAP_FLAGS);
vma_set_flags_mask(vma, VMA_REMAP_FLAGS);
return 0;
}
@@ -7316,7 +7324,7 @@ void folio_zero_user(struct folio *folio, unsigned long addr_hint)
const unsigned long base_addr = ALIGN_DOWN(addr_hint, folio_size(folio));
const long fault_idx = (addr_hint - base_addr) / PAGE_SIZE;
const struct range pg = DEFINE_RANGE(0, folio_nr_pages(folio) - 1);
const int radius = FOLIO_ZERO_LOCALITY_RADIUS;
const long radius = FOLIO_ZERO_LOCALITY_RADIUS;
struct range r[3];
int i;
@@ -7324,20 +7332,19 @@ void folio_zero_user(struct folio *folio, unsigned long addr_hint)
* Faulting page and its immediate neighbourhood. Will be cleared at the
* end to keep its cachelines hot.
*/
r[2] = DEFINE_RANGE(clamp_t(s64, fault_idx - radius, pg.start, pg.end),
clamp_t(s64, fault_idx + radius, pg.start, pg.end));
r[2] = DEFINE_RANGE(fault_idx - radius < (long)pg.start ? pg.start : fault_idx - radius,
fault_idx + radius > (long)pg.end ? pg.end : fault_idx + radius);
/* Region to the left of the fault */
r[1] = DEFINE_RANGE(pg.start,
clamp_t(s64, r[2].start - 1, pg.start - 1, r[2].start));
r[1] = DEFINE_RANGE(pg.start, r[2].start - 1);
/* Region to the right of the fault: always valid for the common fault_idx=0 case. */
r[0] = DEFINE_RANGE(clamp_t(s64, r[2].end + 1, r[2].end, pg.end + 1),
pg.end);
r[0] = DEFINE_RANGE(r[2].end + 1, pg.end);
for (i = 0; i < ARRAY_SIZE(r); i++) {
const unsigned long addr = base_addr + r[i].start * PAGE_SIZE;
const unsigned int nr_pages = range_len(&r[i]);
const long nr_pages = (long)range_len(&r[i]);
struct page *page = folio_page(folio, r[i].start);
if (nr_pages > 0)

110
mm/mmap.c
View File

@@ -108,7 +108,8 @@ static int check_brk_limits(unsigned long addr, unsigned long len)
if (IS_ERR_VALUE(mapped_addr))
return mapped_addr;
return mlock_future_ok(current->mm, current->mm->def_flags, len)
return mlock_future_ok(current->mm,
current->mm->def_flags & VM_LOCKED, len)
? 0 : -EAGAIN;
}
@@ -225,12 +226,12 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
return hint;
}
bool mlock_future_ok(const struct mm_struct *mm, vm_flags_t vm_flags,
unsigned long bytes)
bool mlock_future_ok(const struct mm_struct *mm, bool is_vma_locked,
unsigned long bytes)
{
unsigned long locked_pages, limit_pages;
if (!(vm_flags & VM_LOCKED) || capable(CAP_IPC_LOCK))
if (!is_vma_locked || capable(CAP_IPC_LOCK))
return true;
locked_pages = bytes >> PAGE_SHIFT;
@@ -416,7 +417,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (!can_do_mlock())
return -EPERM;
if (!mlock_future_ok(mm, vm_flags, len))
if (!mlock_future_ok(mm, vm_flags & VM_LOCKED, len))
return -EAGAIN;
if (file) {
@@ -594,7 +595,7 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
* taken when vm_ops->mmap() is called
*/
file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
VM_NORESERVE,
mk_vma_flags(VMA_NORESERVE_BIT),
HUGETLB_ANONHUGE_INODE,
(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
if (IS_ERR(file))
@@ -1247,6 +1248,29 @@ limits_failed:
}
EXPORT_SYMBOL(vm_brk_flags);
static
unsigned long tear_down_vmas(struct mm_struct *mm, struct vma_iterator *vmi,
struct vm_area_struct *vma, unsigned long end)
{
unsigned long nr_accounted = 0;
int count = 0;
mmap_assert_write_locked(mm);
vma_iter_set(vmi, vma->vm_end);
do {
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
vma_mark_detached(vma);
remove_vma(vma);
count++;
cond_resched();
vma = vma_next(vmi);
} while (vma && vma->vm_end <= end);
VM_WARN_ON_ONCE(count != mm->map_count);
return nr_accounted;
}
/* Release all mmaps. */
void exit_mmap(struct mm_struct *mm)
{
@@ -1254,7 +1278,7 @@ void exit_mmap(struct mm_struct *mm)
struct vm_area_struct *vma;
unsigned long nr_accounted = 0;
VMA_ITERATOR(vmi, mm, 0);
int count = 0;
struct unmap_desc unmap;
/* mm's last user has gone, and its about to be pulled down */
mmu_notifier_release(mm);
@@ -1263,18 +1287,19 @@ void exit_mmap(struct mm_struct *mm)
arch_exit_mmap(mm);
vma = vma_next(&vmi);
if (!vma || unlikely(xa_is_zero(vma))) {
if (!vma) {
/* Can happen if dup_mmap() received an OOM */
mmap_read_unlock(mm);
mmap_write_lock(mm);
goto destroy;
}
unmap_all_init(&unmap, &vmi, vma);
flush_cache_mm(mm);
tlb_gather_mmu_fullmm(&tlb, mm);
/* update_hiwater_rss(mm) here? but nobody should be looking */
/* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */
unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX);
unmap_vmas(&tlb, &unmap);
mmap_read_unlock(mm);
/*
@@ -1283,10 +1308,10 @@ void exit_mmap(struct mm_struct *mm)
*/
mm_flags_set(MMF_OOM_SKIP, mm);
mmap_write_lock(mm);
unmap.mm_wr_locked = true;
mt_clear_in_rcu(&mm->mm_mt);
vma_iter_set(&vmi, vma->vm_end);
free_pgtables(&tlb, &vmi.mas, vma, FIRST_USER_ADDRESS,
USER_PGTABLES_CEILING, true);
unmap_pgtable_init(&unmap, &vmi);
free_pgtables(&tlb, &unmap);
tlb_finish_mmu(&tlb);
/*
@@ -1294,22 +1319,11 @@ void exit_mmap(struct mm_struct *mm)
* enabled, without holding any MM locks besides the unreachable
* mmap_write_lock.
*/
vma_iter_set(&vmi, vma->vm_end);
do {
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
vma_mark_detached(vma);
remove_vma(vma);
count++;
cond_resched();
vma = vma_next(&vmi);
} while (vma && likely(!xa_is_zero(vma)));
nr_accounted = tear_down_vmas(mm, &vmi, vma, ULONG_MAX);
BUG_ON(count != mm->map_count);
trace_exit_mmap(mm);
destroy:
__mt_destroy(&mm->mm_mt);
trace_exit_mmap(mm);
mmap_write_unlock(mm);
vm_unacct_memory(nr_accounted);
}
@@ -1840,20 +1854,46 @@ loop_out:
ksm_fork(mm, oldmm);
khugepaged_fork(mm, oldmm);
} else {
unsigned long end;
/*
* The entire maple tree has already been duplicated. If the
* mmap duplication fails, mark the failure point with
* XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
* stop releasing VMAs that have not been duplicated after this
* point.
* The entire maple tree has already been duplicated, but
* replacing the vmas failed at mpnt (which could be NULL if
* all were allocated but the last vma was not fully set up).
* Use the start address of the failure point to clean up the
* partially initialized tree.
*/
if (mpnt) {
mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
mas_store(&vmi.mas, XA_ZERO_ENTRY);
/* Avoid OOM iterating a broken tree */
mm_flags_set(MMF_OOM_SKIP, mm);
if (!mm->map_count) {
/* zero vmas were written to the new tree. */
end = 0;
} else if (mpnt) {
/* partial tree failure */
end = mpnt->vm_start;
} else {
/* All vmas were written to the new tree */
end = ULONG_MAX;
}
/* Hide mm from oom killer because the memory is being freed */
mm_flags_set(MMF_OOM_SKIP, mm);
if (end) {
vma_iter_set(&vmi, 0);
tmp = vma_next(&vmi);
UNMAP_STATE(unmap, &vmi, /* first = */ tmp,
/* vma_start = */ 0, /* vma_end = */ end,
/* prev = */ NULL, /* next = */ NULL);
/*
* Don't iterate over vmas beyond the failure point for
* both unmap_vma() and free_pgtables().
*/
unmap.tree_end = end;
flush_cache_mm(mm);
unmap_region(&unmap);
charge = tear_down_vmas(mm, &vmi, tmp, end);
vm_unacct_memory(charge);
}
__mt_destroy(&mm->mm_mt);
/*
* The mm_struct is going to exit, but the locks will be dropped
* first. Set the mm_struct as unstable is advisable as it is

View File

@@ -1740,7 +1740,7 @@ static int check_prep_vma(struct vma_remap_struct *vrm)
if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
return -EFAULT;
if (!mlock_future_ok(mm, vma->vm_flags, vrm->delta))
if (!mlock_future_ok(mm, vma->vm_flags & VM_LOCKED, vrm->delta))
return -EAGAIN;
if (!may_expand_vm(mm, vma->vm_flags, vrm->delta >> PAGE_SHIFT))

View File

@@ -1429,6 +1429,7 @@ __always_inline bool __free_pages_prepare(struct page *page,
page_cpupid_reset_last(page);
page->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
page->private = 0;
reset_page_owner(page, order);
page_table_check_free(page, order);
pgalloc_tag_sub(page, 1 << order);

View File

@@ -913,9 +913,11 @@ static bool folio_referenced_one(struct folio *folio,
struct folio_referenced_arg *pra = arg;
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
int ptes = 0, referenced = 0;
unsigned int nr;
while (page_vma_mapped_walk(&pvmw)) {
address = pvmw.address;
nr = 1;
if (vma->vm_flags & VM_LOCKED) {
ptes++;
@@ -960,9 +962,21 @@ static bool folio_referenced_one(struct folio *folio,
if (lru_gen_look_around(&pvmw))
referenced++;
} else if (pvmw.pte) {
if (ptep_clear_flush_young_notify(vma, address,
pvmw.pte))
if (folio_test_large(folio)) {
unsigned long end_addr = pmd_addr_end(address, vma->vm_end);
unsigned int max_nr = (end_addr - address) >> PAGE_SHIFT;
pte_t pteval = ptep_get(pvmw.pte);
nr = folio_pte_batch(folio, pvmw.pte,
pteval, max_nr);
}
ptes += nr;
if (clear_flush_young_ptes_notify(vma, address, pvmw.pte, nr))
referenced++;
/* Skip the batched PTEs */
pvmw.pte += nr - 1;
pvmw.address += (nr - 1) * PAGE_SIZE;
} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
if (pmdp_clear_flush_young_notify(vma, address,
pvmw.pmd))
@@ -972,7 +986,15 @@ static bool folio_referenced_one(struct folio *folio,
WARN_ON_ONCE(1);
}
pra->mapcount--;
pra->mapcount -= nr;
/*
* If we are sure that we batched the entire folio,
* we can just optimize and stop right here.
*/
if (ptes == pvmw.nr_pages) {
page_vma_mapped_walk_done(&pvmw);
break;
}
}
if (referenced)
@@ -1923,12 +1945,16 @@ static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
end_addr = pmd_addr_end(addr, vma->vm_end);
max_nr = (end_addr - addr) >> PAGE_SHIFT;
/* We only support lazyfree batching for now ... */
if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
/* We only support lazyfree or file folios batching for now ... */
if (folio_test_anon(folio) && folio_test_swapbacked(folio))
return 1;
if (pte_unused(pte))
return 1;
if (userfaultfd_wp(vma))
return 1;
return folio_pte_batch(folio, pvmw->pte, pte, max_nr);
}
@@ -2291,7 +2317,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
*
* See Documentation/mm/mmu_notifier.rst
*/
dec_mm_counter(mm, mm_counter_file(folio));
add_mm_counter(mm, mm_counter_file(folio), -nr_pages);
}
discard:
if (unlikely(folio_test_hugetlb(folio))) {

View File

@@ -122,13 +122,12 @@ static int secretmem_mmap_prepare(struct vm_area_desc *desc)
{
const unsigned long len = vma_desc_size(desc);
if ((desc->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0)
if (!vma_desc_test_flags(desc, VMA_SHARED_BIT, VMA_MAYSHARE_BIT))
return -EINVAL;
if (!mlock_future_ok(desc->mm, desc->vm_flags | VM_LOCKED, len))
vma_desc_set_flags(desc, VMA_LOCKED_BIT, VMA_DONTDUMP_BIT);
if (!mlock_future_ok(desc->mm, /*is_vma_locked=*/ true, len))
return -EAGAIN;
desc->vm_flags |= VM_LOCKED | VM_DONTDUMP;
desc->vm_ops = &secretmem_vm_ops;
return 0;

View File

@@ -3062,9 +3062,9 @@ static struct offset_ctx *shmem_get_offset_ctx(struct inode *inode)
}
static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
struct super_block *sb,
struct inode *dir, umode_t mode,
dev_t dev, unsigned long flags)
struct super_block *sb,
struct inode *dir, umode_t mode,
dev_t dev, vma_flags_t flags)
{
struct inode *inode;
struct shmem_inode_info *info;
@@ -3092,7 +3092,8 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
spin_lock_init(&info->lock);
atomic_set(&info->stop_eviction, 0);
info->seals = F_SEAL_SEAL;
info->flags = (flags & VM_NORESERVE) ? SHMEM_F_NORESERVE : 0;
info->flags = vma_flags_test(&flags, VMA_NORESERVE_BIT)
? SHMEM_F_NORESERVE : 0;
info->i_crtime = inode_get_mtime(inode);
info->fsflags = (dir == NULL) ? 0 :
SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
@@ -3145,7 +3146,7 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
#ifdef CONFIG_TMPFS_QUOTA
static struct inode *shmem_get_inode(struct mnt_idmap *idmap,
struct super_block *sb, struct inode *dir,
umode_t mode, dev_t dev, unsigned long flags)
umode_t mode, dev_t dev, vma_flags_t flags)
{
int err;
struct inode *inode;
@@ -3171,9 +3172,9 @@ errout:
return ERR_PTR(err);
}
#else
static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
static struct inode *shmem_get_inode(struct mnt_idmap *idmap,
struct super_block *sb, struct inode *dir,
umode_t mode, dev_t dev, unsigned long flags)
umode_t mode, dev_t dev, vma_flags_t flags)
{
return __shmem_get_inode(idmap, sb, dir, mode, dev, flags);
}
@@ -3880,7 +3881,8 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
if (!generic_ci_validate_strict_name(dir, &dentry->d_name))
return -EINVAL;
inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE);
inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev,
mk_vma_flags(VMA_NORESERVE_BIT));
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -3915,7 +3917,8 @@ shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
struct inode *inode;
int error;
inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE);
inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0,
mk_vma_flags(VMA_NORESERVE_BIT));
if (IS_ERR(inode)) {
error = PTR_ERR(inode);
goto err_out;
@@ -4112,7 +4115,7 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
return -ENAMETOOLONG;
inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0,
VM_NORESERVE);
mk_vma_flags(VMA_NORESERVE_BIT));
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -5113,7 +5116,8 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
#endif /* CONFIG_TMPFS_QUOTA */
inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL,
S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
S_IFDIR | sbinfo->mode, 0,
mk_vma_flags(VMA_NORESERVE_BIT));
if (IS_ERR(inode)) {
error = PTR_ERR(inode);
goto failed;
@@ -5814,7 +5818,7 @@ static inline void shmem_unacct_size(unsigned long flags, loff_t size)
static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
struct super_block *sb, struct inode *dir,
umode_t mode, dev_t dev, unsigned long flags)
umode_t mode, dev_t dev, vma_flags_t flags)
{
struct inode *inode = ramfs_get_inode(sb, dir, mode, dev);
return inode ? inode : ERR_PTR(-ENOSPC);
@@ -5825,10 +5829,11 @@ static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
/* common code */
static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
loff_t size, unsigned long vm_flags,
loff_t size, vma_flags_t flags,
unsigned int i_flags)
{
unsigned long flags = (vm_flags & VM_NORESERVE) ? SHMEM_F_NORESERVE : 0;
const unsigned long shmem_flags =
vma_flags_test(&flags, VMA_NORESERVE_BIT) ? SHMEM_F_NORESERVE : 0;
struct inode *inode;
struct file *res;
@@ -5841,13 +5846,13 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
if (is_idmapped_mnt(mnt))
return ERR_PTR(-EINVAL);
if (shmem_acct_size(flags, size))
if (shmem_acct_size(shmem_flags, size))
return ERR_PTR(-ENOMEM);
inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
S_IFREG | S_IRWXUGO, 0, vm_flags);
S_IFREG | S_IRWXUGO, 0, flags);
if (IS_ERR(inode)) {
shmem_unacct_size(flags, size);
shmem_unacct_size(shmem_flags, size);
return ERR_CAST(inode);
}
inode->i_flags |= i_flags;
@@ -5870,9 +5875,10 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
* checks are provided at the key or shm level rather than the inode.
* @name: name for dentry (to be seen in /proc/<pid>/maps)
* @size: size to be set for the file
* @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
* @flags: VMA_NORESERVE_BIT suppresses pre-accounting of the entire object size
*/
struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
struct file *shmem_kernel_file_setup(const char *name, loff_t size,
vma_flags_t flags)
{
return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE);
}
@@ -5882,9 +5888,9 @@ EXPORT_SYMBOL_GPL(shmem_kernel_file_setup);
* shmem_file_setup - get an unlinked file living in tmpfs
* @name: name for dentry (to be seen in /proc/<pid>/maps)
* @size: size to be set for the file
* @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
* @flags: VMA_NORESERVE_BIT suppresses pre-accounting of the entire object size
*/
struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
struct file *shmem_file_setup(const char *name, loff_t size, vma_flags_t flags)
{
return __shmem_file_setup(shm_mnt, name, size, flags, 0);
}
@@ -5895,16 +5901,17 @@ EXPORT_SYMBOL_GPL(shmem_file_setup);
* @mnt: the tmpfs mount where the file will be created
* @name: name for dentry (to be seen in /proc/<pid>/maps)
* @size: size to be set for the file
* @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
* @flags: VMA_NORESERVE_BIT suppresses pre-accounting of the entire object size
*/
struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name,
loff_t size, unsigned long flags)
loff_t size, vma_flags_t flags)
{
return __shmem_file_setup(mnt, name, size, flags, 0);
}
EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
static struct file *__shmem_zero_setup(unsigned long start, unsigned long end, vm_flags_t vm_flags)
static struct file *__shmem_zero_setup(unsigned long start, unsigned long end,
vma_flags_t flags)
{
loff_t size = end - start;
@@ -5914,7 +5921,7 @@ static struct file *__shmem_zero_setup(unsigned long start, unsigned long end, v
* accessible to the user through its mapping, use S_PRIVATE flag to
* bypass file security, in the same way as shmem_kernel_file_setup().
*/
return shmem_kernel_file_setup("dev/zero", size, vm_flags);
return shmem_kernel_file_setup("dev/zero", size, flags);
}
/**
@@ -5924,7 +5931,7 @@ static struct file *__shmem_zero_setup(unsigned long start, unsigned long end, v
*/
int shmem_zero_setup(struct vm_area_struct *vma)
{
struct file *file = __shmem_zero_setup(vma->vm_start, vma->vm_end, vma->vm_flags);
struct file *file = __shmem_zero_setup(vma->vm_start, vma->vm_end, vma->flags);
if (IS_ERR(file))
return PTR_ERR(file);
@@ -5945,7 +5952,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
*/
int shmem_zero_setup_desc(struct vm_area_desc *desc)
{
struct file *file = __shmem_zero_setup(desc->start, desc->end, desc->vm_flags);
struct file *file = __shmem_zero_setup(desc->start, desc->end, desc->vma_flags);
if (IS_ERR(file))
return PTR_ERR(file);

View File

@@ -1154,7 +1154,7 @@ int __compat_vma_mmap(const struct file_operations *f_op,
.pgoff = vma->vm_pgoff,
.vm_file = vma->vm_file,
.vm_flags = vma->vm_flags,
.vma_flags = vma->flags,
.page_prot = vma->vm_page_prot,
.action.type = MMAP_NOTHING, /* Default */

View File

@@ -15,7 +15,10 @@ struct mmap_state {
unsigned long end;
pgoff_t pgoff;
unsigned long pglen;
vm_flags_t vm_flags;
union {
vm_flags_t vm_flags;
vma_flags_t vma_flags;
};
struct file *file;
pgprot_t page_prot;
@@ -472,19 +475,16 @@ void remove_vma(struct vm_area_struct *vma)
*
* Called with the mm semaphore held.
*/
void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct vm_area_struct *next)
void unmap_region(struct unmap_desc *unmap)
{
struct mm_struct *mm = vma->vm_mm;
struct mm_struct *mm = unmap->first->vm_mm;
struct mmu_gather tlb;
tlb_gather_mmu(&tlb, mm);
update_hiwater_rss(mm);
unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end);
mas_set(mas, vma->vm_end);
free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
next ? next->vm_start : USER_PGTABLES_CEILING,
/* mm_wr_locked = */ true);
unmap_vmas(&tlb, unmap);
mas_set(unmap->mas, unmap->tree_reset);
free_pgtables(&tlb, unmap);
tlb_finish_mmu(&tlb);
}
@@ -1256,26 +1256,32 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
struct ma_state *mas_detach, bool mm_wr_locked)
{
struct mmu_gather tlb;
struct unmap_desc unmap = {
.mas = mas_detach,
.first = vms->vma,
/* start and end may be different if there is no prev or next vma. */
.pg_start = vms->unmap_start,
.pg_end = vms->unmap_end,
.vma_start = vms->start,
.vma_end = vms->end,
/*
* The tree limits and reset differ from the normal case since it's a
* side-tree
*/
.tree_reset = 1,
.tree_end = vms->vma_count,
/*
* We can free page tables without write-locking mmap_lock because VMAs
* were isolated before we downgraded mmap_lock.
*/
.mm_wr_locked = mm_wr_locked,
};
if (!vms->clear_ptes) /* Nothing to do */
return;
/*
* We can free page tables without write-locking mmap_lock because VMAs
* were isolated before we downgraded mmap_lock.
*/
mas_set(mas_detach, 1);
tlb_gather_mmu(&tlb, vms->vma->vm_mm);
update_hiwater_rss(vms->vma->vm_mm);
unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end,
vms->vma_count);
mas_set(mas_detach, 1);
/* start and end may be different if there is no prev or next vma. */
free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start,
vms->unmap_end, mm_wr_locked);
tlb_finish_mmu(&tlb);
unmap_region(&unmap);
vms->clear_ptes = false;
}
@@ -2366,7 +2372,7 @@ static void set_desc_from_map(struct vm_area_desc *desc,
desc->pgoff = map->pgoff;
desc->vm_file = map->file;
desc->vm_flags = map->vm_flags;
desc->vma_flags = map->vma_flags;
desc->page_prot = map->page_prot;
}
@@ -2461,13 +2467,14 @@ static int __mmap_new_file_vma(struct mmap_state *map,
error = mmap_file(vma->vm_file, vma);
if (error) {
UNMAP_STATE(unmap, vmi, vma, vma->vm_start, vma->vm_end,
map->prev, map->next);
fput(vma->vm_file);
vma->vm_file = NULL;
vma_iter_set(vmi, vma->vm_end);
/* Undo any partial mapping done by a device driver. */
unmap_region(&vmi->mas, vma, map->prev, map->next);
unmap_region(&unmap);
return error;
}
@@ -2646,7 +2653,7 @@ static int call_mmap_prepare(struct mmap_state *map,
map->file_doesnt_need_get = true;
map->file = desc->vm_file;
}
map->vm_flags = desc->vm_flags;
map->vma_flags = desc->vma_flags;
map->page_prot = desc->page_prot;
/* User-defined fields. */
map->vm_ops = desc->vm_ops;
@@ -2819,7 +2826,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
return -EINVAL;
/* Map writable and ensure this isn't a sealed memfd. */
if (file && is_shared_maywrite(vm_flags)) {
if (file && is_shared_maywrite_vm_flags(vm_flags)) {
int error = mapping_map_writable(file->f_mapping);
if (error)
@@ -3049,7 +3056,7 @@ static int acct_stack_growth(struct vm_area_struct *vma,
return -ENOMEM;
/* mlock limit tests */
if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT))
if (!mlock_future_ok(mm, vma->vm_flags & VM_LOCKED, grow << PAGE_SHIFT))
return -ENOMEM;
/* Check to ensure the stack will not grow into a hugetlb-only region */

View File

@@ -155,6 +155,72 @@ struct vma_merge_struct {
};
struct unmap_desc {
struct ma_state *mas; /* the maple state point to the first vma */
struct vm_area_struct *first; /* The first vma */
unsigned long pg_start; /* The first pagetable address to free (floor) */
unsigned long pg_end; /* The last pagetable address to free (ceiling) */
unsigned long vma_start; /* The min vma address */
unsigned long vma_end; /* The max vma address */
unsigned long tree_end; /* Maximum for the vma tree search */
unsigned long tree_reset; /* Where to reset the vma tree walk */
bool mm_wr_locked; /* If the mmap write lock is held */
};
/*
* unmap_all_init() - Initialize unmap_desc to remove all vmas, point the
* pg_start and pg_end to a safe location.
*/
static inline void unmap_all_init(struct unmap_desc *unmap,
struct vma_iterator *vmi, struct vm_area_struct *vma)
{
unmap->mas = &vmi->mas;
unmap->first = vma;
unmap->pg_start = FIRST_USER_ADDRESS;
unmap->pg_end = USER_PGTABLES_CEILING;
unmap->vma_start = 0;
unmap->vma_end = ULONG_MAX;
unmap->tree_end = ULONG_MAX;
unmap->tree_reset = vma->vm_end;
unmap->mm_wr_locked = false;
}
/*
* unmap_pgtable_init() - Initialize unmap_desc to remove all page tables within
* the user range.
*
* ARM can have mappings outside of vmas.
* See: e2cdef8c847b4 ("[PATCH] freepgt: free_pgtables from FIRST_USER_ADDRESS")
*
* ARM LPAE uses page table mappings beyond the USER_PGTABLES_CEILING
* See: CONFIG_ARM_LPAE in arch/arm/include/asm/pgtable.h
*/
static inline void unmap_pgtable_init(struct unmap_desc *unmap,
struct vma_iterator *vmi)
{
vma_iter_set(vmi, unmap->tree_reset);
unmap->vma_start = FIRST_USER_ADDRESS;
unmap->vma_end = USER_PGTABLES_CEILING;
unmap->tree_end = USER_PGTABLES_CEILING;
}
#define UNMAP_STATE(name, _vmi, _vma, _vma_start, _vma_end, _prev, _next) \
struct unmap_desc name = { \
.mas = &(_vmi)->mas, \
.first = _vma, \
.pg_start = _prev ? ((struct vm_area_struct *)_prev)->vm_end : \
FIRST_USER_ADDRESS, \
.pg_end = _next ? ((struct vm_area_struct *)_next)->vm_start : \
USER_PGTABLES_CEILING, \
.vma_start = _vma_start, \
.vma_end = _vma_end, \
.tree_end = _next ? \
((struct vm_area_struct *)_next)->vm_start : \
USER_PGTABLES_CEILING, \
.tree_reset = _vma->vm_end, \
.mm_wr_locked = true, \
}
static inline bool vmg_nomem(struct vma_merge_struct *vmg)
{
return vmg->state == VMA_MERGE_ERROR_NOMEM;
@@ -243,8 +309,7 @@ static inline void set_vma_from_desc(struct vm_area_struct *vma,
vma->vm_pgoff = desc->pgoff;
if (desc->vm_file != vma->vm_file)
vma_set_file(vma, desc->vm_file);
if (desc->vm_flags != vma->vm_flags)
vm_flags_set(vma, desc->vm_flags);
vma->flags = desc->vma_flags;
vma->vm_page_prot = desc->page_prot;
/* User-defined fields. */
@@ -262,9 +327,7 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
bool unlock);
void remove_vma(struct vm_area_struct *vma);
void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct vm_area_struct *next);
void unmap_region(struct unmap_desc *unmap);
/**
* vma_modify_flags() - Perform any necessary split/merge in preparation for

View File

@@ -46,6 +46,7 @@
#include <linux/swap.h>
#include <linux/uprobes.h>
#include <linux/userfaultfd_k.h>
#include <linux/pgtable.h>
#include <asm/current.h>
#include <asm/tlb.h>

View File

@@ -343,19 +343,21 @@ static void flush_reclaim_state(struct scan_control *sc)
static bool can_demote(int nid, struct scan_control *sc,
struct mem_cgroup *memcg)
{
int demotion_nid;
struct pglist_data *pgdat = NODE_DATA(nid);
nodemask_t allowed_mask;
if (!numa_demotion_enabled)
if (!pgdat || !numa_demotion_enabled)
return false;
if (sc && sc->no_demotion)
return false;
demotion_nid = next_demotion_node(nid);
if (demotion_nid == NUMA_NO_NODE)
node_get_allowed_targets(pgdat, &allowed_mask);
if (nodes_empty(allowed_mask))
return false;
/* If demotion node isn't in the cgroup's mems_allowed, fall back */
return mem_cgroup_node_allowed(memcg, demotion_nid);
/* Filter out nodes that are not in cgroup's mems_allowed. */
mem_cgroup_node_filter_allowed(memcg, &allowed_mask);
return !nodes_empty(allowed_mask);
}
static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
@@ -1017,9 +1019,10 @@ static struct folio *alloc_demote_folio(struct folio *src,
* Folios which are not demoted are left on @demote_folios.
*/
static unsigned int demote_folio_list(struct list_head *demote_folios,
struct pglist_data *pgdat)
struct pglist_data *pgdat,
struct mem_cgroup *memcg)
{
int target_nid = next_demotion_node(pgdat->node_id);
int target_nid;
unsigned int nr_succeeded;
nodemask_t allowed_mask;
@@ -1031,7 +1034,6 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
*/
.gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
__GFP_NOMEMALLOC | GFP_NOWAIT,
.nid = target_nid,
.nmask = &allowed_mask,
.reason = MR_DEMOTION,
};
@@ -1039,10 +1041,17 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
if (list_empty(demote_folios))
return 0;
if (target_nid == NUMA_NO_NODE)
node_get_allowed_targets(pgdat, &allowed_mask);
mem_cgroup_node_filter_allowed(memcg, &allowed_mask);
if (nodes_empty(allowed_mask))
return 0;
node_get_allowed_targets(pgdat, &allowed_mask);
target_nid = next_demotion_node(pgdat->node_id, &allowed_mask);
if (target_nid == NUMA_NO_NODE)
/* No lower-tier nodes or nodes were hot-unplugged. */
return 0;
mtc.nid = target_nid;
/* Demotion ignores all cpuset and mempolicy settings */
migrate_pages(demote_folios, alloc_demote_folio, NULL,
@@ -1564,7 +1573,7 @@ keep:
/* 'folio_list' is always empty here */
/* Migrate folios selected for demotion */
nr_demoted = demote_folio_list(&demote_folios, pgdat);
nr_demoted = demote_folio_list(&demote_folios, pgdat, memcg);
nr_reclaimed += nr_demoted;
stat->nr_demoted += nr_demoted;
/* Folios that could not be demoted are still in @demote_folios */