2
0
mirror of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git synced 2025-09-04 20:19:47 +08:00

Significant patch series in this pull request:

- The 4 patch series "mseal cleanups" from Lorenzo Stoakes erforms some
   mseal cleaning with no intended functional change.
 
 - The 3 patch series "Optimizations for khugepaged" from David
   Hildenbrand improves khugepaged throughput by batching PTE operations
   for large folios.  This gain is mainly for arm64.
 
 - The 8 patch series "x86: enable EXECMEM_ROX_CACHE for ftrace and
   kprobes" from Mike Rapoport provides a bugfix, additional debug code and
   cleanups to the execmem code.
 
 - The 7 patch series "mm/shmem, swap: bugfix and improvement of mTHP
   swap in" from Kairui Song provides bugfixes, cleanups and performance
   improvememnts to the mTHP swapin code.
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCaI+6HQAKCRDdBJ7gKXxA
 jv7lAQCAKE5dUhdZ0pOYbhBKTlDapQh2KqHrlV3QFcxXgknEoQD/c3gG01rY3fLh
 Cnf5l9+cdyfKxFniO48sUPx6IpriRg8=
 =HT5/
 -----END PGP SIGNATURE-----

Merge tag 'mm-stable-2025-08-03-12-35' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Pull more MM updates from Andrew Morton:
 "Significant patch series in this pull request:

   - "mseal cleanups" (Lorenzo Stoakes)

     Some mseal cleaning with no intended functional change.

   - "Optimizations for khugepaged" (David Hildenbrand)

     Improve khugepaged throughput by batching PTE operations for large
     folios. This gain is mainly for arm64.

   - "x86: enable EXECMEM_ROX_CACHE for ftrace and kprobes" (Mike Rapoport)

     A bugfix, additional debug code and cleanups to the execmem code.

   - "mm/shmem, swap: bugfix and improvement of mTHP swap in" (Kairui Song)

     Bugfixes, cleanups and performance improvememnts to the mTHP swapin
     code"

* tag 'mm-stable-2025-08-03-12-35' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (38 commits)
  mm: mempool: fix crash in mempool_free() for zero-minimum pools
  mm: correct type for vmalloc vm_flags fields
  mm/shmem, swap: fix major fault counting
  mm/shmem, swap: rework swap entry and index calculation for large swapin
  mm/shmem, swap: simplify swapin path and result handling
  mm/shmem, swap: never use swap cache and readahead for SWP_SYNCHRONOUS_IO
  mm/shmem, swap: tidy up swap entry splitting
  mm/shmem, swap: tidy up THP swapin checks
  mm/shmem, swap: avoid redundant Xarray lookup during swapin
  x86/ftrace: enable EXECMEM_ROX_CACHE for ftrace allocations
  x86/kprobes: enable EXECMEM_ROX_CACHE for kprobes allocations
  execmem: drop writable parameter from execmem_fill_trapping_insns()
  execmem: add fallback for failures in vmalloc(VM_ALLOW_HUGE_VMAP)
  execmem: move execmem_force_rw() and execmem_restore_rox() before use
  execmem: rework execmem_cache_free()
  execmem: introduce execmem_alloc_rw()
  execmem: drop unused execmem_update_copy()
  mm: fix a UAF when vma->mm is freed after vma->vm_refcnt got dropped
  mm/rmap: add anon_vma lifetime debug check
  mm: remove mm/io-mapping.c
  ...
This commit is contained in:
Linus Torvalds 2025-08-05 16:02:07 +03:00
commit da23ea194d
42 changed files with 1086 additions and 517 deletions

View File

@ -133,4 +133,3 @@ More Memory Management Functions
.. kernel-doc:: mm/mmu_notifier.c .. kernel-doc:: mm/mmu_notifier.c
.. kernel-doc:: mm/balloon_compaction.c .. kernel-doc:: mm/balloon_compaction.c
.. kernel-doc:: mm/huge_memory.c .. kernel-doc:: mm/huge_memory.c
.. kernel-doc:: mm/io-mapping.c

View File

@ -721,7 +721,7 @@ void mark_rodata_ro(void)
static void __init declare_vma(struct vm_struct *vma, static void __init declare_vma(struct vm_struct *vma,
void *va_start, void *va_end, void *va_start, void *va_end,
vm_flags_t vm_flags) unsigned long vm_flags)
{ {
phys_addr_t pa_start = __pa_symbol(va_start); phys_addr_t pa_start = __pa_symbol(va_start);
unsigned long size = va_end - va_start; unsigned long size = va_end - va_start;
@ -1528,7 +1528,7 @@ early_initcall(prevent_bootmem_remove_init);
pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long addr,
pte_t *ptep, unsigned int nr) pte_t *ptep, unsigned int nr)
{ {
pte_t pte = get_and_clear_full_ptes(vma->vm_mm, addr, ptep, nr, /* full = */ 0); pte_t pte = get_and_clear_ptes(vma->vm_mm, addr, ptep, nr);
if (alternative_has_cap_unlikely(ARM64_WORKAROUND_2645198)) { if (alternative_has_cap_unlikely(ARM64_WORKAROUND_2645198)) {
/* /*

View File

@ -120,7 +120,7 @@ struct its_array its_pages;
static void *__its_alloc(struct its_array *pages) static void *__its_alloc(struct its_array *pages)
{ {
void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE); void *page __free(execmem) = execmem_alloc_rw(EXECMEM_MODULE_TEXT, PAGE_SIZE);
if (!page) if (!page)
return NULL; return NULL;
@ -237,7 +237,6 @@ static void *its_alloc(void)
if (!page) if (!page)
return NULL; return NULL;
execmem_make_temp_rw(page, PAGE_SIZE);
if (pages == &its_pages) if (pages == &its_pages)
set_memory_x((unsigned long)page, 1); set_memory_x((unsigned long)page, 1);

View File

@ -263,7 +263,7 @@ void arch_ftrace_update_code(int command)
static inline void *alloc_tramp(unsigned long size) static inline void *alloc_tramp(unsigned long size)
{ {
return execmem_alloc(EXECMEM_FTRACE, size); return execmem_alloc_rw(EXECMEM_FTRACE, size);
} }
static inline void tramp_free(void *tramp) static inline void tramp_free(void *tramp)
{ {

View File

@ -481,24 +481,6 @@ static int prepare_singlestep(kprobe_opcode_t *buf, struct kprobe *p,
return len; return len;
} }
/* Make page to RO mode when allocate it */
void *alloc_insn_page(void)
{
void *page;
page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
if (!page)
return NULL;
/*
* TODO: Once additional kernel code protection mechanisms are set, ensure
* that the page was not maliciously altered and it is still zeroed.
*/
set_memory_rox((unsigned long)page, 1);
return page;
}
/* Kprobe x86 instruction emulation - only regs->ip or IF flag modifiers */ /* Kprobe x86 instruction emulation - only regs->ip or IF flag modifiers */
static void kprobe_emulate_ifmodifiers(struct kprobe *p, struct pt_regs *regs) static void kprobe_emulate_ifmodifiers(struct kprobe *p, struct pt_regs *regs)

View File

@ -1063,13 +1063,9 @@ unsigned long arch_max_swapfile_size(void)
static struct execmem_info execmem_info __ro_after_init; static struct execmem_info execmem_info __ro_after_init;
#ifdef CONFIG_ARCH_HAS_EXECMEM_ROX #ifdef CONFIG_ARCH_HAS_EXECMEM_ROX
void execmem_fill_trapping_insns(void *ptr, size_t size, bool writeable) void execmem_fill_trapping_insns(void *ptr, size_t size)
{ {
/* fill memory with INT3 instructions */
if (writeable)
memset(ptr, INT3_INSN_OPCODE, size); memset(ptr, INT3_INSN_OPCODE, size);
else
text_poke_set(ptr, INT3_INSN_OPCODE, size);
} }
#endif #endif
@ -1102,7 +1098,21 @@ struct execmem_info __init *execmem_arch_setup(void)
.pgprot = pgprot, .pgprot = pgprot,
.alignment = MODULE_ALIGN, .alignment = MODULE_ALIGN,
}, },
[EXECMEM_KPROBES ... EXECMEM_BPF] = { [EXECMEM_KPROBES] = {
.flags = flags,
.start = start,
.end = MODULES_END,
.pgprot = PAGE_KERNEL_ROX,
.alignment = MODULE_ALIGN,
},
[EXECMEM_FTRACE] = {
.flags = flags,
.start = start,
.end = MODULES_END,
.pgprot = pgprot,
.alignment = MODULE_ALIGN,
},
[EXECMEM_BPF] = {
.flags = EXECMEM_KASAN_SHADOW, .flags = EXECMEM_KASAN_SHADOW,
.start = start, .start = start,
.end = MODULES_END, .end = MODULES_END,

View File

@ -60,27 +60,11 @@ enum execmem_range_flags {
* will trap * will trap
* @ptr: pointer to memory to fill * @ptr: pointer to memory to fill
* @size: size of the range to fill * @size: size of the range to fill
* @writable: is the memory poited by @ptr is writable or ROX
* *
* A hook for architecures to fill execmem ranges with invalid instructions. * A hook for architecures to fill execmem ranges with invalid instructions.
* Architectures that use EXECMEM_ROX_CACHE must implement this. * Architectures that use EXECMEM_ROX_CACHE must implement this.
*/ */
void execmem_fill_trapping_insns(void *ptr, size_t size, bool writable); void execmem_fill_trapping_insns(void *ptr, size_t size);
/**
* execmem_make_temp_rw - temporarily remap region with read-write
* permissions
* @ptr: address of the region to remap
* @size: size of the region to remap
*
* Remaps a part of the cached large page in the ROX cache in the range
* [@ptr, @ptr + @size) as writable and not executable. The caller must
* have exclusive ownership of this range and ensure nothing will try to
* execute code in this range.
*
* Return: 0 on success or negative error code on failure.
*/
int execmem_make_temp_rw(void *ptr, size_t size);
/** /**
* execmem_restore_rox - restore read-only-execute permissions * execmem_restore_rox - restore read-only-execute permissions
@ -95,7 +79,6 @@ int execmem_make_temp_rw(void *ptr, size_t size);
*/ */
int execmem_restore_rox(void *ptr, size_t size); int execmem_restore_rox(void *ptr, size_t size);
#else #else
static inline int execmem_make_temp_rw(void *ptr, size_t size) { return 0; }
static inline int execmem_restore_rox(void *ptr, size_t size) { return 0; } static inline int execmem_restore_rox(void *ptr, size_t size) { return 0; }
#endif #endif
@ -165,6 +148,28 @@ struct execmem_info *execmem_arch_setup(void);
*/ */
void *execmem_alloc(enum execmem_type type, size_t size); void *execmem_alloc(enum execmem_type type, size_t size);
/**
* execmem_alloc_rw - allocate writable executable memory
* @type: type of the allocation
* @size: how many bytes of memory are required
*
* Allocates memory that will contain executable code, either generated or
* loaded from kernel modules.
*
* Allocates memory that will contain data coupled with executable code,
* like data sections in kernel modules.
*
* Forces writable permissions on the allocated memory and the caller is
* responsible to manage the permissions afterwards.
*
* For architectures that use ROX cache the permissions will be set to R+W.
* For architectures that don't use ROX cache the default permissions for @type
* will be used as they must be writable.
*
* Return: a pointer to the allocated memory or %NULL
*/
void *execmem_alloc_rw(enum execmem_type type, size_t size);
/** /**
* execmem_free - free executable memory * execmem_free - free executable memory
* @ptr: pointer to the memory that should be freed * @ptr: pointer to the memory that should be freed
@ -185,19 +190,6 @@ DEFINE_FREE(execmem, void *, if (_T) execmem_free(_T));
struct vm_struct *execmem_vmap(size_t size); struct vm_struct *execmem_vmap(size_t size);
#endif #endif
/**
* execmem_update_copy - copy an update to executable memory
* @dst: destination address to update
* @src: source address containing the data
* @size: how many bytes of memory shold be copied
*
* Copy @size bytes from @src to @dst using text poking if the memory at
* @dst is read-only.
*
* Return: a pointer to @dst or NULL on error
*/
void *execmem_update_copy(void *dst, const void *src, size_t size);
/** /**
* execmem_is_rox - check if execmem is read-only * execmem_is_rox - check if execmem is read-only
* @type - the execmem type to check * @type - the execmem type to check

View File

@ -225,7 +225,4 @@ io_mapping_free(struct io_mapping *iomap)
kfree(iomap); kfree(iomap);
} }
int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma,
unsigned long addr, unsigned long pfn, unsigned long size);
#endif /* _LINUX_IO_MAPPING_H */ #endif /* _LINUX_IO_MAPPING_H */

View File

@ -414,8 +414,10 @@ extern unsigned int kobjsize(const void *objp);
#endif #endif
#ifdef CONFIG_64BIT #ifdef CONFIG_64BIT
/* VM is sealed, in vm_flags */ #define VM_SEALED_BIT 42
#define VM_SEALED _BITUL(63) #define VM_SEALED BIT(VM_SEALED_BIT)
#else
#define VM_SEALED VM_NONE
#endif #endif
/* Bits set in the VMA until the stack is in its final location */ /* Bits set in the VMA until the stack is in its final location */

View File

@ -12,6 +12,7 @@ extern int rcuwait_wake_up(struct rcuwait *w);
#include <linux/tracepoint-defs.h> #include <linux/tracepoint-defs.h>
#include <linux/types.h> #include <linux/types.h>
#include <linux/cleanup.h> #include <linux/cleanup.h>
#include <linux/sched/mm.h>
#define MMAP_LOCK_INITIALIZER(name) \ #define MMAP_LOCK_INITIALIZER(name) \
.mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock), .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock),
@ -154,6 +155,10 @@ static inline void vma_refcount_put(struct vm_area_struct *vma)
* reused and attached to a different mm before we lock it. * reused and attached to a different mm before we lock it.
* Returns the vma on success, NULL on failure to lock and EAGAIN if vma got * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
* detached. * detached.
*
* WARNING! The vma passed to this function cannot be used if the function
* fails to lock it because in certain cases RCU lock is dropped and then
* reacquired. Once RCU lock is dropped the vma can be concurently freed.
*/ */
static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
struct vm_area_struct *vma) struct vm_area_struct *vma)
@ -183,6 +188,31 @@ static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
} }
rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
/*
* If vma got attached to another mm from under us, that mm is not
* stable and can be freed in the narrow window after vma->vm_refcnt
* is dropped and before rcuwait_wake_up(mm) is called. Grab it before
* releasing vma->vm_refcnt.
*/
if (unlikely(vma->vm_mm != mm)) {
/* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */
struct mm_struct *other_mm = vma->vm_mm;
/*
* __mmdrop() is a heavy operation and we don't need RCU
* protection here. Release RCU lock during these operations.
* We reinstate the RCU read lock as the caller expects it to
* be held when this function returns even on error.
*/
rcu_read_unlock();
mmgrab(other_mm);
vma_refcount_put(vma);
mmdrop(other_mm);
rcu_read_lock();
return NULL;
}
/* /*
* Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
* False unlocked result is impossible because we modify and check * False unlocked result is impossible because we modify and check

View File

@ -837,8 +837,6 @@ void set_page_writeback(struct page *page);
#define folio_start_writeback(folio) \ #define folio_start_writeback(folio) \
__folio_start_writeback(folio, false) __folio_start_writeback(folio, false)
#define folio_start_writeback_keepwrite(folio) \
__folio_start_writeback(folio, true)
static __always_inline bool folio_test_head(const struct folio *folio) static __always_inline bool folio_test_head(const struct folio *folio)
{ {

View File

@ -736,6 +736,29 @@ static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm,
} }
#endif #endif
/**
* get_and_clear_ptes - Clear present PTEs that map consecutive pages of
* the same folio, collecting dirty/accessed bits.
* @mm: Address space the pages are mapped into.
* @addr: Address the first page is mapped at.
* @ptep: Page table pointer for the first entry.
* @nr: Number of entries to clear.
*
* Use this instead of get_and_clear_full_ptes() if it is known that we don't
* need to clear the full mm, which is mostly the case.
*
* Note that PTE bits in the PTE range besides the PFN can differ. For example,
* some PTEs might be write-protected.
*
* Context: The caller holds the page table lock. The PTEs map consecutive
* pages that belong to the same folio. The PTEs are all in the same PMD.
*/
static inline pte_t get_and_clear_ptes(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, unsigned int nr)
{
return get_and_clear_full_ptes(mm, addr, ptep, nr, 0);
}
#ifndef clear_full_ptes #ifndef clear_full_ptes
/** /**
* clear_full_ptes - Clear present PTEs that map consecutive pages of the same * clear_full_ptes - Clear present PTEs that map consecutive pages of the same
@ -768,6 +791,28 @@ static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr,
} }
#endif #endif
/**
* clear_ptes - Clear present PTEs that map consecutive pages of the same folio.
* @mm: Address space the pages are mapped into.
* @addr: Address the first page is mapped at.
* @ptep: Page table pointer for the first entry.
* @nr: Number of entries to clear.
*
* Use this instead of clear_full_ptes() if it is known that we don't need to
* clear the full mm, which is mostly the case.
*
* Note that PTE bits in the PTE range besides the PFN can differ. For example,
* some PTEs might be write-protected.
*
* Context: The caller holds the page table lock. The PTEs map consecutive
* pages that belong to the same folio. The PTEs are all in the same PMD.
*/
static inline void clear_ptes(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, unsigned int nr)
{
clear_full_ptes(mm, addr, ptep, nr, 0);
}
/* /*
* If two threads concurrently fault at the same page, the thread that * If two threads concurrently fault at the same page, the thread that
* won the race updates the PTE and its local TLB/Cache. The other thread * won the race updates the PTE and its local TLB/Cache. The other thread

View File

@ -449,6 +449,28 @@ static inline void __folio_rmap_sanity_checks(const struct folio *folio,
default: default:
VM_WARN_ON_ONCE(true); VM_WARN_ON_ONCE(true);
} }
/*
* Anon folios must have an associated live anon_vma as long as they're
* mapped into userspace.
* Note that the atomic_read() mainly does two things:
*
* 1. In KASAN builds with CONFIG_SLUB_RCU_DEBUG, it causes KASAN to
* check that the associated anon_vma has not yet been freed (subject
* to KASAN's usual limitations). This check will pass if the
* anon_vma's refcount has already dropped to 0 but an RCU grace
* period hasn't passed since then.
* 2. If the anon_vma has not yet been freed, it checks that the
* anon_vma still has a nonzero refcount (as opposed to being in the
* middle of an RCU delay for getting freed).
*/
if (folio_test_anon(folio) && !folio_test_ksm(folio)) {
unsigned long mapping = (unsigned long)folio->mapping;
struct anon_vma *anon_vma;
anon_vma = (void *)(mapping - FOLIO_MAPPING_ANON);
VM_WARN_ON_FOLIO(atomic_read(&anon_vma->refcount) == 0, folio);
}
} }
/* /*

View File

@ -585,9 +585,12 @@ static void check_mm(struct mm_struct *mm)
for (i = 0; i < NR_MM_COUNTERS; i++) { for (i = 0; i < NR_MM_COUNTERS; i++) {
long x = percpu_counter_sum(&mm->rss_stat[i]); long x = percpu_counter_sum(&mm->rss_stat[i]);
if (unlikely(x)) if (unlikely(x)) {
pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld Comm:%s Pid:%d\n",
mm, resident_page_types[i], x); mm, resident_page_types[i], x,
current->comm,
task_pid_nr(current));
}
} }
if (mm_pgtables_bytes(mm)) if (mm_pgtables_bytes(mm))

View File

@ -1322,20 +1322,11 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
else else
execmem_type = EXECMEM_MODULE_TEXT; execmem_type = EXECMEM_MODULE_TEXT;
ptr = execmem_alloc(execmem_type, size); ptr = execmem_alloc_rw(execmem_type, size);
if (!ptr) if (!ptr)
return -ENOMEM; return -ENOMEM;
if (execmem_is_rox(execmem_type)) { mod->mem[type].is_rox = execmem_is_rox(execmem_type);
int err = execmem_make_temp_rw(ptr, size);
if (err) {
execmem_free(ptr);
return -ENOMEM;
}
mod->mem[type].is_rox = true;
}
/* /*
* The pointer to these blocks of memory are stored on the module * The pointer to these blocks of memory are stored on the module

View File

@ -1242,10 +1242,6 @@ config KMAP_LOCAL
config KMAP_LOCAL_NON_LINEAR_PTE_ARRAY config KMAP_LOCAL_NON_LINEAR_PTE_ARRAY
bool bool
# struct io_mapping based helper. Selected by drivers that need them
config IO_MAPPING
bool
config MEMFD_CREATE config MEMFD_CREATE
bool "Enable memfd_create() system call" if EXPERT bool "Enable memfd_create() system call" if EXPERT

View File

@ -141,7 +141,6 @@ obj-$(CONFIG_MEMFD_CREATE) += memfd.o
obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
obj-$(CONFIG_PTDUMP) += ptdump.o obj-$(CONFIG_PTDUMP) += ptdump.o
obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
obj-$(CONFIG_IO_MAPPING) += io-mapping.o
obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o

View File

@ -711,6 +711,10 @@ static void damos_va_migrate_dests_add(struct folio *folio,
target -= dests->weight_arr[i]; target -= dests->weight_arr[i];
} }
/* If the folio is already in the right node, don't do anything */
if (folio_nid(folio) == dests->node_id_arr[i])
return;
isolate: isolate:
if (!folio_isolate_lru(folio)) if (!folio_isolate_lru(folio))
return; return;

View File

@ -26,7 +26,7 @@ static struct execmem_info default_execmem_info __ro_after_init;
#ifdef CONFIG_MMU #ifdef CONFIG_MMU
static void *execmem_vmalloc(struct execmem_range *range, size_t size, static void *execmem_vmalloc(struct execmem_range *range, size_t size,
pgprot_t pgprot, vm_flags_t vm_flags) pgprot_t pgprot, unsigned long vm_flags)
{ {
bool kasan = range->flags & EXECMEM_KASAN_SHADOW; bool kasan = range->flags & EXECMEM_KASAN_SHADOW;
gfp_t gfp_flags = GFP_KERNEL | __GFP_NOWARN; gfp_t gfp_flags = GFP_KERNEL | __GFP_NOWARN;
@ -82,7 +82,7 @@ struct vm_struct *execmem_vmap(size_t size)
} }
#else #else
static void *execmem_vmalloc(struct execmem_range *range, size_t size, static void *execmem_vmalloc(struct execmem_range *range, size_t size,
pgprot_t pgprot, vm_flags_t vm_flags) pgprot_t pgprot, unsigned long vm_flags)
{ {
return vmalloc(size); return vmalloc(size);
} }
@ -93,8 +93,15 @@ struct execmem_cache {
struct mutex mutex; struct mutex mutex;
struct maple_tree busy_areas; struct maple_tree busy_areas;
struct maple_tree free_areas; struct maple_tree free_areas;
unsigned int pending_free_cnt; /* protected by mutex */
}; };
/* delay to schedule asynchronous free if fast path free fails */
#define FREE_DELAY (msecs_to_jiffies(10))
/* mark entries in busy_areas that should be freed asynchronously */
#define PENDING_FREE_MASK (1 << (PAGE_SHIFT - 1))
static struct execmem_cache execmem_cache = { static struct execmem_cache execmem_cache = {
.mutex = __MUTEX_INITIALIZER(execmem_cache.mutex), .mutex = __MUTEX_INITIALIZER(execmem_cache.mutex),
.busy_areas = MTREE_INIT_EXT(busy_areas, MT_FLAGS_LOCK_EXTERN, .busy_areas = MTREE_INIT_EXT(busy_areas, MT_FLAGS_LOCK_EXTERN,
@ -130,6 +137,27 @@ err_restore:
return err; return err;
} }
static int execmem_force_rw(void *ptr, size_t size)
{
unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT;
unsigned long addr = (unsigned long)ptr;
int ret;
ret = set_memory_nx(addr, nr);
if (ret)
return ret;
return set_memory_rw(addr, nr);
}
int execmem_restore_rox(void *ptr, size_t size)
{
unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT;
unsigned long addr = (unsigned long)ptr;
return set_memory_rox(addr, nr);
}
static void execmem_cache_clean(struct work_struct *work) static void execmem_cache_clean(struct work_struct *work)
{ {
struct maple_tree *free_areas = &execmem_cache.free_areas; struct maple_tree *free_areas = &execmem_cache.free_areas;
@ -155,20 +183,17 @@ static void execmem_cache_clean(struct work_struct *work)
static DECLARE_WORK(execmem_cache_clean_work, execmem_cache_clean); static DECLARE_WORK(execmem_cache_clean_work, execmem_cache_clean);
static int execmem_cache_add(void *ptr, size_t size) static int execmem_cache_add_locked(void *ptr, size_t size, gfp_t gfp_mask)
{ {
struct maple_tree *free_areas = &execmem_cache.free_areas; struct maple_tree *free_areas = &execmem_cache.free_areas;
struct mutex *mutex = &execmem_cache.mutex;
unsigned long addr = (unsigned long)ptr; unsigned long addr = (unsigned long)ptr;
MA_STATE(mas, free_areas, addr - 1, addr + 1); MA_STATE(mas, free_areas, addr - 1, addr + 1);
unsigned long lower, upper; unsigned long lower, upper;
void *area = NULL; void *area = NULL;
int err;
lower = addr; lower = addr;
upper = addr + size - 1; upper = addr + size - 1;
mutex_lock(mutex);
area = mas_walk(&mas); area = mas_walk(&mas);
if (area && mas.last == addr - 1) if (area && mas.last == addr - 1)
lower = mas.index; lower = mas.index;
@ -178,12 +203,14 @@ static int execmem_cache_add(void *ptr, size_t size)
upper = mas.last; upper = mas.last;
mas_set_range(&mas, lower, upper); mas_set_range(&mas, lower, upper);
err = mas_store_gfp(&mas, (void *)lower, GFP_KERNEL); return mas_store_gfp(&mas, (void *)lower, gfp_mask);
mutex_unlock(mutex); }
if (err)
return err;
return 0; static int execmem_cache_add(void *ptr, size_t size, gfp_t gfp_mask)
{
guard(mutex)(&execmem_cache.mutex);
return execmem_cache_add_locked(ptr, size, gfp_mask);
} }
static bool within_range(struct execmem_range *range, struct ma_state *mas, static bool within_range(struct execmem_range *range, struct ma_state *mas,
@ -256,7 +283,7 @@ out_unlock:
static int execmem_cache_populate(struct execmem_range *range, size_t size) static int execmem_cache_populate(struct execmem_range *range, size_t size)
{ {
vm_flags_t vm_flags = VM_ALLOW_HUGE_VMAP; unsigned long vm_flags = VM_ALLOW_HUGE_VMAP;
struct vm_struct *vm; struct vm_struct *vm;
size_t alloc_size; size_t alloc_size;
int err = -ENOMEM; int err = -ENOMEM;
@ -264,6 +291,11 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size)
alloc_size = round_up(size, PMD_SIZE); alloc_size = round_up(size, PMD_SIZE);
p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags); p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags);
if (!p) {
alloc_size = size;
p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags);
}
if (!p) if (!p)
return err; return err;
@ -272,13 +304,13 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size)
goto err_free_mem; goto err_free_mem;
/* fill memory with instructions that will trap */ /* fill memory with instructions that will trap */
execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true); execmem_fill_trapping_insns(p, alloc_size);
err = set_memory_rox((unsigned long)p, vm->nr_pages); err = set_memory_rox((unsigned long)p, vm->nr_pages);
if (err) if (err)
goto err_free_mem; goto err_free_mem;
err = execmem_cache_add(p, alloc_size); err = execmem_cache_add(p, alloc_size, GFP_KERNEL);
if (err) if (err)
goto err_reset_direct_map; goto err_reset_direct_map;
@ -307,57 +339,117 @@ static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
return __execmem_cache_alloc(range, size); return __execmem_cache_alloc(range, size);
} }
static inline bool is_pending_free(void *ptr)
{
return ((unsigned long)ptr & PENDING_FREE_MASK);
}
static inline void *pending_free_set(void *ptr)
{
return (void *)((unsigned long)ptr | PENDING_FREE_MASK);
}
static inline void *pending_free_clear(void *ptr)
{
return (void *)((unsigned long)ptr & ~PENDING_FREE_MASK);
}
static int __execmem_cache_free(struct ma_state *mas, void *ptr, gfp_t gfp_mask)
{
size_t size = mas_range_len(mas);
int err;
err = execmem_force_rw(ptr, size);
if (err)
return err;
execmem_fill_trapping_insns(ptr, size);
execmem_restore_rox(ptr, size);
err = execmem_cache_add_locked(ptr, size, gfp_mask);
if (err)
return err;
mas_store_gfp(mas, NULL, gfp_mask);
return 0;
}
static void execmem_cache_free_slow(struct work_struct *work);
static DECLARE_DELAYED_WORK(execmem_cache_free_work, execmem_cache_free_slow);
static void execmem_cache_free_slow(struct work_struct *work)
{
struct maple_tree *busy_areas = &execmem_cache.busy_areas;
MA_STATE(mas, busy_areas, 0, ULONG_MAX);
void *area;
guard(mutex)(&execmem_cache.mutex);
if (!execmem_cache.pending_free_cnt)
return;
mas_for_each(&mas, area, ULONG_MAX) {
if (!is_pending_free(area))
continue;
area = pending_free_clear(area);
if (__execmem_cache_free(&mas, area, GFP_KERNEL))
continue;
execmem_cache.pending_free_cnt--;
}
if (execmem_cache.pending_free_cnt)
schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY);
else
schedule_work(&execmem_cache_clean_work);
}
static bool execmem_cache_free(void *ptr) static bool execmem_cache_free(void *ptr)
{ {
struct maple_tree *busy_areas = &execmem_cache.busy_areas; struct maple_tree *busy_areas = &execmem_cache.busy_areas;
struct mutex *mutex = &execmem_cache.mutex;
unsigned long addr = (unsigned long)ptr; unsigned long addr = (unsigned long)ptr;
MA_STATE(mas, busy_areas, addr, addr); MA_STATE(mas, busy_areas, addr, addr);
size_t size;
void *area; void *area;
int err;
guard(mutex)(&execmem_cache.mutex);
mutex_lock(mutex);
area = mas_walk(&mas); area = mas_walk(&mas);
if (!area) { if (!area)
mutex_unlock(mutex);
return false; return false;
err = __execmem_cache_free(&mas, area, GFP_KERNEL | __GFP_NORETRY);
if (err) {
/*
* mas points to exact slot we've got the area from, nothing
* else can modify the tree because of the mutex, so there
* won't be any allocations in mas_store_gfp() and it will just
* change the pointer.
*/
area = pending_free_set(area);
mas_store_gfp(&mas, area, GFP_KERNEL);
execmem_cache.pending_free_cnt++;
schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY);
return true;
} }
size = mas_range_len(&mas);
mas_store_gfp(&mas, NULL, GFP_KERNEL);
mutex_unlock(mutex);
execmem_fill_trapping_insns(ptr, size, /* writable = */ false);
execmem_cache_add(ptr, size);
schedule_work(&execmem_cache_clean_work); schedule_work(&execmem_cache_clean_work);
return true; return true;
} }
int execmem_make_temp_rw(void *ptr, size_t size)
{
unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT;
unsigned long addr = (unsigned long)ptr;
int ret;
ret = set_memory_nx(addr, nr);
if (ret)
return ret;
return set_memory_rw(addr, nr);
}
int execmem_restore_rox(void *ptr, size_t size)
{
unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT;
unsigned long addr = (unsigned long)ptr;
return set_memory_rox(addr, nr);
}
#else /* CONFIG_ARCH_HAS_EXECMEM_ROX */ #else /* CONFIG_ARCH_HAS_EXECMEM_ROX */
/*
* when ROX cache is not used the permissions defined by architectures for
* execmem ranges that are updated before use (e.g. EXECMEM_MODULE_TEXT) must
* be writable anyway
*/
static inline int execmem_force_rw(void *ptr, size_t size)
{
return 0;
}
static void *execmem_cache_alloc(struct execmem_range *range, size_t size) static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
{ {
return NULL; return NULL;
@ -373,9 +465,9 @@ void *execmem_alloc(enum execmem_type type, size_t size)
{ {
struct execmem_range *range = &execmem_info->ranges[type]; struct execmem_range *range = &execmem_info->ranges[type];
bool use_cache = range->flags & EXECMEM_ROX_CACHE; bool use_cache = range->flags & EXECMEM_ROX_CACHE;
vm_flags_t vm_flags = VM_FLUSH_RESET_PERMS; unsigned long vm_flags = VM_FLUSH_RESET_PERMS;
pgprot_t pgprot = range->pgprot; pgprot_t pgprot = range->pgprot;
void *p; void *p = NULL;
size = PAGE_ALIGN(size); size = PAGE_ALIGN(size);
@ -387,6 +479,21 @@ void *execmem_alloc(enum execmem_type type, size_t size)
return kasan_reset_tag(p); return kasan_reset_tag(p);
} }
void *execmem_alloc_rw(enum execmem_type type, size_t size)
{
void *p __free(execmem) = execmem_alloc(type, size);
int err;
if (!p)
return NULL;
err = execmem_force_rw(p, size);
if (err)
return NULL;
return no_free_ptr(p);
}
void execmem_free(void *ptr) void execmem_free(void *ptr)
{ {
/* /*
@ -399,11 +506,6 @@ void execmem_free(void *ptr)
vfree(ptr); vfree(ptr);
} }
void *execmem_update_copy(void *dst, const void *src, size_t size)
{
return text_poke_copy(dst, src, size);
}
bool execmem_is_rox(enum execmem_type type) bool execmem_is_rox(enum execmem_type type)
{ {
return !!(execmem_info->ranges[type].flags & EXECMEM_ROX_CACHE); return !!(execmem_info->ranges[type].flags & EXECMEM_ROX_CACHE);

View File

@ -1391,7 +1391,7 @@ int migrate_device_coherent_folio(struct folio *folio);
struct vm_struct *__get_vm_area_node(unsigned long size, struct vm_struct *__get_vm_area_node(unsigned long size,
unsigned long align, unsigned long shift, unsigned long align, unsigned long shift,
vm_flags_t vm_flags, unsigned long start, unsigned long vm_flags, unsigned long start,
unsigned long end, int node, gfp_t gfp_mask, unsigned long end, int node, gfp_t gfp_mask,
const void *caller); const void *caller);

View File

@ -1,30 +0,0 @@
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/mm.h>
#include <linux/io-mapping.h>
/**
* io_mapping_map_user - remap an I/O mapping to userspace
* @iomap: the source io_mapping
* @vma: user vma to map to
* @addr: target user address to start at
* @pfn: physical address of kernel memory
* @size: size of map area
*
* Note: this is only safe if the mm semaphore is held when called.
*/
int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma,
unsigned long addr, unsigned long pfn, unsigned long size)
{
vm_flags_t expected_flags = VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
if (WARN_ON_ONCE((vma->vm_flags & expected_flags) != expected_flags))
return -EINVAL;
pgprot_t remap_prot = __pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) |
(pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK));
/* We rely on prevalidation of the io-mapping to skip pfnmap tracking. */
return remap_pfn_range_notrack(vma, addr, pfn, size, remap_prot);
}
EXPORT_SYMBOL_GPL(io_mapping_map_user);

View File

@ -230,16 +230,12 @@ static bool check_slab_allocation(struct kmem_cache *cache, void *object,
} }
static inline void poison_slab_object(struct kmem_cache *cache, void *object, static inline void poison_slab_object(struct kmem_cache *cache, void *object,
bool init, bool still_accessible) bool init)
{ {
void *tagged_object = object; void *tagged_object = object;
object = kasan_reset_tag(object); object = kasan_reset_tag(object);
/* RCU slabs could be legally used after free within the RCU period. */
if (unlikely(still_accessible))
return;
kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE), kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE),
KASAN_SLAB_FREE, init); KASAN_SLAB_FREE, init);
@ -261,7 +257,22 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init,
if (!kasan_arch_is_ready() || is_kfence_address(object)) if (!kasan_arch_is_ready() || is_kfence_address(object))
return false; return false;
poison_slab_object(cache, object, init, still_accessible); /*
* If this point is reached with an object that must still be
* accessible under RCU, we can't poison it; in that case, also skip the
* quarantine. This should mostly only happen when CONFIG_SLUB_RCU_DEBUG
* has been disabled manually.
*
* Putting the object on the quarantine wouldn't help catch UAFs (since
* we can't poison it here), and it would mask bugs caused by
* SLAB_TYPESAFE_BY_RCU users not being careful enough about object
* reuse; so overall, putting the object into the quarantine here would
* be counterproductive.
*/
if (still_accessible)
return false;
poison_slab_object(cache, object, init);
/* /*
* If the object is put into quarantine, do not let slab put the object * If the object is put into quarantine, do not let slab put the object
@ -519,7 +530,7 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip)
if (check_slab_allocation(slab->slab_cache, ptr, ip)) if (check_slab_allocation(slab->slab_cache, ptr, ip))
return false; return false;
poison_slab_object(slab->slab_cache, ptr, false, false); poison_slab_object(slab->slab_cache, ptr, false);
return true; return true;
} }

View File

@ -700,12 +700,15 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
spinlock_t *ptl, spinlock_t *ptl,
struct list_head *compound_pagelist) struct list_head *compound_pagelist)
{ {
unsigned long end = address + HPAGE_PMD_SIZE;
struct folio *src, *tmp; struct folio *src, *tmp;
pte_t *_pte;
pte_t pteval; pte_t pteval;
pte_t *_pte;
unsigned int nr_ptes;
for (_pte = pte; _pte < pte + HPAGE_PMD_NR; for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes,
_pte++, address += PAGE_SIZE) { address += nr_ptes * PAGE_SIZE) {
nr_ptes = 1;
pteval = ptep_get(_pte); pteval = ptep_get(_pte);
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
@ -722,18 +725,26 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
struct page *src_page = pte_page(pteval); struct page *src_page = pte_page(pteval);
src = page_folio(src_page); src = page_folio(src_page);
if (!folio_test_large(src))
if (folio_test_large(src)) {
unsigned int max_nr_ptes = (end - address) >> PAGE_SHIFT;
nr_ptes = folio_pte_batch(src, _pte, pteval, max_nr_ptes);
} else {
release_pte_folio(src); release_pte_folio(src);
}
/* /*
* ptl mostly unnecessary, but preempt has to * ptl mostly unnecessary, but preempt has to
* be disabled to update the per-cpu stats * be disabled to update the per-cpu stats
* inside folio_remove_rmap_pte(). * inside folio_remove_rmap_pte().
*/ */
spin_lock(ptl); spin_lock(ptl);
ptep_clear(vma->vm_mm, address, _pte); clear_ptes(vma->vm_mm, address, _pte, nr_ptes);
folio_remove_rmap_pte(src, src_page, vma); folio_remove_rmap_ptes(src, src_page, nr_ptes, vma);
spin_unlock(ptl); spin_unlock(ptl);
free_folio_and_swap_cache(src); free_swap_cache(src);
folio_put_refs(src, nr_ptes);
} }
} }
@ -1492,15 +1503,17 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
bool install_pmd) bool install_pmd)
{ {
int nr_mapped_ptes = 0, result = SCAN_FAIL;
unsigned int nr_batch_ptes;
struct mmu_notifier_range range; struct mmu_notifier_range range;
bool notified = false; bool notified = false;
unsigned long haddr = addr & HPAGE_PMD_MASK; unsigned long haddr = addr & HPAGE_PMD_MASK;
unsigned long end = haddr + HPAGE_PMD_SIZE;
struct vm_area_struct *vma = vma_lookup(mm, haddr); struct vm_area_struct *vma = vma_lookup(mm, haddr);
struct folio *folio; struct folio *folio;
pte_t *start_pte, *pte; pte_t *start_pte, *pte;
pmd_t *pmd, pgt_pmd; pmd_t *pmd, pgt_pmd;
spinlock_t *pml = NULL, *ptl; spinlock_t *pml = NULL, *ptl;
int nr_ptes = 0, result = SCAN_FAIL;
int i; int i;
mmap_assert_locked(mm); mmap_assert_locked(mm);
@ -1614,11 +1627,15 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
goto abort; goto abort;
/* step 2: clear page table and adjust rmap */ /* step 2: clear page table and adjust rmap */
for (i = 0, addr = haddr, pte = start_pte; for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR;
i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { i += nr_batch_ptes, addr += nr_batch_ptes * PAGE_SIZE,
pte += nr_batch_ptes) {
unsigned int max_nr_batch_ptes = (end - addr) >> PAGE_SHIFT;
struct page *page; struct page *page;
pte_t ptent = ptep_get(pte); pte_t ptent = ptep_get(pte);
nr_batch_ptes = 1;
if (pte_none(ptent)) if (pte_none(ptent))
continue; continue;
/* /*
@ -1632,26 +1649,29 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
goto abort; goto abort;
} }
page = vm_normal_page(vma, addr, ptent); page = vm_normal_page(vma, addr, ptent);
if (folio_page(folio, i) != page) if (folio_page(folio, i) != page)
goto abort; goto abort;
nr_batch_ptes = folio_pte_batch(folio, pte, ptent, max_nr_batch_ptes);
/* /*
* Must clear entry, or a racing truncate may re-remove it. * Must clear entry, or a racing truncate may re-remove it.
* TLB flush can be left until pmdp_collapse_flush() does it. * TLB flush can be left until pmdp_collapse_flush() does it.
* PTE dirty? Shmem page is already dirty; file is read-only. * PTE dirty? Shmem page is already dirty; file is read-only.
*/ */
ptep_clear(mm, addr, pte); clear_ptes(mm, addr, pte, nr_batch_ptes);
folio_remove_rmap_pte(folio, page, vma); folio_remove_rmap_ptes(folio, page, nr_batch_ptes, vma);
nr_ptes++; nr_mapped_ptes += nr_batch_ptes;
} }
if (!pml) if (!pml)
spin_unlock(ptl); spin_unlock(ptl);
/* step 3: set proper refcount and mm_counters. */ /* step 3: set proper refcount and mm_counters. */
if (nr_ptes) { if (nr_mapped_ptes) {
folio_ref_sub(folio, nr_ptes); folio_ref_sub(folio, nr_mapped_ptes);
add_mm_counter(mm, mm_counter_file(folio), -nr_ptes); add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes);
} }
/* step 4: remove empty page table */ /* step 4: remove empty page table */
@ -1684,10 +1704,10 @@ maybe_install_pmd:
: SCAN_SUCCEED; : SCAN_SUCCEED;
goto drop_folio; goto drop_folio;
abort: abort:
if (nr_ptes) { if (nr_mapped_ptes) {
flush_tlb_mm(mm); flush_tlb_mm(mm);
folio_ref_sub(folio, nr_ptes); folio_ref_sub(folio, nr_mapped_ptes);
add_mm_counter(mm, mm_counter_file(folio), -nr_ptes); add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes);
} }
unlock: unlock:
if (start_pte) if (start_pte)

View File

@ -19,6 +19,7 @@
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/sched/mm.h> #include <linux/sched/mm.h>
#include <linux/mm_inline.h> #include <linux/mm_inline.h>
#include <linux/mmu_context.h>
#include <linux/string.h> #include <linux/string.h>
#include <linux/uio.h> #include <linux/uio.h>
#include <linux/ksm.h> #include <linux/ksm.h>
@ -1256,6 +1257,74 @@ static long madvise_guard_remove(struct madvise_behavior *madv_behavior)
&guard_remove_walk_ops, NULL); &guard_remove_walk_ops, NULL);
} }
#ifdef CONFIG_64BIT
/* Does the madvise operation result in discarding of mapped data? */
static bool is_discard(int behavior)
{
switch (behavior) {
case MADV_FREE:
case MADV_DONTNEED:
case MADV_DONTNEED_LOCKED:
case MADV_REMOVE:
case MADV_DONTFORK:
case MADV_WIPEONFORK:
case MADV_GUARD_INSTALL:
return true;
}
return false;
}
/*
* We are restricted from madvise()'ing mseal()'d VMAs only in very particular
* circumstances - discarding of data from read-only anonymous SEALED mappings.
*
* This is because users cannot trivally discard data from these VMAs, and may
* only do so via an appropriate madvise() call.
*/
static bool can_madvise_modify(struct madvise_behavior *madv_behavior)
{
struct vm_area_struct *vma = madv_behavior->vma;
/* If the VMA isn't sealed we're good. */
if (!vma_is_sealed(vma))
return true;
/* For a sealed VMA, we only care about discard operations. */
if (!is_discard(madv_behavior->behavior))
return true;
/*
* We explicitly permit all file-backed mappings, whether MAP_SHARED or
* MAP_PRIVATE.
*
* The latter causes some complications. Because now, one can mmap()
* read/write a MAP_PRIVATE mapping, write to it, then mprotect()
* read-only, mseal() and a discard will be permitted.
*
* However, in order to avoid issues with potential use of madvise(...,
* MADV_DONTNEED) of mseal()'d .text mappings we, for the time being,
* permit this.
*/
if (!vma_is_anonymous(vma))
return true;
/* If the user could write to the mapping anyway, then this is fine. */
if ((vma->vm_flags & VM_WRITE) &&
arch_vma_access_permitted(vma, /* write= */ true,
/* execute= */ false, /* foreign= */ false))
return true;
/* Otherwise, we are not permitted to perform this operation. */
return false;
}
#else
static bool can_madvise_modify(struct madvise_behavior *madv_behavior)
{
return true;
}
#endif
/* /*
* Apply an madvise behavior to a region of a vma. madvise_update_vma * Apply an madvise behavior to a region of a vma. madvise_update_vma
* will handle splitting a vm area into separate areas, each area with its own * will handle splitting a vm area into separate areas, each area with its own
@ -1269,7 +1338,7 @@ static int madvise_vma_behavior(struct madvise_behavior *madv_behavior)
struct madvise_behavior_range *range = &madv_behavior->range; struct madvise_behavior_range *range = &madv_behavior->range;
int error; int error;
if (unlikely(!can_modify_vma_madv(madv_behavior->vma, behavior))) if (unlikely(!can_madvise_modify(madv_behavior)))
return -EPERM; return -EPERM;
switch (behavior) { switch (behavior) {

View File

@ -837,11 +837,17 @@ static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
struct mm_walk *walk) struct mm_walk *walk)
{ {
struct hwpoison_walk *hwp = walk->private; struct hwpoison_walk *hwp = walk->private;
pte_t pte = huge_ptep_get(walk->mm, addr, ptep);
struct hstate *h = hstate_vma(walk->vma); struct hstate *h = hstate_vma(walk->vma);
spinlock_t *ptl;
pte_t pte;
int ret;
return check_hwpoisoned_entry(pte, addr, huge_page_shift(h), ptl = huge_pte_lock(h, walk->mm, ptep);
pte = huge_ptep_get(walk->mm, addr, ptep);
ret = check_hwpoisoned_entry(pte, addr, huge_page_shift(h),
hwp->pfn, &hwp->tk); hwp->pfn, &hwp->tk);
spin_unlock(ptl);
return ret;
} }
#else #else
#define hwpoison_hugetlb_range NULL #define hwpoison_hugetlb_range NULL

View File

@ -136,7 +136,7 @@ static void kasan_unpoison_element(mempool_t *pool, void *element)
static __always_inline void add_element(mempool_t *pool, void *element) static __always_inline void add_element(mempool_t *pool, void *element)
{ {
BUG_ON(pool->curr_nr >= pool->min_nr); BUG_ON(pool->min_nr != 0 && pool->curr_nr >= pool->min_nr);
poison_element(pool, element); poison_element(pool, element);
if (kasan_poison_element(pool, element)) if (kasan_poison_element(pool, element))
pool->elements[pool->curr_nr++] = element; pool->elements[pool->curr_nr++] = element;
@ -202,16 +202,20 @@ int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
pool->alloc = alloc_fn; pool->alloc = alloc_fn;
pool->free = free_fn; pool->free = free_fn;
init_waitqueue_head(&pool->wait); init_waitqueue_head(&pool->wait);
/*
pool->elements = kmalloc_array_node(min_nr, sizeof(void *), * max() used here to ensure storage for at least 1 element to support
* zero minimum pool
*/
pool->elements = kmalloc_array_node(max(1, min_nr), sizeof(void *),
gfp_mask, node_id); gfp_mask, node_id);
if (!pool->elements) if (!pool->elements)
return -ENOMEM; return -ENOMEM;
/* /*
* First pre-allocate the guaranteed number of buffers. * First pre-allocate the guaranteed number of buffers,
* also pre-allocate 1 element for zero minimum pool.
*/ */
while (pool->curr_nr < pool->min_nr) { while (pool->curr_nr < max(1, pool->min_nr)) {
void *element; void *element;
element = pool->alloc(gfp_mask, pool->pool_data); element = pool->alloc(gfp_mask, pool->pool_data);
@ -555,20 +559,12 @@ void mempool_free(void *element, mempool_t *pool)
* wake-up path of previous test. This explicit check ensures the * wake-up path of previous test. This explicit check ensures the
* allocation of element when both min_nr and curr_nr are 0, and * allocation of element when both min_nr and curr_nr are 0, and
* any active waiters are properly awakened. * any active waiters are properly awakened.
*
* Inline the same logic as previous test, add_element() cannot be
* directly used here since it has BUG_ON to deny if min_nr equals
* curr_nr, so here picked rest of add_element() to use without
* BUG_ON check.
*/ */
if (unlikely(pool->min_nr == 0 && if (unlikely(pool->min_nr == 0 &&
READ_ONCE(pool->curr_nr) == 0)) { READ_ONCE(pool->curr_nr) == 0)) {
spin_lock_irqsave(&pool->lock, flags); spin_lock_irqsave(&pool->lock, flags);
if (likely(pool->curr_nr == 0)) { if (likely(pool->curr_nr == 0)) {
/* Inline the logic of add_element() */ add_element(pool, element);
poison_element(pool, element);
if (kasan_poison_element(pool, element))
pool->elements[pool->curr_nr++] = element;
spin_unlock_irqrestore(&pool->lock, flags); spin_unlock_irqrestore(&pool->lock, flags);
if (wq_has_sleeper(&pool->wait)) if (wq_has_sleeper(&pool->wait))
wake_up(&pool->wait); wake_up(&pool->wait);

View File

@ -29,7 +29,9 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
#ifdef CONFIG_HUGETLB_PAGE #ifdef CONFIG_HUGETLB_PAGE
unsigned char present; unsigned char present;
unsigned char *vec = walk->private; unsigned char *vec = walk->private;
spinlock_t *ptl;
ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
/* /*
* Hugepages under user process are always in RAM and never * Hugepages under user process are always in RAM and never
* swapped out, but theoretically it needs to be checked. * swapped out, but theoretically it needs to be checked.
@ -38,6 +40,7 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
for (; addr != end; vec++, addr += PAGE_SIZE) for (; addr != end; vec++, addr += PAGE_SIZE)
*vec = present; *vec = present;
walk->private = vec; walk->private = vec;
spin_unlock(ptl);
#else #else
BUG(); BUG();
#endif #endif

View File

@ -164,8 +164,7 @@ retry:
*/ */
/* Check if the vma we locked is the right one. */ /* Check if the vma we locked is the right one. */
if (unlikely(vma->vm_mm != mm || if (unlikely(address < vma->vm_start || address >= vma->vm_end))
address < vma->vm_start || address >= vma->vm_end))
goto inval_end_read; goto inval_end_read;
rcu_read_unlock(); rcu_read_unlock();
@ -236,11 +235,8 @@ retry:
goto fallback; goto fallback;
} }
/* /* Verify the vma is not behind the last search position. */
* Verify the vma we locked belongs to the same address space and it's if (unlikely(from_addr >= vma->vm_end))
* not behind of the last search position.
*/
if (unlikely(vma->vm_mm != mm || from_addr >= vma->vm_end))
goto fallback_unlock; goto fallback_unlock;
/* /*

View File

@ -766,7 +766,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
unsigned long charged = 0; unsigned long charged = 0;
int error; int error;
if (!can_modify_vma(vma)) if (vma_is_sealed(vma))
return -EPERM; return -EPERM;
if (newflags == oldflags) { if (newflags == oldflags) {

View File

@ -280,7 +280,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
old_pte, max_nr_ptes); old_pte, max_nr_ptes);
force_flush = true; force_flush = true;
} }
pte = get_and_clear_full_ptes(mm, old_addr, old_ptep, nr_ptes, 0); pte = get_and_clear_ptes(mm, old_addr, old_ptep, nr_ptes);
pte = move_pte(pte, old_addr, new_addr); pte = move_pte(pte, old_addr, new_addr);
pte = move_soft_dirty_pte(pte); pte = move_soft_dirty_pte(pte);
@ -1651,7 +1651,7 @@ static int check_prep_vma(struct vma_remap_struct *vrm)
return -EFAULT; return -EFAULT;
/* If mseal()'d, mremap() is prohibited. */ /* If mseal()'d, mremap() is prohibited. */
if (!can_modify_vma(vma)) if (vma_is_sealed(vma))
return -EPERM; return -EPERM;
/* Align to hugetlb page size, if required. */ /* Align to hugetlb page size, if required. */

View File

@ -11,148 +11,74 @@
#include <linux/mman.h> #include <linux/mman.h>
#include <linux/mm.h> #include <linux/mm.h>
#include <linux/mm_inline.h> #include <linux/mm_inline.h>
#include <linux/mmu_context.h>
#include <linux/syscalls.h> #include <linux/syscalls.h>
#include <linux/sched.h> #include <linux/sched.h>
#include "internal.h" #include "internal.h"
static inline void set_vma_sealed(struct vm_area_struct *vma) /*
{ * mseal() disallows an input range which contain unmapped ranges (VMA holes).
vm_flags_set(vma, VM_SEALED); *
} * It disallows unmapped regions from start to end whether they exist at the
* start, in the middle, or at the end of the range, or any combination thereof.
static bool is_madv_discard(int behavior) *
{ * This is because after sealng a range, there's nothing to stop memory mapping
switch (behavior) { * of ranges in the remaining gaps later, meaning that the user might then
case MADV_FREE: * wrongly consider the entirety of the mseal()'d range to be sealed when it
case MADV_DONTNEED: * in fact isn't.
case MADV_DONTNEED_LOCKED:
case MADV_REMOVE:
case MADV_DONTFORK:
case MADV_WIPEONFORK:
case MADV_GUARD_INSTALL:
return true;
}
return false;
}
static bool is_ro_anon(struct vm_area_struct *vma)
{
/* check anonymous mapping. */
if (vma->vm_file || vma->vm_flags & VM_SHARED)
return false;
/*
* check for non-writable:
* PROT=RO or PKRU is not writeable.
*/ */
if (!(vma->vm_flags & VM_WRITE) ||
!arch_vma_access_permitted(vma, true, false, false))
return true;
return false;
}
/* /*
* Check if a vma is allowed to be modified by madvise. * Does the [start, end) range contain any unmapped memory?
*
* We ensure that:
* - start is part of a valid VMA.
* - end is part of a valid VMA.
* - no gap (unallocated memory) exists between start and end.
*/ */
bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior) static bool range_contains_unmapped(struct mm_struct *mm,
{ unsigned long start, unsigned long end)
if (!is_madv_discard(behavior))
return true;
if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma)))
return false;
/* Allow by default. */
return true;
}
static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct vm_area_struct **prev, unsigned long start,
unsigned long end, vm_flags_t newflags)
{
int ret = 0;
vm_flags_t oldflags = vma->vm_flags;
if (newflags == oldflags)
goto out;
vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
goto out;
}
set_vma_sealed(vma);
out:
*prev = vma;
return ret;
}
/*
* Check for do_mseal:
* 1> start is part of a valid vma.
* 2> end is part of a valid vma.
* 3> No gap (unallocated address) between start and end.
* 4> map is sealable.
*/
static int check_mm_seal(unsigned long start, unsigned long end)
{ {
struct vm_area_struct *vma; struct vm_area_struct *vma;
unsigned long nstart = start; unsigned long prev_end = start;
VMA_ITERATOR(vmi, current->mm, start); VMA_ITERATOR(vmi, current->mm, start);
/* going through each vma to check. */
for_each_vma_range(vmi, vma, end) { for_each_vma_range(vmi, vma, end) {
if (vma->vm_start > nstart) if (vma->vm_start > prev_end)
/* unallocated memory found. */ return true;
return -ENOMEM;
if (vma->vm_end >= end) prev_end = vma->vm_end;
return 0;
nstart = vma->vm_end;
} }
return -ENOMEM; return prev_end < end;
} }
/* static int mseal_apply(struct mm_struct *mm,
* Apply sealing. unsigned long start, unsigned long end)
*/
static int apply_mm_seal(unsigned long start, unsigned long end)
{ {
unsigned long nstart;
struct vm_area_struct *vma, *prev; struct vm_area_struct *vma, *prev;
unsigned long curr_start = start;
VMA_ITERATOR(vmi, mm, start);
VMA_ITERATOR(vmi, current->mm, start); /* We know there are no gaps so this will be non-NULL. */
vma = vma_iter_load(&vmi); vma = vma_iter_load(&vmi);
/*
* Note: check_mm_seal should already checked ENOMEM case.
* so vma should not be null, same for the other ENOMEM cases.
*/
prev = vma_prev(&vmi); prev = vma_prev(&vmi);
if (start > vma->vm_start) if (start > vma->vm_start)
prev = vma; prev = vma;
nstart = start;
for_each_vma_range(vmi, vma, end) { for_each_vma_range(vmi, vma, end) {
int error; unsigned long curr_end = MIN(vma->vm_end, end);
unsigned long tmp;
vm_flags_t newflags;
newflags = vma->vm_flags | VM_SEALED; if (!(vma->vm_flags & VM_SEALED)) {
tmp = vma->vm_end; vma = vma_modify_flags(&vmi, prev, vma,
if (tmp > end) curr_start, curr_end,
tmp = end; vma->vm_flags | VM_SEALED);
error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags); if (IS_ERR(vma))
if (error) return PTR_ERR(vma);
return error; vm_flags_set(vma, VM_SEALED);
nstart = vma_iter_end(&vmi); }
prev = vma;
curr_start = curr_end;
} }
return 0; return 0;
@ -240,14 +166,10 @@ int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
if (mmap_write_lock_killable(mm)) if (mmap_write_lock_killable(mm))
return -EINTR; return -EINTR;
/* if (range_contains_unmapped(mm, start, end)) {
* First pass, this helps to avoid ret = -ENOMEM;
* partial sealing in case of error in input address range,
* e.g. ENOMEM error.
*/
ret = check_mm_seal(start, end);
if (ret)
goto out; goto out;
}
/* /*
* Second pass, this should success, unless there are errors * Second pass, this should success, unless there are errors
@ -255,10 +177,10 @@ int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
* reaching the max supported VMAs, however, those cases shall * reaching the max supported VMAs, however, those cases shall
* be rare. * be rare.
*/ */
ret = apply_mm_seal(start, end); ret = mseal_apply(mm, start, end);
out: out:
mmap_write_unlock(current->mm); mmap_write_unlock(mm);
return ret; return ret;
} }

View File

@ -126,7 +126,7 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
unsigned long start, unsigned long end, gfp_t gfp_mask, unsigned long start, unsigned long end, gfp_t gfp_mask,
pgprot_t prot, vm_flags_t vm_flags, int node, pgprot_t prot, unsigned long vm_flags, int node,
const void *caller) const void *caller)
{ {
return __vmalloc_noprof(size, gfp_mask); return __vmalloc_noprof(size, gfp_mask);

View File

@ -2036,7 +2036,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
flush_cache_range(vma, address, end_addr); flush_cache_range(vma, address, end_addr);
/* Nuke the page table entry. */ /* Nuke the page table entry. */
pteval = get_and_clear_full_ptes(mm, address, pvmw.pte, nr_pages, 0); pteval = get_and_clear_ptes(mm, address, pvmw.pte, nr_pages);
/* /*
* We clear the PTE but do not flush so potentially * We clear the PTE but do not flush so potentially
* a remote CPU could still be writing to the folio. * a remote CPU could still be writing to the folio.

View File

@ -512,15 +512,27 @@ static int shmem_replace_entry(struct address_space *mapping,
/* /*
* Sometimes, before we decide whether to proceed or to fail, we must check * Sometimes, before we decide whether to proceed or to fail, we must check
* that an entry was not already brought back from swap by a racing thread. * that an entry was not already brought back or split by a racing thread.
* *
* Checking folio is not enough: by the time a swapcache folio is locked, it * Checking folio is not enough: by the time a swapcache folio is locked, it
* might be reused, and again be swapcache, using the same swap as before. * might be reused, and again be swapcache, using the same swap as before.
* Returns the swap entry's order if it still presents, else returns -1.
*/ */
static bool shmem_confirm_swap(struct address_space *mapping, static int shmem_confirm_swap(struct address_space *mapping, pgoff_t index,
pgoff_t index, swp_entry_t swap) swp_entry_t swap)
{ {
return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap); XA_STATE(xas, &mapping->i_pages, index);
int ret = -1;
void *entry;
rcu_read_lock();
do {
entry = xas_load(&xas);
if (entry == swp_to_radix_entry(swap))
ret = xas_get_order(&xas);
} while (xas_retry(&xas, entry));
rcu_read_unlock();
return ret;
} }
/* /*
@ -891,7 +903,9 @@ static int shmem_add_to_page_cache(struct folio *folio,
pgoff_t index, void *expected, gfp_t gfp) pgoff_t index, void *expected, gfp_t gfp)
{ {
XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio)); XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
long nr = folio_nr_pages(folio); unsigned long nr = folio_nr_pages(folio);
swp_entry_t iter, swap;
void *entry;
VM_BUG_ON_FOLIO(index != round_down(index, nr), folio); VM_BUG_ON_FOLIO(index != round_down(index, nr), folio);
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
@ -903,14 +917,25 @@ static int shmem_add_to_page_cache(struct folio *folio,
gfp &= GFP_RECLAIM_MASK; gfp &= GFP_RECLAIM_MASK;
folio_throttle_swaprate(folio, gfp); folio_throttle_swaprate(folio, gfp);
swap = radix_to_swp_entry(expected);
do { do {
iter = swap;
xas_lock_irq(&xas); xas_lock_irq(&xas);
if (expected != xas_find_conflict(&xas)) { xas_for_each_conflict(&xas, entry) {
/*
* The range must either be empty, or filled with
* expected swap entries. Shmem swap entries are never
* partially freed without split of both entry and
* folio, so there shouldn't be any holes.
*/
if (!expected || entry != swp_to_radix_entry(iter)) {
xas_set_err(&xas, -EEXIST); xas_set_err(&xas, -EEXIST);
goto unlock; goto unlock;
} }
if (expected && xas_find_conflict(&xas)) { iter.val += 1 << xas_get_order(&xas);
}
if (expected && iter.val - nr != swap.val) {
xas_set_err(&xas, -EEXIST); xas_set_err(&xas, -EEXIST);
goto unlock; goto unlock;
} }
@ -1992,30 +2017,47 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode,
swp_entry_t entry, int order, gfp_t gfp) swp_entry_t entry, int order, gfp_t gfp)
{ {
struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_inode_info *info = SHMEM_I(inode);
int nr_pages = 1 << order;
struct folio *new; struct folio *new;
gfp_t alloc_gfp;
void *shadow; void *shadow;
int nr_pages;
/* /*
* We have arrived here because our zones are constrained, so don't * We have arrived here because our zones are constrained, so don't
* limit chance of success with further cpuset and node constraints. * limit chance of success with further cpuset and node constraints.
*/ */
gfp &= ~GFP_CONSTRAINT_MASK; gfp &= ~GFP_CONSTRAINT_MASK;
if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && order > 0) { alloc_gfp = gfp;
gfp_t huge_gfp = vma_thp_gfp_mask(vma); if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
if (WARN_ON_ONCE(order))
return ERR_PTR(-EINVAL);
} else if (order) {
/*
* If uffd is active for the vma, we need per-page fault
* fidelity to maintain the uffd semantics, then fallback
* to swapin order-0 folio, as well as for zswap case.
* Any existing sub folio in the swap cache also blocks
* mTHP swapin.
*/
if ((vma && unlikely(userfaultfd_armed(vma))) ||
!zswap_never_enabled() ||
non_swapcache_batch(entry, nr_pages) != nr_pages)
goto fallback;
gfp = limit_gfp_mask(huge_gfp, gfp); alloc_gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp);
}
retry:
new = shmem_alloc_folio(alloc_gfp, order, info, index);
if (!new) {
new = ERR_PTR(-ENOMEM);
goto fallback;
} }
new = shmem_alloc_folio(gfp, order, info, index);
if (!new)
return ERR_PTR(-ENOMEM);
nr_pages = folio_nr_pages(new);
if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL, if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL,
gfp, entry)) { alloc_gfp, entry)) {
folio_put(new); folio_put(new);
return ERR_PTR(-ENOMEM); new = ERR_PTR(-ENOMEM);
goto fallback;
} }
/* /*
@ -2030,7 +2072,9 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode,
*/ */
if (swapcache_prepare(entry, nr_pages)) { if (swapcache_prepare(entry, nr_pages)) {
folio_put(new); folio_put(new);
return ERR_PTR(-EEXIST); new = ERR_PTR(-EEXIST);
/* Try smaller folio to avoid cache conflict */
goto fallback;
} }
__folio_set_locked(new); __folio_set_locked(new);
@ -2044,6 +2088,15 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode,
folio_add_lru(new); folio_add_lru(new);
swap_read_folio(new, NULL); swap_read_folio(new, NULL);
return new; return new;
fallback:
/* Order 0 swapin failed, nothing to fallback to, abort */
if (!order)
return new;
entry.val += index - round_down(index, nr_pages);
alloc_gfp = gfp;
nr_pages = 1;
order = 0;
goto retry;
} }
/* /*
@ -2249,7 +2302,7 @@ unlock:
if (xas_error(&xas)) if (xas_error(&xas))
return xas_error(&xas); return xas_error(&xas);
return entry_order; return 0;
} }
/* /*
@ -2266,133 +2319,109 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
struct address_space *mapping = inode->i_mapping; struct address_space *mapping = inode->i_mapping;
struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL; struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL;
struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_inode_info *info = SHMEM_I(inode);
swp_entry_t swap, index_entry;
struct swap_info_struct *si; struct swap_info_struct *si;
struct folio *folio = NULL; struct folio *folio = NULL;
bool skip_swapcache = false; bool skip_swapcache = false;
swp_entry_t swap; int error, nr_pages, order;
int error, nr_pages, order, split_order; pgoff_t offset;
VM_BUG_ON(!*foliop || !xa_is_value(*foliop)); VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
swap = radix_to_swp_entry(*foliop); index_entry = radix_to_swp_entry(*foliop);
swap = index_entry;
*foliop = NULL; *foliop = NULL;
if (is_poisoned_swp_entry(swap)) if (is_poisoned_swp_entry(index_entry))
return -EIO; return -EIO;
si = get_swap_device(swap); si = get_swap_device(index_entry);
if (!si) { order = shmem_confirm_swap(mapping, index, index_entry);
if (!shmem_confirm_swap(mapping, index, swap)) if (unlikely(!si)) {
if (order < 0)
return -EEXIST; return -EEXIST;
else else
return -EINVAL; return -EINVAL;
} }
if (unlikely(order < 0)) {
/* Look it up and read it in.. */ put_swap_device(si);
folio = swap_cache_get_folio(swap, NULL, 0); return -EEXIST;
order = xa_get_order(&mapping->i_pages, index);
if (!folio) {
int nr_pages = 1 << order;
bool fallback_order0 = false;
/* Or update major stats only when swapin succeeds?? */
if (fault_type) {
*fault_type |= VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(fault_mm, PGMAJFAULT);
} }
/* /* index may point to the middle of a large entry, get the sub entry */
* If uffd is active for the vma, we need per-page fault if (order) {
* fidelity to maintain the uffd semantics, then fallback offset = index - round_down(index, 1 << order);
* to swapin order-0 folio, as well as for zswap case.
* Any existing sub folio in the swap cache also blocks
* mTHP swapin.
*/
if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) ||
!zswap_never_enabled() ||
non_swapcache_batch(swap, nr_pages) != nr_pages))
fallback_order0 = true;
/* Skip swapcache for synchronous device. */
if (!fallback_order0 && data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
folio = shmem_swap_alloc_folio(inode, vma, index, swap, order, gfp);
if (!IS_ERR(folio)) {
skip_swapcache = true;
goto alloced;
}
/*
* Fallback to swapin order-0 folio unless the swap entry
* already exists.
*/
error = PTR_ERR(folio);
folio = NULL;
if (error == -EEXIST)
goto failed;
}
/*
* Now swap device can only swap in order 0 folio, then we
* should split the large swap entry stored in the pagecache
* if necessary.
*/
split_order = shmem_split_large_entry(inode, index, swap, gfp);
if (split_order < 0) {
error = split_order;
goto failed;
}
/*
* If the large swap entry has already been split, it is
* necessary to recalculate the new swap entry based on
* the old order alignment.
*/
if (split_order > 0) {
pgoff_t offset = index - round_down(index, 1 << split_order);
swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
} }
/* Here we actually start the io */ /* Look it up and read it in.. */
folio = swap_cache_get_folio(swap, NULL, 0);
if (!folio) {
if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
/* Direct swapin skipping swap cache & readahead */
folio = shmem_swap_alloc_folio(inode, vma, index,
index_entry, order, gfp);
if (IS_ERR(folio)) {
error = PTR_ERR(folio);
folio = NULL;
goto failed;
}
skip_swapcache = true;
} else {
/* Cached swapin only supports order 0 folio */
folio = shmem_swapin_cluster(swap, gfp, info, index); folio = shmem_swapin_cluster(swap, gfp, info, index);
if (!folio) { if (!folio) {
error = -ENOMEM; error = -ENOMEM;
goto failed; goto failed;
} }
} else if (order != folio_order(folio)) { }
if (fault_type) {
*fault_type |= VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(fault_mm, PGMAJFAULT);
}
}
if (order > folio_order(folio)) {
/* /*
* Swap readahead may swap in order 0 folios into swapcache * Swapin may get smaller folios due to various reasons:
* It may fallback to order 0 due to memory pressure or race,
* swap readahead may swap in order 0 folios into swapcache
* asynchronously, while the shmem mapping can still stores * asynchronously, while the shmem mapping can still stores
* large swap entries. In such cases, we should split the * large swap entries. In such cases, we should split the
* large swap entry to prevent possible data corruption. * large swap entry to prevent possible data corruption.
*/ */
split_order = shmem_split_large_entry(inode, index, swap, gfp); error = shmem_split_large_entry(inode, index, index_entry, gfp);
if (split_order < 0) { if (error)
folio_put(folio); goto failed_nolock;
folio = NULL;
error = split_order;
goto failed;
} }
/* /*
* If the large swap entry has already been split, it is * If the folio is large, round down swap and index by folio size.
* necessary to recalculate the new swap entry based on * No matter what race occurs, the swap layer ensures we either get
* the old order alignment. * a valid folio that has its swap entry aligned by size, or a
* temporarily invalid one which we'll abort very soon and retry.
*
* shmem_add_to_page_cache ensures the whole range contains expected
* entries and prevents any corruption, so any race split is fine
* too, it will succeed as long as the entries are still there.
*/ */
if (split_order > 0) { nr_pages = folio_nr_pages(folio);
pgoff_t offset = index - round_down(index, 1 << split_order); if (nr_pages > 1) {
swap.val = round_down(swap.val, nr_pages);
swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); index = round_down(index, nr_pages);
}
} }
alloced: /*
/* We have to do this with folio locked to prevent races */ * We have to do this with the folio locked to prevent races.
* The shmem_confirm_swap below only checks if the first swap
* entry matches the folio, that's enough to ensure the folio
* is not used outside of shmem, as shmem swap entries
* and swap cache folios are never partially freed.
*/
folio_lock(folio); folio_lock(folio);
if ((!skip_swapcache && !folio_test_swapcache(folio)) || if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
folio->swap.val != swap.val || shmem_confirm_swap(mapping, index, swap) < 0 ||
!shmem_confirm_swap(mapping, index, swap) || folio->swap.val != swap.val) {
xa_get_order(&mapping->i_pages, index) != folio_order(folio)) {
error = -EEXIST; error = -EEXIST;
goto unlock; goto unlock;
} }
@ -2415,8 +2444,7 @@ alloced:
goto failed; goto failed;
} }
error = shmem_add_to_page_cache(folio, mapping, error = shmem_add_to_page_cache(folio, mapping, index,
round_down(index, nr_pages),
swp_to_radix_entry(swap), gfp); swp_to_radix_entry(swap), gfp);
if (error) if (error)
goto failed; goto failed;
@ -2439,18 +2467,19 @@ alloced:
*foliop = folio; *foliop = folio;
return 0; return 0;
failed: failed:
if (!shmem_confirm_swap(mapping, index, swap)) if (shmem_confirm_swap(mapping, index, swap) < 0)
error = -EEXIST; error = -EEXIST;
if (error == -EIO) if (error == -EIO)
shmem_set_folio_swapin_error(inode, index, folio, swap, shmem_set_folio_swapin_error(inode, index, folio, swap,
skip_swapcache); skip_swapcache);
unlock: unlock:
if (skip_swapcache) if (folio)
swapcache_clear(si, swap, folio_nr_pages(folio));
if (folio) {
folio_unlock(folio); folio_unlock(folio);
failed_nolock:
if (skip_swapcache)
swapcache_clear(si, folio->swap, folio_nr_pages(folio));
if (folio)
folio_put(folio); folio_put(folio);
}
put_swap_device(si); put_swap_device(si);
return error; return error;
@ -5960,8 +5989,8 @@ struct folio *shmem_read_folio_gfp(struct address_space *mapping,
struct folio *folio; struct folio *folio;
int error; int error;
error = shmem_get_folio_gfp(inode, index, 0, &folio, SGP_CACHE, error = shmem_get_folio_gfp(inode, index, i_size_read(inode),
gfp, NULL, NULL); &folio, SGP_CACHE, gfp, NULL, NULL);
if (error) if (error)
return ERR_PTR(error); return ERR_PTR(error);

View File

@ -1351,7 +1351,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
} }
/* Don't bother splitting the VMA if we can't unmap it anyway */ /* Don't bother splitting the VMA if we can't unmap it anyway */
if (!can_modify_vma(vms->vma)) { if (vma_is_sealed(vms->vma)) {
error = -EPERM; error = -EPERM;
goto start_split_failed; goto start_split_failed;
} }
@ -1371,7 +1371,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
for_each_vma_range(*(vms->vmi), next, vms->end) { for_each_vma_range(*(vms->vmi), next, vms->end) {
long nrpages; long nrpages;
if (!can_modify_vma(next)) { if (vma_is_sealed(next)) {
error = -EPERM; error = -EPERM;
goto modify_vma_failed; goto modify_vma_failed;
} }

View File

@ -559,38 +559,15 @@ struct vm_area_struct *vma_iter_next_rewind(struct vma_iterator *vmi,
} }
#ifdef CONFIG_64BIT #ifdef CONFIG_64BIT
static inline bool vma_is_sealed(struct vm_area_struct *vma) static inline bool vma_is_sealed(struct vm_area_struct *vma)
{ {
return (vma->vm_flags & VM_SEALED); return (vma->vm_flags & VM_SEALED);
} }
/*
* check if a vma is sealed for modification.
* return true, if modification is allowed.
*/
static inline bool can_modify_vma(struct vm_area_struct *vma)
{
if (unlikely(vma_is_sealed(vma)))
return false;
return true;
}
bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior);
#else #else
static inline bool vma_is_sealed(struct vm_area_struct *vma)
static inline bool can_modify_vma(struct vm_area_struct *vma)
{ {
return true; return false;
} }
static inline bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
{
return true;
}
#endif #endif
#if defined(CONFIG_STACK_GROWSUP) #if defined(CONFIG_STACK_GROWSUP)

View File

@ -33,6 +33,11 @@ void print_cachestat(struct cachestat *cs)
cs->nr_evicted, cs->nr_recently_evicted); cs->nr_evicted, cs->nr_recently_evicted);
} }
enum file_type {
FILE_MMAP,
FILE_SHMEM
};
bool write_exactly(int fd, size_t filesize) bool write_exactly(int fd, size_t filesize)
{ {
int random_fd = open("/dev/urandom", O_RDONLY); int random_fd = open("/dev/urandom", O_RDONLY);
@ -201,8 +206,20 @@ out1:
out: out:
return ret; return ret;
} }
const char *file_type_str(enum file_type type)
{
switch (type) {
case FILE_SHMEM:
return "shmem";
case FILE_MMAP:
return "mmap";
default:
return "unknown";
}
}
bool test_cachestat_shmem(void)
bool run_cachestat_test(enum file_type type)
{ {
size_t PS = sysconf(_SC_PAGESIZE); size_t PS = sysconf(_SC_PAGESIZE);
size_t filesize = PS * 512 * 2; /* 2 2MB huge pages */ size_t filesize = PS * 512 * 2; /* 2 2MB huge pages */
@ -212,27 +229,50 @@ bool test_cachestat_shmem(void)
char *filename = "tmpshmcstat"; char *filename = "tmpshmcstat";
struct cachestat cs; struct cachestat cs;
bool ret = true; bool ret = true;
int fd;
unsigned long num_pages = compute_len / PS; unsigned long num_pages = compute_len / PS;
int fd = shm_open(filename, O_CREAT | O_RDWR, 0600); if (type == FILE_SHMEM)
fd = shm_open(filename, O_CREAT | O_RDWR, 0600);
else
fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0666);
if (fd < 0) { if (fd < 0) {
ksft_print_msg("Unable to create shmem file.\n"); ksft_print_msg("Unable to create %s file.\n",
file_type_str(type));
ret = false; ret = false;
goto out; goto out;
} }
if (ftruncate(fd, filesize)) { if (ftruncate(fd, filesize)) {
ksft_print_msg("Unable to truncate shmem file.\n"); ksft_print_msg("Unable to truncate %s file.\n",file_type_str(type));
ret = false; ret = false;
goto close_fd; goto close_fd;
} }
switch (type) {
case FILE_SHMEM:
if (!write_exactly(fd, filesize)) { if (!write_exactly(fd, filesize)) {
ksft_print_msg("Unable to write to shmem file.\n"); ksft_print_msg("Unable to write to file.\n");
ret = false; ret = false;
goto close_fd; goto close_fd;
} }
break;
case FILE_MMAP:
char *map = mmap(NULL, filesize, PROT_READ | PROT_WRITE,
MAP_SHARED, fd, 0);
if (map == MAP_FAILED) {
ksft_print_msg("mmap failed.\n");
ret = false;
goto close_fd;
}
for (int i = 0; i < filesize; i++)
map[i] = 'A';
break;
default:
ksft_print_msg("Unsupported file type.\n");
ret = false;
goto close_fd;
}
syscall_ret = syscall(__NR_cachestat, fd, &cs_range, &cs, 0); syscall_ret = syscall(__NR_cachestat, fd, &cs_range, &cs, 0);
if (syscall_ret) { if (syscall_ret) {
@ -308,12 +348,18 @@ int main(void)
break; break;
} }
if (test_cachestat_shmem()) if (run_cachestat_test(FILE_SHMEM))
ksft_test_result_pass("cachestat works with a shmem file\n"); ksft_test_result_pass("cachestat works with a shmem file\n");
else { else {
ksft_test_result_fail("cachestat fails with a shmem file\n"); ksft_test_result_fail("cachestat fails with a shmem file\n");
ret = 1; ret = 1;
} }
if (run_cachestat_test(FILE_MMAP))
ksft_test_result_pass("cachestat works with a mmap file\n");
else {
ksft_test_result_fail("cachestat fails with a mmap file\n");
ret = 1;
}
return ret; return ret;
} }

View File

@ -21,6 +21,7 @@ on-fault-limit
transhuge-stress transhuge-stress
pagemap_ioctl pagemap_ioctl
pfnmap pfnmap
process_madv
*.tmp* *.tmp*
protection_keys protection_keys
protection_keys_32 protection_keys_32

View File

@ -85,6 +85,7 @@ TEST_GEN_FILES += mseal_test
TEST_GEN_FILES += on-fault-limit TEST_GEN_FILES += on-fault-limit
TEST_GEN_FILES += pagemap_ioctl TEST_GEN_FILES += pagemap_ioctl
TEST_GEN_FILES += pfnmap TEST_GEN_FILES += pfnmap
TEST_GEN_FILES += process_madv
TEST_GEN_FILES += thuge-gen TEST_GEN_FILES += thuge-gen
TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += transhuge-stress
TEST_GEN_FILES += uffd-stress TEST_GEN_FILES += uffd-stress

View File

@ -0,0 +1,344 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#define _GNU_SOURCE
#include "../kselftest_harness.h"
#include <errno.h>
#include <setjmp.h>
#include <signal.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <linux/mman.h>
#include <sys/syscall.h>
#include <unistd.h>
#include <sched.h>
#include "vm_util.h"
#include "../pidfd/pidfd.h"
FIXTURE(process_madvise)
{
unsigned long page_size;
pid_t child_pid;
int remote_pidfd;
int pidfd;
};
FIXTURE_SETUP(process_madvise)
{
self->page_size = (unsigned long)sysconf(_SC_PAGESIZE);
self->pidfd = PIDFD_SELF;
self->remote_pidfd = -1;
self->child_pid = -1;
};
FIXTURE_TEARDOWN_PARENT(process_madvise)
{
/* This teardown is guaranteed to run, even if tests SKIP or ASSERT */
if (self->child_pid > 0) {
kill(self->child_pid, SIGKILL);
waitpid(self->child_pid, NULL, 0);
}
if (self->remote_pidfd >= 0)
close(self->remote_pidfd);
}
static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec,
size_t vlen, int advice, unsigned int flags)
{
return syscall(__NR_process_madvise, pidfd, iovec, vlen, advice, flags);
}
/*
* This test uses PIDFD_SELF to target the current process. The main
* goal is to verify the basic behavior of process_madvise() with
* a vector of non-contiguous memory ranges, not its cross-process
* capabilities.
*/
TEST_F(process_madvise, basic)
{
const unsigned long pagesize = self->page_size;
const int madvise_pages = 4;
struct iovec vec[madvise_pages];
int pidfd = self->pidfd;
ssize_t ret;
char *map;
/*
* Create a single large mapping. We will pick pages from this
* mapping to advise on. This ensures we test non-contiguous iovecs.
*/
map = mmap(NULL, pagesize * 10, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (map == MAP_FAILED)
SKIP(return, "mmap failed, not enough memory.\n");
/* Fill the entire region with a known pattern. */
memset(map, 'A', pagesize * 10);
/*
* Setup the iovec to point to 4 non-contiguous pages
* within the mapping.
*/
vec[0].iov_base = &map[0 * pagesize];
vec[0].iov_len = pagesize;
vec[1].iov_base = &map[3 * pagesize];
vec[1].iov_len = pagesize;
vec[2].iov_base = &map[5 * pagesize];
vec[2].iov_len = pagesize;
vec[3].iov_base = &map[8 * pagesize];
vec[3].iov_len = pagesize;
ret = sys_process_madvise(pidfd, vec, madvise_pages, MADV_DONTNEED, 0);
if (ret == -1 && errno == EPERM)
SKIP(return,
"process_madvise() unsupported or permission denied, try running as root.\n");
else if (errno == EINVAL)
SKIP(return,
"process_madvise() unsupported or parameter invalid, please check arguments.\n");
/* The call should succeed and report the total bytes processed. */
ASSERT_EQ(ret, madvise_pages * pagesize);
/* Check that advised pages are now zero. */
for (int i = 0; i < madvise_pages; i++) {
char *advised_page = (char *)vec[i].iov_base;
/* Content must be 0, not 'A'. */
ASSERT_EQ(*advised_page, '\0');
}
/* Check that an un-advised page in between is still 'A'. */
char *unadvised_page = &map[1 * pagesize];
for (int i = 0; i < pagesize; i++)
ASSERT_EQ(unadvised_page[i], 'A');
/* Cleanup. */
ASSERT_EQ(munmap(map, pagesize * 10), 0);
}
/*
* This test deterministically validates process_madvise() with MADV_COLLAPSE
* on a remote process, other advices are difficult to verify reliably.
*
* The test verifies that a memory region in a child process,
* focus on process_madv remote result, only check addresses and lengths.
* The correctness of the MADV_COLLAPSE can be found in the relevant test examples in khugepaged.
*/
TEST_F(process_madvise, remote_collapse)
{
const unsigned long pagesize = self->page_size;
long huge_page_size;
int pipe_info[2];
ssize_t ret;
struct iovec vec;
struct child_info {
pid_t pid;
void *map_addr;
} info;
huge_page_size = read_pmd_pagesize();
if (huge_page_size <= 0)
SKIP(return, "Could not determine a valid huge page size.\n");
ASSERT_EQ(pipe(pipe_info), 0);
self->child_pid = fork();
ASSERT_NE(self->child_pid, -1);
if (self->child_pid == 0) {
char *map;
size_t map_size = 2 * huge_page_size;
close(pipe_info[0]);
map = mmap(NULL, map_size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
ASSERT_NE(map, MAP_FAILED);
/* Fault in as small pages */
for (size_t i = 0; i < map_size; i += pagesize)
map[i] = 'A';
/* Send info and pause */
info.pid = getpid();
info.map_addr = map;
ret = write(pipe_info[1], &info, sizeof(info));
ASSERT_EQ(ret, sizeof(info));
close(pipe_info[1]);
pause();
exit(0);
}
close(pipe_info[1]);
/* Receive child info */
ret = read(pipe_info[0], &info, sizeof(info));
if (ret <= 0) {
waitpid(self->child_pid, NULL, 0);
SKIP(return, "Failed to read child info from pipe.\n");
}
ASSERT_EQ(ret, sizeof(info));
close(pipe_info[0]);
self->child_pid = info.pid;
self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0);
ASSERT_GE(self->remote_pidfd, 0);
vec.iov_base = info.map_addr;
vec.iov_len = huge_page_size;
ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_COLLAPSE,
0);
if (ret == -1) {
if (errno == EINVAL)
SKIP(return, "PROCESS_MADV_ADVISE is not supported.\n");
else if (errno == EPERM)
SKIP(return,
"No process_madvise() permissions, try running as root.\n");
return;
}
ASSERT_EQ(ret, huge_page_size);
}
/*
* Test process_madvise() with a pidfd for a process that has already
* exited to ensure correct error handling.
*/
TEST_F(process_madvise, exited_process_pidfd)
{
const unsigned long pagesize = self->page_size;
struct iovec vec;
char *map;
ssize_t ret;
map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
0);
if (map == MAP_FAILED)
SKIP(return, "mmap failed, not enough memory.\n");
vec.iov_base = map;
vec.iov_len = pagesize;
/*
* Using a pidfd for a process that has already exited should fail
* with ESRCH.
*/
self->child_pid = fork();
ASSERT_NE(self->child_pid, -1);
if (self->child_pid == 0)
exit(0);
self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0);
ASSERT_GE(self->remote_pidfd, 0);
/* Wait for the child to ensure it has terminated. */
waitpid(self->child_pid, NULL, 0);
ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_DONTNEED,
0);
ASSERT_EQ(ret, -1);
ASSERT_EQ(errno, ESRCH);
}
/*
* Test process_madvise() with bad pidfds to ensure correct error
* handling.
*/
TEST_F(process_madvise, bad_pidfd)
{
const unsigned long pagesize = self->page_size;
struct iovec vec;
char *map;
ssize_t ret;
map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
0);
if (map == MAP_FAILED)
SKIP(return, "mmap failed, not enough memory.\n");
vec.iov_base = map;
vec.iov_len = pagesize;
/* Using an invalid fd number (-1) should fail with EBADF. */
ret = sys_process_madvise(-1, &vec, 1, MADV_DONTNEED, 0);
ASSERT_EQ(ret, -1);
ASSERT_EQ(errno, EBADF);
/*
* Using a valid fd that is not a pidfd (e.g. stdin) should fail
* with EBADF.
*/
ret = sys_process_madvise(STDIN_FILENO, &vec, 1, MADV_DONTNEED, 0);
ASSERT_EQ(ret, -1);
ASSERT_EQ(errno, EBADF);
}
/*
* Test that process_madvise() rejects vlen > UIO_MAXIOV.
* The kernel should return -EINVAL when the number of iovecs exceeds 1024.
*/
TEST_F(process_madvise, invalid_vlen)
{
const unsigned long pagesize = self->page_size;
int pidfd = self->pidfd;
struct iovec vec;
char *map;
ssize_t ret;
map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
0);
if (map == MAP_FAILED)
SKIP(return, "mmap failed, not enough memory.\n");
vec.iov_base = map;
vec.iov_len = pagesize;
ret = sys_process_madvise(pidfd, &vec, 1025, MADV_DONTNEED, 0);
ASSERT_EQ(ret, -1);
ASSERT_EQ(errno, EINVAL);
/* Cleanup. */
ASSERT_EQ(munmap(map, pagesize), 0);
}
/*
* Test process_madvise() with an invalid flag value. Currently, only a flag
* value of 0 is supported. This test is reserved for the future, e.g., if
* synchronous flags are added.
*/
TEST_F(process_madvise, flag)
{
const unsigned long pagesize = self->page_size;
unsigned int invalid_flag;
int pidfd = self->pidfd;
struct iovec vec;
char *map;
ssize_t ret;
map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
0);
if (map == MAP_FAILED)
SKIP(return, "mmap failed, not enough memory.\n");
vec.iov_base = map;
vec.iov_len = pagesize;
invalid_flag = 0x80000000;
ret = sys_process_madvise(pidfd, &vec, 1, MADV_DONTNEED, invalid_flag);
ASSERT_EQ(ret, -1);
ASSERT_EQ(errno, EINVAL);
/* Cleanup. */
ASSERT_EQ(munmap(map, pagesize), 0);
}
TEST_HARNESS_MAIN

View File

@ -65,6 +65,8 @@ separated by spaces:
test pagemap_scan IOCTL test pagemap_scan IOCTL
- pfnmap - pfnmap
tests for VM_PFNMAP handling tests for VM_PFNMAP handling
- process_madv
test for process_madv
- cow - cow
test copy-on-write semantics test copy-on-write semantics
- thp - thp
@ -425,6 +427,9 @@ CATEGORY="madv_guard" run_test ./guard-regions
# MADV_POPULATE_READ and MADV_POPULATE_WRITE tests # MADV_POPULATE_READ and MADV_POPULATE_WRITE tests
CATEGORY="madv_populate" run_test ./madv_populate CATEGORY="madv_populate" run_test ./madv_populate
# PROCESS_MADV test
CATEGORY="process_madv" run_test ./process_madv
CATEGORY="vma_merge" run_test ./merge CATEGORY="vma_merge" run_test ./merge
if [ -x ./memfd_secret ] if [ -x ./memfd_secret ]

View File

@ -108,8 +108,10 @@ extern unsigned long dac_mmap_min_addr;
#define CAP_IPC_LOCK 14 #define CAP_IPC_LOCK 14
#ifdef CONFIG_64BIT #ifdef CONFIG_64BIT
/* VM is sealed, in vm_flags */ #define VM_SEALED_BIT 42
#define VM_SEALED _BITUL(63) #define VM_SEALED BIT(VM_SEALED_BIT)
#else
#define VM_SEALED VM_NONE
#endif #endif
#define FIRST_USER_ADDRESS 0UL #define FIRST_USER_ADDRESS 0UL