From 5c241ed8d031693dadf33dd98ed2e7cc363e9b66 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Mon, 28 Jul 2025 15:52:59 +0800 Subject: [PATCH 01/38] mm/shmem, swap: improve cached mTHP handling and fix potential hang The current swap-in code assumes that, when a swap entry in shmem mapping is order 0, its cached folios (if present) must be order 0 too, which turns out not always correct. The problem is shmem_split_large_entry is called before verifying the folio will eventually be swapped in, one possible race is: CPU1 CPU2 shmem_swapin_folio /* swap in of order > 0 swap entry S1 */ folio = swap_cache_get_folio /* folio = NULL */ order = xa_get_order /* order > 0 */ folio = shmem_swap_alloc_folio /* mTHP alloc failure, folio = NULL */ <... Interrupted ...> shmem_swapin_folio /* S1 is swapped in */ shmem_writeout /* S1 is swapped out, folio cached */ shmem_split_large_entry(..., S1) /* S1 is split, but the folio covering it has order > 0 now */ Now any following swapin of S1 will hang: `xa_get_order` returns 0, and folio lookup will return a folio with order > 0. The `xa_get_order(&mapping->i_pages, index) != folio_order(folio)` will always return false causing swap-in to return -EEXIST. And this looks fragile. So fix this up by allowing seeing a larger folio in swap cache, and check the whole shmem mapping range covered by the swapin have the right swap value upon inserting the folio. And drop the redundant tree walks before the insertion. This will actually improve performance, as it avoids two redundant Xarray tree walks in the hot path, and the only side effect is that in the failure path, shmem may redundantly reallocate a few folios causing temporary slight memory pressure. And worth noting, it may seems the order and value check before inserting might help reducing the lock contention, which is not true. The swap cache layer ensures raced swapin will either see a swap cache folio or failed to do a swapin (we have SWAP_HAS_CACHE bit even if swap cache is bypassed), so holding the folio lock and checking the folio flag is already good enough for avoiding the lock contention. The chance that a folio passes the swap entry value check but the shmem mapping slot has changed should be very low. Link: https://lkml.kernel.org/r/20250728075306.12704-1-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20250728075306.12704-2-ryncsn@gmail.com Fixes: 809bc86517cc ("mm: shmem: support large folio swap out") Signed-off-by: Kairui Song Reviewed-by: Kemeng Shi Reviewed-by: Baolin Wang Tested-by: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Cc: Dev Jain Cc: Signed-off-by: Andrew Morton --- mm/shmem.c | 39 ++++++++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 7570a24e0ae4..1d0fd266c29b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -891,7 +891,9 @@ static int shmem_add_to_page_cache(struct folio *folio, pgoff_t index, void *expected, gfp_t gfp) { XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio)); - long nr = folio_nr_pages(folio); + unsigned long nr = folio_nr_pages(folio); + swp_entry_t iter, swap; + void *entry; VM_BUG_ON_FOLIO(index != round_down(index, nr), folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); @@ -903,14 +905,25 @@ static int shmem_add_to_page_cache(struct folio *folio, gfp &= GFP_RECLAIM_MASK; folio_throttle_swaprate(folio, gfp); + swap = radix_to_swp_entry(expected); do { + iter = swap; xas_lock_irq(&xas); - if (expected != xas_find_conflict(&xas)) { - xas_set_err(&xas, -EEXIST); - goto unlock; + xas_for_each_conflict(&xas, entry) { + /* + * The range must either be empty, or filled with + * expected swap entries. Shmem swap entries are never + * partially freed without split of both entry and + * folio, so there shouldn't be any holes. + */ + if (!expected || entry != swp_to_radix_entry(iter)) { + xas_set_err(&xas, -EEXIST); + goto unlock; + } + iter.val += 1 << xas_get_order(&xas); } - if (expected && xas_find_conflict(&xas)) { + if (expected && iter.val - nr != swap.val) { xas_set_err(&xas, -EEXIST); goto unlock; } @@ -2359,7 +2372,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, error = -ENOMEM; goto failed; } - } else if (order != folio_order(folio)) { + } else if (order > folio_order(folio)) { /* * Swap readahead may swap in order 0 folios into swapcache * asynchronously, while the shmem mapping can still stores @@ -2384,15 +2397,23 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); } + } else if (order < folio_order(folio)) { + swap.val = round_down(swap.val, 1 << folio_order(folio)); + index = round_down(index, 1 << folio_order(folio)); } alloced: - /* We have to do this with folio locked to prevent races */ + /* + * We have to do this with the folio locked to prevent races. + * The shmem_confirm_swap below only checks if the first swap + * entry matches the folio, that's enough to ensure the folio + * is not used outside of shmem, as shmem swap entries + * and swap cache folios are never partially freed. + */ folio_lock(folio); if ((!skip_swapcache && !folio_test_swapcache(folio)) || - folio->swap.val != swap.val || !shmem_confirm_swap(mapping, index, swap) || - xa_get_order(&mapping->i_pages, index) != folio_order(folio)) { + folio->swap.val != swap.val) { error = -EEXIST; goto unlock; } From 8d58d65621118fdca3ed6a0b3d658ba7e0e5153c Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 31 Jul 2025 09:53:43 +0800 Subject: [PATCH 02/38] mm: shmem: fix the shmem large folio allocation for the i915 driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After commit acd7ccb284b8 ("mm: shmem: add large folio support for tmpfs"), we extend the 'huge=' option to allow any sized large folios for tmpfs, which means tmpfs will allow getting a highest order hint based on the size of write() and fallocate() paths, and then will try each allowable large order. However, when the i915 driver allocates shmem memory, it doesn't provide hint information about the size of the large folio to be allocated, resulting in the inability to allocate PMD-sized shmem, which in turn affects GPU performance. Patryk added: : In my tests, the performance drop ranges from a few percent up to 13% : in Unigine Superposition under heavy memory usage on the CPU Core Ultra : 155H with the Xe 128 EU GPU. Other users have reported performance : impact up to 30% on certain workloads. Please find more in the : regressions reports: : https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/14645 : https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/13845 : : I believe the change should be backported to all active kernel branches : after version 6.12. To fix this issue, we can use the inode's size as a write size hint in shmem_read_folio_gfp() to help allocate PMD-sized large folios. Link: https://lkml.kernel.org/r/f7e64e99a3a87a8144cc6b2f1dddf7a89c12ce44.1753926601.git.baolin.wang@linux.alibaba.com Fixes: acd7ccb284b8 ("mm: shmem: add large folio support for tmpfs") Signed-off-by: Baolin Wang Reported-by: Patryk Kowalczyk Reported-by: Ville Syrjälä Tested-by: Patryk Kowalczyk Suggested-by: Hugh Dickins Cc: Signed-off-by: Andrew Morton --- mm/shmem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 1d0fd266c29b..5e9ec28fab85 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -5981,8 +5981,8 @@ struct folio *shmem_read_folio_gfp(struct address_space *mapping, struct folio *folio; int error; - error = shmem_get_folio_gfp(inode, index, 0, &folio, SGP_CACHE, - gfp, NULL, NULL); + error = shmem_get_folio_gfp(inode, index, i_size_read(inode), + &folio, SGP_CACHE, gfp, NULL, NULL); if (error) return ERR_PTR(error); From b50e37889f9f343b772d9162d00105bb7a26c2f5 Mon Sep 17 00:00:00 2001 From: wang lian Date: Mon, 21 Jul 2025 19:46:14 +0800 Subject: [PATCH 03/38] selftests/mm: add process_madvise() tests Add tests for process_madvise(), focusing on verifying behavior under various conditions including valid usage and error cases. [lianux.mm@gmail.com: v7] Link: https://lkml.kernel.org/r/20250729113109.12272-1-lianux.mm@gmail.com Link: https://lkml.kernel.org/r/20250729113109.12272-1-lianux.mm@gmail.com Link: https://lkml.kernel.org/r/20250721114614.40996-1-lianux.mm@gmail.com Signed-off-by: wang lian Suggested-by: Lorenzo Stoakes Suggested-by: David Hildenbrand Suggested-by: Zi Yan Suggested-by: Mark Brown Acked-by: SeongJae Park Reviewed-by: Zi Yan Tested-by: Zi Yan Cc: Christian Brauner Cc: Jann Horn Cc: Kairui Song Cc: Liam Howlett Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/.gitignore | 1 + tools/testing/selftests/mm/Makefile | 1 + tools/testing/selftests/mm/process_madv.c | 344 ++++++++++++++++++++++ tools/testing/selftests/mm/run_vmtests.sh | 5 + 4 files changed, 351 insertions(+) create mode 100644 tools/testing/selftests/mm/process_madv.c diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index f2dafa0b700b..e7b23a8a05fe 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -21,6 +21,7 @@ on-fault-limit transhuge-stress pagemap_ioctl pfnmap +process_madv *.tmp* protection_keys protection_keys_32 diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index ae6f994d3add..d13b3cef2a2b 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -85,6 +85,7 @@ TEST_GEN_FILES += mseal_test TEST_GEN_FILES += on-fault-limit TEST_GEN_FILES += pagemap_ioctl TEST_GEN_FILES += pfnmap +TEST_GEN_FILES += process_madv TEST_GEN_FILES += thuge-gen TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += uffd-stress diff --git a/tools/testing/selftests/mm/process_madv.c b/tools/testing/selftests/mm/process_madv.c new file mode 100644 index 000000000000..471cae8427f1 --- /dev/null +++ b/tools/testing/selftests/mm/process_madv.c @@ -0,0 +1,344 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#define _GNU_SOURCE +#include "../kselftest_harness.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "vm_util.h" + +#include "../pidfd/pidfd.h" + +FIXTURE(process_madvise) +{ + unsigned long page_size; + pid_t child_pid; + int remote_pidfd; + int pidfd; +}; + +FIXTURE_SETUP(process_madvise) +{ + self->page_size = (unsigned long)sysconf(_SC_PAGESIZE); + self->pidfd = PIDFD_SELF; + self->remote_pidfd = -1; + self->child_pid = -1; +}; + +FIXTURE_TEARDOWN_PARENT(process_madvise) +{ + /* This teardown is guaranteed to run, even if tests SKIP or ASSERT */ + if (self->child_pid > 0) { + kill(self->child_pid, SIGKILL); + waitpid(self->child_pid, NULL, 0); + } + + if (self->remote_pidfd >= 0) + close(self->remote_pidfd); +} + +static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec, + size_t vlen, int advice, unsigned int flags) +{ + return syscall(__NR_process_madvise, pidfd, iovec, vlen, advice, flags); +} + +/* + * This test uses PIDFD_SELF to target the current process. The main + * goal is to verify the basic behavior of process_madvise() with + * a vector of non-contiguous memory ranges, not its cross-process + * capabilities. + */ +TEST_F(process_madvise, basic) +{ + const unsigned long pagesize = self->page_size; + const int madvise_pages = 4; + struct iovec vec[madvise_pages]; + int pidfd = self->pidfd; + ssize_t ret; + char *map; + + /* + * Create a single large mapping. We will pick pages from this + * mapping to advise on. This ensures we test non-contiguous iovecs. + */ + map = mmap(NULL, pagesize * 10, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (map == MAP_FAILED) + SKIP(return, "mmap failed, not enough memory.\n"); + + /* Fill the entire region with a known pattern. */ + memset(map, 'A', pagesize * 10); + + /* + * Setup the iovec to point to 4 non-contiguous pages + * within the mapping. + */ + vec[0].iov_base = &map[0 * pagesize]; + vec[0].iov_len = pagesize; + vec[1].iov_base = &map[3 * pagesize]; + vec[1].iov_len = pagesize; + vec[2].iov_base = &map[5 * pagesize]; + vec[2].iov_len = pagesize; + vec[3].iov_base = &map[8 * pagesize]; + vec[3].iov_len = pagesize; + + ret = sys_process_madvise(pidfd, vec, madvise_pages, MADV_DONTNEED, 0); + if (ret == -1 && errno == EPERM) + SKIP(return, + "process_madvise() unsupported or permission denied, try running as root.\n"); + else if (errno == EINVAL) + SKIP(return, + "process_madvise() unsupported or parameter invalid, please check arguments.\n"); + + /* The call should succeed and report the total bytes processed. */ + ASSERT_EQ(ret, madvise_pages * pagesize); + + /* Check that advised pages are now zero. */ + for (int i = 0; i < madvise_pages; i++) { + char *advised_page = (char *)vec[i].iov_base; + + /* Content must be 0, not 'A'. */ + ASSERT_EQ(*advised_page, '\0'); + } + + /* Check that an un-advised page in between is still 'A'. */ + char *unadvised_page = &map[1 * pagesize]; + + for (int i = 0; i < pagesize; i++) + ASSERT_EQ(unadvised_page[i], 'A'); + + /* Cleanup. */ + ASSERT_EQ(munmap(map, pagesize * 10), 0); +} + +/* + * This test deterministically validates process_madvise() with MADV_COLLAPSE + * on a remote process, other advices are difficult to verify reliably. + * + * The test verifies that a memory region in a child process, + * focus on process_madv remote result, only check addresses and lengths. + * The correctness of the MADV_COLLAPSE can be found in the relevant test examples in khugepaged. + */ +TEST_F(process_madvise, remote_collapse) +{ + const unsigned long pagesize = self->page_size; + long huge_page_size; + int pipe_info[2]; + ssize_t ret; + struct iovec vec; + + struct child_info { + pid_t pid; + void *map_addr; + } info; + + huge_page_size = read_pmd_pagesize(); + if (huge_page_size <= 0) + SKIP(return, "Could not determine a valid huge page size.\n"); + + ASSERT_EQ(pipe(pipe_info), 0); + + self->child_pid = fork(); + ASSERT_NE(self->child_pid, -1); + + if (self->child_pid == 0) { + char *map; + size_t map_size = 2 * huge_page_size; + + close(pipe_info[0]); + + map = mmap(NULL, map_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(map, MAP_FAILED); + + /* Fault in as small pages */ + for (size_t i = 0; i < map_size; i += pagesize) + map[i] = 'A'; + + /* Send info and pause */ + info.pid = getpid(); + info.map_addr = map; + ret = write(pipe_info[1], &info, sizeof(info)); + ASSERT_EQ(ret, sizeof(info)); + close(pipe_info[1]); + + pause(); + exit(0); + } + + close(pipe_info[1]); + + /* Receive child info */ + ret = read(pipe_info[0], &info, sizeof(info)); + if (ret <= 0) { + waitpid(self->child_pid, NULL, 0); + SKIP(return, "Failed to read child info from pipe.\n"); + } + ASSERT_EQ(ret, sizeof(info)); + close(pipe_info[0]); + self->child_pid = info.pid; + + self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0); + ASSERT_GE(self->remote_pidfd, 0); + + vec.iov_base = info.map_addr; + vec.iov_len = huge_page_size; + + ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_COLLAPSE, + 0); + if (ret == -1) { + if (errno == EINVAL) + SKIP(return, "PROCESS_MADV_ADVISE is not supported.\n"); + else if (errno == EPERM) + SKIP(return, + "No process_madvise() permissions, try running as root.\n"); + return; + } + + ASSERT_EQ(ret, huge_page_size); +} + +/* + * Test process_madvise() with a pidfd for a process that has already + * exited to ensure correct error handling. + */ +TEST_F(process_madvise, exited_process_pidfd) +{ + const unsigned long pagesize = self->page_size; + struct iovec vec; + char *map; + ssize_t ret; + + map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, + 0); + if (map == MAP_FAILED) + SKIP(return, "mmap failed, not enough memory.\n"); + + vec.iov_base = map; + vec.iov_len = pagesize; + + /* + * Using a pidfd for a process that has already exited should fail + * with ESRCH. + */ + self->child_pid = fork(); + ASSERT_NE(self->child_pid, -1); + + if (self->child_pid == 0) + exit(0); + + self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0); + ASSERT_GE(self->remote_pidfd, 0); + + /* Wait for the child to ensure it has terminated. */ + waitpid(self->child_pid, NULL, 0); + + ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_DONTNEED, + 0); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, ESRCH); +} + +/* + * Test process_madvise() with bad pidfds to ensure correct error + * handling. + */ +TEST_F(process_madvise, bad_pidfd) +{ + const unsigned long pagesize = self->page_size; + struct iovec vec; + char *map; + ssize_t ret; + + map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, + 0); + if (map == MAP_FAILED) + SKIP(return, "mmap failed, not enough memory.\n"); + + vec.iov_base = map; + vec.iov_len = pagesize; + + /* Using an invalid fd number (-1) should fail with EBADF. */ + ret = sys_process_madvise(-1, &vec, 1, MADV_DONTNEED, 0); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EBADF); + + /* + * Using a valid fd that is not a pidfd (e.g. stdin) should fail + * with EBADF. + */ + ret = sys_process_madvise(STDIN_FILENO, &vec, 1, MADV_DONTNEED, 0); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EBADF); +} + +/* + * Test that process_madvise() rejects vlen > UIO_MAXIOV. + * The kernel should return -EINVAL when the number of iovecs exceeds 1024. + */ +TEST_F(process_madvise, invalid_vlen) +{ + const unsigned long pagesize = self->page_size; + int pidfd = self->pidfd; + struct iovec vec; + char *map; + ssize_t ret; + + map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, + 0); + if (map == MAP_FAILED) + SKIP(return, "mmap failed, not enough memory.\n"); + + vec.iov_base = map; + vec.iov_len = pagesize; + + ret = sys_process_madvise(pidfd, &vec, 1025, MADV_DONTNEED, 0); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EINVAL); + + /* Cleanup. */ + ASSERT_EQ(munmap(map, pagesize), 0); +} + +/* + * Test process_madvise() with an invalid flag value. Currently, only a flag + * value of 0 is supported. This test is reserved for the future, e.g., if + * synchronous flags are added. + */ +TEST_F(process_madvise, flag) +{ + const unsigned long pagesize = self->page_size; + unsigned int invalid_flag; + int pidfd = self->pidfd; + struct iovec vec; + char *map; + ssize_t ret; + + map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, + 0); + if (map == MAP_FAILED) + SKIP(return, "mmap failed, not enough memory.\n"); + + vec.iov_base = map; + vec.iov_len = pagesize; + + invalid_flag = 0x80000000; + + ret = sys_process_madvise(pidfd, &vec, 1, MADV_DONTNEED, invalid_flag); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EINVAL); + + /* Cleanup. */ + ASSERT_EQ(munmap(map, pagesize), 0); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index a38c984103ce..471e539d82b8 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -65,6 +65,8 @@ separated by spaces: test pagemap_scan IOCTL - pfnmap tests for VM_PFNMAP handling +- process_madv + test for process_madv - cow test copy-on-write semantics - thp @@ -425,6 +427,9 @@ CATEGORY="madv_guard" run_test ./guard-regions # MADV_POPULATE_READ and MADV_POPULATE_WRITE tests CATEGORY="madv_populate" run_test ./madv_populate +# PROCESS_MADV test +CATEGORY="process_madv" run_test ./process_madv + CATEGORY="vma_merge" run_test ./merge if [ -x ./memfd_secret ] From d171b10b2d7b067c16d79e1d069a23a34f088d23 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Tue, 22 Jul 2025 11:22:30 -0700 Subject: [PATCH 04/38] mm/page-flags: remove folio_start_writeback_keepwrite() Commit cd57b77197a4 ("ext4: Convert ext4_bio_write_page() to use a folio) removed set_page_writeback_keepwrite() which was the last/only caller of folio_start_writeback_keepwrite(). Link: https://lkml.kernel.org/r/20250722182230.2114587-1-joannelkoong@gmail.com Signed-off-by: Joanne Koong Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 8e4d6eda8a8d..8d3fa3a91ce4 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -837,8 +837,6 @@ void set_page_writeback(struct page *page); #define folio_start_writeback(folio) \ __folio_start_writeback(folio, false) -#define folio_start_writeback_keepwrite(folio) \ - __folio_start_writeback(folio, true) static __always_inline bool folio_test_head(const struct folio *folio) { From 56bdf83de7f1151d141e1d020e19cc1c56ff0db4 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 23 Jul 2025 16:59:19 +0200 Subject: [PATCH 05/38] kasan: skip quarantine if object is still accessible under RCU Currently, enabling KASAN masks bugs where a lockless lookup path gets a pointer to a SLAB_TYPESAFE_BY_RCU object that might concurrently be recycled and is insufficiently careful about handling recycled objects: KASAN puts freed objects in SLAB_TYPESAFE_BY_RCU slabs onto its quarantine queues, even when it can't actually detect UAF in these objects, and the quarantine prevents fast recycling. When I introduced CONFIG_SLUB_RCU_DEBUG, my intention was that enabling CONFIG_SLUB_RCU_DEBUG should cause KASAN to mark such objects as freed after an RCU grace period and put them on the quarantine, while disabling CONFIG_SLUB_RCU_DEBUG should allow such objects to be reused immediately; but that hasn't actually been working. I discovered such a UAF bug involving SLAB_TYPESAFE_BY_RCU yesterday; I could only trigger this bug in a KASAN build by disabling CONFIG_SLUB_RCU_DEBUG and applying this patch. Link: https://lkml.kernel.org/r/20250723-kasan-tsbrcu-noquarantine-v1-1-846c8645976c@google.com Signed-off-by: Jann Horn Acked-by: Vlastimil Babka Reviewed-by: Alexander Potapenko Acked-by: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitriy Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/kasan/common.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index ed4873e18c75..9142964ab9c9 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -230,16 +230,12 @@ static bool check_slab_allocation(struct kmem_cache *cache, void *object, } static inline void poison_slab_object(struct kmem_cache *cache, void *object, - bool init, bool still_accessible) + bool init) { void *tagged_object = object; object = kasan_reset_tag(object); - /* RCU slabs could be legally used after free within the RCU period. */ - if (unlikely(still_accessible)) - return; - kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE), KASAN_SLAB_FREE, init); @@ -261,7 +257,22 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init, if (!kasan_arch_is_ready() || is_kfence_address(object)) return false; - poison_slab_object(cache, object, init, still_accessible); + /* + * If this point is reached with an object that must still be + * accessible under RCU, we can't poison it; in that case, also skip the + * quarantine. This should mostly only happen when CONFIG_SLUB_RCU_DEBUG + * has been disabled manually. + * + * Putting the object on the quarantine wouldn't help catch UAFs (since + * we can't poison it here), and it would mask bugs caused by + * SLAB_TYPESAFE_BY_RCU users not being careful enough about object + * reuse; so overall, putting the object into the quarantine here would + * be counterproductive. + */ + if (still_accessible) + return false; + + poison_slab_object(cache, object, init); /* * If the object is put into quarantine, do not let slab put the object @@ -519,7 +530,7 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip) if (check_slab_allocation(slab->slab_cache, ptr, ip)) return false; - poison_slab_object(slab->slab_cache, ptr, false, false); + poison_slab_object(slab->slab_cache, ptr, false); return true; } From 881388f34338197f4ea3adf4d08dc6374c3420c8 Mon Sep 17 00:00:00 2001 From: Xuanye Liu Date: Wed, 23 Jul 2025 18:09:00 +0800 Subject: [PATCH 06/38] mm: add process info to bad rss-counter warning Enhance the debugging information in check_mm() by including the process name and PID when reporting bad rss-counter states. This helps identify which process is associated with the memory accounting issue. Link: https://lkml.kernel.org/r/20250723100901.1909683-1-liuqiye2025@163.com Signed-off-by: Xuanye Liu Acked-by: SeongJae Park Cc: Ben Segall Cc: David Hildenbrand Cc: Dietmar Eggemann Cc: Ingo Molnar Cc: Juri Lelli Cc: Kees Cook Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Valentin Schneider Cc: Vincent Guittot Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- kernel/fork.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 1ee8eb11f38b..f799d128b968 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -585,9 +585,12 @@ static void check_mm(struct mm_struct *mm) for (i = 0; i < NR_MM_COUNTERS; i++) { long x = percpu_counter_sum(&mm->rss_stat[i]); - if (unlikely(x)) - pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", - mm, resident_page_types[i], x); + if (unlikely(x)) { + pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld Comm:%s Pid:%d\n", + mm, resident_page_types[i], x, + current->comm, + task_pid_nr(current)); + } } if (mm_pgtables_bytes(mm)) From d6a511dea45ce3e851326b6bdc63f827ebb3e765 Mon Sep 17 00:00:00 2001 From: Suresh K C Date: Wed, 9 Jul 2025 23:16:57 +0530 Subject: [PATCH 07/38] selftests: cachestat: add tests for mmap, refactor and enhance mmap test for cachestat validation Add a cohesive test case that verifies cachestat behavior with memory-mapped files using mmap(). Also refactor the test logic to reduce redundancy, improve error reporting, and clarify failure messages for both shmem and mmap file types. [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20250709174657.6916-1-suresh.k.chandrappa@gmail.com Signed-off-by: Suresh K C Reviewed-by: Joshua Hahn Tested-by: Nhat Pham Acked-by: Nhat Pham Cc: Johannes Weiner Cc: Shuah Khan Signed-off-by: Andrew Morton --- .../selftests/cachestat/test_cachestat.c | 62 ++++++++++++++++--- 1 file changed, 54 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/cachestat/test_cachestat.c b/tools/testing/selftests/cachestat/test_cachestat.c index 632ab44737ec..c952640f163b 100644 --- a/tools/testing/selftests/cachestat/test_cachestat.c +++ b/tools/testing/selftests/cachestat/test_cachestat.c @@ -33,6 +33,11 @@ void print_cachestat(struct cachestat *cs) cs->nr_evicted, cs->nr_recently_evicted); } +enum file_type { + FILE_MMAP, + FILE_SHMEM +}; + bool write_exactly(int fd, size_t filesize) { int random_fd = open("/dev/urandom", O_RDONLY); @@ -201,8 +206,20 @@ out1: out: return ret; } +const char *file_type_str(enum file_type type) +{ + switch (type) { + case FILE_SHMEM: + return "shmem"; + case FILE_MMAP: + return "mmap"; + default: + return "unknown"; + } +} -bool test_cachestat_shmem(void) + +bool run_cachestat_test(enum file_type type) { size_t PS = sysconf(_SC_PAGESIZE); size_t filesize = PS * 512 * 2; /* 2 2MB huge pages */ @@ -212,27 +229,50 @@ bool test_cachestat_shmem(void) char *filename = "tmpshmcstat"; struct cachestat cs; bool ret = true; + int fd; unsigned long num_pages = compute_len / PS; - int fd = shm_open(filename, O_CREAT | O_RDWR, 0600); + if (type == FILE_SHMEM) + fd = shm_open(filename, O_CREAT | O_RDWR, 0600); + else + fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0666); if (fd < 0) { - ksft_print_msg("Unable to create shmem file.\n"); + ksft_print_msg("Unable to create %s file.\n", + file_type_str(type)); ret = false; goto out; } if (ftruncate(fd, filesize)) { - ksft_print_msg("Unable to truncate shmem file.\n"); + ksft_print_msg("Unable to truncate %s file.\n",file_type_str(type)); ret = false; goto close_fd; } + switch (type) { + case FILE_SHMEM: + if (!write_exactly(fd, filesize)) { + ksft_print_msg("Unable to write to file.\n"); + ret = false; + goto close_fd; + } + break; + case FILE_MMAP: + char *map = mmap(NULL, filesize, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); - if (!write_exactly(fd, filesize)) { - ksft_print_msg("Unable to write to shmem file.\n"); + if (map == MAP_FAILED) { + ksft_print_msg("mmap failed.\n"); + ret = false; + goto close_fd; + } + for (int i = 0; i < filesize; i++) + map[i] = 'A'; + break; + default: + ksft_print_msg("Unsupported file type.\n"); ret = false; goto close_fd; } - syscall_ret = syscall(__NR_cachestat, fd, &cs_range, &cs, 0); if (syscall_ret) { @@ -308,12 +348,18 @@ int main(void) break; } - if (test_cachestat_shmem()) + if (run_cachestat_test(FILE_SHMEM)) ksft_test_result_pass("cachestat works with a shmem file\n"); else { ksft_test_result_fail("cachestat fails with a shmem file\n"); ret = 1; } + if (run_cachestat_test(FILE_MMAP)) + ksft_test_result_pass("cachestat works with a mmap file\n"); + else { + ksft_test_result_fail("cachestat fails with a mmap file\n"); + ret = 1; + } return ret; } From dee3ab621f2bab8e58e343bee0302d66c9b035ef Mon Sep 17 00:00:00 2001 From: Bijan Tabatabai Date: Fri, 25 Jul 2025 11:33:00 -0500 Subject: [PATCH 08/38] mm/damon/vaddr: skip isolating folios already in destination nid damos_va_migrate_dests_add() determines the node a folio should be in based on the struct damos_migrate_dests associated with the migration scheme and adds the folio to the linked list corresponding to that node so it can be migrated later. Currently, folios are isolated and added to the list even if they are already in the node they should be in. In using damon weighted interleave more, I've found that the overhead of needlessly adding these folios to the migration lists can be quite high. The overhead comes from isolating folios and placing them in the migration lists inside of damos_va_migrate_dests_add(), as well as the cost of handling those folios in damon_migrate_pages(). This patch eliminates that overhead by simply avoiding the addition of folios that are already in their intended location to the migration list. To show the benefit of this patch, we start the test workload and start a DAMON instance attached to that workload with a migrate_hot scheme that has one dest field sending data to the local node. This way, we are only measuring the overheads of the scheme, and not the cost of migrating pages, since data will be allocated to the local node by default. I tested with two workloads: the embedding reduction workload used in [1] and a microbenchmark that allocates 20GB of data then sleeps, which is similar to the memory usage of the embedding reduction workload. The time taken in damos_va_migrate_dests_add() and damon_migrate_pages() each aggregation interval is shown below. Before this patch: damos_va_migrate_dests_add damon_migrate_pages microbenchmark ~2ms ~3ms embedding reduction ~1s ~3s After this patch: damos_va_migrate_dests_add damon_migrate_pages microbenchmark 0us ~40us embedding reduction 0us ~100us I did not do an in depth analysis for why things are much slower in the embedding reduction workload than the microbenchmark. However, I assume it's because the embedding reduction workload oversaturates the bandwidth of the local memory node, increasing the memory access latency, and in turn making the pointer chasing involved in iterating through a linked list much slower. Regardless of that, this patch results in a significant speedup. [1] https://lore.kernel.org/damon/20250709005952.17776-1-bijan311@gmail.com/ Link: https://lkml.kernel.org/r/20250725163300.4602-1-bijan311@gmail.com Fixes: 19c1dc15c859 ("mm/damon/vaddr: use damos->migrate_dests in migrate_{hot,cold}") Signed-off-by: Bijan Tabatabai Reviewed-by: SeongJae Park Reviewed-by: Raghavendra K T Signed-off-by: Andrew Morton --- mm/damon/vaddr.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 94af19c4dfed..87e825349bdf 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -711,6 +711,10 @@ static void damos_va_migrate_dests_add(struct folio *folio, target -= dests->weight_arr[i]; } + /* If the folio is already in the right node, don't do anything */ + if (folio_nid(folio) == dests->node_id_arr[i]) + return; + isolate: if (!folio_isolate_lru(folio)) return; From f225b34f1e6c81c50e48f6207ddb6d290be1b932 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 25 Jul 2025 09:29:41 +0100 Subject: [PATCH 09/38] mm/mseal: always define VM_SEALED Patch series "mseal cleanups", v4. Perform a number of cleanups to the mseal logic. Firstly, VM_SEALED is treated differently from every other VMA flag, it really doesn't make sense to do this, so we start by making this consistent with everything else. Next we place the madvise logic where it belongs - in mm/madvise.c. It really makes no sense to abstract this elsewhere. In doing so, we go to great lengths to explain very clearly the previously very confusing logic as to what sealed mappings are impacted here. In doing so, we retain existing logic regarding treatment of madvise() discard operations for a sealed, read-only MAP_PRIVATE file-backed mapping. This is something we likely need to revisit. We then abstract out and explain the 'are there are any gaps in this range in the mm?' check being performed as a prerequisite to mseal being performed. Finally, we simplify the actual mseal logic which is really quite straightforward. No functional change is intended. This patch (of 4): There is no reason to treat VM_SEALED in a special way, in each other case in which a VMA flag is unavailable due to configuration, we simply assign that flag to VM_NONE, so make VM_SEALED consistent with all other VMA flags in this respect. Additionally, use the next available bit for VM_SEALED, 42, rather than arbitrarily putting it at 63 and update the declaration to match all other VMA flags. No functional change intended. Link: https://lkml.kernel.org/r/cover.1753431105.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/aeb398a77029b6e7377cd944328bc9bbc3c90537.1753431105.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Reviewed-by: Pedro Falcato Acked-by: David Hildenbrand Cc: Jann Horn Cc: Jeff Xu Cc: Kees Cook Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 ++++-- tools/testing/vma/vma_internal.h | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 8e3a4c5b78ff..ceaa780a703a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -414,8 +414,10 @@ extern unsigned int kobjsize(const void *objp); #endif #ifdef CONFIG_64BIT -/* VM is sealed, in vm_flags */ -#define VM_SEALED _BITUL(63) +#define VM_SEALED_BIT 42 +#define VM_SEALED BIT(VM_SEALED_BIT) +#else +#define VM_SEALED VM_NONE #endif /* Bits set in the VMA until the stack is in its final location */ diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 991022e9e0d3..0fe52fd6782b 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -108,8 +108,10 @@ extern unsigned long dac_mmap_min_addr; #define CAP_IPC_LOCK 14 #ifdef CONFIG_64BIT -/* VM is sealed, in vm_flags */ -#define VM_SEALED _BITUL(63) +#define VM_SEALED_BIT 42 +#define VM_SEALED BIT(VM_SEALED_BIT) +#else +#define VM_SEALED VM_NONE #endif #define FIRST_USER_ADDRESS 0UL From d0b47a6866f1047247061f3a38f12a981825b265 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 25 Jul 2025 09:29:42 +0100 Subject: [PATCH 10/38] mm/mseal: update madvise() logic The madvise() logic is inexplicably performed in mm/mseal.c - this ought to be located in mm/madvise.c. Additionally can_modify_vma_madv() is inconsistently named and, in combination with is_ro_anon(), is very confusing logic. Put a static function in mm/madvise.c instead - can_madvise_modify() - that spells out exactly what's happening. Also explicitly check for an anon VMA. Also add commentary to explain what's going on. Essentially - we disallow discarding of data in mseal()'d mappings in instances where the user couldn't otherwise write to that data. We retain the existing behaviour here regarding MAP_PRIVATE mappings of file-backed mappings, which entails some complexity - while this, strictly speaking - appears to violate mseal() semantics, it may interact badly with users which expect to be able to madvise(MADV_DONTNEED) .text mappings for instance. We may revisit this at a later date. No functional change intended. Link: https://lkml.kernel.org/r/492a98d9189646e92c8f23f4cce41ed323fe01df.1753431105.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Reviewed-by: Pedro Falcato Acked-by: David Hildenbrand Cc: Jann Horn Cc: Jeff Xu Cc: Kees Cook Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/madvise.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++- mm/mseal.c | 49 ------------------------------------ mm/vma.h | 7 ------ 3 files changed, 70 insertions(+), 57 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index bb80fc5ea08f..7f9af2dbd044 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -1256,6 +1257,74 @@ static long madvise_guard_remove(struct madvise_behavior *madv_behavior) &guard_remove_walk_ops, NULL); } +#ifdef CONFIG_64BIT +/* Does the madvise operation result in discarding of mapped data? */ +static bool is_discard(int behavior) +{ + switch (behavior) { + case MADV_FREE: + case MADV_DONTNEED: + case MADV_DONTNEED_LOCKED: + case MADV_REMOVE: + case MADV_DONTFORK: + case MADV_WIPEONFORK: + case MADV_GUARD_INSTALL: + return true; + } + + return false; +} + +/* + * We are restricted from madvise()'ing mseal()'d VMAs only in very particular + * circumstances - discarding of data from read-only anonymous SEALED mappings. + * + * This is because users cannot trivally discard data from these VMAs, and may + * only do so via an appropriate madvise() call. + */ +static bool can_madvise_modify(struct madvise_behavior *madv_behavior) +{ + struct vm_area_struct *vma = madv_behavior->vma; + + /* If the VMA isn't sealed we're good. */ + if (can_modify_vma(vma)) + return true; + + /* For a sealed VMA, we only care about discard operations. */ + if (!is_discard(madv_behavior->behavior)) + return true; + + /* + * We explicitly permit all file-backed mappings, whether MAP_SHARED or + * MAP_PRIVATE. + * + * The latter causes some complications. Because now, one can mmap() + * read/write a MAP_PRIVATE mapping, write to it, then mprotect() + * read-only, mseal() and a discard will be permitted. + * + * However, in order to avoid issues with potential use of madvise(..., + * MADV_DONTNEED) of mseal()'d .text mappings we, for the time being, + * permit this. + */ + if (!vma_is_anonymous(vma)) + return true; + + /* If the user could write to the mapping anyway, then this is fine. */ + if ((vma->vm_flags & VM_WRITE) && + arch_vma_access_permitted(vma, /* write= */ true, + /* execute= */ false, /* foreign= */ false)) + return true; + + /* Otherwise, we are not permitted to perform this operation. */ + return false; +} +#else +static bool can_madvise_modify(struct madvise_behavior *madv_behavior) +{ + return true; +} +#endif + /* * Apply an madvise behavior to a region of a vma. madvise_update_vma * will handle splitting a vm area into separate areas, each area with its own @@ -1269,7 +1338,7 @@ static int madvise_vma_behavior(struct madvise_behavior *madv_behavior) struct madvise_behavior_range *range = &madv_behavior->range; int error; - if (unlikely(!can_modify_vma_madv(madv_behavior->vma, behavior))) + if (unlikely(!can_madvise_modify(madv_behavior))) return -EPERM; switch (behavior) { diff --git a/mm/mseal.c b/mm/mseal.c index c27197ac04e8..1308e88ab184 100644 --- a/mm/mseal.c +++ b/mm/mseal.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include "internal.h" @@ -21,54 +20,6 @@ static inline void set_vma_sealed(struct vm_area_struct *vma) vm_flags_set(vma, VM_SEALED); } -static bool is_madv_discard(int behavior) -{ - switch (behavior) { - case MADV_FREE: - case MADV_DONTNEED: - case MADV_DONTNEED_LOCKED: - case MADV_REMOVE: - case MADV_DONTFORK: - case MADV_WIPEONFORK: - case MADV_GUARD_INSTALL: - return true; - } - - return false; -} - -static bool is_ro_anon(struct vm_area_struct *vma) -{ - /* check anonymous mapping. */ - if (vma->vm_file || vma->vm_flags & VM_SHARED) - return false; - - /* - * check for non-writable: - * PROT=RO or PKRU is not writeable. - */ - if (!(vma->vm_flags & VM_WRITE) || - !arch_vma_access_permitted(vma, true, false, false)) - return true; - - return false; -} - -/* - * Check if a vma is allowed to be modified by madvise. - */ -bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior) -{ - if (!is_madv_discard(behavior)) - return true; - - if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma))) - return false; - - /* Allow by default. */ - return true; -} - static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, vm_flags_t newflags) diff --git a/mm/vma.h b/mm/vma.h index acdcc515c459..85db5e880fcc 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -577,8 +577,6 @@ static inline bool can_modify_vma(struct vm_area_struct *vma) return true; } -bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior); - #else static inline bool can_modify_vma(struct vm_area_struct *vma) @@ -586,11 +584,6 @@ static inline bool can_modify_vma(struct vm_area_struct *vma) return true; } -static inline bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior) -{ - return true; -} - #endif #if defined(CONFIG_STACK_GROWSUP) From 8b2914162aa3a56062d4b7c716149946672d48a6 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 25 Jul 2025 09:29:43 +0100 Subject: [PATCH 11/38] mm/mseal: small cleanups Drop the wholly unnecessary set_vma_sealed() helper(), which is used only once, and place VMA_ITERATOR() declarations in the correct place. Retain vma_is_sealed(), and use it instead of the confusingly named can_modify_vma(), so it's abundantly clear what's being tested, rather then a nebulous sense of 'can the VMA be modified'. No functional change intended. Link: https://lkml.kernel.org/r/98cf28d04583d632a6eb698e9ad23733bb6af26b.1753431105.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Reviewed-by: Pedro Falcato Acked-by: David Hildenbrand Acked-by: Jeff Xu Cc: Jann Horn Cc: Kees Cook Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/madvise.c | 2 +- mm/mprotect.c | 2 +- mm/mremap.c | 2 +- mm/mseal.c | 9 +-------- mm/vma.c | 4 ++-- mm/vma.h | 20 ++------------------ 6 files changed, 8 insertions(+), 31 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 7f9af2dbd044..35ed4ab0d7c5 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1287,7 +1287,7 @@ static bool can_madvise_modify(struct madvise_behavior *madv_behavior) struct vm_area_struct *vma = madv_behavior->vma; /* If the VMA isn't sealed we're good. */ - if (can_modify_vma(vma)) + if (!vma_is_sealed(vma)) return true; /* For a sealed VMA, we only care about discard operations. */ diff --git a/mm/mprotect.c b/mm/mprotect.c index 2ddd37b2f462..78bded7acf79 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -766,7 +766,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, unsigned long charged = 0; int error; - if (!can_modify_vma(vma)) + if (vma_is_sealed(vma)) return -EPERM; if (newflags == oldflags) { diff --git a/mm/mremap.c b/mm/mremap.c index e15cf2e444c7..ac39845e9718 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1651,7 +1651,7 @@ static int check_prep_vma(struct vma_remap_struct *vrm) return -EFAULT; /* If mseal()'d, mremap() is prohibited. */ - if (!can_modify_vma(vma)) + if (vma_is_sealed(vma)) return -EPERM; /* Align to hugetlb page size, if required. */ diff --git a/mm/mseal.c b/mm/mseal.c index 1308e88ab184..adbcc65e9660 100644 --- a/mm/mseal.c +++ b/mm/mseal.c @@ -15,11 +15,6 @@ #include #include "internal.h" -static inline void set_vma_sealed(struct vm_area_struct *vma) -{ - vm_flags_set(vma, VM_SEALED); -} - static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, vm_flags_t newflags) @@ -36,7 +31,7 @@ static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, goto out; } - set_vma_sealed(vma); + vm_flags_set(vma, VM_SEALED); out: *prev = vma; return ret; @@ -53,7 +48,6 @@ static int check_mm_seal(unsigned long start, unsigned long end) { struct vm_area_struct *vma; unsigned long nstart = start; - VMA_ITERATOR(vmi, current->mm, start); /* going through each vma to check. */ @@ -78,7 +72,6 @@ static int apply_mm_seal(unsigned long start, unsigned long end) { unsigned long nstart; struct vm_area_struct *vma, *prev; - VMA_ITERATOR(vmi, current->mm, start); vma = vma_iter_load(&vmi); diff --git a/mm/vma.c b/mm/vma.c index fc502b741dcf..75fd2759964b 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -1351,7 +1351,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, } /* Don't bother splitting the VMA if we can't unmap it anyway */ - if (!can_modify_vma(vms->vma)) { + if (vma_is_sealed(vms->vma)) { error = -EPERM; goto start_split_failed; } @@ -1371,7 +1371,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, for_each_vma_range(*(vms->vmi), next, vms->end) { long nrpages; - if (!can_modify_vma(next)) { + if (vma_is_sealed(next)) { error = -EPERM; goto modify_vma_failed; } diff --git a/mm/vma.h b/mm/vma.h index 85db5e880fcc..b123a9cdedb0 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -559,31 +559,15 @@ struct vm_area_struct *vma_iter_next_rewind(struct vma_iterator *vmi, } #ifdef CONFIG_64BIT - static inline bool vma_is_sealed(struct vm_area_struct *vma) { return (vma->vm_flags & VM_SEALED); } - -/* - * check if a vma is sealed for modification. - * return true, if modification is allowed. - */ -static inline bool can_modify_vma(struct vm_area_struct *vma) -{ - if (unlikely(vma_is_sealed(vma))) - return false; - - return true; -} - #else - -static inline bool can_modify_vma(struct vm_area_struct *vma) +static inline bool vma_is_sealed(struct vm_area_struct *vma) { - return true; + return false; } - #endif #if defined(CONFIG_STACK_GROWSUP) From 530e090964130d538dfa74874012ca461ef692fa Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 25 Jul 2025 09:29:44 +0100 Subject: [PATCH 12/38] mm/mseal: simplify and rename VMA gap check The check_mm_seal() function is doing something general - checking whether a range contains only VMAs (or rather that it does NOT contain any unmapped regions). So rename this function to range_contains_unmapped(). Additionally simplify the logic, we are simply checking whether the last vma->vm_end has either a VMA starting after it or ends before the end parameter. This check is rather dubious, so it is sensible to keep it local to mm/mseal.c as at a later stage it may be removed, and we don't want any other mm code to perform such a check. No functional change intended. [lorenzo.stoakes@oracle.com: add comment explaining why we disallow gaps on mseal()] Link: https://lkml.kernel.org/r/d85b3d55-09dc-43ba-8204-b48267a96751@lucifer.local Link: https://lkml.kernel.org/r/dd50984eff1e242b5f7f0f070a3360ef760e06b8.1753431105.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Acked-by: David Hildenbrand Acked-by: Jeff Xu Reviewed-by: Pedro Falcato Cc: Jann Horn Cc: Kees Cook Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/mseal.c | 51 ++++++++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/mm/mseal.c b/mm/mseal.c index adbcc65e9660..d140f569c4c3 100644 --- a/mm/mseal.c +++ b/mm/mseal.c @@ -38,31 +38,40 @@ out: } /* - * Check for do_mseal: - * 1> start is part of a valid vma. - * 2> end is part of a valid vma. - * 3> No gap (unallocated address) between start and end. - * 4> map is sealable. + * mseal() disallows an input range which contain unmapped ranges (VMA holes). + * + * It disallows unmapped regions from start to end whether they exist at the + * start, in the middle, or at the end of the range, or any combination thereof. + * + * This is because after sealng a range, there's nothing to stop memory mapping + * of ranges in the remaining gaps later, meaning that the user might then + * wrongly consider the entirety of the mseal()'d range to be sealed when it + * in fact isn't. */ -static int check_mm_seal(unsigned long start, unsigned long end) + +/* + * Does the [start, end) range contain any unmapped memory? + * + * We ensure that: + * - start is part of a valid VMA. + * - end is part of a valid VMA. + * - no gap (unallocated memory) exists between start and end. + */ +static bool range_contains_unmapped(struct mm_struct *mm, + unsigned long start, unsigned long end) { struct vm_area_struct *vma; - unsigned long nstart = start; + unsigned long prev_end = start; VMA_ITERATOR(vmi, current->mm, start); - /* going through each vma to check. */ for_each_vma_range(vmi, vma, end) { - if (vma->vm_start > nstart) - /* unallocated memory found. */ - return -ENOMEM; + if (vma->vm_start > prev_end) + return true; - if (vma->vm_end >= end) - return 0; - - nstart = vma->vm_end; + prev_end = vma->vm_end; } - return -ENOMEM; + return prev_end < end; } /* @@ -184,14 +193,10 @@ int do_mseal(unsigned long start, size_t len_in, unsigned long flags) if (mmap_write_lock_killable(mm)) return -EINTR; - /* - * First pass, this helps to avoid - * partial sealing in case of error in input address range, - * e.g. ENOMEM error. - */ - ret = check_mm_seal(start, end); - if (ret) + if (range_contains_unmapped(mm, start, end)) { + ret = -ENOMEM; goto out; + } /* * Second pass, this should success, unless there are errors From 6c2da14ae1e0a0146587381594559027bd46c059 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 25 Jul 2025 09:29:45 +0100 Subject: [PATCH 13/38] mm/mseal: rework mseal apply logic The logic can be simplified - firstly by renaming the inconsistently named apply_mm_seal() to mseal_apply(). We then wrap mseal_fixup() into the main loop as the logic is simple enough to not require it, equally it isn't a hugely pleasant pattern in mprotect() etc. so it's not something we want to perpetuate. We eliminate the need for invoking vma_iter_end() on each loop by directly determining if the VMA was merged - the only thing we need concern ourselves with is whether the start/end of the (gapless) range are offset into VMAs. This refactoring also avoids the rather horrid 'pass pointer to prev around' pattern used in mprotect() et al. No functional change intended. Link: https://lkml.kernel.org/r/ddfa4376ce29f19a589d7dc8c92cb7d4f7605a4c.1753431105.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Pedro Falcato Reviewed-by: Liam R. Howlett Acked-by: David Hildenbrand Acked-by: Jeff Xu Cc: Jann Horn Cc: Kees Cook Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/mseal.c | 65 ++++++++++++++++-------------------------------------- 1 file changed, 19 insertions(+), 46 deletions(-) diff --git a/mm/mseal.c b/mm/mseal.c index d140f569c4c3..e5b205562d2e 100644 --- a/mm/mseal.c +++ b/mm/mseal.c @@ -15,28 +15,6 @@ #include #include "internal.h" -static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, - struct vm_area_struct **prev, unsigned long start, - unsigned long end, vm_flags_t newflags) -{ - int ret = 0; - vm_flags_t oldflags = vma->vm_flags; - - if (newflags == oldflags) - goto out; - - vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags); - if (IS_ERR(vma)) { - ret = PTR_ERR(vma); - goto out; - } - - vm_flags_set(vma, VM_SEALED); -out: - *prev = vma; - return ret; -} - /* * mseal() disallows an input range which contain unmapped ranges (VMA holes). * @@ -74,38 +52,33 @@ static bool range_contains_unmapped(struct mm_struct *mm, return prev_end < end; } -/* - * Apply sealing. - */ -static int apply_mm_seal(unsigned long start, unsigned long end) +static int mseal_apply(struct mm_struct *mm, + unsigned long start, unsigned long end) { - unsigned long nstart; struct vm_area_struct *vma, *prev; - VMA_ITERATOR(vmi, current->mm, start); + unsigned long curr_start = start; + VMA_ITERATOR(vmi, mm, start); + /* We know there are no gaps so this will be non-NULL. */ vma = vma_iter_load(&vmi); - /* - * Note: check_mm_seal should already checked ENOMEM case. - * so vma should not be null, same for the other ENOMEM cases. - */ prev = vma_prev(&vmi); if (start > vma->vm_start) prev = vma; - nstart = start; for_each_vma_range(vmi, vma, end) { - int error; - unsigned long tmp; - vm_flags_t newflags; + unsigned long curr_end = MIN(vma->vm_end, end); - newflags = vma->vm_flags | VM_SEALED; - tmp = vma->vm_end; - if (tmp > end) - tmp = end; - error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags); - if (error) - return error; - nstart = vma_iter_end(&vmi); + if (!(vma->vm_flags & VM_SEALED)) { + vma = vma_modify_flags(&vmi, prev, vma, + curr_start, curr_end, + vma->vm_flags | VM_SEALED); + if (IS_ERR(vma)) + return PTR_ERR(vma); + vm_flags_set(vma, VM_SEALED); + } + + prev = vma; + curr_start = curr_end; } return 0; @@ -204,10 +177,10 @@ int do_mseal(unsigned long start, size_t len_in, unsigned long flags) * reaching the max supported VMAs, however, those cases shall * be rare. */ - ret = apply_mm_seal(start, end); + ret = mseal_apply(mm, start, end); out: - mmap_write_unlock(current->mm); + mmap_write_unlock(mm); return ret; } From 9109bd52559b44a66e4dbde69d0dd36f3e4dcae8 Mon Sep 17 00:00:00 2001 From: Jinjiang Tu Date: Fri, 25 Jul 2025 11:31:12 +0800 Subject: [PATCH 14/38] mm/memory-failure: hold PTL in hwpoison_hugetlb_range Hold PTL in hwpoison_hugetlb_range() to avoid operating on stale page, as hwpoison_pte_range() have done. This change is not known to address any issues which users have experienced. Link: https://lkml.kernel.org/r/20250725033112.2690158-1-tujinjiang@huawei.com Signed-off-by: Jinjiang Tu Acked-by: David Hildenbrand Cc: Andrei Vagin Cc: Andrii Nakryiko Cc: Baolin Wang Cc: Brahmajit Das Cc: Catalin Marinas Cc: Christophe Leroy Cc: David Rientjes Cc: Dev Jain Cc: Hugh Dickins Cc: Joern Engel Cc: Kefeng Wang Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Ryan Roberts Cc: Thiago Jung Bauermann Signed-off-by: Andrew Morton --- mm/memory-failure.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9e2cff199934..f0f0b23dcf2d 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -837,11 +837,17 @@ static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask, struct mm_walk *walk) { struct hwpoison_walk *hwp = walk->private; - pte_t pte = huge_ptep_get(walk->mm, addr, ptep); struct hstate *h = hstate_vma(walk->vma); + spinlock_t *ptl; + pte_t pte; + int ret; - return check_hwpoisoned_entry(pte, addr, huge_page_shift(h), - hwp->pfn, &hwp->tk); + ptl = huge_pte_lock(h, walk->mm, ptep); + pte = huge_ptep_get(walk->mm, addr, ptep); + ret = check_hwpoisoned_entry(pte, addr, huge_page_shift(h), + hwp->pfn, &hwp->tk); + spin_unlock(ptl); + return ret; } #else #define hwpoison_hugetlb_range NULL From 1623717b057f904d558eb0489fbd592a18750c1e Mon Sep 17 00:00:00 2001 From: Jinjiang Tu Date: Thu, 24 Jul 2025 17:09:58 +0800 Subject: [PATCH 15/38] mm/mincore: hold PTL in mincore_hugetlb Hold PTL in mincore_hugetlb() to avoid operating on stale page, as mincore_pte_range() have done. Link: https://lkml.kernel.org/r/20250724090958.455887-4-tujinjiang@huawei.com Signed-off-by: Jinjiang Tu Acked-by: David Hildenbrand Cc: Andrei Vagin Cc: Andrii Nakryiko Cc: Baolin Wang Cc: Brahmajit Das Cc: Catalin Marinas Cc: Christophe Leroy Cc: David Rientjes Cc: Dev Jain Cc: Hugh Dickins Cc: Joern Engel Cc: Kefeng Wang Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Ryan Roberts Cc: Thiago Jung Bauermann Signed-off-by: Andrew Morton --- mm/mincore.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/mincore.c b/mm/mincore.c index 42d6c9c8da86..10dabefc3acc 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -29,7 +29,9 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, #ifdef CONFIG_HUGETLB_PAGE unsigned char present; unsigned char *vec = walk->private; + spinlock_t *ptl; + ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); /* * Hugepages under user process are always in RAM and never * swapped out, but theoretically it needs to be checked. @@ -38,6 +40,7 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, for (; addr != end; vec++, addr += PAGE_SIZE) *vec = present; walk->private = vec; + spin_unlock(ptl); #else BUG(); #endif From 3dfde97800e06882960cc926d2c428f2128b7c70 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 24 Jul 2025 10:52:59 +0530 Subject: [PATCH 16/38] mm: add get_and_clear_ptes() and clear_ptes() Patch series "Optimizations for khugepaged", v4. If the underlying folio mapped by the ptes is large, we can process those ptes in a batch using folio_pte_batch(). For arm64 specifically, this results in a 16x reduction in the number of ptep_get() calls, since on a contig block, ptep_get() on arm64 will iterate through all 16 entries to collect a/d bits. Next, ptep_clear() will cause a TLBI for every contig block in the range via contpte_try_unfold(). Instead, use clear_ptes() to only do the TLBI at the first and last contig block of the range. For split folios, there will be no pte batching; the batch size returned by folio_pte_batch() will be 1. For pagetable split folios, the ptes will still point to the same large folio; for arm64, this results in the optimization described above, and for other arches, a minor improvement is expected due to a reduction in the number of function calls and batching atomic operations. This patch (of 3): Let's add variants to be used where "full" does not apply -- which will be the majority of cases in the future. "full" really only applies if we are about to tear down a full MM. Use get_and_clear_ptes() in existing code, clear_ptes() users will be added next. Link: https://lkml.kernel.org/r/20250724052301.23844-2-dev.jain@arm.com Signed-off-by: David Hildenbrand Signed-off-by: Dev Jain Reviewed-by: Baolin Wang Reviewed-by: Barry Song Reviewed-by: Lorenzo Stoakes Reviewed-by: Zi Yan Cc: Liam Howlett Cc: Mariano Pache Cc: Ryan Roberts Signed-off-by: Andrew Morton --- arch/arm64/mm/mmu.c | 2 +- include/linux/pgtable.h | 45 +++++++++++++++++++++++++++++++++++++++++ mm/mremap.c | 2 +- mm/rmap.c | 2 +- 4 files changed, 48 insertions(+), 3 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index abd9725796e9..20a89ab97dc5 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1528,7 +1528,7 @@ early_initcall(prevent_bootmem_remove_init); pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr) { - pte_t pte = get_and_clear_full_ptes(vma->vm_mm, addr, ptep, nr, /* full = */ 0); + pte_t pte = get_and_clear_ptes(vma->vm_mm, addr, ptep, nr); if (alternative_has_cap_unlikely(ARM64_WORKAROUND_2645198)) { /* diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index e3b99920be05..4c035637eeb7 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -736,6 +736,29 @@ static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm, } #endif +/** + * get_and_clear_ptes - Clear present PTEs that map consecutive pages of + * the same folio, collecting dirty/accessed bits. + * @mm: Address space the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to clear. + * + * Use this instead of get_and_clear_full_ptes() if it is known that we don't + * need to clear the full mm, which is mostly the case. + * + * Note that PTE bits in the PTE range besides the PFN can differ. For example, + * some PTEs might be write-protected. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + */ +static inline pte_t get_and_clear_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr) +{ + return get_and_clear_full_ptes(mm, addr, ptep, nr, 0); +} + #ifndef clear_full_ptes /** * clear_full_ptes - Clear present PTEs that map consecutive pages of the same @@ -768,6 +791,28 @@ static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr, } #endif +/** + * clear_ptes - Clear present PTEs that map consecutive pages of the same folio. + * @mm: Address space the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to clear. + * + * Use this instead of clear_full_ptes() if it is known that we don't need to + * clear the full mm, which is mostly the case. + * + * Note that PTE bits in the PTE range besides the PFN can differ. For example, + * some PTEs might be write-protected. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + */ +static inline void clear_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr) +{ + clear_full_ptes(mm, addr, ptep, nr, 0); +} + /* * If two threads concurrently fault at the same page, the thread that * won the race updates the PTE and its local TLB/Cache. The other thread diff --git a/mm/mremap.c b/mm/mremap.c index ac39845e9718..677a4d744df9 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -280,7 +280,7 @@ static int move_ptes(struct pagetable_move_control *pmc, old_pte, max_nr_ptes); force_flush = true; } - pte = get_and_clear_full_ptes(mm, old_addr, old_ptep, nr_ptes, 0); + pte = get_and_clear_ptes(mm, old_addr, old_ptep, nr_ptes); pte = move_pte(pte, old_addr, new_addr); pte = move_soft_dirty_pte(pte); diff --git a/mm/rmap.c b/mm/rmap.c index f93ce27132ab..568198e9efc2 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -2036,7 +2036,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, flush_cache_range(vma, address, end_addr); /* Nuke the page table entry. */ - pteval = get_and_clear_full_ptes(mm, address, pvmw.pte, nr_pages, 0); + pteval = get_and_clear_ptes(mm, address, pvmw.pte, nr_pages); /* * We clear the PTE but do not flush so potentially * a remote CPU could still be writing to the folio. From 4ea3594a47412f9dd20fbda0dc70b0cbec9cba43 Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Thu, 24 Jul 2025 10:53:00 +0530 Subject: [PATCH 17/38] khugepaged: optimize __collapse_huge_page_copy_succeeded() by PTE batching Use PTE batching to batch process PTEs mapping the same large folio. An improvement is expected due to batching refcount-mapcount manipulation on the folios, and for arm64 which supports contig mappings, the number of TLB flushes is also reduced. Link: https://lkml.kernel.org/r/20250724052301.23844-3-dev.jain@arm.com Signed-off-by: Dev Jain Acked-by: David Hildenbrand Reviewed-by: Baolin Wang Reviewed-by: Lorenzo Stoakes Cc: Barry Song Cc: Liam Howlett Cc: Mariano Pache Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/khugepaged.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index a55fb1dcd224..f23e943506bc 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -700,12 +700,15 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, spinlock_t *ptl, struct list_head *compound_pagelist) { + unsigned long end = address + HPAGE_PMD_SIZE; struct folio *src, *tmp; - pte_t *_pte; pte_t pteval; + pte_t *_pte; + unsigned int nr_ptes; - for (_pte = pte; _pte < pte + HPAGE_PMD_NR; - _pte++, address += PAGE_SIZE) { + for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes, + address += nr_ptes * PAGE_SIZE) { + nr_ptes = 1; pteval = ptep_get(_pte); if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); @@ -722,18 +725,26 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, struct page *src_page = pte_page(pteval); src = page_folio(src_page); - if (!folio_test_large(src)) + + if (folio_test_large(src)) { + unsigned int max_nr_ptes = (end - address) >> PAGE_SHIFT; + + nr_ptes = folio_pte_batch(src, _pte, pteval, max_nr_ptes); + } else { release_pte_folio(src); + } + /* * ptl mostly unnecessary, but preempt has to * be disabled to update the per-cpu stats * inside folio_remove_rmap_pte(). */ spin_lock(ptl); - ptep_clear(vma->vm_mm, address, _pte); - folio_remove_rmap_pte(src, src_page, vma); + clear_ptes(vma->vm_mm, address, _pte, nr_ptes); + folio_remove_rmap_ptes(src, src_page, nr_ptes, vma); spin_unlock(ptl); - free_folio_and_swap_cache(src); + free_swap_cache(src); + folio_put_refs(src, nr_ptes); } } From 22d0229093b92db2fe6ca6ba946bad1f246024e8 Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Thu, 24 Jul 2025 10:53:01 +0530 Subject: [PATCH 18/38] khugepaged: optimize collapse_pte_mapped_thp() by PTE batching Use PTE batching to batch process PTEs mapping the same large folio. An improvement is expected due to batching mapcount manipulation on the folios, and for arm64 which supports contig mappings, the number of TLB flushes is also reduced. Note that we do not need to make a change to the check "if (folio_page(folio, i) != page)"; if i'th page of the folio is equal to the first page of our batch, then i + 1, .... i + nr_batch_ptes - 1 pages of the folio will be equal to the corresponding pages of our batch mapping consecutive pages. Link: https://lkml.kernel.org/r/20250724052301.23844-4-dev.jain@arm.com Signed-off-by: Dev Jain Acked-by: David Hildenbrand Reviewed-by: Baolin Wang Reviewed-by: Lorenzo Stoakes Reviewed-by: Zi Yan Cc: Barry Song Cc: Liam Howlett Cc: Mariano Pache Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/khugepaged.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index f23e943506bc..374a6a5193a7 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1503,15 +1503,17 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, bool install_pmd) { + int nr_mapped_ptes = 0, result = SCAN_FAIL; + unsigned int nr_batch_ptes; struct mmu_notifier_range range; bool notified = false; unsigned long haddr = addr & HPAGE_PMD_MASK; + unsigned long end = haddr + HPAGE_PMD_SIZE; struct vm_area_struct *vma = vma_lookup(mm, haddr); struct folio *folio; pte_t *start_pte, *pte; pmd_t *pmd, pgt_pmd; spinlock_t *pml = NULL, *ptl; - int nr_ptes = 0, result = SCAN_FAIL; int i; mmap_assert_locked(mm); @@ -1625,11 +1627,15 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, goto abort; /* step 2: clear page table and adjust rmap */ - for (i = 0, addr = haddr, pte = start_pte; - i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { + for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR; + i += nr_batch_ptes, addr += nr_batch_ptes * PAGE_SIZE, + pte += nr_batch_ptes) { + unsigned int max_nr_batch_ptes = (end - addr) >> PAGE_SHIFT; struct page *page; pte_t ptent = ptep_get(pte); + nr_batch_ptes = 1; + if (pte_none(ptent)) continue; /* @@ -1643,26 +1649,29 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, goto abort; } page = vm_normal_page(vma, addr, ptent); + if (folio_page(folio, i) != page) goto abort; + nr_batch_ptes = folio_pte_batch(folio, pte, ptent, max_nr_batch_ptes); + /* * Must clear entry, or a racing truncate may re-remove it. * TLB flush can be left until pmdp_collapse_flush() does it. * PTE dirty? Shmem page is already dirty; file is read-only. */ - ptep_clear(mm, addr, pte); - folio_remove_rmap_pte(folio, page, vma); - nr_ptes++; + clear_ptes(mm, addr, pte, nr_batch_ptes); + folio_remove_rmap_ptes(folio, page, nr_batch_ptes, vma); + nr_mapped_ptes += nr_batch_ptes; } if (!pml) spin_unlock(ptl); /* step 3: set proper refcount and mm_counters. */ - if (nr_ptes) { - folio_ref_sub(folio, nr_ptes); - add_mm_counter(mm, mm_counter_file(folio), -nr_ptes); + if (nr_mapped_ptes) { + folio_ref_sub(folio, nr_mapped_ptes); + add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes); } /* step 4: remove empty page table */ @@ -1695,10 +1704,10 @@ maybe_install_pmd: : SCAN_SUCCEED; goto drop_folio; abort: - if (nr_ptes) { + if (nr_mapped_ptes) { flush_tlb_mm(mm); - folio_ref_sub(folio, nr_ptes); - add_mm_counter(mm, mm_counter_file(folio), -nr_ptes); + folio_ref_sub(folio, nr_mapped_ptes); + add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes); } unlock: if (start_pte) From 9a4f90e246615d1f42a9b907deb9b4c0a418d996 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 25 Jul 2025 15:29:01 +0100 Subject: [PATCH 19/38] mm: remove mm/io-mapping.c This is dead code, which was used from commit b739f125e4eb ("i915: use io_mapping_map_user") but reverted a month later by commit 0e4fe0c9f2f9 ("Revert "i915: use io_mapping_map_user"") back in 2021. Since then nobody has used it, so remove it. [akpm@linux-foundation.org: update Documentation/core-api/mm-api.rst, per Vlastimil] Link: https://lkml.kernel.org/r/20250725142901.81502-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- Documentation/core-api/mm-api.rst | 1 - include/linux/io-mapping.h | 3 --- mm/Kconfig | 4 ---- mm/Makefile | 1 - mm/io-mapping.c | 30 ------------------------------ 5 files changed, 39 deletions(-) delete mode 100644 mm/io-mapping.c diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst index af8151db88b2..24970b91ac15 100644 --- a/Documentation/core-api/mm-api.rst +++ b/Documentation/core-api/mm-api.rst @@ -139,4 +139,3 @@ More Memory Management Functions .. kernel-doc:: mm/mmu_notifier.c .. kernel-doc:: mm/balloon_compaction.c .. kernel-doc:: mm/huge_memory.c -.. kernel-doc:: mm/io-mapping.c diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h index 7376c1df9c90..c16353cc6e3c 100644 --- a/include/linux/io-mapping.h +++ b/include/linux/io-mapping.h @@ -225,7 +225,4 @@ io_mapping_free(struct io_mapping *iomap) kfree(iomap); } -int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma, - unsigned long addr, unsigned long pfn, unsigned long size); - #endif /* _LINUX_IO_MAPPING_H */ diff --git a/mm/Kconfig b/mm/Kconfig index d5d4eca947a6..e443fe8cd6cf 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1242,10 +1242,6 @@ config KMAP_LOCAL config KMAP_LOCAL_NON_LINEAR_PTE_ARRAY bool -# struct io_mapping based helper. Selected by drivers that need them -config IO_MAPPING - bool - config MEMFD_CREATE bool "Enable memfd_create() system call" if EXPERT diff --git a/mm/Makefile b/mm/Makefile index 1a7a11d4933d..ef54aa615d9d 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -141,7 +141,6 @@ obj-$(CONFIG_MEMFD_CREATE) += memfd.o obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o obj-$(CONFIG_PTDUMP) += ptdump.o obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o -obj-$(CONFIG_IO_MAPPING) += io-mapping.o obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o diff --git a/mm/io-mapping.c b/mm/io-mapping.c deleted file mode 100644 index d3586e95c12c..000000000000 --- a/mm/io-mapping.c +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only - -#include -#include - -/** - * io_mapping_map_user - remap an I/O mapping to userspace - * @iomap: the source io_mapping - * @vma: user vma to map to - * @addr: target user address to start at - * @pfn: physical address of kernel memory - * @size: size of map area - * - * Note: this is only safe if the mm semaphore is held when called. - */ -int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma, - unsigned long addr, unsigned long pfn, unsigned long size) -{ - vm_flags_t expected_flags = VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; - - if (WARN_ON_ONCE((vma->vm_flags & expected_flags) != expected_flags)) - return -EINVAL; - - pgprot_t remap_prot = __pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) | - (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK)); - - /* We rely on prevalidation of the io-mapping to skip pfnmap tracking. */ - return remap_pfn_range_notrack(vma, addr, pfn, size, remap_prot); -} -EXPORT_SYMBOL_GPL(io_mapping_map_user); From a222439e1e273fa0f4e37ce17aeb109f3e91824f Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 25 Jul 2025 14:16:24 +0200 Subject: [PATCH 20/38] mm/rmap: add anon_vma lifetime debug check If an anon folio is mapped into userspace, its anon_vma must be alive, otherwise rmap walks can hit UAF. There have been syzkaller reports a few months ago[1][2] of UAF in rmap walks that seems to indicate that there can be pages with elevated mapcount whose anon_vma has already been freed, but I think we never figured out what the cause is; and syzkaller only hit these UAFs when memory pressure randomly caused reclaim to rmap-walk the affected pages, so it of course didn't manage to create a reproducer. Add a VM_WARN_ON_FOLIO() when we add/remove mappings of anonymous folios to hopefully catch such issues more reliably. [1] https://lore.kernel.org/r/67abaeaf.050a0220.110943.0041.GAE@google.com [2] https://lore.kernel.org/r/67a76f33.050a0220.3d72c.0028.GAE@google.com Link: https://lkml.kernel.org/r/20250725-anonvma-uaf-debug-v2-1-bc3c7e5ba5b1@google.com Signed-off-by: Jann Horn Acked-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Acked-by: Harry Yoo Cc: David Hildenbrand Cc: Jann Horn Cc: Liam Howlett Cc: Rik van Riel Signed-off-by: Andrew Morton --- include/linux/rmap.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 20803fcb49a7..6cd020eea37a 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -449,6 +449,28 @@ static inline void __folio_rmap_sanity_checks(const struct folio *folio, default: VM_WARN_ON_ONCE(true); } + + /* + * Anon folios must have an associated live anon_vma as long as they're + * mapped into userspace. + * Note that the atomic_read() mainly does two things: + * + * 1. In KASAN builds with CONFIG_SLUB_RCU_DEBUG, it causes KASAN to + * check that the associated anon_vma has not yet been freed (subject + * to KASAN's usual limitations). This check will pass if the + * anon_vma's refcount has already dropped to 0 but an RCU grace + * period hasn't passed since then. + * 2. If the anon_vma has not yet been freed, it checks that the + * anon_vma still has a nonzero refcount (as opposed to being in the + * middle of an RCU delay for getting freed). + */ + if (folio_test_anon(folio) && !folio_test_ksm(folio)) { + unsigned long mapping = (unsigned long)folio->mapping; + struct anon_vma *anon_vma; + + anon_vma = (void *)(mapping - FOLIO_MAPPING_ANON); + VM_WARN_ON_FOLIO(atomic_read(&anon_vma->refcount) == 0, folio); + } } /* From 9bbffee67ffd16360179327b57f3b1245579ef08 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 28 Jul 2025 10:53:55 -0700 Subject: [PATCH 21/38] mm: fix a UAF when vma->mm is freed after vma->vm_refcnt got dropped By inducing delays in the right places, Jann Horn created a reproducer for a hard to hit UAF issue that became possible after VMAs were allowed to be recycled by adding SLAB_TYPESAFE_BY_RCU to their cache. Race description is borrowed from Jann's discovery report: lock_vma_under_rcu() looks up a VMA locklessly with mas_walk() under rcu_read_lock(). At that point, the VMA may be concurrently freed, and it can be recycled by another process. vma_start_read() then increments the vma->vm_refcnt (if it is in an acceptable range), and if this succeeds, vma_start_read() can return a recycled VMA. In this scenario where the VMA has been recycled, lock_vma_under_rcu() will then detect the mismatching ->vm_mm pointer and drop the VMA through vma_end_read(), which calls vma_refcount_put(). vma_refcount_put() drops the refcount and then calls rcuwait_wake_up() using a copy of vma->vm_mm. This is wrong: It implicitly assumes that the caller is keeping the VMA's mm alive, but in this scenario the caller has no relation to the VMA's mm, so the rcuwait_wake_up() can cause UAF. The diagram depicting the race: T1 T2 T3 == == == lock_vma_under_rcu mas_walk mmap vma_start_read __refcount_inc_not_zero_limited_acquire munmap __vma_enter_locked refcount_add_not_zero vma_end_read vma_refcount_put __refcount_dec_and_test rcuwait_wait_event rcuwait_wake_up [UAF] Note that rcuwait_wait_event() in T3 does not block because refcount was already dropped by T1. At this point T3 can exit and free the mm causing UAF in T1. To avoid this we move vma->vm_mm verification into vma_start_read() and grab vma->vm_mm to stabilize it before vma_refcount_put() operation. [surenb@google.com: v3] Link: https://lkml.kernel.org/r/20250729145709.2731370-1-surenb@google.com Link: https://lkml.kernel.org/r/20250728175355.2282375-1-surenb@google.com Fixes: 3104138517fc ("mm: make vma cache SLAB_TYPESAFE_BY_RCU") Signed-off-by: Suren Baghdasaryan Reported-by: Jann Horn Closes: https://lore.kernel.org/all/CAG48ez0-deFbVH=E3jbkWx=X3uVbd8nWeo6kbJPQ0KoUD+m2tA@mail.gmail.com/ Reviewed-by: Vlastimil Babka Acked-by: Lorenzo Stoakes Cc: Jann Horn Cc: Liam Howlett Cc: Signed-off-by: Andrew Morton --- include/linux/mmap_lock.h | 30 ++++++++++++++++++++++++++++++ mm/mmap_lock.c | 10 +++------- 2 files changed, 33 insertions(+), 7 deletions(-) diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 1f4f44951abe..11a078de9150 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -12,6 +12,7 @@ extern int rcuwait_wake_up(struct rcuwait *w); #include #include #include +#include #define MMAP_LOCK_INITIALIZER(name) \ .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock), @@ -154,6 +155,10 @@ static inline void vma_refcount_put(struct vm_area_struct *vma) * reused and attached to a different mm before we lock it. * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got * detached. + * + * WARNING! The vma passed to this function cannot be used if the function + * fails to lock it because in certain cases RCU lock is dropped and then + * reacquired. Once RCU lock is dropped the vma can be concurently freed. */ static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, struct vm_area_struct *vma) @@ -183,6 +188,31 @@ static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, } rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); + + /* + * If vma got attached to another mm from under us, that mm is not + * stable and can be freed in the narrow window after vma->vm_refcnt + * is dropped and before rcuwait_wake_up(mm) is called. Grab it before + * releasing vma->vm_refcnt. + */ + if (unlikely(vma->vm_mm != mm)) { + /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */ + struct mm_struct *other_mm = vma->vm_mm; + + /* + * __mmdrop() is a heavy operation and we don't need RCU + * protection here. Release RCU lock during these operations. + * We reinstate the RCU read lock as the caller expects it to + * be held when this function returns even on error. + */ + rcu_read_unlock(); + mmgrab(other_mm); + vma_refcount_put(vma); + mmdrop(other_mm); + rcu_read_lock(); + return NULL; + } + /* * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. * False unlocked result is impossible because we modify and check diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index 729fb7d0dd59..b006cec8e6fe 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -164,8 +164,7 @@ retry: */ /* Check if the vma we locked is the right one. */ - if (unlikely(vma->vm_mm != mm || - address < vma->vm_start || address >= vma->vm_end)) + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) goto inval_end_read; rcu_read_unlock(); @@ -236,11 +235,8 @@ retry: goto fallback; } - /* - * Verify the vma we locked belongs to the same address space and it's - * not behind of the last search position. - */ - if (unlikely(vma->vm_mm != mm || from_addr >= vma->vm_end)) + /* Verify the vma is not behind the last search position. */ + if (unlikely(from_addr >= vma->vm_end)) goto fallback_unlock; /* From fcd90ad31e29d0b403f3a074a64cd7f0876175dd Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 13 Jul 2025 10:17:23 +0300 Subject: [PATCH 22/38] execmem: drop unused execmem_update_copy() Patch series "x86: enable EXECMEM_ROX_CACHE for ftrace and kprobes", v3. These patches enable use of EXECMEM_ROX_CACHE for ftrace and kprobes allocations on x86. They also include some ground work in execmem. Since the execmem model for caching large ROX pages changed from the initial assumption that the memory that is allocated from ROX cache is always ROX to the current state where memory can be temporarily made RW and then restored to ROX, we can stop using text poking to update it. This also saves the hassle of trying lock text_mutex in execmem_cache_free() when kprobes already hold that mutex. This patch (of 8): The execmem_update_copy() that used text poking was required when memory allocated from ROX cache was always read-only. Since now its permissions can be switched to read-write there is no need in a function that updates memory with text poking. Remove it. Link: https://lkml.kernel.org/r/20250713071730.4117334-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20250713071730.4117334-2-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Peter Zijlstra (Intel) Cc: Daniel Gomez Cc: Masami Hiramatsu (Google) Cc: Petr Pavlu Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- include/linux/execmem.h | 13 ------------- mm/execmem.c | 5 ----- 2 files changed, 18 deletions(-) diff --git a/include/linux/execmem.h b/include/linux/execmem.h index 3be35680a54f..734fbe83d98e 100644 --- a/include/linux/execmem.h +++ b/include/linux/execmem.h @@ -185,19 +185,6 @@ DEFINE_FREE(execmem, void *, if (_T) execmem_free(_T)); struct vm_struct *execmem_vmap(size_t size); #endif -/** - * execmem_update_copy - copy an update to executable memory - * @dst: destination address to update - * @src: source address containing the data - * @size: how many bytes of memory shold be copied - * - * Copy @size bytes from @src to @dst using text poking if the memory at - * @dst is read-only. - * - * Return: a pointer to @dst or NULL on error - */ -void *execmem_update_copy(void *dst, const void *src, size_t size); - /** * execmem_is_rox - check if execmem is read-only * @type - the execmem type to check diff --git a/mm/execmem.c b/mm/execmem.c index 627e6cf64f4f..aac211bc88c5 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -399,11 +399,6 @@ void execmem_free(void *ptr) vfree(ptr); } -void *execmem_update_copy(void *dst, const void *src, size_t size) -{ - return text_poke_copy(dst, src, size); -} - bool execmem_is_rox(enum execmem_type type) { return !!(execmem_info->ranges[type].flags & EXECMEM_ROX_CACHE); From 838955f64ae7582f009a3538889bb9244f37ab26 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 13 Jul 2025 10:17:24 +0300 Subject: [PATCH 23/38] execmem: introduce execmem_alloc_rw() Some callers of execmem_alloc() require the memory to be temporarily writable even when it is allocated from ROX cache. These callers use execemem_make_temp_rw() right after the call to execmem_alloc(). Wrap this sequence in execmem_alloc_rw() API. Link: https://lkml.kernel.org/r/20250713071730.4117334-3-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: Daniel Gomez Reviewed-by: Petr Pavlu Acked-by: Peter Zijlstra (Intel) Cc: Masami Hiramatsu (Google) Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- arch/x86/kernel/alternative.c | 3 +-- include/linux/execmem.h | 38 ++++++++++++++++++++--------------- kernel/module/main.c | 13 ++---------- mm/execmem.c | 27 ++++++++++++++++++++++++- 4 files changed, 51 insertions(+), 30 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index ea1d984166cd..526a5fef93ab 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -120,7 +120,7 @@ struct its_array its_pages; static void *__its_alloc(struct its_array *pages) { - void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE); + void *page __free(execmem) = execmem_alloc_rw(EXECMEM_MODULE_TEXT, PAGE_SIZE); if (!page) return NULL; @@ -237,7 +237,6 @@ static void *its_alloc(void) if (!page) return NULL; - execmem_make_temp_rw(page, PAGE_SIZE); if (pages == &its_pages) set_memory_x((unsigned long)page, 1); diff --git a/include/linux/execmem.h b/include/linux/execmem.h index 734fbe83d98e..8b61b05da7d5 100644 --- a/include/linux/execmem.h +++ b/include/linux/execmem.h @@ -67,21 +67,6 @@ enum execmem_range_flags { */ void execmem_fill_trapping_insns(void *ptr, size_t size, bool writable); -/** - * execmem_make_temp_rw - temporarily remap region with read-write - * permissions - * @ptr: address of the region to remap - * @size: size of the region to remap - * - * Remaps a part of the cached large page in the ROX cache in the range - * [@ptr, @ptr + @size) as writable and not executable. The caller must - * have exclusive ownership of this range and ensure nothing will try to - * execute code in this range. - * - * Return: 0 on success or negative error code on failure. - */ -int execmem_make_temp_rw(void *ptr, size_t size); - /** * execmem_restore_rox - restore read-only-execute permissions * @ptr: address of the region to remap @@ -95,7 +80,6 @@ int execmem_make_temp_rw(void *ptr, size_t size); */ int execmem_restore_rox(void *ptr, size_t size); #else -static inline int execmem_make_temp_rw(void *ptr, size_t size) { return 0; } static inline int execmem_restore_rox(void *ptr, size_t size) { return 0; } #endif @@ -165,6 +149,28 @@ struct execmem_info *execmem_arch_setup(void); */ void *execmem_alloc(enum execmem_type type, size_t size); +/** + * execmem_alloc_rw - allocate writable executable memory + * @type: type of the allocation + * @size: how many bytes of memory are required + * + * Allocates memory that will contain executable code, either generated or + * loaded from kernel modules. + * + * Allocates memory that will contain data coupled with executable code, + * like data sections in kernel modules. + * + * Forces writable permissions on the allocated memory and the caller is + * responsible to manage the permissions afterwards. + * + * For architectures that use ROX cache the permissions will be set to R+W. + * For architectures that don't use ROX cache the default permissions for @type + * will be used as they must be writable. + * + * Return: a pointer to the allocated memory or %NULL + */ +void *execmem_alloc_rw(enum execmem_type type, size_t size); + /** * execmem_free - free executable memory * @ptr: pointer to the memory that should be freed diff --git a/kernel/module/main.c b/kernel/module/main.c index 413ac6ea3702..d009326ef7bb 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1292,20 +1292,11 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type) else execmem_type = EXECMEM_MODULE_TEXT; - ptr = execmem_alloc(execmem_type, size); + ptr = execmem_alloc_rw(execmem_type, size); if (!ptr) return -ENOMEM; - if (execmem_is_rox(execmem_type)) { - int err = execmem_make_temp_rw(ptr, size); - - if (err) { - execmem_free(ptr); - return -ENOMEM; - } - - mod->mem[type].is_rox = true; - } + mod->mem[type].is_rox = execmem_is_rox(execmem_type); /* * The pointer to these blocks of memory are stored on the module diff --git a/mm/execmem.c b/mm/execmem.c index aac211bc88c5..d0bf0123bce4 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -336,7 +336,7 @@ static bool execmem_cache_free(void *ptr) return true; } -int execmem_make_temp_rw(void *ptr, size_t size) +static int execmem_force_rw(void *ptr, size_t size) { unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; unsigned long addr = (unsigned long)ptr; @@ -358,6 +358,16 @@ int execmem_restore_rox(void *ptr, size_t size) } #else /* CONFIG_ARCH_HAS_EXECMEM_ROX */ +/* + * when ROX cache is not used the permissions defined by architectures for + * execmem ranges that are updated before use (e.g. EXECMEM_MODULE_TEXT) must + * be writable anyway + */ +static inline int execmem_force_rw(void *ptr, size_t size) +{ + return 0; +} + static void *execmem_cache_alloc(struct execmem_range *range, size_t size) { return NULL; @@ -387,6 +397,21 @@ void *execmem_alloc(enum execmem_type type, size_t size) return kasan_reset_tag(p); } +void *execmem_alloc_rw(enum execmem_type type, size_t size) +{ + void *p __free(execmem) = execmem_alloc(type, size); + int err; + + if (!p) + return NULL; + + err = execmem_force_rw(p, size); + if (err) + return NULL; + + return no_free_ptr(p); +} + void execmem_free(void *ptr) { /* From 187fd8521dd8b202cbacd7af57f4301da4d5b52d Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 13 Jul 2025 10:17:25 +0300 Subject: [PATCH 24/38] execmem: rework execmem_cache_free() Currently execmem_cache_free() ignores potential allocation failures that may happen in execmem_cache_add(). Besides, it uses text poking to fill the memory with trapping instructions before returning it to cache although it would be more efficient to make that memory writable, update it using memcpy and then restore ROX protection. Rework execmem_cache_free() so that in case of an error it will defer freeing of the memory to a delayed work. With this the happy fast path will now change permissions to RW, fill the memory with trapping instructions using memcpy, restore ROX permissions, add the memory back to the free cache and clear the relevant entry in busy_areas. If any step in the fast path fails, the entry in busy_areas will be marked as pending_free. These entries will be handled by a delayed work and freed asynchronously. To make the fast path faster, use __GFP_NORETRY for memory allocations and let asynchronous handler try harder with GFP_KERNEL. Link: https://lkml.kernel.org/r/20250713071730.4117334-4-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Peter Zijlstra (Intel) Cc: Daniel Gomez Cc: Masami Hiramatsu (Google) Cc: Petr Pavlu Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- mm/execmem.c | 125 +++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 102 insertions(+), 23 deletions(-) diff --git a/mm/execmem.c b/mm/execmem.c index d0bf0123bce4..52b06ccf614a 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -93,8 +93,15 @@ struct execmem_cache { struct mutex mutex; struct maple_tree busy_areas; struct maple_tree free_areas; + unsigned int pending_free_cnt; /* protected by mutex */ }; +/* delay to schedule asynchronous free if fast path free fails */ +#define FREE_DELAY (msecs_to_jiffies(10)) + +/* mark entries in busy_areas that should be freed asynchronously */ +#define PENDING_FREE_MASK (1 << (PAGE_SHIFT - 1)) + static struct execmem_cache execmem_cache = { .mutex = __MUTEX_INITIALIZER(execmem_cache.mutex), .busy_areas = MTREE_INIT_EXT(busy_areas, MT_FLAGS_LOCK_EXTERN, @@ -155,20 +162,17 @@ static void execmem_cache_clean(struct work_struct *work) static DECLARE_WORK(execmem_cache_clean_work, execmem_cache_clean); -static int execmem_cache_add(void *ptr, size_t size) +static int execmem_cache_add_locked(void *ptr, size_t size, gfp_t gfp_mask) { struct maple_tree *free_areas = &execmem_cache.free_areas; - struct mutex *mutex = &execmem_cache.mutex; unsigned long addr = (unsigned long)ptr; MA_STATE(mas, free_areas, addr - 1, addr + 1); unsigned long lower, upper; void *area = NULL; - int err; lower = addr; upper = addr + size - 1; - mutex_lock(mutex); area = mas_walk(&mas); if (area && mas.last == addr - 1) lower = mas.index; @@ -178,12 +182,14 @@ static int execmem_cache_add(void *ptr, size_t size) upper = mas.last; mas_set_range(&mas, lower, upper); - err = mas_store_gfp(&mas, (void *)lower, GFP_KERNEL); - mutex_unlock(mutex); - if (err) - return err; + return mas_store_gfp(&mas, (void *)lower, gfp_mask); +} - return 0; +static int execmem_cache_add(void *ptr, size_t size, gfp_t gfp_mask) +{ + guard(mutex)(&execmem_cache.mutex); + + return execmem_cache_add_locked(ptr, size, gfp_mask); } static bool within_range(struct execmem_range *range, struct ma_state *mas, @@ -278,7 +284,7 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size) if (err) goto err_free_mem; - err = execmem_cache_add(p, alloc_size); + err = execmem_cache_add(p, alloc_size, GFP_KERNEL); if (err) goto err_reset_direct_map; @@ -307,29 +313,102 @@ static void *execmem_cache_alloc(struct execmem_range *range, size_t size) return __execmem_cache_alloc(range, size); } +static inline bool is_pending_free(void *ptr) +{ + return ((unsigned long)ptr & PENDING_FREE_MASK); +} + +static inline void *pending_free_set(void *ptr) +{ + return (void *)((unsigned long)ptr | PENDING_FREE_MASK); +} + +static inline void *pending_free_clear(void *ptr) +{ + return (void *)((unsigned long)ptr & ~PENDING_FREE_MASK); +} + +static int execmem_force_rw(void *ptr, size_t size); + +static int __execmem_cache_free(struct ma_state *mas, void *ptr, gfp_t gfp_mask) +{ + size_t size = mas_range_len(mas); + int err; + + err = execmem_force_rw(ptr, size); + if (err) + return err; + + execmem_fill_trapping_insns(ptr, size, /* writable = */ true); + execmem_restore_rox(ptr, size); + + err = execmem_cache_add_locked(ptr, size, gfp_mask); + if (err) + return err; + + mas_store_gfp(mas, NULL, gfp_mask); + return 0; +} + +static void execmem_cache_free_slow(struct work_struct *work); +static DECLARE_DELAYED_WORK(execmem_cache_free_work, execmem_cache_free_slow); + +static void execmem_cache_free_slow(struct work_struct *work) +{ + struct maple_tree *busy_areas = &execmem_cache.busy_areas; + MA_STATE(mas, busy_areas, 0, ULONG_MAX); + void *area; + + guard(mutex)(&execmem_cache.mutex); + + if (!execmem_cache.pending_free_cnt) + return; + + mas_for_each(&mas, area, ULONG_MAX) { + if (!is_pending_free(area)) + continue; + + area = pending_free_clear(area); + if (__execmem_cache_free(&mas, area, GFP_KERNEL)) + continue; + + execmem_cache.pending_free_cnt--; + } + + if (execmem_cache.pending_free_cnt) + schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY); + else + schedule_work(&execmem_cache_clean_work); +} + static bool execmem_cache_free(void *ptr) { struct maple_tree *busy_areas = &execmem_cache.busy_areas; - struct mutex *mutex = &execmem_cache.mutex; unsigned long addr = (unsigned long)ptr; MA_STATE(mas, busy_areas, addr, addr); - size_t size; void *area; + int err; + + guard(mutex)(&execmem_cache.mutex); - mutex_lock(mutex); area = mas_walk(&mas); - if (!area) { - mutex_unlock(mutex); + if (!area) return false; + + err = __execmem_cache_free(&mas, area, GFP_KERNEL | __GFP_NORETRY); + if (err) { + /* + * mas points to exact slot we've got the area from, nothing + * else can modify the tree because of the mutex, so there + * won't be any allocations in mas_store_gfp() and it will just + * change the pointer. + */ + area = pending_free_set(area); + mas_store_gfp(&mas, area, GFP_KERNEL); + execmem_cache.pending_free_cnt++; + schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY); + return true; } - size = mas_range_len(&mas); - - mas_store_gfp(&mas, NULL, GFP_KERNEL); - mutex_unlock(mutex); - - execmem_fill_trapping_insns(ptr, size, /* writable = */ false); - - execmem_cache_add(ptr, size); schedule_work(&execmem_cache_clean_work); From 888b5a847ba9650f454cd0842ccf8497268da959 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 13 Jul 2025 10:17:26 +0300 Subject: [PATCH 25/38] execmem: move execmem_force_rw() and execmem_restore_rox() before use to avoid static declarations. Link: https://lkml.kernel.org/r/20250713071730.4117334-5-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Peter Zijlstra (Intel) Cc: Daniel Gomez Cc: Masami Hiramatsu (Google) Cc: Petr Pavlu Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- mm/execmem.c | 44 +++++++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/mm/execmem.c b/mm/execmem.c index 52b06ccf614a..c99b299b113c 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -137,6 +137,27 @@ err_restore: return err; } +static int execmem_force_rw(void *ptr, size_t size) +{ + unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; + unsigned long addr = (unsigned long)ptr; + int ret; + + ret = set_memory_nx(addr, nr); + if (ret) + return ret; + + return set_memory_rw(addr, nr); +} + +int execmem_restore_rox(void *ptr, size_t size) +{ + unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; + unsigned long addr = (unsigned long)ptr; + + return set_memory_rox(addr, nr); +} + static void execmem_cache_clean(struct work_struct *work) { struct maple_tree *free_areas = &execmem_cache.free_areas; @@ -328,8 +349,6 @@ static inline void *pending_free_clear(void *ptr) return (void *)((unsigned long)ptr & ~PENDING_FREE_MASK); } -static int execmem_force_rw(void *ptr, size_t size); - static int __execmem_cache_free(struct ma_state *mas, void *ptr, gfp_t gfp_mask) { size_t size = mas_range_len(mas); @@ -415,27 +434,6 @@ static bool execmem_cache_free(void *ptr) return true; } -static int execmem_force_rw(void *ptr, size_t size) -{ - unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; - unsigned long addr = (unsigned long)ptr; - int ret; - - ret = set_memory_nx(addr, nr); - if (ret) - return ret; - - return set_memory_rw(addr, nr); -} - -int execmem_restore_rox(void *ptr, size_t size) -{ - unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; - unsigned long addr = (unsigned long)ptr; - - return set_memory_rox(addr, nr); -} - #else /* CONFIG_ARCH_HAS_EXECMEM_ROX */ /* * when ROX cache is not used the permissions defined by architectures for From 3bd4e0ac61b2fd87d64572e866f58940d1d5fbdf Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 13 Jul 2025 10:17:27 +0300 Subject: [PATCH 26/38] execmem: add fallback for failures in vmalloc(VM_ALLOW_HUGE_VMAP) When execmem populates ROX cache it uses vmalloc(VM_ALLOW_HUGE_VMAP). Although vmalloc falls back to allocating base pages if high order allocation fails, it may happen that it still cannot allocate enough memory. Right now ROX cache is only used by modules and in majority of cases the allocations happen at boot time when there's plenty of free memory, but upcoming enabling ROX cache for ftrace and kprobes would mean that execmem allocations can happen when the system is under memory pressure and a failure to allocate large page worth of memory becomes more likely. Fallback to regular vmalloc() if vmalloc(VM_ALLOW_HUGE_VMAP) fails. Link: https://lkml.kernel.org/r/20250713071730.4117334-6-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Peter Zijlstra (Intel) Cc: Daniel Gomez Cc: Masami Hiramatsu (Google) Cc: Petr Pavlu Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- mm/execmem.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mm/execmem.c b/mm/execmem.c index c99b299b113c..9abf76a63a79 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -291,6 +291,11 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size) alloc_size = round_up(size, PMD_SIZE); p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags); + if (!p) { + alloc_size = size; + p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags); + } + if (!p) return err; @@ -462,7 +467,7 @@ void *execmem_alloc(enum execmem_type type, size_t size) bool use_cache = range->flags & EXECMEM_ROX_CACHE; vm_flags_t vm_flags = VM_FLUSH_RESET_PERMS; pgprot_t pgprot = range->pgprot; - void *p; + void *p = NULL; size = PAGE_ALIGN(size); From ab674b6871b049aab2e86d1d7375526368ed175a Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 13 Jul 2025 10:17:28 +0300 Subject: [PATCH 27/38] execmem: drop writable parameter from execmem_fill_trapping_insns() After update of execmem_cache_free() that made memory writable before updating it, there is no need to update read only memory, so the writable parameter to execmem_fill_trapping_insns() is not needed. Drop it. Link: https://lkml.kernel.org/r/20250713071730.4117334-7-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Peter Zijlstra (Intel) Cc: Daniel Gomez Cc: Masami Hiramatsu (Google) Cc: Petr Pavlu Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- arch/x86/mm/init.c | 8 ++------ include/linux/execmem.h | 3 +-- mm/execmem.c | 4 ++-- 3 files changed, 5 insertions(+), 10 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 7456df985d96..dbc63f0d538f 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -1063,13 +1063,9 @@ unsigned long arch_max_swapfile_size(void) static struct execmem_info execmem_info __ro_after_init; #ifdef CONFIG_ARCH_HAS_EXECMEM_ROX -void execmem_fill_trapping_insns(void *ptr, size_t size, bool writeable) +void execmem_fill_trapping_insns(void *ptr, size_t size) { - /* fill memory with INT3 instructions */ - if (writeable) - memset(ptr, INT3_INSN_OPCODE, size); - else - text_poke_set(ptr, INT3_INSN_OPCODE, size); + memset(ptr, INT3_INSN_OPCODE, size); } #endif diff --git a/include/linux/execmem.h b/include/linux/execmem.h index 8b61b05da7d5..7de229134e30 100644 --- a/include/linux/execmem.h +++ b/include/linux/execmem.h @@ -60,12 +60,11 @@ enum execmem_range_flags { * will trap * @ptr: pointer to memory to fill * @size: size of the range to fill - * @writable: is the memory poited by @ptr is writable or ROX * * A hook for architecures to fill execmem ranges with invalid instructions. * Architectures that use EXECMEM_ROX_CACHE must implement this. */ -void execmem_fill_trapping_insns(void *ptr, size_t size, bool writable); +void execmem_fill_trapping_insns(void *ptr, size_t size); /** * execmem_restore_rox - restore read-only-execute permissions diff --git a/mm/execmem.c b/mm/execmem.c index 9abf76a63a79..1785d7f435e4 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -304,7 +304,7 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size) goto err_free_mem; /* fill memory with instructions that will trap */ - execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true); + execmem_fill_trapping_insns(p, alloc_size); err = set_memory_rox((unsigned long)p, vm->nr_pages); if (err) @@ -363,7 +363,7 @@ static int __execmem_cache_free(struct ma_state *mas, void *ptr, gfp_t gfp_mask) if (err) return err; - execmem_fill_trapping_insns(ptr, size, /* writable = */ true); + execmem_fill_trapping_insns(ptr, size); execmem_restore_rox(ptr, size); err = execmem_cache_add_locked(ptr, size, gfp_mask); From 36de1e4238c1243866eaec515ef59972c490367f Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 13 Jul 2025 10:17:29 +0300 Subject: [PATCH 28/38] x86/kprobes: enable EXECMEM_ROX_CACHE for kprobes allocations x86::alloc_insn_page() always allocates ROX memory. Instead of overriding this method, add EXECMEM_KPROBES entry in execmem_info with pgprot set to PAGE_KERNEL_ROX and use ROX cache when configuration and CPU features allow it. Link: https://lkml.kernel.org/r/20250713071730.4117334-8-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Peter Zijlstra (Intel) Acked-by: Masami Hiramatsu (Google) Cc: Daniel Gomez Cc: Petr Pavlu Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- arch/x86/kernel/kprobes/core.c | 18 ------------------ arch/x86/mm/init.c | 9 ++++++++- 2 files changed, 8 insertions(+), 19 deletions(-) diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 47cb8eb138ba..6079d15dab8c 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -481,24 +481,6 @@ static int prepare_singlestep(kprobe_opcode_t *buf, struct kprobe *p, return len; } -/* Make page to RO mode when allocate it */ -void *alloc_insn_page(void) -{ - void *page; - - page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE); - if (!page) - return NULL; - - /* - * TODO: Once additional kernel code protection mechanisms are set, ensure - * that the page was not maliciously altered and it is still zeroed. - */ - set_memory_rox((unsigned long)page, 1); - - return page; -} - /* Kprobe x86 instruction emulation - only regs->ip or IF flag modifiers */ static void kprobe_emulate_ifmodifiers(struct kprobe *p, struct pt_regs *regs) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index dbc63f0d538f..442fafd8ff52 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -1098,7 +1098,14 @@ struct execmem_info __init *execmem_arch_setup(void) .pgprot = pgprot, .alignment = MODULE_ALIGN, }, - [EXECMEM_KPROBES ... EXECMEM_BPF] = { + [EXECMEM_KPROBES] = { + .flags = flags, + .start = start, + .end = MODULES_END, + .pgprot = PAGE_KERNEL_ROX, + .alignment = MODULE_ALIGN, + }, + [EXECMEM_FTRACE ... EXECMEM_BPF] = { .flags = EXECMEM_KASAN_SHADOW, .start = start, .end = MODULES_END, From 5d79c2be508143559c65ace445e7a951ef92881b Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 13 Jul 2025 10:17:30 +0300 Subject: [PATCH 29/38] x86/ftrace: enable EXECMEM_ROX_CACHE for ftrace allocations For the most part ftrace uses text poking and can handle ROX memory. The only place that requires writable memory is create_trampoline() that updates the allocated memory and in the end makes it ROX. Use execmem_alloc_rw() in x86::ftrace::alloc_tramp() and enable ROX cache for EXECMEM_FTRACE when configuration and CPU features allow that. Link: https://lkml.kernel.org/r/20250713071730.4117334-9-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Peter Zijlstra (Intel) Acked-by: Steven Rostedt (Google) Cc: Daniel Gomez Cc: Masami Hiramatsu (Google) Cc: Petr Pavlu Signed-off-by: Andrew Morton --- arch/x86/kernel/ftrace.c | 2 +- arch/x86/mm/init.c | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 252e82bcfd2f..4450acec9390 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -263,7 +263,7 @@ void arch_ftrace_update_code(int command) static inline void *alloc_tramp(unsigned long size) { - return execmem_alloc(EXECMEM_FTRACE, size); + return execmem_alloc_rw(EXECMEM_FTRACE, size); } static inline void tramp_free(void *tramp) { diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 442fafd8ff52..bb57e93b4caf 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -1105,7 +1105,14 @@ struct execmem_info __init *execmem_arch_setup(void) .pgprot = PAGE_KERNEL_ROX, .alignment = MODULE_ALIGN, }, - [EXECMEM_FTRACE ... EXECMEM_BPF] = { + [EXECMEM_FTRACE] = { + .flags = flags, + .start = start, + .end = MODULES_END, + .pgprot = pgprot, + .alignment = MODULE_ALIGN, + }, + [EXECMEM_BPF] = { .flags = EXECMEM_KASAN_SHADOW, .start = start, .end = MODULES_END, From 0cfc0e7e3d062b93e9eec6828de000981cdfb152 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Mon, 28 Jul 2025 15:53:00 +0800 Subject: [PATCH 30/38] mm/shmem, swap: avoid redundant Xarray lookup during swapin Patch series "mm/shmem, swap: bugfix and improvement of mTHP swap in", v6. The current THP swapin path have several problems. It may potentially hang, may cause redundant faults due to false positive swap cache lookup, and it issues redundant Xarray walks. !CONFIG_TRANSPARENT_HUGEPAGE builds may also contain unnecessary THP checks. This series fixes all of the mentioned issues, the code should be more robust and prepared for the swap table series. Now 4 walks is reduced to 3 (get order & confirm, confirm, insert folio), !CONFIG_TRANSPARENT_HUGEPAGE build overhead is also minimized, and comes with a sanity check now. The performance is slightly better after this series, sequential swap in of 24G data from ZRAM, using transparent_hugepage_tmpfs=always (24 samples each): Before: avg: 10.66s, stddev: 0.04 After patch 1: avg: 10.58s, stddev: 0.04 After patch 2: avg: 10.65s, stddev: 0.05 After patch 3: avg: 10.65s, stddev: 0.04 After patch 4: avg: 10.67s, stddev: 0.04 After patch 5: avg: 9.79s, stddev: 0.04 After patch 6: avg: 9.79s, stddev: 0.05 After patch 7: avg: 9.78s, stddev: 0.05 After patch 8: avg: 9.79s, stddev: 0.04 Several patches improve the performance by a little, which is about ~8% faster in total. Build kernel test showed very slightly improvement, testing with make -j48 with defconfig in a 768M memcg also using ZRAM as swap, and transparent_hugepage_tmpfs=always (6 test runs): Before: avg: 3334.66s, stddev: 43.76 After patch 1: avg: 3349.77s, stddev: 18.55 After patch 2: avg: 3325.01s, stddev: 42.96 After patch 3: avg: 3354.58s, stddev: 14.62 After patch 4: avg: 3336.24s, stddev: 32.15 After patch 5: avg: 3325.13s, stddev: 22.14 After patch 6: avg: 3285.03s, stddev: 38.95 After patch 7: avg: 3287.32s, stddev: 26.37 After patch 8: avg: 3295.87s, stddev: 46.24 This patch (of 7): Currently shmem calls xa_get_order to get the swap radix entry order, requiring a full tree walk. This can be easily combined with the swap entry value checking (shmem_confirm_swap) to avoid the duplicated lookup and abort early if the entry is gone already. Which should improve the performance. Link: https://lkml.kernel.org/r/20250728075306.12704-1-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20250728075306.12704-3-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Kemeng Shi Reviewed-by: Dev Jain Reviewed-by: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/shmem.c | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 5e9ec28fab85..6e00d2ec40a9 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -512,15 +512,27 @@ static int shmem_replace_entry(struct address_space *mapping, /* * Sometimes, before we decide whether to proceed or to fail, we must check - * that an entry was not already brought back from swap by a racing thread. + * that an entry was not already brought back or split by a racing thread. * * Checking folio is not enough: by the time a swapcache folio is locked, it * might be reused, and again be swapcache, using the same swap as before. + * Returns the swap entry's order if it still presents, else returns -1. */ -static bool shmem_confirm_swap(struct address_space *mapping, - pgoff_t index, swp_entry_t swap) +static int shmem_confirm_swap(struct address_space *mapping, pgoff_t index, + swp_entry_t swap) { - return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap); + XA_STATE(xas, &mapping->i_pages, index); + int ret = -1; + void *entry; + + rcu_read_lock(); + do { + entry = xas_load(&xas); + if (entry == swp_to_radix_entry(swap)) + ret = xas_get_order(&xas); + } while (xas_retry(&xas, entry)); + rcu_read_unlock(); + return ret; } /* @@ -2293,16 +2305,20 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, return -EIO; si = get_swap_device(swap); - if (!si) { - if (!shmem_confirm_swap(mapping, index, swap)) + order = shmem_confirm_swap(mapping, index, swap); + if (unlikely(!si)) { + if (order < 0) return -EEXIST; else return -EINVAL; } + if (unlikely(order < 0)) { + put_swap_device(si); + return -EEXIST; + } /* Look it up and read it in.. */ folio = swap_cache_get_folio(swap, NULL, 0); - order = xa_get_order(&mapping->i_pages, index); if (!folio) { int nr_pages = 1 << order; bool fallback_order0 = false; @@ -2412,7 +2428,7 @@ alloced: */ folio_lock(folio); if ((!skip_swapcache && !folio_test_swapcache(folio)) || - !shmem_confirm_swap(mapping, index, swap) || + shmem_confirm_swap(mapping, index, swap) < 0 || folio->swap.val != swap.val) { error = -EEXIST; goto unlock; @@ -2460,7 +2476,7 @@ alloced: *foliop = folio; return 0; failed: - if (!shmem_confirm_swap(mapping, index, swap)) + if (shmem_confirm_swap(mapping, index, swap) < 0) error = -EEXIST; if (error == -EIO) shmem_set_folio_swapin_error(inode, index, folio, swap, From c262ffd72c8539d16ada8641a6348c5a88f0c542 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Mon, 28 Jul 2025 15:53:01 +0800 Subject: [PATCH 31/38] mm/shmem, swap: tidy up THP swapin checks Move all THP swapin related checks under CONFIG_TRANSPARENT_HUGEPAGE, so they will be trimmed off by the compiler if not needed. And add a WARN if shmem sees a order > 0 entry when CONFIG_TRANSPARENT_HUGEPAGE is disabled, that should never happen unless things went very wrong. There should be no observable feature change except the new added WARN. Link: https://lkml.kernel.org/r/20250728075306.12704-4-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: Dev Jain Cc: Hugh Dickins Cc: Kemeng Shi Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/shmem.c | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 6e00d2ec40a9..88058d53ae55 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2017,26 +2017,38 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode, swp_entry_t entry, int order, gfp_t gfp) { struct shmem_inode_info *info = SHMEM_I(inode); + int nr_pages = 1 << order; struct folio *new; void *shadow; - int nr_pages; /* * We have arrived here because our zones are constrained, so don't * limit chance of success with further cpuset and node constraints. */ gfp &= ~GFP_CONSTRAINT_MASK; - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && order > 0) { - gfp_t huge_gfp = vma_thp_gfp_mask(vma); + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { + if (WARN_ON_ONCE(order)) + return ERR_PTR(-EINVAL); + } else if (order) { + /* + * If uffd is active for the vma, we need per-page fault + * fidelity to maintain the uffd semantics, then fallback + * to swapin order-0 folio, as well as for zswap case. + * Any existing sub folio in the swap cache also blocks + * mTHP swapin. + */ + if ((vma && unlikely(userfaultfd_armed(vma))) || + !zswap_never_enabled() || + non_swapcache_batch(entry, nr_pages) != nr_pages) + return ERR_PTR(-EINVAL); - gfp = limit_gfp_mask(huge_gfp, gfp); + gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp); } new = shmem_alloc_folio(gfp, order, info, index); if (!new) return ERR_PTR(-ENOMEM); - nr_pages = folio_nr_pages(new); if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL, gfp, entry)) { folio_put(new); @@ -2320,9 +2332,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, /* Look it up and read it in.. */ folio = swap_cache_get_folio(swap, NULL, 0); if (!folio) { - int nr_pages = 1 << order; - bool fallback_order0 = false; - /* Or update major stats only when swapin succeeds?? */ if (fault_type) { *fault_type |= VM_FAULT_MAJOR; @@ -2330,20 +2339,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, count_memcg_event_mm(fault_mm, PGMAJFAULT); } - /* - * If uffd is active for the vma, we need per-page fault - * fidelity to maintain the uffd semantics, then fallback - * to swapin order-0 folio, as well as for zswap case. - * Any existing sub folio in the swap cache also blocks - * mTHP swapin. - */ - if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) || - !zswap_never_enabled() || - non_swapcache_batch(swap, nr_pages) != nr_pages)) - fallback_order0 = true; - /* Skip swapcache for synchronous device. */ - if (!fallback_order0 && data_race(si->flags & SWP_SYNCHRONOUS_IO)) { + if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) { folio = shmem_swap_alloc_folio(inode, vma, index, swap, order, gfp); if (!IS_ERR(folio)) { skip_swapcache = true; From 91ab656ece137c368a3189dfd42f8c9203a6285c Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Mon, 28 Jul 2025 15:53:02 +0800 Subject: [PATCH 32/38] mm/shmem, swap: tidy up swap entry splitting Instead of keeping different paths of splitting the entry before the swap in start, move the entry splitting after the swapin has put the folio in swap cache (or set the SWAP_HAS_CACHE bit). This way we only need one place and one unified way to split the large entry. Whenever swapin brought in a folio smaller than the shmem swap entry, split the entry and recalculate the entry and index for verification. This removes duplicated codes and function calls, reduces LOC, and the split is less racy as it's guarded by swap cache now. So it will have a lower chance of repeated faults due to raced split. The compiler is also able to optimize the coder further: bloat-o-meter results with GCC 14: With DEBUG_SECTION_MISMATCH (-fno-inline-functions-called-once): ./scripts/bloat-o-meter mm/shmem.o.old mm/shmem.o add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-143 (-143) Function old new delta shmem_swapin_folio 2358 2215 -143 Total: Before=32933, After=32790, chg -0.43% With !DEBUG_SECTION_MISMATCH: add/remove: 0/1 grow/shrink: 1/0 up/down: 1069/-749 (320) Function old new delta shmem_swapin_folio 2871 3940 +1069 shmem_split_large_entry.isra 749 - -749 Total: Before=32806, After=33126, chg +0.98% Since shmem_split_large_entry is only called in one place now. The compiler will either generate more compact code, or inlined it for better performance. Link: https://lkml.kernel.org/r/20250728075306.12704-5-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Baolin Wang Tested-by: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: Dev Jain Cc: Hugh Dickins Cc: Kemeng Shi Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/shmem.c | 56 ++++++++++++++++++++++-------------------------------- 1 file changed, 23 insertions(+), 33 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 88058d53ae55..5e10e27e8b73 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2303,14 +2303,16 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL; struct shmem_inode_info *info = SHMEM_I(inode); + swp_entry_t swap, index_entry; struct swap_info_struct *si; struct folio *folio = NULL; bool skip_swapcache = false; - swp_entry_t swap; int error, nr_pages, order, split_order; + pgoff_t offset; VM_BUG_ON(!*foliop || !xa_is_value(*foliop)); - swap = radix_to_swp_entry(*foliop); + index_entry = radix_to_swp_entry(*foliop); + swap = index_entry; *foliop = NULL; if (is_poisoned_swp_entry(swap)) @@ -2358,46 +2360,35 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, } /* - * Now swap device can only swap in order 0 folio, then we - * should split the large swap entry stored in the pagecache - * if necessary. - */ - split_order = shmem_split_large_entry(inode, index, swap, gfp); - if (split_order < 0) { - error = split_order; - goto failed; - } - - /* - * If the large swap entry has already been split, it is + * Now swap device can only swap in order 0 folio, it is * necessary to recalculate the new swap entry based on - * the old order alignment. + * the offset, as the swapin index might be unalgined. */ - if (split_order > 0) { - pgoff_t offset = index - round_down(index, 1 << split_order); - + if (order) { + offset = index - round_down(index, 1 << order); swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); } - /* Here we actually start the io */ folio = shmem_swapin_cluster(swap, gfp, info, index); if (!folio) { error = -ENOMEM; goto failed; } - } else if (order > folio_order(folio)) { + } +alloced: + if (order > folio_order(folio)) { /* - * Swap readahead may swap in order 0 folios into swapcache + * Swapin may get smaller folios due to various reasons: + * It may fallback to order 0 due to memory pressure or race, + * swap readahead may swap in order 0 folios into swapcache * asynchronously, while the shmem mapping can still stores * large swap entries. In such cases, we should split the * large swap entry to prevent possible data corruption. */ - split_order = shmem_split_large_entry(inode, index, swap, gfp); + split_order = shmem_split_large_entry(inode, index, index_entry, gfp); if (split_order < 0) { - folio_put(folio); - folio = NULL; error = split_order; - goto failed; + goto failed_nolock; } /* @@ -2406,16 +2397,14 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, * the old order alignment. */ if (split_order > 0) { - pgoff_t offset = index - round_down(index, 1 << split_order); - - swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); + offset = index - round_down(index, 1 << split_order); + swap = swp_entry(swp_type(swap), swp_offset(index_entry) + offset); } } else if (order < folio_order(folio)) { swap.val = round_down(swap.val, 1 << folio_order(folio)); index = round_down(index, 1 << folio_order(folio)); } -alloced: /* * We have to do this with the folio locked to prevent races. * The shmem_confirm_swap below only checks if the first swap @@ -2479,12 +2468,13 @@ failed: shmem_set_folio_swapin_error(inode, index, folio, swap, skip_swapcache); unlock: - if (skip_swapcache) - swapcache_clear(si, swap, folio_nr_pages(folio)); - if (folio) { + if (folio) folio_unlock(folio); +failed_nolock: + if (skip_swapcache) + swapcache_clear(si, folio->swap, folio_nr_pages(folio)); + if (folio) folio_put(folio); - } put_swap_device(si); return error; From 69805ea79db6634d4e7d596f3f36667924dc6cbf Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Mon, 28 Jul 2025 15:53:03 +0800 Subject: [PATCH 33/38] mm/shmem, swap: never use swap cache and readahead for SWP_SYNCHRONOUS_IO For SWP_SYNCHRONOUS_IO devices, if a cache bypassing THP swapin failed due to reasons like memory pressure, partially conflicting swap cache or ZSWAP enabled, shmem will fallback to cached order 0 swapin. Right now the swap cache still has a non-trivial overhead, and readahead is not helpful for SWP_SYNCHRONOUS_IO devices, so we should always skip the readahead and swap cache even if the swapin falls back to order 0. So handle the fallback logic without falling back to the cached read. Link: https://lkml.kernel.org/r/20250728075306.12704-6-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: Dev Jain Cc: Hugh Dickins Cc: Kemeng Shi Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/shmem.c | 41 ++++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 5e10e27e8b73..e6207ad6f2f1 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2019,6 +2019,7 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode, struct shmem_inode_info *info = SHMEM_I(inode); int nr_pages = 1 << order; struct folio *new; + gfp_t alloc_gfp; void *shadow; /* @@ -2026,6 +2027,7 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode, * limit chance of success with further cpuset and node constraints. */ gfp &= ~GFP_CONSTRAINT_MASK; + alloc_gfp = gfp; if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { if (WARN_ON_ONCE(order)) return ERR_PTR(-EINVAL); @@ -2040,19 +2042,22 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode, if ((vma && unlikely(userfaultfd_armed(vma))) || !zswap_never_enabled() || non_swapcache_batch(entry, nr_pages) != nr_pages) - return ERR_PTR(-EINVAL); + goto fallback; - gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp); + alloc_gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp); + } +retry: + new = shmem_alloc_folio(alloc_gfp, order, info, index); + if (!new) { + new = ERR_PTR(-ENOMEM); + goto fallback; } - new = shmem_alloc_folio(gfp, order, info, index); - if (!new) - return ERR_PTR(-ENOMEM); - if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL, - gfp, entry)) { + alloc_gfp, entry)) { folio_put(new); - return ERR_PTR(-ENOMEM); + new = ERR_PTR(-ENOMEM); + goto fallback; } /* @@ -2067,7 +2072,9 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode, */ if (swapcache_prepare(entry, nr_pages)) { folio_put(new); - return ERR_PTR(-EEXIST); + new = ERR_PTR(-EEXIST); + /* Try smaller folio to avoid cache conflict */ + goto fallback; } __folio_set_locked(new); @@ -2081,6 +2088,15 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode, folio_add_lru(new); swap_read_folio(new, NULL); return new; +fallback: + /* Order 0 swapin failed, nothing to fallback to, abort */ + if (!order) + return new; + entry.val += index - round_down(index, nr_pages); + alloc_gfp = gfp; + nr_pages = 1; + order = 0; + goto retry; } /* @@ -2350,13 +2366,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, } /* - * Fallback to swapin order-0 folio unless the swap entry - * already exists. + * Direct swapin handled order 0 fallback already, + * if it failed, abort. */ error = PTR_ERR(folio); folio = NULL; - if (error == -EEXIST) - goto failed; + goto failed; } /* From 1326359f22805b2b0e9567ec0099980b8956fc29 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Mon, 28 Jul 2025 15:53:04 +0800 Subject: [PATCH 34/38] mm/shmem, swap: simplify swapin path and result handling Slightly tidy up the different handling of swap in and error handling for SWP_SYNCHRONOUS_IO and non-SWP_SYNCHRONOUS_IO devices. Now swapin will always use either shmem_swap_alloc_folio or shmem_swapin_cluster, then check the result. Simplify the control flow and avoid a redundant goto label. Link: https://lkml.kernel.org/r/20250728075306.12704-7-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: Dev Jain Cc: Hugh Dickins Cc: Kemeng Shi Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/shmem.c | 49 +++++++++++++++++++++---------------------------- 1 file changed, 21 insertions(+), 28 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index e6207ad6f2f1..0de37d014524 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2357,40 +2357,33 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, count_memcg_event_mm(fault_mm, PGMAJFAULT); } - /* Skip swapcache for synchronous device. */ if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) { + /* Direct swapin skipping swap cache & readahead */ folio = shmem_swap_alloc_folio(inode, vma, index, swap, order, gfp); - if (!IS_ERR(folio)) { - skip_swapcache = true; - goto alloced; + if (IS_ERR(folio)) { + error = PTR_ERR(folio); + folio = NULL; + goto failed; + } + skip_swapcache = true; + } else { + /* + * Cached swapin only supports order 0 folio, it is + * necessary to recalculate the new swap entry based on + * the offset, as the swapin index might be unalgined. + */ + if (order) { + offset = index - round_down(index, 1 << order); + swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); } - /* - * Direct swapin handled order 0 fallback already, - * if it failed, abort. - */ - error = PTR_ERR(folio); - folio = NULL; - goto failed; - } - - /* - * Now swap device can only swap in order 0 folio, it is - * necessary to recalculate the new swap entry based on - * the offset, as the swapin index might be unalgined. - */ - if (order) { - offset = index - round_down(index, 1 << order); - swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); - } - - folio = shmem_swapin_cluster(swap, gfp, info, index); - if (!folio) { - error = -ENOMEM; - goto failed; + folio = shmem_swapin_cluster(swap, gfp, info, index); + if (!folio) { + error = -ENOMEM; + goto failed; + } } } -alloced: if (order > folio_order(folio)) { /* * Swapin may get smaller folios due to various reasons: From 93c0476e705768c7ca902cffea4efb500b9678b4 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Mon, 28 Jul 2025 15:53:05 +0800 Subject: [PATCH 35/38] mm/shmem, swap: rework swap entry and index calculation for large swapin Instead of calculating the swap entry differently in different swapin paths, calculate it early before the swap cache lookup and use that for the lookup and later swapin. And after swapin have brought a folio, simply round it down against the size of the folio. This is simple and effective enough to verify the swap value. A folio's swap entry is always aligned by its size. Any kind of parallel split or race is acceptable because the final shmem_add_to_page_cache ensures that all entries covered by the folio are correct, and thus there will be no data corruption. This also prevents false positive cache lookup. If a shmem read request's index points to the middle of a large swap entry, previously, shmem will try the swap cache lookup using the large swap entry's starting value (which is the first sub swap entry of this large entry). This will lead to false positive lookup results if only the first few swap entries are cached but the actual requested swap entry pointed by the index is uncached. This is not a rare event, as swap readahead always tries to cache order 0 folios when possible. And this shouldn't cause any increased repeated faults. Instead, no matter how the shmem mapping is split in parallel, as long as the mapping still contains the right entries, the swapin will succeed. The final object size and stack usage are also reduced due to simplified code: ./scripts/bloat-o-meter mm/shmem.o.old mm/shmem.o add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-145 (-145) Function old new delta shmem_swapin_folio 4056 3911 -145 Total: Before=33242, After=33097, chg -0.44% Stack usage (Before vs After): mm/shmem.c:2314:12:shmem_swapin_folio 264 static mm/shmem.c:2314:12:shmem_swapin_folio 256 static And while at it, round down the index too if swap entry is round down. The index is used either for folio reallocation or confirming the mapping content. In either case, it should be aligned with the swap folio. Link: https://lkml.kernel.org/r/20250728075306.12704-8-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Baolin Wang Tested-by: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: Dev Jain Cc: Hugh Dickins Cc: Kemeng Shi Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/shmem.c | 67 +++++++++++++++++++++++++++--------------------------- 1 file changed, 33 insertions(+), 34 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 0de37d014524..33d30ee5bc84 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2302,7 +2302,7 @@ unlock: if (xas_error(&xas)) return xas_error(&xas); - return entry_order; + return 0; } /* @@ -2323,7 +2323,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct swap_info_struct *si; struct folio *folio = NULL; bool skip_swapcache = false; - int error, nr_pages, order, split_order; + int error, nr_pages, order; pgoff_t offset; VM_BUG_ON(!*foliop || !xa_is_value(*foliop)); @@ -2331,11 +2331,11 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, swap = index_entry; *foliop = NULL; - if (is_poisoned_swp_entry(swap)) + if (is_poisoned_swp_entry(index_entry)) return -EIO; - si = get_swap_device(swap); - order = shmem_confirm_swap(mapping, index, swap); + si = get_swap_device(index_entry); + order = shmem_confirm_swap(mapping, index, index_entry); if (unlikely(!si)) { if (order < 0) return -EEXIST; @@ -2347,6 +2347,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, return -EEXIST; } + /* index may point to the middle of a large entry, get the sub entry */ + if (order) { + offset = index - round_down(index, 1 << order); + swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); + } + /* Look it up and read it in.. */ folio = swap_cache_get_folio(swap, NULL, 0); if (!folio) { @@ -2359,7 +2365,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) { /* Direct swapin skipping swap cache & readahead */ - folio = shmem_swap_alloc_folio(inode, vma, index, swap, order, gfp); + folio = shmem_swap_alloc_folio(inode, vma, index, + index_entry, order, gfp); if (IS_ERR(folio)) { error = PTR_ERR(folio); folio = NULL; @@ -2367,16 +2374,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, } skip_swapcache = true; } else { - /* - * Cached swapin only supports order 0 folio, it is - * necessary to recalculate the new swap entry based on - * the offset, as the swapin index might be unalgined. - */ - if (order) { - offset = index - round_down(index, 1 << order); - swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); - } - + /* Cached swapin only supports order 0 folio */ folio = shmem_swapin_cluster(swap, gfp, info, index); if (!folio) { error = -ENOMEM; @@ -2384,6 +2382,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, } } } + if (order > folio_order(folio)) { /* * Swapin may get smaller folios due to various reasons: @@ -2393,24 +2392,25 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, * large swap entries. In such cases, we should split the * large swap entry to prevent possible data corruption. */ - split_order = shmem_split_large_entry(inode, index, index_entry, gfp); - if (split_order < 0) { - error = split_order; + error = shmem_split_large_entry(inode, index, index_entry, gfp); + if (error) goto failed_nolock; - } + } - /* - * If the large swap entry has already been split, it is - * necessary to recalculate the new swap entry based on - * the old order alignment. - */ - if (split_order > 0) { - offset = index - round_down(index, 1 << split_order); - swap = swp_entry(swp_type(swap), swp_offset(index_entry) + offset); - } - } else if (order < folio_order(folio)) { - swap.val = round_down(swap.val, 1 << folio_order(folio)); - index = round_down(index, 1 << folio_order(folio)); + /* + * If the folio is large, round down swap and index by folio size. + * No matter what race occurs, the swap layer ensures we either get + * a valid folio that has its swap entry aligned by size, or a + * temporarily invalid one which we'll abort very soon and retry. + * + * shmem_add_to_page_cache ensures the whole range contains expected + * entries and prevents any corruption, so any race split is fine + * too, it will succeed as long as the entries are still there. + */ + nr_pages = folio_nr_pages(folio); + if (nr_pages > 1) { + swap.val = round_down(swap.val, nr_pages); + index = round_down(index, nr_pages); } /* @@ -2446,8 +2446,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, goto failed; } - error = shmem_add_to_page_cache(folio, mapping, - round_down(index, nr_pages), + error = shmem_add_to_page_cache(folio, mapping, index, swp_to_radix_entry(swap), gfp); if (error) goto failed; From de55be42379cc0561aadfd9e1459239dea70be32 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Mon, 28 Jul 2025 15:53:06 +0800 Subject: [PATCH 36/38] mm/shmem, swap: fix major fault counting If the swapin failed, don't update the major fault count. There is a long existing comment for doing it this way, now with previous cleanups, we can finally fix it. Link: https://lkml.kernel.org/r/20250728075306.12704-9-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: Dev Jain Cc: Hugh Dickins Cc: Kemeng Shi Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/shmem.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 33d30ee5bc84..e1e5d5f7f58d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2356,13 +2356,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, /* Look it up and read it in.. */ folio = swap_cache_get_folio(swap, NULL, 0); if (!folio) { - /* Or update major stats only when swapin succeeds?? */ - if (fault_type) { - *fault_type |= VM_FAULT_MAJOR; - count_vm_event(PGMAJFAULT); - count_memcg_event_mm(fault_mm, PGMAJFAULT); - } - if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) { /* Direct swapin skipping swap cache & readahead */ folio = shmem_swap_alloc_folio(inode, vma, index, @@ -2381,6 +2374,11 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, goto failed; } } + if (fault_type) { + *fault_type |= VM_FAULT_MAJOR; + count_vm_event(PGMAJFAULT); + count_memcg_event_mm(fault_mm, PGMAJFAULT); + } } if (order > folio_order(folio)) { From f04fd85f15945f3ff189701050e3ce303c1a4d98 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 29 Jul 2025 12:49:06 +0100 Subject: [PATCH 37/38] mm: correct type for vmalloc vm_flags fields Several functions refer to the unfortunately named 'vm_flags' field when referencing vmalloc flags, which happens to be the precise same name used for VMA flags. As a result these were erroneously changed to use the vm_flags_t type (which currently is a typedef equivalent to unsigned long). Currently this has no impact, but in future when vm_flags_t changes this will result in issues, so change the type to unsigned long to account for this. [lorenzo.stoakes@oracle.com: fixup very disguised vmalloc flags parameter] Link: https://lkml.kernel.org/r/e74dd8de-7e60-47ab-8a45-2c851f3c5d26@lucifer.local Link: https://lkml.kernel.org/r/20250729114906.55347-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reported-by: Harry Yoo Closes: https://lore.kernel.org/all/aIgSpAnU8EaIcqd9@hyeyoo/ Reviewed-by: Pedro Falcato Acked-by: David Hildenbrand Reviewed-by: Harry Yoo Acked-by: Vlastimil Babka Cc: Jann Horn Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: "Uladzislau Rezki (Sony)" Signed-off-by: Andrew Morton --- arch/arm64/mm/mmu.c | 2 +- mm/execmem.c | 8 ++++---- mm/internal.h | 2 +- mm/nommu.c | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 20a89ab97dc5..34e5d78af076 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -721,7 +721,7 @@ void mark_rodata_ro(void) static void __init declare_vma(struct vm_struct *vma, void *va_start, void *va_end, - vm_flags_t vm_flags) + unsigned long vm_flags) { phys_addr_t pa_start = __pa_symbol(va_start); unsigned long size = va_end - va_start; diff --git a/mm/execmem.c b/mm/execmem.c index 1785d7f435e4..0822305413ec 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -26,7 +26,7 @@ static struct execmem_info default_execmem_info __ro_after_init; #ifdef CONFIG_MMU static void *execmem_vmalloc(struct execmem_range *range, size_t size, - pgprot_t pgprot, vm_flags_t vm_flags) + pgprot_t pgprot, unsigned long vm_flags) { bool kasan = range->flags & EXECMEM_KASAN_SHADOW; gfp_t gfp_flags = GFP_KERNEL | __GFP_NOWARN; @@ -82,7 +82,7 @@ struct vm_struct *execmem_vmap(size_t size) } #else static void *execmem_vmalloc(struct execmem_range *range, size_t size, - pgprot_t pgprot, vm_flags_t vm_flags) + pgprot_t pgprot, unsigned long vm_flags) { return vmalloc(size); } @@ -283,7 +283,7 @@ out_unlock: static int execmem_cache_populate(struct execmem_range *range, size_t size) { - vm_flags_t vm_flags = VM_ALLOW_HUGE_VMAP; + unsigned long vm_flags = VM_ALLOW_HUGE_VMAP; struct vm_struct *vm; size_t alloc_size; int err = -ENOMEM; @@ -465,7 +465,7 @@ void *execmem_alloc(enum execmem_type type, size_t size) { struct execmem_range *range = &execmem_info->ranges[type]; bool use_cache = range->flags & EXECMEM_ROX_CACHE; - vm_flags_t vm_flags = VM_FLUSH_RESET_PERMS; + unsigned long vm_flags = VM_FLUSH_RESET_PERMS; pgprot_t pgprot = range->pgprot; void *p = NULL; diff --git a/mm/internal.h b/mm/internal.h index 28d2d5b051df..142d9302c2ae 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1391,7 +1391,7 @@ int migrate_device_coherent_folio(struct folio *folio); struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long shift, - vm_flags_t vm_flags, unsigned long start, + unsigned long vm_flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, const void *caller); diff --git a/mm/nommu.c b/mm/nommu.c index 87e1acab0d64..07504d666d6a 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -126,7 +126,7 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, - pgprot_t prot, vm_flags_t vm_flags, int node, + pgprot_t prot, unsigned long vm_flags, int node, const void *caller) { return __vmalloc_noprof(size, gfp_mask); From a2152fef29020e740ba0276930f3a24440012505 Mon Sep 17 00:00:00 2001 From: Yadan Fan Date: Fri, 1 Aug 2025 02:14:45 +0800 Subject: [PATCH 38/38] mm: mempool: fix crash in mempool_free() for zero-minimum pools The mempool wake-up fix introduced in commit a5867a218d7c ("mm: mempool: fix wake-up edge case bug for zero-minimum pools") inlined the add_element() logic in mempool_free() to return the element to the zero-minimum pool: pool->elements[pool->curr_nr++] = element; This causes crash, because mempool_init_node() does not initialize with real allocation for zero-minimum pool, it only returns ZERO_SIZE_PTR to the elements array which is unable to be dereferenced, and the pre-allocation of this array never happened since the while test: while (pool->curr_nr < pool->min_nr) can never be satisfied as min_nr is zero, so the pool does not actually reserve any buffer, the only way so far is to call alloc_fn() to get buffer from SLUB, but if the memory is under high pressure the alloc_fn() could never get any buffer, the waiting thread would be in an indefinite loop of wake-sleep in a period until there is free memory to get. This patch changes mempool_init_node() to allocate 1 element for the elements array of zero-minimum pool, so that the pool will have reserved buffer to use. This will fix the crash issue and let the waiting thread can get the reserved element when alloc_fn() failed to get buffer under high memory pressure. Also modify add_element() to support zero-minimum pool with simplifying codes of zero-minimum handling in mempool_free(). Link: https://lkml.kernel.org/r/e01f00f3-58d9-4ca7-af54-bfa42fec9527@suse.com Fixes: a5867a218d7c ("mm: mempool: fix wake-up edge case bug for zero-minimum pools") Signed-off-by: Yadan Fan Signed-off-by: Andrew Morton --- mm/mempool.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/mm/mempool.c b/mm/mempool.c index 204a216b6418..1c38e873e546 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -136,7 +136,7 @@ static void kasan_unpoison_element(mempool_t *pool, void *element) static __always_inline void add_element(mempool_t *pool, void *element) { - BUG_ON(pool->curr_nr >= pool->min_nr); + BUG_ON(pool->min_nr != 0 && pool->curr_nr >= pool->min_nr); poison_element(pool, element); if (kasan_poison_element(pool, element)) pool->elements[pool->curr_nr++] = element; @@ -202,16 +202,20 @@ int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, pool->alloc = alloc_fn; pool->free = free_fn; init_waitqueue_head(&pool->wait); - - pool->elements = kmalloc_array_node(min_nr, sizeof(void *), + /* + * max() used here to ensure storage for at least 1 element to support + * zero minimum pool + */ + pool->elements = kmalloc_array_node(max(1, min_nr), sizeof(void *), gfp_mask, node_id); if (!pool->elements) return -ENOMEM; /* - * First pre-allocate the guaranteed number of buffers. + * First pre-allocate the guaranteed number of buffers, + * also pre-allocate 1 element for zero minimum pool. */ - while (pool->curr_nr < pool->min_nr) { + while (pool->curr_nr < max(1, pool->min_nr)) { void *element; element = pool->alloc(gfp_mask, pool->pool_data); @@ -555,20 +559,12 @@ void mempool_free(void *element, mempool_t *pool) * wake-up path of previous test. This explicit check ensures the * allocation of element when both min_nr and curr_nr are 0, and * any active waiters are properly awakened. - * - * Inline the same logic as previous test, add_element() cannot be - * directly used here since it has BUG_ON to deny if min_nr equals - * curr_nr, so here picked rest of add_element() to use without - * BUG_ON check. */ if (unlikely(pool->min_nr == 0 && READ_ONCE(pool->curr_nr) == 0)) { spin_lock_irqsave(&pool->lock, flags); if (likely(pool->curr_nr == 0)) { - /* Inline the logic of add_element() */ - poison_element(pool, element); - if (kasan_poison_element(pool, element)) - pool->elements[pool->curr_nr++] = element; + add_element(pool, element); spin_unlock_irqrestore(&pool->lock, flags); if (wq_has_sleeper(&pool->wait)) wake_up(&pool->wait);