mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-03-21 23:16:50 +08:00
A dirty folio is one which has been written to. A clean folio is its
opposite. Since a clean folio has no user data, it can be freed under
memory pressure.
memfd preservation with LUO saves the flag at preserve(). This is
problematic. The folio might get dirtied later. Saving it at freeze()
also doesn't work, since the dirty bit from PTE is normally synced at
unmap and there might still be mappings of the file at freeze().
To see why this is a problem, say a folio is clean at preserve, but gets
dirtied later. The serialized state of the folio will mark it as clean.
After retrieve, the next kernel will see the folio as clean and might try
to reclaim it under memory pressure. This will result in losing user
data.
Mark all folios of the file as dirty, and always set the
MEMFD_LUO_FOLIO_DIRTY flag. This comes with the side effect of making all
clean folios un-reclaimable. This is a cost that has to be paid for
participants of live update. It is not expected to be a common use case
to preserve a lot of clean folios anyway.
Since the value of pfolio->flags is a constant now, drop the flags
variable and set it directly.
Link: https://lkml.kernel.org/r/20260223173931.2221759-3-pratyush@kernel.org
Fixes: b3749f174d ("mm: memfd_luo: allow preserving memfd")
Signed-off-by: Pratyush Yadav (Google) <pratyush@kernel.org>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
561 lines
14 KiB
C
561 lines
14 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
/*
|
|
* Copyright (c) 2025, Google LLC.
|
|
* Pasha Tatashin <pasha.tatashin@soleen.com>
|
|
*
|
|
* Copyright (C) 2025 Amazon.com Inc. or its affiliates.
|
|
* Pratyush Yadav <ptyadav@amazon.de>
|
|
*/
|
|
|
|
/**
|
|
* DOC: Memfd Preservation via LUO
|
|
*
|
|
* Overview
|
|
* ========
|
|
*
|
|
* Memory file descriptors (memfd) can be preserved over a kexec using the Live
|
|
* Update Orchestrator (LUO) file preservation. This allows userspace to
|
|
* transfer its memory contents to the next kernel after a kexec.
|
|
*
|
|
* The preservation is not intended to be transparent. Only select properties of
|
|
* the file are preserved. All others are reset to default. The preserved
|
|
* properties are described below.
|
|
*
|
|
* .. note::
|
|
* The LUO API is not stabilized yet, so the preserved properties of a memfd
|
|
* are also not stable and are subject to backwards incompatible changes.
|
|
*
|
|
* .. note::
|
|
* Currently a memfd backed by Hugetlb is not supported. Memfds created
|
|
* with ``MFD_HUGETLB`` will be rejected.
|
|
*
|
|
* Preserved Properties
|
|
* ====================
|
|
*
|
|
* The following properties of the memfd are preserved across kexec:
|
|
*
|
|
* File Contents
|
|
* All data stored in the file is preserved.
|
|
*
|
|
* File Size
|
|
* The size of the file is preserved. Holes in the file are filled by
|
|
* allocating pages for them during preservation.
|
|
*
|
|
* File Position
|
|
* The current file position is preserved, allowing applications to continue
|
|
* reading/writing from their last position.
|
|
*
|
|
* File Status Flags
|
|
* memfds are always opened with ``O_RDWR`` and ``O_LARGEFILE``. This property
|
|
* is maintained.
|
|
*
|
|
* Non-Preserved Properties
|
|
* ========================
|
|
*
|
|
* All properties which are not preserved must be assumed to be reset to
|
|
* default. This section describes some of those properties which may be more of
|
|
* note.
|
|
*
|
|
* ``FD_CLOEXEC`` flag
|
|
* A memfd can be created with the ``MFD_CLOEXEC`` flag that sets the
|
|
* ``FD_CLOEXEC`` on the file. This flag is not preserved and must be set
|
|
* again after restore via ``fcntl()``.
|
|
*
|
|
* Seals
|
|
* File seals are not preserved. The file is unsealed on restore and if
|
|
* needed, must be sealed again via ``fcntl()``.
|
|
*/
|
|
|
|
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
|
|
|
#include <linux/bits.h>
|
|
#include <linux/err.h>
|
|
#include <linux/file.h>
|
|
#include <linux/io.h>
|
|
#include <linux/kexec_handover.h>
|
|
#include <linux/kho/abi/memfd.h>
|
|
#include <linux/liveupdate.h>
|
|
#include <linux/shmem_fs.h>
|
|
#include <linux/vmalloc.h>
|
|
#include <linux/memfd.h>
|
|
#include "internal.h"
|
|
|
|
static int memfd_luo_preserve_folios(struct file *file,
|
|
struct kho_vmalloc *kho_vmalloc,
|
|
struct memfd_luo_folio_ser **out_folios_ser,
|
|
u64 *nr_foliosp)
|
|
{
|
|
struct inode *inode = file_inode(file);
|
|
struct memfd_luo_folio_ser *folios_ser;
|
|
unsigned int max_folios;
|
|
long i, size, nr_pinned;
|
|
struct folio **folios;
|
|
int err = -EINVAL;
|
|
pgoff_t offset;
|
|
u64 nr_folios;
|
|
|
|
size = i_size_read(inode);
|
|
/*
|
|
* If the file has zero size, then the folios and nr_folios properties
|
|
* are not set.
|
|
*/
|
|
if (!size) {
|
|
*nr_foliosp = 0;
|
|
*out_folios_ser = NULL;
|
|
memset(kho_vmalloc, 0, sizeof(*kho_vmalloc));
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Guess the number of folios based on inode size. Real number might end
|
|
* up being smaller if there are higher order folios.
|
|
*/
|
|
max_folios = PAGE_ALIGN(size) / PAGE_SIZE;
|
|
folios = kvmalloc_objs(*folios, max_folios);
|
|
if (!folios)
|
|
return -ENOMEM;
|
|
|
|
/*
|
|
* Pin the folios so they don't move around behind our back. This also
|
|
* ensures none of the folios are in CMA -- which ensures they don't
|
|
* fall in KHO scratch memory. It also moves swapped out folios back to
|
|
* memory.
|
|
*
|
|
* A side effect of doing this is that it allocates a folio for all
|
|
* indices in the file. This might waste memory on sparse memfds. If
|
|
* that is really a problem in the future, we can have a
|
|
* memfd_pin_folios() variant that does not allocate a page on empty
|
|
* slots.
|
|
*/
|
|
nr_pinned = memfd_pin_folios(file, 0, size - 1, folios, max_folios,
|
|
&offset);
|
|
if (nr_pinned < 0) {
|
|
err = nr_pinned;
|
|
pr_err("failed to pin folios: %d\n", err);
|
|
goto err_free_folios;
|
|
}
|
|
nr_folios = nr_pinned;
|
|
|
|
folios_ser = vcalloc(nr_folios, sizeof(*folios_ser));
|
|
if (!folios_ser) {
|
|
err = -ENOMEM;
|
|
goto err_unpin;
|
|
}
|
|
|
|
for (i = 0; i < nr_folios; i++) {
|
|
struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
|
|
struct folio *folio = folios[i];
|
|
|
|
err = kho_preserve_folio(folio);
|
|
if (err)
|
|
goto err_unpreserve;
|
|
|
|
folio_lock(folio);
|
|
|
|
/*
|
|
* A dirty folio is one which has been written to. A clean folio
|
|
* is its opposite. Since a clean folio does not carry user
|
|
* data, it can be freed by page reclaim under memory pressure.
|
|
*
|
|
* Saving the dirty flag at prepare() time doesn't work since it
|
|
* can change later. Saving it at freeze() also won't work
|
|
* because the dirty bit is normally synced at unmap and there
|
|
* might still be a mapping of the file at freeze().
|
|
*
|
|
* To see why this is a problem, say a folio is clean at
|
|
* preserve, but gets dirtied later. The pfolio flags will mark
|
|
* it as clean. After retrieve, the next kernel might try to
|
|
* reclaim this folio under memory pressure, losing user data.
|
|
*
|
|
* Unconditionally mark it dirty to avoid this problem. This
|
|
* comes at the cost of making clean folios un-reclaimable after
|
|
* live update.
|
|
*/
|
|
folio_mark_dirty(folio);
|
|
|
|
/*
|
|
* If the folio is not uptodate, it was fallocated but never
|
|
* used. Saving this flag at prepare() doesn't work since it
|
|
* might change later when someone uses the folio.
|
|
*
|
|
* Since we have taken the performance penalty of allocating,
|
|
* zeroing, and pinning all the folios in the holes, take a bit
|
|
* more and zero all non-uptodate folios too.
|
|
*
|
|
* NOTE: For someone looking to improve preserve performance,
|
|
* this is a good place to look.
|
|
*/
|
|
if (!folio_test_uptodate(folio)) {
|
|
folio_zero_range(folio, 0, folio_size(folio));
|
|
flush_dcache_folio(folio);
|
|
folio_mark_uptodate(folio);
|
|
}
|
|
|
|
folio_unlock(folio);
|
|
|
|
pfolio->pfn = folio_pfn(folio);
|
|
pfolio->flags = MEMFD_LUO_FOLIO_DIRTY | MEMFD_LUO_FOLIO_UPTODATE;
|
|
pfolio->index = folio->index;
|
|
}
|
|
|
|
err = kho_preserve_vmalloc(folios_ser, kho_vmalloc);
|
|
if (err)
|
|
goto err_unpreserve;
|
|
|
|
kvfree(folios);
|
|
*nr_foliosp = nr_folios;
|
|
*out_folios_ser = folios_ser;
|
|
|
|
/*
|
|
* Note: folios_ser is purposely not freed here. It is preserved
|
|
* memory (via KHO). In the 'unpreserve' path, we use the vmap pointer
|
|
* that is passed via private_data.
|
|
*/
|
|
return 0;
|
|
|
|
err_unpreserve:
|
|
for (i = i - 1; i >= 0; i--)
|
|
kho_unpreserve_folio(folios[i]);
|
|
vfree(folios_ser);
|
|
err_unpin:
|
|
unpin_folios(folios, nr_folios);
|
|
err_free_folios:
|
|
kvfree(folios);
|
|
|
|
return err;
|
|
}
|
|
|
|
static void memfd_luo_unpreserve_folios(struct kho_vmalloc *kho_vmalloc,
|
|
struct memfd_luo_folio_ser *folios_ser,
|
|
u64 nr_folios)
|
|
{
|
|
long i;
|
|
|
|
if (!nr_folios)
|
|
return;
|
|
|
|
kho_unpreserve_vmalloc(kho_vmalloc);
|
|
|
|
for (i = 0; i < nr_folios; i++) {
|
|
const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
|
|
struct folio *folio;
|
|
|
|
if (!pfolio->pfn)
|
|
continue;
|
|
|
|
folio = pfn_folio(pfolio->pfn);
|
|
|
|
kho_unpreserve_folio(folio);
|
|
unpin_folio(folio);
|
|
}
|
|
|
|
vfree(folios_ser);
|
|
}
|
|
|
|
static int memfd_luo_preserve(struct liveupdate_file_op_args *args)
|
|
{
|
|
struct inode *inode = file_inode(args->file);
|
|
struct memfd_luo_folio_ser *folios_ser;
|
|
struct memfd_luo_ser *ser;
|
|
u64 nr_folios;
|
|
int err = 0;
|
|
|
|
inode_lock(inode);
|
|
shmem_freeze(inode, true);
|
|
|
|
/* Allocate the main serialization structure in preserved memory */
|
|
ser = kho_alloc_preserve(sizeof(*ser));
|
|
if (IS_ERR(ser)) {
|
|
err = PTR_ERR(ser);
|
|
goto err_unlock;
|
|
}
|
|
|
|
ser->pos = args->file->f_pos;
|
|
ser->size = i_size_read(inode);
|
|
|
|
err = memfd_luo_preserve_folios(args->file, &ser->folios,
|
|
&folios_ser, &nr_folios);
|
|
if (err)
|
|
goto err_free_ser;
|
|
|
|
ser->nr_folios = nr_folios;
|
|
inode_unlock(inode);
|
|
|
|
args->private_data = folios_ser;
|
|
args->serialized_data = virt_to_phys(ser);
|
|
|
|
return 0;
|
|
|
|
err_free_ser:
|
|
kho_unpreserve_free(ser);
|
|
err_unlock:
|
|
shmem_freeze(inode, false);
|
|
inode_unlock(inode);
|
|
return err;
|
|
}
|
|
|
|
static int memfd_luo_freeze(struct liveupdate_file_op_args *args)
|
|
{
|
|
struct memfd_luo_ser *ser;
|
|
|
|
if (WARN_ON_ONCE(!args->serialized_data))
|
|
return -EINVAL;
|
|
|
|
ser = phys_to_virt(args->serialized_data);
|
|
|
|
/*
|
|
* The pos might have changed since prepare. Everything else stays the
|
|
* same.
|
|
*/
|
|
ser->pos = args->file->f_pos;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void memfd_luo_unpreserve(struct liveupdate_file_op_args *args)
|
|
{
|
|
struct inode *inode = file_inode(args->file);
|
|
struct memfd_luo_ser *ser;
|
|
|
|
if (WARN_ON_ONCE(!args->serialized_data))
|
|
return;
|
|
|
|
inode_lock(inode);
|
|
shmem_freeze(inode, false);
|
|
|
|
ser = phys_to_virt(args->serialized_data);
|
|
|
|
memfd_luo_unpreserve_folios(&ser->folios, args->private_data,
|
|
ser->nr_folios);
|
|
|
|
kho_unpreserve_free(ser);
|
|
inode_unlock(inode);
|
|
}
|
|
|
|
static void memfd_luo_discard_folios(const struct memfd_luo_folio_ser *folios_ser,
|
|
u64 nr_folios)
|
|
{
|
|
u64 i;
|
|
|
|
for (i = 0; i < nr_folios; i++) {
|
|
const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
|
|
struct folio *folio;
|
|
phys_addr_t phys;
|
|
|
|
if (!pfolio->pfn)
|
|
continue;
|
|
|
|
phys = PFN_PHYS(pfolio->pfn);
|
|
folio = kho_restore_folio(phys);
|
|
if (!folio) {
|
|
pr_warn_ratelimited("Unable to restore folio at physical address: %llx\n",
|
|
phys);
|
|
continue;
|
|
}
|
|
|
|
folio_put(folio);
|
|
}
|
|
}
|
|
|
|
static void memfd_luo_finish(struct liveupdate_file_op_args *args)
|
|
{
|
|
struct memfd_luo_folio_ser *folios_ser;
|
|
struct memfd_luo_ser *ser;
|
|
|
|
/*
|
|
* If retrieve was successful, nothing to do. If it failed, retrieve()
|
|
* already cleaned up everything it could. So nothing to do there
|
|
* either. Only need to clean up when retrieve was not called.
|
|
*/
|
|
if (args->retrieve_status)
|
|
return;
|
|
|
|
ser = phys_to_virt(args->serialized_data);
|
|
if (!ser)
|
|
return;
|
|
|
|
if (ser->nr_folios) {
|
|
folios_ser = kho_restore_vmalloc(&ser->folios);
|
|
if (!folios_ser)
|
|
goto out;
|
|
|
|
memfd_luo_discard_folios(folios_ser, ser->nr_folios);
|
|
vfree(folios_ser);
|
|
}
|
|
|
|
out:
|
|
kho_restore_free(ser);
|
|
}
|
|
|
|
static int memfd_luo_retrieve_folios(struct file *file,
|
|
struct memfd_luo_folio_ser *folios_ser,
|
|
u64 nr_folios)
|
|
{
|
|
struct inode *inode = file_inode(file);
|
|
struct address_space *mapping = inode->i_mapping;
|
|
struct folio *folio;
|
|
int err = -EIO;
|
|
long i;
|
|
|
|
for (i = 0; i < nr_folios; i++) {
|
|
const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
|
|
phys_addr_t phys;
|
|
u64 index;
|
|
int flags;
|
|
|
|
if (!pfolio->pfn)
|
|
continue;
|
|
|
|
phys = PFN_PHYS(pfolio->pfn);
|
|
folio = kho_restore_folio(phys);
|
|
if (!folio) {
|
|
pr_err("Unable to restore folio at physical address: %llx\n",
|
|
phys);
|
|
goto put_folios;
|
|
}
|
|
index = pfolio->index;
|
|
flags = pfolio->flags;
|
|
|
|
/* Set up the folio for insertion. */
|
|
__folio_set_locked(folio);
|
|
__folio_set_swapbacked(folio);
|
|
|
|
err = mem_cgroup_charge(folio, NULL, mapping_gfp_mask(mapping));
|
|
if (err) {
|
|
pr_err("shmem: failed to charge folio index %ld: %d\n",
|
|
i, err);
|
|
goto unlock_folio;
|
|
}
|
|
|
|
err = shmem_add_to_page_cache(folio, mapping, index, NULL,
|
|
mapping_gfp_mask(mapping));
|
|
if (err) {
|
|
pr_err("shmem: failed to add to page cache folio index %ld: %d\n",
|
|
i, err);
|
|
goto unlock_folio;
|
|
}
|
|
|
|
if (flags & MEMFD_LUO_FOLIO_UPTODATE)
|
|
folio_mark_uptodate(folio);
|
|
if (flags & MEMFD_LUO_FOLIO_DIRTY)
|
|
folio_mark_dirty(folio);
|
|
|
|
err = shmem_inode_acct_blocks(inode, 1);
|
|
if (err) {
|
|
pr_err("shmem: failed to account folio index %ld: %d\n",
|
|
i, err);
|
|
goto unlock_folio;
|
|
}
|
|
|
|
shmem_recalc_inode(inode, 1, 0);
|
|
folio_add_lru(folio);
|
|
folio_unlock(folio);
|
|
folio_put(folio);
|
|
}
|
|
|
|
return 0;
|
|
|
|
unlock_folio:
|
|
folio_unlock(folio);
|
|
folio_put(folio);
|
|
put_folios:
|
|
/*
|
|
* Note: don't free the folios already added to the file. They will be
|
|
* freed when the file is freed. Free the ones not added yet here.
|
|
*/
|
|
for (long j = i + 1; j < nr_folios; j++) {
|
|
const struct memfd_luo_folio_ser *pfolio = &folios_ser[j];
|
|
|
|
folio = kho_restore_folio(pfolio->pfn);
|
|
if (folio)
|
|
folio_put(folio);
|
|
}
|
|
|
|
return err;
|
|
}
|
|
|
|
static int memfd_luo_retrieve(struct liveupdate_file_op_args *args)
|
|
{
|
|
struct memfd_luo_folio_ser *folios_ser;
|
|
struct memfd_luo_ser *ser;
|
|
struct file *file;
|
|
int err;
|
|
|
|
ser = phys_to_virt(args->serialized_data);
|
|
if (!ser)
|
|
return -EINVAL;
|
|
|
|
file = memfd_alloc_file("", 0);
|
|
if (IS_ERR(file)) {
|
|
pr_err("failed to setup file: %pe\n", file);
|
|
err = PTR_ERR(file);
|
|
goto free_ser;
|
|
}
|
|
|
|
vfs_setpos(file, ser->pos, MAX_LFS_FILESIZE);
|
|
file->f_inode->i_size = ser->size;
|
|
|
|
if (ser->nr_folios) {
|
|
folios_ser = kho_restore_vmalloc(&ser->folios);
|
|
if (!folios_ser) {
|
|
err = -EINVAL;
|
|
goto put_file;
|
|
}
|
|
|
|
err = memfd_luo_retrieve_folios(file, folios_ser, ser->nr_folios);
|
|
vfree(folios_ser);
|
|
if (err)
|
|
goto put_file;
|
|
}
|
|
|
|
args->file = file;
|
|
kho_restore_free(ser);
|
|
|
|
return 0;
|
|
|
|
put_file:
|
|
fput(file);
|
|
free_ser:
|
|
kho_restore_free(ser);
|
|
return err;
|
|
}
|
|
|
|
static bool memfd_luo_can_preserve(struct liveupdate_file_handler *handler,
|
|
struct file *file)
|
|
{
|
|
struct inode *inode = file_inode(file);
|
|
|
|
return shmem_file(file) && !inode->i_nlink;
|
|
}
|
|
|
|
static const struct liveupdate_file_ops memfd_luo_file_ops = {
|
|
.freeze = memfd_luo_freeze,
|
|
.finish = memfd_luo_finish,
|
|
.retrieve = memfd_luo_retrieve,
|
|
.preserve = memfd_luo_preserve,
|
|
.unpreserve = memfd_luo_unpreserve,
|
|
.can_preserve = memfd_luo_can_preserve,
|
|
.owner = THIS_MODULE,
|
|
};
|
|
|
|
static struct liveupdate_file_handler memfd_luo_handler = {
|
|
.ops = &memfd_luo_file_ops,
|
|
.compatible = MEMFD_LUO_FH_COMPATIBLE,
|
|
};
|
|
|
|
static int __init memfd_luo_init(void)
|
|
{
|
|
int err = liveupdate_register_file_handler(&memfd_luo_handler);
|
|
|
|
if (err && err != -EOPNOTSUPP) {
|
|
pr_err("Could not register luo filesystem handler: %pe\n",
|
|
ERR_PTR(err));
|
|
|
|
return err;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
late_initcall(memfd_luo_init);
|