mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-03-23 16:06:50 +08:00
If hmm_range_fault() fails a folio_trylock() in do_swap_page,
trying to acquire the lock of a device-private folio for migration,
to ram, the function will spin until it succeeds grabbing the lock.
However, if the process holding the lock is depending on a work
item to be completed, which is scheduled on the same CPU as the
spinning hmm_range_fault(), that work item might be starved and
we end up in a livelock / starvation situation which is never
resolved.
This can happen, for example if the process holding the
device-private folio lock is stuck in
migrate_device_unmap()->lru_add_drain_all()
sinc lru_add_drain_all() requires a short work-item
to be run on all online cpus to complete.
A prerequisite for this to happen is:
a) Both zone device and system memory folios are considered in
migrate_device_unmap(), so that there is a reason to call
lru_add_drain_all() for a system memory folio while a
folio lock is held on a zone device folio.
b) The zone device folio has an initial mapcount > 1 which causes
at least one migration PTE entry insertion to be deferred to
try_to_migrate(), which can happen after the call to
lru_add_drain_all().
c) No or voluntary only preemption.
This all seems pretty unlikely to happen, but indeed is hit by
the "xe_exec_system_allocator" igt test.
Resolve this by waiting for the folio to be unlocked if the
folio_trylock() fails in do_swap_page().
Rename migration_entry_wait_on_locked() to
softleaf_entry_wait_unlock() and update its documentation to
indicate the new use-case.
Future code improvements might consider moving
the lru_add_drain_all() call in migrate_device_unmap() to be
called *after* all pages have migration entries inserted.
That would eliminate also b) above.
v2:
- Instead of a cond_resched() in hmm_range_fault(),
eliminate the problem by waiting for the folio to be unlocked
in do_swap_page() (Alistair Popple, Andrew Morton)
v3:
- Add a stub migration_entry_wait_on_locked() for the
!CONFIG_MIGRATION case. (Kernel Test Robot)
v4:
- Rename migrate_entry_wait_on_locked() to
softleaf_entry_wait_on_locked() and update docs (Alistair Popple)
v5:
- Add a WARN_ON_ONCE() for the !CONFIG_MIGRATION
version of softleaf_entry_wait_on_locked().
- Modify wording around function names in the commit message
(Andrew Morton)
Suggested-by: Alistair Popple <apopple@nvidia.com>
Fixes: 1afaeb8293 ("mm/migrate: Trylock device page in do_swap_page")
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: linux-mm@kvack.org
Cc: <dri-devel@lists.freedesktop.org>
Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Cc: <stable@vger.kernel.org> # v6.15+
Reviewed-by: John Hubbard <jhubbard@nvidia.com> #v3
Reviewed-by: Alistair Popple <apopple@nvidia.com>
Link: https://patch.msgid.link/20260210115653.92413-1-thomas.hellstrom@linux.intel.com
(cherry picked from commit a69d1ab971a624c6f112cea61536569d579c3215)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
207 lines
7.0 KiB
C
207 lines
7.0 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _LINUX_MIGRATE_H
|
|
#define _LINUX_MIGRATE_H
|
|
|
|
#include <linux/mm.h>
|
|
#include <linux/mempolicy.h>
|
|
#include <linux/migrate_mode.h>
|
|
#include <linux/hugetlb.h>
|
|
|
|
typedef struct folio *new_folio_t(struct folio *folio, unsigned long private);
|
|
typedef void free_folio_t(struct folio *folio, unsigned long private);
|
|
|
|
struct migration_target_control;
|
|
|
|
/**
|
|
* struct movable_operations - Driver page migration
|
|
* @isolate_page:
|
|
* The VM calls this function to prepare the page to be moved. The page
|
|
* is locked and the driver should not unlock it. The driver should
|
|
* return ``true`` if the page is movable and ``false`` if it is not
|
|
* currently movable. After this function returns, the VM uses the
|
|
* page->lru field, so the driver must preserve any information which
|
|
* is usually stored here.
|
|
*
|
|
* @migrate_page:
|
|
* After isolation, the VM calls this function with the isolated
|
|
* @src page. The driver should copy the contents of the
|
|
* @src page to the @dst page and set up the fields of @dst page.
|
|
* Both pages are locked.
|
|
* If page migration is successful, the driver should return 0.
|
|
* If the driver cannot migrate the page at the moment, it can return
|
|
* -EAGAIN. The VM interprets this as a temporary migration failure and
|
|
* will retry it later. Any other error value is a permanent migration
|
|
* failure and migration will not be retried.
|
|
* The driver shouldn't touch the @src->lru field while in the
|
|
* migrate_page() function. It may write to @dst->lru.
|
|
*
|
|
* @putback_page:
|
|
* If migration fails on the isolated page, the VM informs the driver
|
|
* that the page is no longer a candidate for migration by calling
|
|
* this function. The driver should put the isolated page back into
|
|
* its own data structure.
|
|
*/
|
|
struct movable_operations {
|
|
bool (*isolate_page)(struct page *, isolate_mode_t);
|
|
int (*migrate_page)(struct page *dst, struct page *src,
|
|
enum migrate_mode);
|
|
void (*putback_page)(struct page *);
|
|
};
|
|
|
|
/* Defined in mm/debug.c: */
|
|
extern const char *migrate_reason_names[MR_TYPES];
|
|
|
|
#ifdef CONFIG_MIGRATION
|
|
|
|
void putback_movable_pages(struct list_head *l);
|
|
int migrate_folio(struct address_space *mapping, struct folio *dst,
|
|
struct folio *src, enum migrate_mode mode);
|
|
int migrate_pages(struct list_head *l, new_folio_t new, free_folio_t free,
|
|
unsigned long private, enum migrate_mode mode, int reason,
|
|
unsigned int *ret_succeeded);
|
|
struct folio *alloc_migration_target(struct folio *src, unsigned long private);
|
|
bool isolate_movable_ops_page(struct page *page, isolate_mode_t mode);
|
|
bool isolate_folio_to_list(struct folio *folio, struct list_head *list);
|
|
|
|
int migrate_huge_page_move_mapping(struct address_space *mapping,
|
|
struct folio *dst, struct folio *src);
|
|
void softleaf_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl)
|
|
__releases(ptl);
|
|
void folio_migrate_flags(struct folio *newfolio, struct folio *folio);
|
|
int folio_migrate_mapping(struct address_space *mapping,
|
|
struct folio *newfolio, struct folio *folio, int extra_count);
|
|
int set_movable_ops(const struct movable_operations *ops, enum pagetype type);
|
|
|
|
#else
|
|
|
|
static inline void putback_movable_pages(struct list_head *l) {}
|
|
static inline int migrate_pages(struct list_head *l, new_folio_t new,
|
|
free_folio_t free, unsigned long private,
|
|
enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
|
|
{ return -ENOSYS; }
|
|
static inline struct folio *alloc_migration_target(struct folio *src,
|
|
unsigned long private)
|
|
{ return NULL; }
|
|
static inline bool isolate_movable_ops_page(struct page *page, isolate_mode_t mode)
|
|
{ return false; }
|
|
static inline bool isolate_folio_to_list(struct folio *folio, struct list_head *list)
|
|
{ return false; }
|
|
|
|
static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
|
|
struct folio *dst, struct folio *src)
|
|
{
|
|
return -ENOSYS;
|
|
}
|
|
static inline int set_movable_ops(const struct movable_operations *ops, enum pagetype type)
|
|
{
|
|
return -ENOSYS;
|
|
}
|
|
|
|
static inline void softleaf_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl)
|
|
__releases(ptl)
|
|
{
|
|
WARN_ON_ONCE(1);
|
|
|
|
spin_unlock(ptl);
|
|
}
|
|
|
|
#endif /* CONFIG_MIGRATION */
|
|
|
|
#ifdef CONFIG_NUMA_BALANCING
|
|
int migrate_misplaced_folio_prepare(struct folio *folio,
|
|
struct vm_area_struct *vma, int node);
|
|
int migrate_misplaced_folio(struct folio *folio, int node);
|
|
#else
|
|
static inline int migrate_misplaced_folio_prepare(struct folio *folio,
|
|
struct vm_area_struct *vma, int node)
|
|
{
|
|
return -EAGAIN; /* can't migrate now */
|
|
}
|
|
static inline int migrate_misplaced_folio(struct folio *folio, int node)
|
|
{
|
|
return -EAGAIN; /* can't migrate now */
|
|
}
|
|
#endif /* CONFIG_NUMA_BALANCING */
|
|
|
|
#ifdef CONFIG_MIGRATION
|
|
|
|
/*
|
|
* Watch out for PAE architecture, which has an unsigned long, and might not
|
|
* have enough bits to store all physical address and flags. So far we have
|
|
* enough room for all our flags.
|
|
*/
|
|
#define MIGRATE_PFN_VALID (1UL << 0)
|
|
#define MIGRATE_PFN_MIGRATE (1UL << 1)
|
|
#define MIGRATE_PFN_WRITE (1UL << 3)
|
|
#define MIGRATE_PFN_COMPOUND (1UL << 4)
|
|
#define MIGRATE_PFN_SHIFT 6
|
|
|
|
static inline struct page *migrate_pfn_to_page(unsigned long mpfn)
|
|
{
|
|
if (!(mpfn & MIGRATE_PFN_VALID))
|
|
return NULL;
|
|
return pfn_to_page(mpfn >> MIGRATE_PFN_SHIFT);
|
|
}
|
|
|
|
static inline unsigned long migrate_pfn(unsigned long pfn)
|
|
{
|
|
return (pfn << MIGRATE_PFN_SHIFT) | MIGRATE_PFN_VALID;
|
|
}
|
|
|
|
enum migrate_vma_direction {
|
|
MIGRATE_VMA_SELECT_SYSTEM = 1 << 0,
|
|
MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1,
|
|
MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2,
|
|
MIGRATE_VMA_SELECT_COMPOUND = 1 << 3,
|
|
};
|
|
|
|
struct migrate_vma {
|
|
struct vm_area_struct *vma;
|
|
/*
|
|
* Both src and dst array must be big enough for
|
|
* (end - start) >> PAGE_SHIFT entries.
|
|
*
|
|
* The src array must not be modified by the caller after
|
|
* migrate_vma_setup(), and must not change the dst array after
|
|
* migrate_vma_pages() returns.
|
|
*/
|
|
unsigned long *dst;
|
|
unsigned long *src;
|
|
unsigned long cpages;
|
|
unsigned long npages;
|
|
unsigned long start;
|
|
unsigned long end;
|
|
|
|
/*
|
|
* Set to the owner value also stored in page_pgmap(page)->owner
|
|
* for migrating out of device private memory. The flags also need to
|
|
* be set to MIGRATE_VMA_SELECT_DEVICE_PRIVATE.
|
|
* The caller should always set this field when using mmu notifier
|
|
* callbacks to avoid device MMU invalidations for device private
|
|
* pages that are not being migrated.
|
|
*/
|
|
void *pgmap_owner;
|
|
unsigned long flags;
|
|
|
|
/*
|
|
* Set to vmf->page if this is being called to migrate a page as part of
|
|
* a migrate_to_ram() callback.
|
|
*/
|
|
struct page *fault_page;
|
|
};
|
|
|
|
int migrate_vma_setup(struct migrate_vma *args);
|
|
void migrate_vma_pages(struct migrate_vma *migrate);
|
|
void migrate_vma_finalize(struct migrate_vma *migrate);
|
|
int migrate_device_range(unsigned long *src_pfns, unsigned long start,
|
|
unsigned long npages);
|
|
int migrate_device_pfns(unsigned long *src_pfns, unsigned long npages);
|
|
void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
|
|
unsigned long npages);
|
|
void migrate_device_finalize(unsigned long *src_pfns,
|
|
unsigned long *dst_pfns, unsigned long npages);
|
|
|
|
#endif /* CONFIG_MIGRATION */
|
|
|
|
#endif /* _LINUX_MIGRATE_H */
|