diff --git a/include/linux/swap.h b/include/linux/swap.h index bf72b548a96d..74df3004c850 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -458,7 +458,6 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern int swap_duplicate_nr(swp_entry_t entry, int nr); -extern int swapcache_prepare(swp_entry_t entry, int nr); extern void swap_free_nr(swp_entry_t entry, int nr_pages); extern void free_swap_and_cache_nr(swp_entry_t entry, int nr); int swap_type_of(dev_t device, sector_t offset); @@ -517,11 +516,6 @@ static inline int swap_duplicate_nr(swp_entry_t swp, int nr_pages) return 0; } -static inline int swapcache_prepare(swp_entry_t swp, int nr) -{ - return 0; -} - static inline void swap_free_nr(swp_entry_t entry, int nr_pages) { } diff --git a/mm/swap.h b/mm/swap.h index 2f79458b37f3..e427240073e9 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -234,6 +234,14 @@ static inline bool folio_matches_swap_entry(const struct folio *folio, return folio_entry.val == round_down(entry.val, nr_pages); } +/* Temporary internal helpers */ +void __swapcache_set_cached(struct swap_info_struct *si, + struct swap_cluster_info *ci, + swp_entry_t entry); +void __swapcache_clear_cached(struct swap_info_struct *si, + struct swap_cluster_info *ci, + swp_entry_t entry, unsigned int nr); + /* * All swap cache helpers below require the caller to ensure the swap entries * used are valid and stabilize the device by any of the following ways: @@ -247,7 +255,8 @@ static inline bool folio_matches_swap_entry(const struct folio *folio, */ struct folio *swap_cache_get_folio(swp_entry_t entry); void *swap_cache_get_shadow(swp_entry_t entry); -void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadow); +int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, + void **shadow, bool alloc); void swap_cache_del_folio(struct folio *folio); struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_flags, struct mempolicy *mpol, pgoff_t ilx, @@ -413,8 +422,10 @@ static inline void *swap_cache_get_shadow(swp_entry_t entry) return NULL; } -static inline void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadow) +static inline int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, + void **shadow, bool alloc) { + return -ENOENT; } static inline void swap_cache_del_folio(struct folio *folio) diff --git a/mm/swap_state.c b/mm/swap_state.c index d58bce532d95..22990c5259cc 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -127,34 +127,64 @@ void *swap_cache_get_shadow(swp_entry_t entry) * @entry: The swap entry corresponding to the folio. * @gfp: gfp_mask for XArray node allocation. * @shadowp: If a shadow is found, return the shadow. + * @alloc: If it's the allocator that is trying to insert a folio. Allocator + * sets SWAP_HAS_CACHE to pin slots before insert so skip map update. * * Context: Caller must ensure @entry is valid and protect the swap device * with reference count or locks. - * The caller also needs to update the corresponding swap_map slots with - * SWAP_HAS_CACHE bit to avoid race or conflict. */ -void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadowp) +int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, + void **shadowp, bool alloc) { + int err; void *shadow = NULL; + struct swap_info_struct *si; unsigned long old_tb, new_tb; struct swap_cluster_info *ci; - unsigned int ci_start, ci_off, ci_end; + unsigned int ci_start, ci_off, ci_end, offset; unsigned long nr_pages = folio_nr_pages(folio); VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio); + si = __swap_entry_to_info(entry); new_tb = folio_to_swp_tb(folio); ci_start = swp_cluster_offset(entry); ci_end = ci_start + nr_pages; ci_off = ci_start; - ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry)); + offset = swp_offset(entry); + ci = swap_cluster_lock(si, swp_offset(entry)); + if (unlikely(!ci->table)) { + err = -ENOENT; + goto failed; + } do { - old_tb = __swap_table_xchg(ci, ci_off, new_tb); - WARN_ON_ONCE(swp_tb_is_folio(old_tb)); + old_tb = __swap_table_get(ci, ci_off); + if (unlikely(swp_tb_is_folio(old_tb))) { + err = -EEXIST; + goto failed; + } + if (!alloc && unlikely(!__swap_count(swp_entry(swp_type(entry), offset)))) { + err = -ENOENT; + goto failed; + } if (swp_tb_is_shadow(old_tb)) shadow = swp_tb_to_shadow(old_tb); + offset++; + } while (++ci_off < ci_end); + + ci_off = ci_start; + offset = swp_offset(entry); + do { + /* + * Still need to pin the slots with SWAP_HAS_CACHE since + * swap allocator depends on that. + */ + if (!alloc) + __swapcache_set_cached(si, ci, swp_entry(swp_type(entry), offset)); + __swap_table_set(ci, ci_off, new_tb); + offset++; } while (++ci_off < ci_end); folio_ref_add(folio, nr_pages); @@ -167,6 +197,11 @@ void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadowp if (shadowp) *shadowp = shadow; + return 0; + +failed: + swap_cluster_unlock(ci); + return err; } /** @@ -185,6 +220,7 @@ void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadowp void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, swp_entry_t entry, void *shadow) { + struct swap_info_struct *si; unsigned long old_tb, new_tb; unsigned int ci_start, ci_off, ci_end; unsigned long nr_pages = folio_nr_pages(folio); @@ -194,6 +230,7 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio); + si = __swap_entry_to_info(entry); new_tb = shadow_swp_to_tb(shadow); ci_start = swp_cluster_offset(entry); ci_end = ci_start + nr_pages; @@ -209,6 +246,7 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, folio_clear_swapcache(folio); node_stat_mod_folio(folio, NR_FILE_PAGES, -nr_pages); lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages); + __swapcache_clear_cached(si, ci, entry, nr_pages); } /** @@ -230,7 +268,6 @@ void swap_cache_del_folio(struct folio *folio) __swap_cache_del_folio(ci, folio, entry, NULL); swap_cluster_unlock(ci); - put_swap_folio(folio, entry); folio_ref_sub(folio, folio_nr_pages(folio)); } @@ -422,67 +459,37 @@ static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry, gfp_t gfp, bool charged, bool skip_if_exists) { - struct folio *swapcache; + struct folio *swapcache = NULL; void *shadow; int ret; - /* - * Check and pin the swap map with SWAP_HAS_CACHE, then add the folio - * into the swap cache. Loop with a schedule delay if raced with - * another process setting SWAP_HAS_CACHE. This hackish loop will - * be fixed very soon. - */ + __folio_set_locked(folio); + __folio_set_swapbacked(folio); for (;;) { - ret = swapcache_prepare(entry, folio_nr_pages(folio)); + ret = swap_cache_add_folio(folio, entry, &shadow, false); if (!ret) break; /* - * The skip_if_exists is for protecting against a recursive - * call to this helper on the same entry waiting forever - * here because SWAP_HAS_CACHE is set but the folio is not - * in the swap cache yet. This can happen today if - * mem_cgroup_swapin_charge_folio() below triggers reclaim - * through zswap, which may call this helper again in the - * writeback path. - * - * Large order allocation also needs special handling on + * Large order allocation needs special handling on * race: if a smaller folio exists in cache, swapin needs * to fallback to order 0, and doing a swap cache lookup * might return a folio that is irrelevant to the faulting * entry because @entry is aligned down. Just return NULL. */ if (ret != -EEXIST || skip_if_exists || folio_test_large(folio)) - return NULL; + goto failed; - /* - * Check the swap cache again, we can only arrive - * here because swapcache_prepare returns -EEXIST. - */ swapcache = swap_cache_get_folio(entry); if (swapcache) - return swapcache; - - /* - * We might race against __swap_cache_del_folio(), and - * stumble across a swap_map entry whose SWAP_HAS_CACHE - * has not yet been cleared. Or race against another - * swap_cache_alloc_folio(), which has set SWAP_HAS_CACHE - * in swap_map, but not yet added its folio to swap cache. - */ - schedule_timeout_uninterruptible(1); + goto failed; } - __folio_set_locked(folio); - __folio_set_swapbacked(folio); - if (!charged && mem_cgroup_swapin_charge_folio(folio, NULL, gfp, entry)) { - put_swap_folio(folio, entry); - folio_unlock(folio); - return NULL; + swap_cache_del_folio(folio); + goto failed; } - swap_cache_add_folio(folio, entry, &shadow); memcg1_swapin(entry, folio_nr_pages(folio)); if (shadow) workingset_refault(folio, shadow); @@ -490,6 +497,10 @@ static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry, /* Caller will initiate read into locked folio */ folio_add_lru(folio); return folio; + +failed: + folio_unlock(folio); + return swapcache; } /** diff --git a/mm/swapfile.c b/mm/swapfile.c index ced53aba3f4c..64970ee11fcf 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1476,7 +1476,11 @@ again: if (!entry.val) return -ENOMEM; - swap_cache_add_folio(folio, entry, NULL); + /* + * Allocator has pinned the slots with SWAP_HAS_CACHE + * so it should never fail + */ + WARN_ON_ONCE(swap_cache_add_folio(folio, entry, NULL, true)); return 0; @@ -1582,9 +1586,8 @@ static unsigned char swap_entry_put_locked(struct swap_info_struct *si, * do_swap_page() * ... swapoff+swapon * swap_cache_alloc_folio() - * swapcache_prepare() - * __swap_duplicate() - * // check swap_map + * swap_cache_add_folio() + * // check swap_map * // verify PTE not changed * * In __swap_duplicate(), the swap_map need to be checked before @@ -3769,17 +3772,25 @@ int swap_duplicate_nr(swp_entry_t entry, int nr) return err; } -/* - * @entry: first swap entry from which we allocate nr swap cache. - * - * Called when allocating swap cache for existing swap entries, - * This can return error codes. Returns 0 at success. - * -EEXIST means there is a swap cache. - * Note: return code is different from swap_duplicate(). - */ -int swapcache_prepare(swp_entry_t entry, int nr) +/* Mark the swap map as HAS_CACHE, caller need to hold the cluster lock */ +void __swapcache_set_cached(struct swap_info_struct *si, + struct swap_cluster_info *ci, + swp_entry_t entry) { - return __swap_duplicate(entry, SWAP_HAS_CACHE, nr); + WARN_ON(swap_dup_entries(si, ci, swp_offset(entry), SWAP_HAS_CACHE, 1)); +} + +/* Clear the swap map as !HAS_CACHE, caller need to hold the cluster lock */ +void __swapcache_clear_cached(struct swap_info_struct *si, + struct swap_cluster_info *ci, + swp_entry_t entry, unsigned int nr) +{ + if (swap_only_has_cache(si, swp_offset(entry), nr)) { + swap_entries_free(si, ci, entry, nr); + } else { + for (int i = 0; i < nr; i++, entry.val++) + swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE); + } } /* diff --git a/mm/vmscan.c b/mm/vmscan.c index 1d281174164e..973ffb9813ea 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -757,10 +757,9 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, if (reclaimed && !mapping_exiting(mapping)) shadow = workingset_eviction(folio, target_memcg); - __swap_cache_del_folio(ci, folio, swap, shadow); memcg1_swapout(folio, swap); + __swap_cache_del_folio(ci, folio, swap, shadow); swap_cluster_unlock_irq(ci); - put_swap_folio(folio, swap); } else { void (*free_folio)(struct folio *);