diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst index 1f883abf3f00fa7311b0d16081039c74dfcf2919..0440ae709c7df6a98750581bbc90b7b1e92516c6 100644 --- a/Documentation/admin-guide/mm/index.rst +++ b/Documentation/admin-guide/mm/index.rst @@ -38,7 +38,6 @@ the Linux memory management. pagemap shrinker_debugfs soft-dirty - swap_numa transhuge userfaultfd zswap diff --git a/Documentation/admin-guide/mm/swap_numa.rst b/Documentation/admin-guide/mm/swap_numa.rst deleted file mode 100644 index 2e630627bceec00ddab4be1955f39963ba28cbcb..0000000000000000000000000000000000000000 --- a/Documentation/admin-guide/mm/swap_numa.rst +++ /dev/null @@ -1,78 +0,0 @@ -=========================================== -Automatically bind swap device to numa node -=========================================== - -If the system has more than one swap device and swap device has the node -information, we can make use of this information to decide which swap -device to use in get_swap_pages() to get better performance. - - -How to use this feature -======================= - -Swap device has priority and that decides the order of it to be used. To make -use of automatically binding, there is no need to manipulate priority settings -for swap devices. e.g. on a 2 node machine, assume 2 swap devices swapA and -swapB, with swapA attached to node 0 and swapB attached to node 1, are going -to be swapped on. Simply swapping them on by doing:: - - # swapon /dev/swapA - # swapon /dev/swapB - -Then node 0 will use the two swap devices in the order of swapA then swapB and -node 1 will use the two swap devices in the order of swapB then swapA. Note -that the order of them being swapped on doesn't matter. - -A more complex example on a 4 node machine. Assume 6 swap devices are going to -be swapped on: swapA and swapB are attached to node 0, swapC is attached to -node 1, swapD and swapE are attached to node 2 and swapF is attached to node3. -The way to swap them on is the same as above:: - - # swapon /dev/swapA - # swapon /dev/swapB - # swapon /dev/swapC - # swapon /dev/swapD - # swapon /dev/swapE - # swapon /dev/swapF - -Then node 0 will use them in the order of:: - - swapA/swapB -> swapC -> swapD -> swapE -> swapF - -swapA and swapB will be used in a round robin mode before any other swap device. - -node 1 will use them in the order of:: - - swapC -> swapA -> swapB -> swapD -> swapE -> swapF - -node 2 will use them in the order of:: - - swapD/swapE -> swapA -> swapB -> swapC -> swapF - -Similaly, swapD and swapE will be used in a round robin mode before any -other swap devices. - -node 3 will use them in the order of:: - - swapF -> swapA -> swapB -> swapC -> swapD -> swapE - - -Implementation details -====================== - -The current code uses a priority based list, swap_avail_list, to decide -which swap device to use and if multiple swap devices share the same -priority, they are used round robin. This change here replaces the single -global swap_avail_list with a per-numa-node list, i.e. for each numa node, -it sees its own priority based list of available swap devices. Swap -device's priority can be promoted on its matching node's swap_avail_list. - -The current swap device's priority is set as: user can set a >=0 value, -or the system will pick one starting from -1 then downwards. The priority -value in the swap_avail_list is the negated value of the swap device's -due to plist being sorted from low to high. The new policy doesn't change -the semantics for priority >=0 cases, the previous starting from -1 then -downwards now becomes starting from -2 then downwards and -1 is reserved -as the promoted value. So if multiple swap devices are attached to the same -node, they will all be promoted to priority -1 on that node's plist and will -be used round robin before any other swap devices. diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 9ccb803eacd47e4eb485c0459fdea71b213ce6b2..faab8a4d9f275046dac6614e7348661bef44ebf6 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -734,7 +734,7 @@ static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) dec_mm_counter(mm, mm_counter(folio)); } - free_swap_and_cache(entry); + swap_put_entries_direct(entry, 1); } void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, diff --git a/include/linux/swap.h b/include/linux/swap.h index d3ea40efa8ffa6ad27c04404f47fdbd9f1596a63..7a19194edb589b793ab22725f53b3f44b2bc308a 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -224,13 +224,11 @@ enum { #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX /* Bit flag in swap_map */ -#define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */ #define COUNT_CONTINUED 0x80 /* Flag swap_map continuation for full count */ /* Special value in first swap_map */ #define SWAP_MAP_MAX 0x3e /* Max count */ #define SWAP_MAP_BAD 0x3f /* Note page is bad */ -#define SWAP_MAP_SHMEM 0xbf /* Owned by shmem/tmpfs */ /* Special value in each swap_map continuation */ #define SWAP_CONT_MAX 0x7f /* Max count */ @@ -304,21 +302,11 @@ struct swap_info_struct { struct work_struct discard_work; /* discard worker */ struct work_struct reclaim_work; /* reclaim worker */ struct list_head discard_clusters; /* discard clusters list */ + struct plist_node avail_list; /* entry in swap_avail_head */ KABI_RESERVE(1); KABI_RESERVE(2); KABI_RESERVE(3); - - struct plist_node avail_lists[]; /* - * entries in swap_avail_heads, one - * entry per node. - * Must be last as the number of the - * array is nr_node_ids, which is not - * a fixed value so have to allocate - * dynamically. - * And it has to be an array so that - * plist_for_each_* can work. - */ }; static inline swp_entry_t page_swap_entry(struct page *page) @@ -484,27 +472,42 @@ static inline long get_nr_swap_pages(void) } extern void si_swapinfo(struct sysinfo *); -int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask); -bool folio_free_swap(struct folio *folio); -void put_swap_folio(struct folio *folio, swp_entry_t entry); -extern swp_entry_t get_swap_page_of_type(int); extern int add_swap_count_continuation(swp_entry_t, gfp_t); -extern void swap_shmem_alloc(swp_entry_t, int); -extern int swap_duplicate(swp_entry_t); -extern int swapcache_prepare(swp_entry_t entry, int nr); -extern void swap_free_nr(swp_entry_t entry, int nr_pages); -extern void free_swap_and_cache_nr(swp_entry_t entry, int nr); int swap_type_of(dev_t device, sector_t offset); int find_first_swap(dev_t *device); extern unsigned int count_swap_pages(int, int); extern sector_t swapdev_block(int, pgoff_t); extern int __swap_count(swp_entry_t entry); -extern bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry); +extern bool swap_entry_swapped(struct swap_info_struct *si, + unsigned long offset); extern int swp_swapcount(swp_entry_t entry); struct backing_dev_info; extern struct swap_info_struct *get_swap_device(swp_entry_t entry); sector_t swap_folio_sector(struct folio *folio); +/* + * If there is an existing swap slot reference (swap entry) and the caller + * guarantees that there is no race modification of it (e.g., PTL + * protecting the swap entry in page table; shmem's cmpxchg protects t + * he swap entry in shmem mapping), these two helpers below can be used + * to put/dup the entries directly. + * + * All entries must be allocated by folio_alloc_swap(). And they must have + * a swap count > 1. See comments of folio_*_swap helpers for more info. + */ +int swap_dup_entry_direct(swp_entry_t entry); +void swap_put_entries_direct(swp_entry_t entry, int nr); + +/* + * folio_free_swap tries to free the swap entries pinned by a swap cache + * folio, it has to be here to be called by other components. + */ +bool folio_free_swap(struct folio *folio); + +/* Allocate / free (hibernation) exclusive entries */ +swp_entry_t swap_alloc_hibernation_slot(int type); +void swap_free_hibernation_slot(swp_entry_t entry); + static inline void put_swap_device(struct swap_info_struct *si) { percpu_ref_put(&si->users); @@ -534,10 +537,6 @@ static inline void put_swap_device(struct swap_info_struct *si) #define free_pages_and_swap_cache(pages, nr) \ release_pages((pages), (nr)); -static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr) -{ -} - static inline void free_swap_cache(struct folio *folio) { } @@ -547,25 +546,12 @@ static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask) return 0; } -static inline void swap_shmem_alloc(swp_entry_t swp, int nr) -{ -} - -static inline int swap_duplicate(swp_entry_t swp) +static inline int swap_dup_entry_direct(swp_entry_t ent) { return 0; } -static inline int swapcache_prepare(swp_entry_t swp, int nr) -{ - return 0; -} - -static inline void swap_free_nr(swp_entry_t entry, int nr_pages) -{ -} - -static inline void put_swap_folio(struct folio *folio, swp_entry_t swp) +static inline void swap_put_entries_direct(swp_entry_t ent, int nr) { } @@ -574,7 +560,8 @@ static inline int __swap_count(swp_entry_t entry) return 0; } -static inline bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry) +static inline bool swap_entry_swapped(struct swap_info_struct *si, + unsigned long offset) { return false; } @@ -584,11 +571,6 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -static inline int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask) -{ - return -EINVAL; -} - static inline bool folio_free_swap(struct folio *folio) { return false; @@ -601,17 +583,6 @@ static inline int add_swap_extent(struct swap_info_struct *sis, return -EINVAL; } #endif /* CONFIG_SWAP */ - -static inline void free_swap_and_cache(swp_entry_t entry) -{ - free_swap_and_cache_nr(entry, 1); -} - -static inline void swap_free(swp_entry_t entry) -{ - swap_free_nr(entry, 1); -} - #ifdef CONFIG_MEMCG static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) { diff --git a/kernel/power/swap.c b/kernel/power/swap.c index b1896346fde159168da56bab18af5409ea6f0ee5..e532da418e5ffe3c1042c7ed3e4f6f6f8fbdba11 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -179,10 +179,10 @@ sector_t alloc_swapdev_block(int swap) { unsigned long offset; - offset = swp_offset(get_swap_page_of_type(swap)); + offset = swp_offset(swap_alloc_hibernation_slot(swap)); if (offset) { if (swsusp_extents_insert(offset)) - swap_free(swp_entry(swap, offset)); + swap_free_hibernation_slot(swp_entry(swap, offset)); else return swapdev_block(swap, offset); } @@ -197,6 +197,7 @@ sector_t alloc_swapdev_block(int swap) void free_all_swap_pages(int swap) { + unsigned long offset; struct rb_node *node; while ((node = swsusp_extents.rb_node)) { @@ -204,8 +205,9 @@ void free_all_swap_pages(int swap) ext = rb_entry(node, struct swsusp_extent, node); rb_erase(node, &swsusp_extents); - swap_free_nr(swp_entry(swap, ext->start), - ext->end - ext->start + 1); + + for (offset = ext->start; offset < ext->end; offset++) + swap_free_hibernation_slot(swp_entry(swap, offset)); kfree(ext); } diff --git a/mm/madvise.c b/mm/madvise.c index 1fb7aa3a1d9a26a94e2f603b5013d308ef41d424..5420921c8816447d589bbd536a165b0284ba5795 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -661,7 +661,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, max_nr = (end - addr) / PAGE_SIZE; nr = swap_pte_batch(pte, max_nr, ptent); nr_swap -= nr; - free_swap_and_cache_nr(entry, nr); + swap_put_entries_direct(entry, nr); clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) { diff --git a/mm/memory.c b/mm/memory.c index a7d12751a862b90950a3b69657444b50b818577d..e7b67d1cc480edbfab1b8563249bfa2679283616 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -790,7 +790,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, swp_entry_t entry = pte_to_swp_entry(orig_pte); if (likely(!non_swap_entry(entry))) { - if (swap_duplicate(entry) < 0) + if (swap_dup_entry_direct(entry) < 0) return -EIO; /* make sure dst_mm is on swapoff's mmlist. */ @@ -1635,7 +1635,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (!should_zap_cows(details)) continue; rss[MM_SWAPENTS] -= nr; - free_swap_and_cache_nr(entry, nr); + swap_put_entries_direct(entry, nr); } else if (is_migration_entry(entry)) { folio = pfn_swap_entry_folio(entry); if (!should_zap_folio(details, folio)) @@ -3849,12 +3849,22 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) return 0; } -static inline bool should_try_to_free_swap(struct folio *folio, +static inline bool should_try_to_free_swap(struct swap_info_struct *si, + struct folio *folio, struct vm_area_struct *vma, + unsigned int extra_refs, unsigned int fault_flags) { if (!folio_test_swapcache(folio)) return false; + /* + * Try to free swap cache for SWP_SYNCHRONOUS_IO devices. + * Redundant IO is unlikely to be an issue for them, but a + * slot being pinned by swap cache may cause more fragmentation + * and delayed freeing of swap metadata. + */ + if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) + return true; if (mem_cgroup_swap_full(folio) || (vma->vm_flags & VM_LOCKED) || folio_test_mlocked(folio)) return true; @@ -3865,7 +3875,7 @@ static inline bool should_try_to_free_swap(struct folio *folio, * reference only in case it's likely that we'll be the exlusive user. */ return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) && - folio_ref_count(folio) == (1 + folio_nr_pages(folio)); + folio_ref_count(folio) == (extra_refs + folio_nr_pages(folio)); } static vm_fault_t pte_marker_clear(struct vm_fault *vmf) @@ -4097,6 +4107,16 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +/* Sanity check that a folio is fully exclusive */ +static void check_swap_exclusive(struct folio *folio, swp_entry_t entry, + unsigned int nr_pages) +{ + do { + VM_WARN_ON_ONCE_FOLIO(__swap_count(entry) != 1, folio); + entry.val++; + } while (--nr_pages); +} + /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. @@ -4108,16 +4128,14 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) vm_fault_t do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct folio *swapcache, *folio = NULL; + struct folio *swapcache = NULL, *folio; struct page *page; struct swap_info_struct *si = NULL; rmap_t rmap_flags = RMAP_NONE; - bool need_clear_cache = false; bool exclusive = false; swp_entry_t entry; pte_t pte; vm_fault_t ret = 0; - void *shadow = NULL; int nr_pages; unsigned long page_idx; unsigned long address; @@ -4180,55 +4198,21 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) folio = swap_cache_get_folio(entry); if (folio) swap_update_readahead(folio, vma, vmf->address); - swapcache = folio; - if (!folio) { - if (data_race(si->flags & SWP_SYNCHRONOUS_IO) && - __swap_count(entry) == 1) { - /* skip swapcache */ + if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) { folio = alloc_swap_folio(vmf); if (folio) { - __folio_set_locked(folio); - __folio_set_swapbacked(folio); - - nr_pages = folio_nr_pages(folio); - if (folio_test_large(folio)) - entry.val = ALIGN_DOWN(entry.val, nr_pages); /* - * Prevent parallel swapin from proceeding with - * the cache flag. Otherwise, another thread - * may finish swapin first, free the entry, and - * swapout reusing the same entry. It's - * undetectable as pte_same() returns true due - * to entry reuse. + * folio is charged, so swapin can only fail due + * to raced swapin and return NULL. */ - if (swapcache_prepare(entry, nr_pages)) { - /* - * Relax a bit to prevent rapid - * repeated page faults. - */ - schedule_timeout_uninterruptible(1); - goto out_page; - } - need_clear_cache = true; - - mem_cgroup_swapin_uncharge_swap(entry, nr_pages); - - shadow = swap_cache_get_shadow(entry); - if (shadow) - workingset_refault(folio, shadow); - - folio_add_lru(folio); - - /* To provide entry to swap_read_folio() */ - folio->swap = entry; - swap_read_folio(folio, NULL); - folio->private = NULL; + swapcache = swapin_folio(entry, folio); + if (swapcache != folio) + folio_put(folio); + folio = swapcache; } } else { - folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, - vmf); - swapcache = folio; + folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf); } if (!folio) { @@ -4250,60 +4234,58 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); } + swapcache = folio; ret |= folio_lock_or_retry(folio, vmf); if (ret & VM_FAULT_RETRY) goto out_release; page = folio_file_page(folio, swp_offset(entry)); - if (swapcache) { - /* - * Make sure folio_free_swap() or swapoff did not release the - * swapcache from under us. The page pin, and pte_same test - * below, are not enough to exclude that. Even if it is still - * swapcache, we need to check that the page's swap has not - * changed. - */ - if (unlikely(!folio_matches_swap_entry(folio, entry))) - goto out_page; - - if (unlikely(PageHWPoison(page))) { - /* - * hwpoisoned dirty swapcache pages are kept for killing - * owner processes (which may be unknown at hwpoison time) - */ - ret = VM_FAULT_HWPOISON; - goto out_page; - } - - /* - * KSM sometimes has to copy on read faults, for example, if - * page->index of !PageKSM() pages would be nonlinear inside the - * anon VMA -- PageKSM() is lost on actual swapout. - */ - folio = ksm_might_need_to_copy(folio, vma, vmf->address); - if (unlikely(!folio)) { - ret = VM_FAULT_OOM; - folio = swapcache; - goto out_page; - } else if (unlikely(folio == ERR_PTR(-EHWPOISON))) { - ret = VM_FAULT_HWPOISON; - folio = swapcache; - goto out_page; - } - if (folio != swapcache) - page = folio_page(folio, 0); + /* + * Make sure folio_free_swap() or swapoff did not release the + * swapcache from under us. The page pin, and pte_same test + * below, are not enough to exclude that. Even if it is still + * swapcache, we need to check that the page's swap has not + * changed. + */ + if (unlikely(!folio_matches_swap_entry(folio, entry))) + goto out_page; + if (unlikely(PageHWPoison(page))) { /* - * If we want to map a page that's in the swapcache writable, we - * have to detect via the refcount if we're really the exclusive - * owner. Try removing the extra reference from the local LRU - * caches if required. + * hwpoisoned dirty swapcache pages are kept for killing + * owner processes (which may be unknown at hwpoison time) */ - if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache && - !folio_test_ksm(folio) && !folio_test_lru(folio)) - lru_add_drain(); + ret = VM_FAULT_HWPOISON; + goto out_page; } + /* + * KSM sometimes has to copy on read faults, for example, if + * page->index of !PageKSM() pages would be nonlinear inside the + * anon VMA -- PageKSM() is lost on actual swapout. + */ + folio = ksm_might_need_to_copy(folio, vma, vmf->address); + if (unlikely(!folio)) { + ret = VM_FAULT_OOM; + folio = swapcache; + goto out_page; + } else if (unlikely(folio == ERR_PTR(-EHWPOISON))) { + ret = VM_FAULT_HWPOISON; + folio = swapcache; + goto out_page; + } else if (folio != swapcache) + page = folio_page(folio, 0); + + /* + * If we want to map a page that's in the swapcache writable, we + * have to detect via the refcount if we're really the exclusive + * owner. Try removing the extra reference from the local LRU + * caches if required. + */ + if ((vmf->flags & FAULT_FLAG_WRITE) && + !folio_test_ksm(folio) && !folio_test_lru(folio)) + lru_add_drain(); + folio_throttle_swaprate(folio, GFP_KERNEL); /* @@ -4319,24 +4301,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out_nomap; } - /* allocated large folios for SWP_SYNCHRONOUS_IO */ - if (folio_test_large(folio) && !folio_test_swapcache(folio)) { - unsigned long nr = folio_nr_pages(folio); - unsigned long folio_start = ALIGN_DOWN(vmf->address, nr * PAGE_SIZE); - unsigned long idx = (vmf->address - folio_start) / PAGE_SIZE; - pte_t *folio_ptep = vmf->pte - idx; - pte_t folio_pte = ptep_get(folio_ptep); - - if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) || - swap_pte_batch(folio_ptep, nr, folio_pte) != nr) - goto out_nomap; - - page_idx = idx; - address = folio_start; - ptep = folio_ptep; - goto check_folio; - } - nr_pages = 1; page_idx = 0; address = vmf->address; @@ -4380,12 +4344,37 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) BUG_ON(!folio_test_anon(folio) && folio_test_mappedtodisk(folio)); BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page)); + /* + * If a large folio already belongs to anon mapping, then we + * can just go on and map it partially. + * If not, with the large swapin check above failing, the page table + * have changed, so sub pages might got charged to the wrong cgroup, + * or even should be shmem. So we have to free it and fallback. + * Nothing should have touched it, both anon and shmem checks if a + * large folio is fully appliable before use. + * + * This will be removed once we unify folio allocation in the swap cache + * layer, where allocation of a folio stabilizes the swap entries. + */ + if (!folio_test_anon(folio) && folio_test_large(folio) && + nr_pages != folio_nr_pages(folio)) { + if (!WARN_ON_ONCE(folio_test_dirty(folio))) + swap_cache_del_folio(folio); + goto out_nomap; + } + /* * Check under PT lock (to protect against concurrent fork() sharing * the swap entry concurrently) for certainly exclusive pages. */ if (!folio_test_ksm(folio)) { + /* + * The can_swapin_thp check above ensures all PTE have + * same exclusivenss, only check one PTE is fine. + */ exclusive = pte_swp_exclusive(vmf->orig_pte); + if (exclusive) + check_swap_exclusive(folio, entry, nr_pages); if (folio != swapcache) { /* * We have a fresh page that is not exposed to the @@ -4419,19 +4408,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* * Some architectures may have to restore extra metadata to the page * when reading from swap. This metadata may be indexed by swap entry - * so this must be called before swap_free(). + * so this must be called before folio_put_swap(). */ arch_swap_restore(folio_swap(entry, folio), folio); - /* - * Remove the swap entry and conditionally try to free up the swapcache. - * We're already holding a reference on the page but haven't mapped it - * yet. - */ - swap_free_nr(entry, nr_pages); - if (should_try_to_free_swap(folio, vma, vmf->flags)) - folio_free_swap(folio); - add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages); pte = mk_pte(page, vma->vm_page_prot); @@ -4459,22 +4439,24 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) vmf->orig_pte = pte_advance_pfn(pte, page_idx); /* ksm created a completely new copy */ - if (unlikely(folio != swapcache && swapcache)) { + if (unlikely(folio != swapcache)) { folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); + folio_put_swap(swapcache, NULL); } else if (!folio_test_anon(folio)) { /* - * We currently only expect small !anon folios which are either - * fully exclusive or fully shared, or new allocated large - * folios which are fully exclusive. If we ever get large - * folios within swapcache here, we have to be careful. + * We currently only expect !anon folios that are fully + * mappable. See the comment after can_swapin_thp above. */ - VM_WARN_ON_ONCE(folio_test_large(folio) && folio_test_swapcache(folio)); - VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_nr_pages(folio) != nr_pages, folio); + VM_WARN_ON_ONCE_FOLIO(folio_mapped(folio), folio); folio_add_new_anon_rmap(folio, vma, address, rmap_flags); + folio_put_swap(folio, NULL); } else { + VM_WARN_ON_ONCE(nr_pages != 1 && nr_pages != folio_nr_pages(folio)); folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, address, - rmap_flags); + rmap_flags); + folio_put_swap(folio, nr_pages == 1 ? page : NULL); } VM_BUG_ON(!folio_test_anon(folio) || @@ -4483,13 +4465,21 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) arch_do_swap_page_nr(vma->vm_mm, vma, address, pte, pte, nr_pages); + /* + * Remove the swap entry and conditionally try to free up the + * swapcache. Do it after mapping so any raced page fault will + * see the folio in swap cache and wait for us. + */ + if (should_try_to_free_swap(si, folio, vma, nr_pages, vmf->flags)) + folio_free_swap(folio); + folio_unlock(folio); - if (folio != swapcache && swapcache) { + if (unlikely(folio != swapcache)) { /* * Hold the lock to avoid the swap entry to be reused * until we take the PT lock for the pte_same() check * (to avoid false positives from pte_same). For - * further safety release the lock after the swap_free + * further safety release the lock after the folio_put_swap * so that the swap count won't change under a * parallel locked swapcache. */ @@ -4510,9 +4500,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); out: - /* Clear the swap cache pin for direct swapin after PTL unlock */ - if (need_clear_cache) - swapcache_clear(si, entry, nr_pages); if (si) put_swap_device(si); return ret; @@ -4520,15 +4507,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); out_page: + if (folio_test_swapcache(folio)) + folio_free_swap(folio); folio_unlock(folio); out_release: folio_put(folio); - if (folio != swapcache && swapcache) { + if (folio != swapcache) { folio_unlock(swapcache); folio_put(swapcache); } - if (need_clear_cache) - swapcache_clear(si, entry, nr_pages); if (si) put_swap_device(si); return ret; diff --git a/mm/migrate.c b/mm/migrate.c index b5b9174af5ce683a045ed9400b812726b589ad8a..57771f23a368dbe4dba2a06ed84d4562b6a2dbe5 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -401,7 +401,7 @@ static int folio_expected_refs(struct address_space *mapping, int folio_migrate_mapping(struct address_space *mapping, struct folio *newfolio, struct folio *folio, int extra_count) { - XA_STATE(xas, &mapping->i_pages, folio_index(folio)); + XA_STATE(xas, &mapping->i_pages, folio->index); struct swap_cluster_info *ci = NULL; struct zone *oldzone, *newzone; int dirty; @@ -548,7 +548,7 @@ EXPORT_SYMBOL(folio_migrate_mapping); int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src) { - XA_STATE(xas, &mapping->i_pages, folio_index(src)); + XA_STATE(xas, &mapping->i_pages, src->index); int expected_count; xas_lock_irq(&xas); diff --git a/mm/rmap.c b/mm/rmap.c index 43f570549d3a8fc78b9fb22128a1075c1bd655bc..8b2e9133a8fafd316266d3771229d4dc228c568f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -83,6 +83,7 @@ #include #include "internal.h" +#include "swap.h" static struct kmem_cache *anon_vma_cachep; static struct kmem_cache *anon_vma_chain_cachep; @@ -1833,14 +1834,14 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, break; } - if (swap_duplicate(entry) < 0) { + if (folio_dup_swap(folio, subpage) < 0) { set_pte_at(mm, address, pvmw.pte, pteval); ret = false; page_vma_mapped_walk_done(&pvmw); break; } if (arch_unmap_one(mm, vma, address, pteval) < 0) { - swap_free(entry); + folio_put_swap(folio, subpage); set_pte_at(mm, address, pvmw.pte, pteval); ret = false; page_vma_mapped_walk_done(&pvmw); @@ -1850,7 +1851,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, /* See folio_try_share_anon_rmap(): clear PTE first. */ if (anon_exclusive && folio_try_share_anon_rmap_pte(folio, subpage)) { - swap_free(entry); + folio_put_swap(folio, subpage); set_pte_at(mm, address, pvmw.pte, pteval); ret = false; page_vma_mapped_walk_done(&pvmw); diff --git a/mm/shmem.c b/mm/shmem.c index 82c9943037bd553b1e68215decc8c0df6a54e870..6cc1bdb5e1bfbbc13598e8d93fb61da81c52f58e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -283,7 +283,7 @@ bool vma_is_shmem(struct vm_area_struct *vma) } static LIST_HEAD(shmem_swaplist); -static DEFINE_MUTEX(shmem_swaplist_mutex); +static DEFINE_SPINLOCK(shmem_swaplist_lock); #ifdef CONFIG_TMPFS_QUOTA @@ -423,10 +423,13 @@ static void shmem_free_inode(struct super_block *sb, size_t freed_ispace) * * But normally info->alloced == inode->i_mapping->nrpages + info->swapped * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) + * + * Return: true if swapped was incremented from 0, for shmem_writeout(). */ -static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped) +static bool shmem_recalc_inode(struct inode *inode, long alloced, long swapped) { struct shmem_inode_info *info = SHMEM_I(inode); + bool first_swapped = false; long freed; spin_lock(&info->lock); @@ -441,8 +444,11 @@ static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped) * to stop a racing shmem_recalc_inode() from thinking that a page has * been freed. Compensate here, to avoid the need for a followup call. */ - if (swapped > 0) + if (swapped > 0) { + if (info->swapped == swapped) + first_swapped = true; freed += swapped; + } if (freed > 0) info->alloced -= freed; spin_unlock(&info->lock); @@ -450,6 +456,7 @@ static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped) /* The quota case may block */ if (freed > 0) shmem_inode_unacct_blocks(inode, freed); + return first_swapped; } bool shmem_charge(struct inode *inode, long pages) @@ -893,7 +900,7 @@ static long shmem_free_swap(struct address_space *mapping, old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0); if (old != radswap) return 0; - free_swap_and_cache_nr(radix_to_swp_entry(radswap), 1 << order); + swap_put_entries_direct(radix_to_swp_entry(radswap), 1 << order); return 1 << order; } @@ -1310,11 +1317,11 @@ static void shmem_evict_inode(struct inode *inode) /* Wait while shmem_unuse() is scanning this inode... */ wait_var_event(&info->stop_eviction, !atomic_read(&info->stop_eviction)); - mutex_lock(&shmem_swaplist_mutex); + spin_lock(&shmem_swaplist_lock); /* ...but beware of the race if we peeked too early */ if (!atomic_read(&info->stop_eviction)) list_del_init(&info->swaplist); - mutex_unlock(&shmem_swaplist_mutex); + spin_unlock(&shmem_swaplist_lock); } } @@ -1448,7 +1455,7 @@ int shmem_unuse(unsigned int type) if (pagecache_limit_should_shrink()) shrink_page_cache(GFP_KERNEL, NULL); - mutex_lock(&shmem_swaplist_mutex); + spin_lock(&shmem_swaplist_lock); start_over: list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { if (!info->swapped) { @@ -1462,12 +1469,12 @@ int shmem_unuse(unsigned int type) * (igrab() would protect from unlink, but not from unmount). */ atomic_inc(&info->stop_eviction); - mutex_unlock(&shmem_swaplist_mutex); + spin_unlock(&shmem_swaplist_lock); error = shmem_unuse_inode(&info->vfs_inode, type); cond_resched(); - mutex_lock(&shmem_swaplist_mutex); + spin_lock(&shmem_swaplist_lock); if (atomic_dec_and_test(&info->stop_eviction)) wake_up_var(&info->stop_eviction); if (error) @@ -1478,7 +1485,7 @@ int shmem_unuse(unsigned int type) if (!info->swapped) list_del_init(&info->swaplist); } - mutex_unlock(&shmem_swaplist_mutex); + spin_unlock(&shmem_swaplist_lock); if (!error && pagecache_limit_should_shrink()) shrink_page_cache(GFP_KERNEL, NULL); @@ -1577,30 +1584,59 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) folio_mark_uptodate(folio); } - /* - * Add inode to shmem_unuse()'s list of swapped-out inodes, - * if it's not already there. Do it now before the folio is - * moved to swap cache, when its pagelock no longer protects - * the inode from eviction. But don't unlock the mutex until - * we've incremented swapped, because shmem_unuse_inode() will - * prune a !swapped inode from the swaplist under this mutex. - */ - mutex_lock(&shmem_swaplist_mutex); - if (list_empty(&info->swaplist)) - list_add(&info->swaplist, &shmem_swaplist); + if (!folio_alloc_swap(folio)) { + bool first_swapped = shmem_recalc_inode(inode, 0, nr_pages); + int error; + + /* + * Add inode to shmem_unuse()'s list of swapped-out inodes, + * if it's not already there. Do it now before the folio is + * removed from page cache, when its pagelock no longer + * protects the inode from eviction. And do it now, after + * we've incremented swapped, because shmem_unuse() will + * prune a !swapped inode from the swaplist. + */ + if (first_swapped) { + spin_lock(&shmem_swaplist_lock); + if (list_empty(&info->swaplist)) + list_add(&info->swaplist, &shmem_swaplist); + spin_unlock(&shmem_swaplist_lock); + } - if (!folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN)) { - shmem_recalc_inode(inode, 0, nr_pages); - swap_shmem_alloc(folio->swap, nr_pages); + folio_dup_swap(folio, NULL); shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap)); - mutex_unlock(&shmem_swaplist_mutex); BUG_ON(folio_mapped(folio)); - return swap_writepage(&folio->page, wbc); + error = swap_writepage(&folio->page, wbc); + if (error != AOP_WRITEPAGE_ACTIVATE) { + /* folio has been unlocked */ + return error; + } + + /* + * The intention here is to avoid holding on to the swap when + * zswap was unable to compress and unable to writeback; but + * it will be appropriate if other reactivate cases are added. + */ + error = shmem_add_to_page_cache(folio, mapping, index, + swp_to_radix_entry(folio->swap), + __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); + /* Swap entry might be erased by racing shmem_free_swap() */ + if (!error) { + shmem_recalc_inode(inode, 0, -nr_pages); + folio_put_swap(folio, NULL); + } + + /* + * The delete_from_swap_cache() below could be left for + * shrink_folio_list()'s folio_free_swap() to dispose of; + * but I'm a little nervous about letting this folio out of + * shmem_writeout() in a hybrid half-tmpfs-half-swap state + * e.g. folio_mapping(folio) might give an unexpected answer. + */ + swap_cache_del_folio(folio); + goto redirty; } - if (!info->swapped) - list_del_init(&info->swaplist); - mutex_unlock(&shmem_swaplist_mutex); if (nr_pages > 1) goto try_split; redirty: @@ -1820,6 +1856,7 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, struct shmem_inode_info *info = SHMEM_I(inode); unsigned long suitable_orders = 0; struct folio *folio = NULL; + pgoff_t aligned_index; long pages; int error, order; @@ -1833,10 +1870,12 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, order = highest_order(suitable_orders); while (suitable_orders) { pages = 1UL << order; - index = round_down(index, pages); - folio = shmem_alloc_folio(gfp, order, info, index); - if (folio) + aligned_index = round_down(index, pages); + folio = shmem_alloc_folio(gfp, order, info, aligned_index); + if (folio) { + index = aligned_index; goto allocated; + } if (pages == HPAGE_PMD_NR) count_vm_event(THP_FILE_FALLBACK); @@ -1918,10 +1957,9 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode, swp_entry_t entry, int order, gfp_t gfp) { struct shmem_inode_info *info = SHMEM_I(inode); + struct folio *new, *swapcache; int nr_pages = 1 << order; - struct folio *new; gfp_t alloc_gfp; - void *shadow; /* * We have arrived here because our zones are constrained, so don't @@ -1961,34 +1999,19 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode, goto fallback; } - /* - * Prevent parallel swapin from proceeding with the swap cache flag. - * - * Of course there is another possible concurrent scenario as well, - * that is to say, the swap cache flag of a large folio has already - * been set by swapcache_prepare(), while another thread may have - * already split the large swap entry stored in the shmem mapping. - * In this case, shmem_add_to_page_cache() will help identify the - * concurrent swapin and return -EEXIST. - */ - if (swapcache_prepare(entry, nr_pages)) { + swapcache = swapin_folio(entry, new); + if (swapcache != new) { folio_put(new); - new = ERR_PTR(-EEXIST); - /* Try smaller folio to avoid cache conflict */ - goto fallback; + if (!swapcache) { + /* + * The new folio is charged already, swapin can + * only fail due to another raced swapin. + */ + new = ERR_PTR(-EEXIST); + goto fallback; + } } - - __folio_set_locked(new); - __folio_set_swapbacked(new); - new->swap = entry; - - mem_cgroup_swapin_uncharge_swap(entry, nr_pages); - shadow = swap_cache_get_shadow(entry); - if (shadow) - workingset_refault(new, shadow); - folio_add_lru(new); - swap_read_folio(new, NULL); - return new; + return swapcache; fallback: /* Order 0 swapin failed, nothing to fallback to, abort */ if (!order) @@ -2078,8 +2101,7 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, } static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, - struct folio *folio, swp_entry_t swap, - bool skip_swapcache) + struct folio *folio, swp_entry_t swap) { struct address_space *mapping = inode->i_mapping; swp_entry_t swapin_error; @@ -2095,15 +2117,14 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, nr_pages = folio_nr_pages(folio); folio_wait_writeback(folio); - if (!skip_swapcache) - swap_cache_del_folio(folio); + folio_put_swap(folio, NULL); + swap_cache_del_folio(folio); /* * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks) * in shmem_evict_inode(). */ shmem_recalc_inode(inode, -nr_pages, -nr_pages); - swap_free_nr(swap, nr_pages); } static int shmem_split_large_entry(struct inode *inode, pgoff_t index, @@ -2201,7 +2222,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, swp_entry_t swap, index_entry; struct swap_info_struct *si; struct folio *folio = NULL; - bool skip_swapcache = false; int error, nr_pages, order; pgoff_t offset; #ifdef CONFIG_CGROUP_SLI @@ -2247,7 +2267,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, folio = NULL; goto failed; } - skip_swapcache = true; } else { #ifdef CONFIG_CGROUP_SLI sli_memlat_stat_start(&start); @@ -2309,9 +2328,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, * and swap cache folios are never partially freed. */ folio_lock(folio); - if ((!skip_swapcache && !folio_test_swapcache(folio)) || - shmem_confirm_swap(mapping, index, swap) < 0 || - folio->swap.val != swap.val) { + if (!folio_matches_swap_entry(folio, swap) || + shmem_confirm_swap(mapping, index, swap) < 0) { error = -EEXIST; goto unlock; } @@ -2344,14 +2362,9 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, if (sgp == SGP_WRITE) folio_mark_accessed(folio); - if (skip_swapcache) { - folio->swap.val = 0; - swapcache_clear(si, swap, nr_pages); - } else { - swap_cache_del_folio(folio); - } + folio_put_swap(folio, NULL); + swap_cache_del_folio(folio); folio_mark_dirty(folio); - swap_free_nr(swap, nr_pages); put_swap_device(si); *foliop = folio; @@ -2360,14 +2373,11 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, if (shmem_confirm_swap(mapping, index, swap) < 0) error = -EEXIST; if (error == -EIO) - shmem_set_folio_swapin_error(inode, index, folio, swap, - skip_swapcache); + shmem_set_folio_swapin_error(inode, index, folio, swap); unlock: if (folio) folio_unlock(folio); failed_nolock: - if (skip_swapcache) - swapcache_clear(si, folio->swap, folio_nr_pages(folio)); if (folio) folio_put(folio); put_swap_device(si); diff --git a/mm/swap.h b/mm/swap.h index d6127148ff334e47b05a5eb37195758b0fa026e9..d0a74656cc8c2f1017e226de3b2ba9fbd237ff16 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -180,6 +180,33 @@ static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci) spin_unlock_irq(&ci->lock); } +/* + * Below are the core routines for doing swap for a folio. + * All helpers requires the folio to be locked, and a locked folio + * in the swap cache pins the swap entries / slots allocated to the + * folio, swap relies heavily on the swap cache and folio lock for + * synchronization. + * + * folio_alloc_swap(): the entry point for a folio to be swapped + * out. It allocates swap slots and pins the slots with swap cache. + * The slots start with a swap count of zero. + * + * folio_dup_swap(): increases the swap count of a folio, usually + * during it gets unmapped and a swap entry is installed to replace + * it (e.g., swap entry in page table). A swap slot with swap + * count == 0 should only be increasd by this helper. + * + * folio_put_swap(): does the opposite thing of folio_dup_swap(). + */ +int folio_alloc_swap(struct folio *folio); +int folio_dup_swap(struct folio *folio, struct page *subpage); +void folio_put_swap(struct folio *folio, struct page *subpage); + +/* For internal use */ +extern void swap_entries_free(struct swap_info_struct *si, + struct swap_cluster_info *ci, + unsigned long offset, unsigned int nr_pages); + /* linux/mm/page_io.c */ int sio_pool_init(void); struct swap_iocb; @@ -242,11 +269,16 @@ static inline bool folio_matches_swap_entry(struct folio *folio, * swap entries in the page table, similar to locking swap cache folio. * - See the comment of get_swap_device() for more complex usage. */ +bool swap_cache_check_folio(swp_entry_t entry); struct folio *swap_cache_get_folio(swp_entry_t entry); void *swap_cache_get_shadow(swp_entry_t entry); -void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadow); void swap_cache_del_folio(struct folio *folio); +struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_flags, + struct mempolicy *mpol, pgoff_t ilx, + bool *alloced); /* Below helpers require the caller to lock and pass in the swap cluster. */ +void __swap_cache_add_folio(struct swap_cluster_info *ci, + struct folio *folio, swp_entry_t entry); void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, swp_entry_t entry, void *shadow); void __swap_cache_replace_folio(struct swap_cluster_info *ci, @@ -258,13 +290,11 @@ void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr); struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, struct swap_iocb **plug); -struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_flags, - struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated, - bool skip_if_exists); struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, struct mempolicy *mpol, pgoff_t ilx); struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag, struct vm_fault *vmf); +struct folio *swapin_folio(swp_entry_t entry, struct folio *folio); void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, unsigned long addr); @@ -300,8 +330,6 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) { - struct swap_info_struct *si = __swap_entry_to_info(entry); - pgoff_t offset = swp_offset(entry); int i; /* @@ -310,8 +338,9 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) * be in conflict with the folio in swap cache. */ for (i = 0; i < max_nr; i++) { - if ((si->swap_map[offset + i] & SWAP_HAS_CACHE)) + if (swap_cache_check_folio(entry)) return i; + entry.val++; } return i; @@ -350,9 +379,24 @@ static inline struct swap_info_struct *__swap_entry_to_info(swp_entry_t entry) return NULL; } +static inline int folio_alloc_swap(struct folio *folio, gfp_t gfp) +{ + return -EINVAL; +} + +static inline int folio_dup_swap(struct folio *folio, struct page *page) +{ + return -EINVAL; +} + +static inline void folio_put_swap(struct folio *folio, struct page *page) +{ +} + static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug) { } + static inline void swap_write_unplug(struct swap_iocb *sio) { } @@ -383,6 +427,11 @@ static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, return NULL; } +static inline struct folio *swapin_folio(swp_entry_t entry, struct folio *folio) +{ + return NULL; +} + static inline int swap_writepage(struct page *p, struct writeback_control *wbc) { return 0; @@ -393,8 +442,9 @@ static inline void swap_update_readahead(struct folio *folio, { } -static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr) +static inline bool swap_cache_check_folio(swp_entry_t entry) { + return false; } static inline struct folio *swap_cache_get_folio(swp_entry_t entry) @@ -407,7 +457,8 @@ static inline void *swap_cache_get_shadow(swp_entry_t entry) return NULL; } -static inline void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadow) +static inline void *__swap_cache_add_folio(struct swap_cluster_info *ci, + struct folio *folio, swp_entry_t entry) { } @@ -441,25 +492,4 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) return 0; } #endif /* CONFIG_SWAP */ - -/** - * folio_index - File index of a folio. - * @folio: The folio. - * - * For a folio which is either in the page cache or the swap cache, - * return its index within the address_space it belongs to. If you know - * the folio is definitely in the page cache, you can look at the folio's - * index directly. - * - * Return: The index (offset in units of pages) of a folio in its file. - */ -static inline pgoff_t folio_index(struct folio *folio) -{ -#ifdef CONFIG_SWAP - if (unlikely(folio_test_swapcache(folio))) - return swp_offset(folio->swap); -#endif - return folio->index; -} - #endif /* _MM_SWAP_H */ diff --git a/mm/swap_state.c b/mm/swap_state.c index 836a2df2c1d62bf30068b07929d12db2ac1e559d..36297452396150ef9de3ccfa77fc4ab4d684954d 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -104,6 +104,22 @@ struct folio *swap_cache_get_folio(swp_entry_t entry) return NULL; } +/** + * swap_cache_check_folio - Check if a swap slot has cache. + * @entry: swap entry indicating the slot. + * + * Context: Caller must ensure @entry is valid and protect the swap + * device with reference count or locks. + */ +bool swap_cache_check_folio(swp_entry_t entry) +{ + unsigned long swp_tb; + + swp_tb = swap_table_get(__swap_entry_to_cluster(entry), + swp_cluster_offset(entry)); + return swp_tb_is_folio(swp_tb); +} + /** * swap_cache_get_shadow - Looks up a shadow in the swap cache. * @entry: swap entry used for the lookup. @@ -123,52 +139,92 @@ void *swap_cache_get_shadow(swp_entry_t entry) return NULL; } +void __swap_cache_add_folio(struct swap_cluster_info *ci, + struct folio *folio, swp_entry_t entry) +{ + unsigned long new_tb; + unsigned int ci_start, ci_off, ci_end; + unsigned long nr_pages = folio_nr_pages(folio); + + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio); + + new_tb = folio_to_swp_tb(folio); + ci_start = swp_cluster_offset(entry); + ci_off = ci_start; + ci_end = ci_start + nr_pages; + do { + VM_WARN_ON_ONCE(swp_tb_is_folio(__swap_table_get(ci, ci_off))); + __swap_table_set(ci, ci_off, new_tb); + } while (++ci_off < ci_end); + + folio_ref_add(folio, nr_pages); + folio_set_swapcache(folio); + folio->swap = entry; + + node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); + lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages); +} + /** * swap_cache_add_folio - Add a folio into the swap cache. * @folio: The folio to be added. * @entry: The swap entry corresponding to the folio. * @gfp: gfp_mask for XArray node allocation. * @shadowp: If a shadow is found, return the shadow. + * @alloc: If it's the allocator that is trying to insert a folio. Allocator + * sets SWAP_HAS_CACHE to pin slots before insert so skip map update. * * Context: Caller must ensure @entry is valid and protect the swap device * with reference count or locks. * The caller also needs to update the corresponding swap_map slots with * SWAP_HAS_CACHE bit to avoid race or conflict. */ -void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadowp) +static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, + void **shadowp) { + int err; void *shadow = NULL; - unsigned long old_tb, new_tb; + unsigned long old_tb; + struct swap_info_struct *si; struct swap_cluster_info *ci; - unsigned int ci_start, ci_off, ci_end; + unsigned int ci_start, ci_off, ci_end, offset; unsigned long nr_pages = folio_nr_pages(folio); - VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); - VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); - VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio); - - new_tb = folio_to_swp_tb(folio); + si = __swap_entry_to_info(entry); ci_start = swp_cluster_offset(entry); ci_end = ci_start + nr_pages; ci_off = ci_start; - ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry)); + offset = swp_offset(entry); + ci = swap_cluster_lock(si, swp_offset(entry)); + if (unlikely(!ci->table)) { + err = -ENOENT; + goto failed; + } do { - old_tb = __swap_table_xchg(ci, ci_off, new_tb); - WARN_ON_ONCE(swp_tb_is_folio(old_tb)); + old_tb = __swap_table_get(ci, ci_off); + if (unlikely(swp_tb_is_folio(old_tb))) { + err = -EEXIST; + goto failed; + } + if (unlikely(!__swap_count(swp_entry(swp_type(entry), offset)))) { + err = -ENOENT; + goto failed; + } if (swp_tb_is_shadow(old_tb)) shadow = swp_tb_to_shadow(old_tb); + offset++; } while (++ci_off < ci_end); - - folio_ref_add(folio, nr_pages); - folio_set_swapcache(folio); - folio->swap = entry; + __swap_cache_add_folio(ci, folio, entry); swap_cluster_unlock(ci); - - node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); - lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages); - if (shadowp) *shadowp = shadow; + return 0; + +failed: + swap_cluster_unlock(ci); + return err; } /** @@ -187,8 +243,10 @@ void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadowp void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, swp_entry_t entry, void *shadow) { + struct swap_info_struct *si; unsigned long old_tb, new_tb; unsigned int ci_start, ci_off, ci_end; + bool folio_swapped = false, need_free = false; unsigned long nr_pages = folio_nr_pages(folio); VM_WARN_ON_ONCE(__swap_entry_to_cluster(entry) != ci); @@ -196,6 +254,7 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio); + si = __swap_entry_to_info(entry); new_tb = shadow_swp_to_tb(shadow); ci_start = swp_cluster_offset(entry); ci_end = ci_start + nr_pages; @@ -205,12 +264,27 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, old_tb = __swap_table_xchg(ci, ci_off, new_tb); WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != folio); + if (__swap_count(swp_entry(si->type, + swp_offset(entry) + ci_off - ci_start))) + folio_swapped = true; + else + need_free = true; } while (++ci_off < ci_end); folio->swap.val = 0; folio_clear_swapcache(folio); node_stat_mod_folio(folio, NR_FILE_PAGES, -nr_pages); lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages); + + if (!folio_swapped) { + swap_entries_free(si, ci, swp_offset(entry), nr_pages); + } else if (need_free) { + do { + if (!__swap_count(entry)) + swap_entries_free(si, ci, swp_offset(entry), 1); + entry.val++; + } while (--nr_pages); + } } /** @@ -232,7 +306,6 @@ void swap_cache_del_folio(struct folio *folio) __swap_cache_del_folio(ci, folio, entry, NULL); swap_cluster_unlock(ci); - put_swap_folio(folio, entry); folio_ref_sub(folio, folio_nr_pages(folio)); } @@ -412,118 +485,147 @@ void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, } } -struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, - struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated, - bool skip_if_exists) +/** + * __swap_cache_prepare_and_add - Prepare the folio and add it to swap cache. + * @entry: swap entry to be bound to the folio. + * @folio: folio to be added. + * @gfp: memory allocation flags for charge, can be 0 if @charged if true. + * @charged: if the folio is already charged. + * + * Update the swap_map and add folio as swap cache, typically before swapin. + * All swap slots covered by the folio must have a non-zero swap count. + * + * Context: Caller must protect the swap device with reference count or locks. + * Return: Returns the folio being added on success. Returns the existing + * folio if @entry is cached. Returns NULL if raced with swapin or swapoff. + */ +static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry, + struct folio *folio, + gfp_t gfp, bool charged) { - struct swap_info_struct *si = __swap_entry_to_info(entry); - struct folio *folio; - struct folio *new_folio = NULL; - struct folio *result = NULL; - void *shadow = NULL; - - *new_page_allocated = false; + struct folio *swapcache = NULL; + void *shadow; + int ret; + __folio_set_locked(folio); + __folio_set_swapbacked(folio); for (;;) { - int err; + ret = swap_cache_add_folio(folio, entry, &shadow); + if (!ret) + break; /* - * Check the swap cache first, if a cached folio is found, - * return it unlocked. The caller will lock and check it. + * Large order allocation needs special handling on + * race: if a smaller folio exists in cache, swapin needs + * to fallback to order 0, and doing a swap cache lookup + * might return a folio that is irrelevant to the faulting + * entry because @entry is aligned down. Just return NULL. */ - folio = swap_cache_get_folio(entry); - if (folio) - goto got_folio; + if (ret != -EEXIST || folio_test_large(folio)) + goto failed; - /* - * Just skip read ahead for unused swap slot. - */ - if (!swap_entry_swapped(si, entry)) - goto put_and_return; + swapcache = swap_cache_get_folio(entry); + if (swapcache) + goto failed; + } - /* - * Get a new folio to read into from swap. Allocate it now if - * new_folio not exist, before marking swap_map SWAP_HAS_CACHE, - * when -EEXIST will cause any racers to loop around until we - * add it to cache. - */ - if (!new_folio) { - new_folio = folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id()); - if (!new_folio) - goto put_and_return; - } + if (!charged && mem_cgroup_swapin_charge_folio(folio, NULL, gfp, entry)) { + swap_cache_del_folio(folio); + goto failed; + } - /* - * Swap entry may have been freed since our caller observed it. - */ - err = swapcache_prepare(entry, 1); - if (!err) - break; - else if (err != -EEXIST) - goto put_and_return; + mem_cgroup_swapin_uncharge_swap(entry, folio_nr_pages(folio)); + if (shadow) + workingset_refault(folio, shadow); - /* - * Protect against a recursive call to __read_swap_cache_async() - * on the same entry waiting forever here because SWAP_HAS_CACHE - * is set but the folio is not the swap cache yet. This can - * happen today if mem_cgroup_swapin_charge_folio() below - * triggers reclaim through zswap, which may call - * __read_swap_cache_async() in the writeback path. - */ - if (skip_if_exists) - goto put_and_return; + /* Caller will initiate read into locked folio */ + folio_add_lru(folio); + return folio; - /* - * We might race against __swap_cache_del_folio(), and - * stumble across a swap_map entry whose SWAP_HAS_CACHE - * has not yet been cleared. Or race against another - * __read_swap_cache_async(), which has set SWAP_HAS_CACHE - * in swap_map, but not yet added its folio to swap cache. - */ - schedule_timeout_uninterruptible(1); - } +failed: + folio_unlock(folio); + return swapcache; +} - /* - * The swap entry is ours to swap in. Prepare the new folio. - */ - __folio_set_locked(new_folio); - __folio_set_swapbacked(new_folio); +/** + * swap_cache_alloc_folio - Allocate folio for swapped out slot in swap cache. + * @entry: the swapped out swap entry to be binded to the folio. + * @gfp_mask: memory allocation flags + * @mpol: NUMA memory allocation policy to be applied + * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE + * @new_page_allocated: sets true if allocation happened, false otherwise + * + * Allocate a folio in the swap cache for one swap slot, typically before + * doing IO (swap in or swap out). The swap slot indicated by @entry must + * have a non-zero swap count (swapped out). Currently only supports order 0. + * + * Context: Caller must protect the swap device with reference count or locks. + * Return: Returns the existing folio if @entry is cached already. Returns + * NULL if failed due to -ENOMEM or @entry have a swap count < 1. + */ +struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask, + struct mempolicy *mpol, pgoff_t ilx, + bool *new_page_allocated) +{ + struct swap_info_struct *si = __swap_entry_to_info(entry); + struct folio *folio; + struct folio *result = NULL; - if (mem_cgroup_swapin_charge_folio(new_folio, NULL, gfp_mask, entry)) - goto fail_unlock; + *new_page_allocated = false; + /* Check the swap cache again for readahead path. */ + folio = swap_cache_get_folio(entry); + if (folio) + return folio; - swap_cache_add_folio(new_folio, entry, &shadow); - mem_cgroup_swapin_uncharge_swap(entry, 1); + /* Skip allocation for unused and bad swap slot for readahead. */ + if (!swap_entry_swapped(si, swp_offset(entry))) + return NULL; - if (shadow) - workingset_refault(new_folio, shadow); - - /* Caller will initiate read into locked new_folio */ - folio_add_lru(new_folio); - *new_page_allocated = true; - folio = new_folio; -got_folio: - result = folio; - goto put_and_return; - -fail_unlock: - put_swap_folio(new_folio, entry); - folio_unlock(new_folio); -put_and_return: - if (!(*new_page_allocated) && new_folio) - folio_put(new_folio); + /* Allocate a new folio to be added into the swap cache. */ + folio = folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id()); + if (!folio) + return NULL; + /* Try add the new folio, returns existing folio or NULL on failure. */ + result = __swap_cache_prepare_and_add(entry, folio, gfp_mask, false); + if (result == folio) + *new_page_allocated = true; + else + folio_put(folio); return result; } +/** + * swapin_folio - swap-in one or multiple entries skipping readahead. + * @entry: starting swap entry to swap in + * @folio: a new allocated and charged folio + * + * Reads @entry into @folio, @folio will be added to the swap cache. + * If @folio is a large folio, the @entry will be rounded down to align + * with the folio size. + * + * Return: returns pointer to @folio on success. If folio is a large folio + * and this raced with another swapin, NULL will be returned. Else, if + * another folio was already added to the swap cache, return that swap + * cache folio instead. + */ +struct folio *swapin_folio(swp_entry_t entry, struct folio *folio) +{ + struct folio *swapcache; + pgoff_t offset = swp_offset(entry); + unsigned long nr_pages = folio_nr_pages(folio); + + entry = swp_entry(swp_type(entry), round_down(offset, nr_pages)); + swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true); + if (swapcache == folio) + swap_read_folio(folio, NULL); + return swapcache; +} + /* * Locate a page of swap in physical memory, reserving swap cache space * and reading the disk if it is not already cached. * A failure return means that either the page allocation failed or that * the swap entry is no longer in use. - * - * get/put_swap_device() aren't needed to call this function, because - * __read_swap_cache_async() call them and swap_read_folio() holds the - * swap cache folio lock. */ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, @@ -540,8 +642,8 @@ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, return NULL; mpol = get_vma_policy(vma, addr, 0, &ilx); - folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, - &page_allocated, false); + folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx, + &page_allocated); mpol_cond_put(mpol); if (page_allocated) @@ -659,9 +761,9 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, blk_start_plug(&plug); for (offset = start_offset; offset <= end_offset ; offset++) { /* Ok, do the async read-ahead now */ - folio = __read_swap_cache_async( - swp_entry(swp_type(entry), offset), - gfp_mask, mpol, ilx, &page_allocated, false); + folio = swap_cache_alloc_folio( + swp_entry(swp_type(entry), offset), gfp_mask, mpol, ilx, + &page_allocated); if (!folio) continue; if (page_allocated) { @@ -678,8 +780,8 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, lru_add_drain(); /* Push any new pages onto the LRU now */ skip: /* The page was likely read above, so no need for plugging here */ - folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, - &page_allocated, false); + folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx, + &page_allocated); if (unlikely(page_allocated)) swap_read_folio(folio, NULL); return folio; @@ -773,8 +875,8 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, continue; pte_unmap(pte); pte = NULL; - folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, - &page_allocated, false); + folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx, + &page_allocated); if (!folio) continue; if (page_allocated) { @@ -793,8 +895,8 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, lru_add_drain(); skip: /* The folio was likely read above, so no need for plugging here */ - folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, - &page_allocated, false); + folio = swap_cache_alloc_folio(targ_entry, gfp_mask, mpol, targ_ilx, + &page_allocated); if (unlikely(page_allocated)) swap_read_folio(folio, NULL); return folio; diff --git a/mm/swapfile.c b/mm/swapfile.c index 5b2a3004bcbe098d0d88b145a429a2447031953f..f933e976984e20716805abdcb4d11cf9552699f3 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -47,16 +47,18 @@ #include #include "swap_table.h" #include "internal.h" +#include "swap_table.h" #include "swap.h" static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); -static void swap_entries_free(struct swap_info_struct *si, - struct swap_cluster_info *ci, - swp_entry_t entry, unsigned int nr_pages); static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries); +static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr); +static void swap_put_entry_locked(struct swap_info_struct *si, + struct swap_cluster_info *ci, + unsigned long offset); static bool folio_swapcache_freeable(struct folio *folio); static void move_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci, struct list_head *list, @@ -74,7 +76,7 @@ atomic_long_t nr_swap_pages; EXPORT_SYMBOL_GPL(nr_swap_pages); /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ long total_swap_pages; -static int least_priority = -1; +#define DEF_SWAP_PRIO -1 unsigned long swapfile_maximum_size; #ifdef CONFIG_MIGRATION bool swap_migration_ad_supported; @@ -104,7 +106,7 @@ EXPORT_SYMBOL(swap_active_head); * is held and the locking order requires swap_lock to be taken * before any swap_info_struct->lock. */ -static struct plist_head *swap_avail_heads; +static PLIST_HEAD(swap_avail_head); static DEFINE_SPINLOCK(swap_avail_lock); struct swap_info_struct *swap_info[MAX_SWAPFILES]; @@ -145,11 +147,6 @@ static struct swap_info_struct *swap_entry_to_info(swp_entry_t entry) return swap_type_to_info(swp_type(entry)); } -static inline unsigned char swap_count(unsigned char ent) -{ - return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ -} - /* * Use the second highest bit of inuse_pages counter as the indicator * if one swap device is on the available plist, so the atomic can @@ -181,39 +178,25 @@ static long swap_usage_in_pages(struct swap_info_struct *si) #define TTRS_FULL 0x4 static bool swap_only_has_cache(struct swap_info_struct *si, - unsigned long offset, int nr_pages) + struct swap_cluster_info *ci, + unsigned long offset, int nr_pages) { + unsigned int ci_off = offset % SWAPFILE_CLUSTER; unsigned char *map = si->swap_map + offset; unsigned char *map_end = map + nr_pages; + unsigned long swp_tb; do { - VM_BUG_ON(!(*map & SWAP_HAS_CACHE)); - if (*map != SWAP_HAS_CACHE) + swp_tb = __swap_table_get(ci, ci_off); + VM_WARN_ON_ONCE(!swp_tb_is_folio(swp_tb)); + if (*map) return false; + ++ci_off; } while (++map < map_end); return true; } -static bool swap_is_last_map(struct swap_info_struct *si, - unsigned long offset, int nr_pages, bool *has_cache) -{ - unsigned char *map = si->swap_map + offset; - unsigned char *map_end = map + nr_pages; - unsigned char count = *map; - - if (swap_count(count) != 1 && swap_count(count) != SWAP_MAP_SHMEM) - return false; - - while (++map < map_end) { - if (*map != count) - return false; - } - - *has_cache = !!(count & SWAP_HAS_CACHE); - return true; -} - /* * returns number of pages in the folio that backs the swap entry. If positive, * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no @@ -269,7 +252,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, * reference or pending writeback, and can't be allocated to others. */ ci = swap_cluster_lock(si, offset); - need_reclaim = swap_only_has_cache(si, offset, nr_pages); + need_reclaim = swap_only_has_cache(si, ci, offset, nr_pages); swap_cluster_unlock(ci); if (!need_reclaim) goto out_unlock; @@ -595,7 +578,7 @@ static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info * this returns NULL for an non-empty list. */ static struct swap_cluster_info *isolate_lock_cluster( - struct swap_info_struct *si, struct list_head *list, int order) + struct swap_info_struct *si, struct list_head *list) { struct swap_cluster_info *ci, *found = NULL; @@ -752,14 +735,14 @@ static void relocate_cluster(struct swap_info_struct *si, } /* - * The cluster corresponding to page_nr will be used. The cluster will not be - * added to free cluster list and its usage counter will be increased by 1. - * Only used for initialization. + * The cluster corresponding to @offset will be accounted as having one bad + * slot. The cluster will not be added to the free cluster list, and its + * usage counter will be increased by 1. Only used for initialization. */ -static int inc_cluster_info_page(struct swap_info_struct *si, - struct swap_cluster_info *cluster_info, unsigned long page_nr) +static int swap_cluster_setup_bad_slot(struct swap_cluster_info *cluster_info, + unsigned long offset) { - unsigned long idx = page_nr / SWAPFILE_CLUSTER; + unsigned long idx = offset / SWAPFILE_CLUSTER; struct swap_table *table; struct swap_cluster_info *ci; @@ -773,74 +756,80 @@ static int inc_cluster_info_page(struct swap_info_struct *si, ci->count++; - VM_BUG_ON(ci->count > SWAPFILE_CLUSTER); - VM_BUG_ON(ci->flags); + WARN_ON(ci->count > SWAPFILE_CLUSTER); + WARN_ON(ci->flags); return 0; } -static bool cluster_reclaim_range(struct swap_info_struct *si, - struct swap_cluster_info *ci, - unsigned long start, unsigned long end) +static unsigned int cluster_reclaim_range(struct swap_info_struct *si, + struct swap_cluster_info *ci, + unsigned long start, unsigned int order) { + unsigned int nr_pages = 1 << order; + unsigned long offset = start, end = start + nr_pages; unsigned char *map = si->swap_map; - unsigned long offset = start; - int nr_reclaim; + unsigned long swp_tb; spin_unlock(&ci->lock); do { - switch (READ_ONCE(map[offset])) { - case 0: - offset++; - break; - case SWAP_HAS_CACHE: - nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); - if (nr_reclaim > 0) - offset += nr_reclaim; - else - goto out; + if (READ_ONCE(map[offset])) break; - default: - goto out; + swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER); + if (swp_tb_is_folio(swp_tb)) { + if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY) < 0) + break; } - } while (offset < end); -out: + } while (++offset < end); spin_lock(&ci->lock); + + /* + * We just dropped ci->lock so cluster could be used by another + * order or got freed, check if it's still usable or empty. + */ + if (!cluster_is_usable(ci, order)) + return SWAP_ENTRY_INVALID; + if (cluster_is_empty(ci)) + return cluster_offset(si, ci); + /* * Recheck the range no matter reclaim succeeded or not, the slot * could have been be freed while we are not holding the lock. */ - for (offset = start; offset < end; offset++) - if (READ_ONCE(map[offset])) - return false; + for (offset = start; offset < end; offset++) { + swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER); + if (map[offset] || !swp_tb_is_null(swp_tb)) + return SWAP_ENTRY_INVALID; + } - return true; + return start; } static bool cluster_scan_range(struct swap_info_struct *si, struct swap_cluster_info *ci, - unsigned long start, unsigned int nr_pages, + unsigned long offset, unsigned int nr_pages, bool *need_reclaim) { - unsigned long offset, end = start + nr_pages; + unsigned long end = offset + nr_pages; unsigned char *map = si->swap_map; + unsigned long swp_tb; if (cluster_is_empty(ci)) return true; - for (offset = start; offset < end; offset++) { - switch (READ_ONCE(map[offset])) { - case 0: - continue; - case SWAP_HAS_CACHE: + do { + if (map[offset]) + return false; + swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER); + if (swp_tb_is_folio(swp_tb)) { if (!vm_swap_full()) return false; *need_reclaim = true; - continue; - default: - return false; + } else { + /* A entry with no count and no cache must be null */ + VM_WARN_ON_ONCE(!swp_tb_is_null(swp_tb)); } - } + } while (++offset < end); return true; } @@ -865,28 +854,48 @@ static void swap_cluster_assert_table_empty(struct swap_cluster_info *ci, } } -static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, - unsigned int start, unsigned char usage, - unsigned int order) +static bool cluster_alloc_range(struct swap_info_struct *si, + struct swap_cluster_info *ci, + struct folio *folio, + unsigned int offset) { - unsigned int nr_pages = 1 << order; + unsigned long nr_pages; + unsigned int order; lockdep_assert_held(&ci->lock); if (!(si->flags & SWP_WRITEOK)) return false; + /* + * All mm swap allocation starts with a folio (folio_alloc_swap), + * it's also the only allocation path for large orders allocation. + * Such swap slots starts with count == 0 and will be increased + * upon folio unmap. + * + * Else, it's a exclusive order 0 allocation for hibernation. + * The slot starts with count == 1 and never increases. + */ + if (likely(folio)) { + order = folio_order(folio); + nr_pages = 1 << order; + __swap_cache_add_folio(ci, folio, swp_entry(si->type, offset)); + } else { + order = 0; + nr_pages = 1; + WARN_ON_ONCE(si->swap_map[offset]); + si->swap_map[offset] = 1; + swap_cluster_assert_table_empty(ci, offset, 1); + } + /* * The first allocation in a cluster makes the * cluster exclusive to this order */ if (cluster_is_empty(ci)) ci->order = order; - - memset(si->swap_map + start, usage, nr_pages); - swap_cluster_assert_table_empty(ci, start, nr_pages); - swap_range_alloc(si, nr_pages); ci->count += nr_pages; + swap_range_alloc(si, nr_pages); return true; } @@ -894,15 +903,14 @@ static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster /* Try use a new cluster for current CPU and allocate from it. */ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci, - unsigned long offset, - unsigned int order, - unsigned char usage) + struct folio *folio, unsigned long offset) { unsigned int next = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID; unsigned long start = ALIGN_DOWN(offset, SWAPFILE_CLUSTER); unsigned long end = min(start + SWAPFILE_CLUSTER, si->max); + unsigned int order = likely(folio) ? folio_order(folio) : 0; unsigned int nr_pages = 1 << order; - bool need_reclaim, ret; + bool need_reclaim; lockdep_assert_held(&ci->lock); @@ -914,22 +922,13 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, if (!cluster_scan_range(si, ci, offset, nr_pages, &need_reclaim)) continue; if (need_reclaim) { - ret = cluster_reclaim_range(si, ci, offset, offset + nr_pages); - /* - * Reclaim drops ci->lock and cluster could be used - * by another order. Not checking flag as off-list - * cluster has no flag set, and change of list - * won't cause fragmentation. - */ - if (!cluster_is_usable(ci, order)) - goto out; - if (cluster_is_empty(ci)) - offset = start; + found = cluster_reclaim_range(si, ci, offset, order); /* Reclaim failed but cluster is usable, try next */ - if (!ret) + if (!found) continue; + offset = found; } - if (!cluster_alloc_range(si, ci, offset, usage, order)) + if (!cluster_alloc_range(si, ci, folio, offset)) break; found = offset; offset += nr_pages; @@ -951,20 +950,19 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, static unsigned int alloc_swap_scan_list(struct swap_info_struct *si, struct list_head *list, - unsigned int order, - unsigned char usage, + struct folio *folio, bool scan_all) { unsigned int found = SWAP_ENTRY_INVALID; do { - struct swap_cluster_info *ci = isolate_lock_cluster(si, list, order); + struct swap_cluster_info *ci = isolate_lock_cluster(si, list); unsigned long offset; if (!ci) break; offset = cluster_offset(si, ci); - found = alloc_swap_scan_cluster(si, ci, offset, order, usage); + found = alloc_swap_scan_cluster(si, ci, folio, offset); if (found) break; } while (scan_all); @@ -983,13 +981,14 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) if (force) to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER; - while ((ci = isolate_lock_cluster(si, &si->full_clusters, 0))) { + while ((ci = isolate_lock_cluster(si, &si->full_clusters))) { offset = cluster_offset(si, ci); end = min(si->max, offset + SWAPFILE_CLUSTER); to_scan--; while (offset < end) { - if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) { + if (!READ_ONCE(map[offset]) && + swp_tb_is_folio(swap_table_get(ci, offset % SWAPFILE_CLUSTER))) { spin_unlock(&ci->lock); nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); @@ -1025,10 +1024,11 @@ static void swap_reclaim_work(struct work_struct *work) * Try to allocate swap entries with specified order and try set a new * cluster for current CPU too. */ -static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order, - unsigned char usage) +static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, + struct folio *folio) { struct swap_cluster_info *ci; + unsigned int order = likely(folio) ? folio_order(folio) : 0; unsigned int offset = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID; /* @@ -1050,8 +1050,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o if (cluster_is_usable(ci, order)) { if (cluster_is_empty(ci)) offset = cluster_offset(si, ci); - found = alloc_swap_scan_cluster(si, ci, offset, - order, usage); + found = alloc_swap_scan_cluster(si, ci, folio, offset); } else { swap_cluster_unlock(ci); } @@ -1065,22 +1064,19 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o * to spread out the writes. */ if (si->flags & SWP_PAGE_DISCARD) { - found = alloc_swap_scan_list(si, &si->free_clusters, order, usage, - false); + found = alloc_swap_scan_list(si, &si->free_clusters, folio, false); if (found) goto done; } if (order < PMD_ORDER) { - found = alloc_swap_scan_list(si, &si->nonfull_clusters[order], - order, usage, true); + found = alloc_swap_scan_list(si, &si->nonfull_clusters[order], folio, true); if (found) goto done; } if (!(si->flags & SWP_PAGE_DISCARD)) { - found = alloc_swap_scan_list(si, &si->free_clusters, order, usage, - false); + found = alloc_swap_scan_list(si, &si->free_clusters, folio, false); if (found) goto done; } @@ -1096,8 +1092,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o * failure is not critical. Scanning one cluster still * keeps the list rotated and reclaimed (for HAS_CACHE). */ - found = alloc_swap_scan_list(si, &si->frag_clusters[order], order, - usage, false); + found = alloc_swap_scan_list(si, &si->frag_clusters[order], folio, false); if (found) goto done; } @@ -1111,13 +1106,11 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o * Clusters here have at least one usable slots and can't fail order 0 * allocation, but reclaim may drop si->lock and race with another user. */ - found = alloc_swap_scan_list(si, &si->frag_clusters[o], - 0, usage, true); + found = alloc_swap_scan_list(si, &si->frag_clusters[o], folio, true); if (found) goto done; - found = alloc_swap_scan_list(si, &si->nonfull_clusters[o], - 0, usage, true); + found = alloc_swap_scan_list(si, &si->nonfull_clusters[o], folio, true); if (found) goto done; } @@ -1131,7 +1124,6 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o /* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff) { - int nid; unsigned long pages; spin_lock(&swap_avail_lock); @@ -1160,8 +1152,7 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff) goto skip; } - for_each_node(nid) - plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]); + plist_del(&si->avail_list, &swap_avail_head); skip: spin_unlock(&swap_avail_lock); @@ -1170,7 +1161,6 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff) /* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */ static void add_to_avail_list(struct swap_info_struct *si, bool swapon) { - int nid; long val; unsigned long pages; @@ -1203,8 +1193,7 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon) goto skip; } - for_each_node(nid) - plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]); + plist_add(&si->avail_list, &swap_avail_head); skip: spin_unlock(&swap_avail_lock); @@ -1312,12 +1301,12 @@ static bool get_swap_device_info(struct swap_info_struct *si) * Fast path try to get swap entries with specified order from current * CPU's swap entry pool (a cluster). */ -static bool swap_alloc_fast(swp_entry_t *entry, - int order) +static bool swap_alloc_fast(struct folio *folio) { + unsigned int order = folio_order(folio); struct swap_cluster_info *ci; struct swap_info_struct *si; - unsigned int offset, found = SWAP_ENTRY_INVALID; + unsigned int offset; /* * Once allocated, swap_info_struct will never be completely freed, @@ -1332,41 +1321,33 @@ static bool swap_alloc_fast(swp_entry_t *entry, if (cluster_is_usable(ci, order)) { if (cluster_is_empty(ci)) offset = cluster_offset(si, ci); - found = alloc_swap_scan_cluster(si, ci, offset, order, SWAP_HAS_CACHE); - if (found) - *entry = swp_entry(si->type, found); + alloc_swap_scan_cluster(si, ci, folio, offset); } else { swap_cluster_unlock(ci); } put_swap_device(si); - return !!found; + return folio_test_swapcache(folio); } /* Rotate the device and switch to a new cluster */ -static bool swap_alloc_slow(swp_entry_t *entry, - int order) +static void swap_alloc_slow(struct folio *folio) { - int node; - unsigned long offset; struct swap_info_struct *si, *next; - node = numa_node_id(); spin_lock(&swap_avail_lock); start_over: - plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) { + plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { /* Rotate the device and switch to a new cluster */ - plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); + plist_requeue(&si->avail_list, &swap_avail_head); spin_unlock(&swap_avail_lock); if (get_swap_device_info(si)) { - offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE); + cluster_alloc_swap_entry(si, folio); put_swap_device(si); - if (offset) { - *entry = swp_entry(si->type, offset); - return true; - } - if (order) - return false; + if (folio_test_swapcache(folio)) + return; + if (folio_test_large(folio)) + return; } spin_lock(&swap_avail_lock); @@ -1381,26 +1362,23 @@ static bool swap_alloc_slow(swp_entry_t *entry, * still in the swap_avail_head list then try it, otherwise * start over if we have not gotten any slots. */ - if (plist_node_empty(&next->avail_lists[node])) + if (plist_node_empty(&si->avail_list)) goto start_over; } spin_unlock(&swap_avail_lock); - return false; } /* - * Discard pending clusters in a synchronized way when under high - * pressure to avoid OOM. + * Discard pending clusters in a synchronized way when under high pressure. * Return: true if any cluster is discarded. */ -bool swap_sync_discard(void) +static bool swap_sync_discard(void) { bool ret = false; - int nid = numa_node_id(); struct swap_info_struct *si, *next; spin_lock(&swap_avail_lock); - plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], avail_lists[nid]) { + plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { spin_unlock(&swap_avail_lock); if (get_swap_device_info(si)) { if (si->flags & SWP_PAGE_DISCARD) @@ -1408,18 +1386,86 @@ bool swap_sync_discard(void) put_swap_device(si); } if (ret) - break; + return true; spin_lock(&swap_avail_lock); } spin_unlock(&swap_avail_lock); - return ret; + return false; +} + +/** + * swap_put_entries_cluster - Decrease the swap count of a set of slots. + * @si: The swap device. + * @start: start offset of slots. + * @nr: number of slots. + * @reclaim_cache: if true, also reclaim the swap cache. + * + * This helper decreases the swap count of a set of slots and tries to + * batch free them. Also reclaims the swap cache if @reclaim_cache is true. + * Context: The caller must ensure that all slots belong to the same + * cluster and their swap count doesn't go underflow. + */ +static void swap_put_entries_cluster(struct swap_info_struct *si, + unsigned long start, int nr, + bool reclaim_cache) +{ + unsigned long offset = start, end = start + nr; + unsigned long batch_start = SWAP_ENTRY_INVALID; + struct swap_cluster_info *ci; + bool need_reclaim = false; + unsigned int nr_reclaimed; + unsigned long swp_tb; + unsigned int count; + + ci = swap_cluster_lock(si, offset); + do { + swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER); + count = si->swap_map[offset]; + VM_WARN_ON(count < 1 || count == SWAP_MAP_BAD); + if (count == 1) { + /* count == 1 and non-cached slots will be batch freed. */ + if (!swp_tb_is_folio(swp_tb)) { + if (!batch_start) + batch_start = offset; + continue; + } + /* count will be 0 after put, slot can be reclaimed */ + need_reclaim = true; + } + /* + * A count != 1 or cached slot can't be freed. Put its swap + * count and then free the interrupted pending batch. Cached + * slots will be freed when folio is removed from swap cache + * (__swap_cache_del_folio). + */ + swap_put_entry_locked(si, ci, offset); + if (batch_start) { + swap_entries_free(si, ci, batch_start, offset - batch_start); + batch_start = SWAP_ENTRY_INVALID; + } + } while (++offset < end); + + if (batch_start) + swap_entries_free(si, ci, batch_start, offset - batch_start); + swap_cluster_unlock(ci); + + if (!need_reclaim || !reclaim_cache) + return; + + offset = start; + do { + nr_reclaimed = __try_to_reclaim_swap(si, offset, + TTRS_UNMAPPED | TTRS_FULL); + offset++; + if (nr_reclaimed) + offset = round_up(offset, abs(nr_reclaimed)); + } while (offset < end); } /** * folio_alloc_swap - allocate swap space for a folio * @folio: folio we want to move to swap - * @gfp: gfp mask for shadow nodes * * Allocate swap space for the folio and add the folio to the * swap cache. @@ -1427,11 +1473,10 @@ bool swap_sync_discard(void) * Context: Caller needs to hold the folio lock. * Return: Whether the folio was added to the swap cache. */ -int folio_alloc_swap(struct folio *folio, gfp_t gfp) +int folio_alloc_swap(struct folio *folio) { unsigned int order = folio_order(folio); unsigned int size = 1 << order; - swp_entry_t entry = {}; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio); @@ -1456,89 +1501,90 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp) again: local_lock(&percpu_swap_cluster.lock); - if (!swap_alloc_fast(&entry, order)) - swap_alloc_slow(&entry, order); + if (!swap_alloc_fast(folio)) + swap_alloc_slow(folio); local_unlock(&percpu_swap_cluster.lock); - if (unlikely(!order && !entry.val)) { + if (!order && unlikely(!folio_test_swapcache(folio))) { if (swap_sync_discard()) goto again; } /* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */ - if (mem_cgroup_try_charge_swap(folio, entry)) - goto out_free; + if (unlikely(mem_cgroup_try_charge_swap(folio, folio->swap))) + swap_cache_del_folio(folio); - if (!entry.val) + if (unlikely(!folio_test_swapcache(folio))) return -ENOMEM; - swap_cache_add_folio(folio, entry, NULL); - return 0; +} + +/** + * folio_dup_swap() - Increase swap count of swap entries of a folio. + * @folio: folio with swap entries bounded. + * @subpage: if not NULL, only increase the swap count of this subpage. + * + * Context: Caller must ensure the folio is locked and in the swap cache. + * The caller also has to ensure there is no raced call to + * swap_put_entries_direct before this helper returns, or the swap + * map may underflow (TODO: maybe we should allow or avoid underflow to + * make swap refcount lockless). + */ +int folio_dup_swap(struct folio *folio, struct page *subpage) +{ + int err = 0; + swp_entry_t entry = folio->swap; + unsigned long nr_pages = folio_nr_pages(folio); -out_free: - put_swap_folio(folio, entry); - return -ENOMEM; + VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_FOLIO(!folio_test_swapcache(folio), folio); + + if (subpage) { + entry.val += folio_page_idx(folio, subpage); + nr_pages = 1; + } + + while (!err && __swap_duplicate(entry, 1, nr_pages) == -ENOMEM) + err = add_swap_count_continuation(entry, GFP_ATOMIC); + + return err; } -static struct swap_info_struct *_swap_info_get(swp_entry_t entry) +/** + * folio_put_swap() - Decrease swap count of swap entries of a folio. + * @folio: folio with swap entries bounded, must be in swap cache and locked. + * @subpage: if not NULL, only decrease the swap count of this subpage. + * + * This won't free the swap slots even if swap count drops to zero, they are + * still pinned by the swap cache. User may call folio_free_swap to free them. + * Context: Caller must ensure the folio is locked and in the swap cache. + */ +void folio_put_swap(struct folio *folio, struct page *subpage) { - struct swap_info_struct *si; - unsigned long offset; + swp_entry_t entry = folio->swap; + unsigned long nr_pages = folio_nr_pages(folio); + struct swap_info_struct *si = __swap_entry_to_info(entry); - if (!entry.val) - goto out; - si = swap_entry_to_info(entry); - if (!si) - goto bad_nofile; - if (data_race(!(si->flags & SWP_USED))) - goto bad_device; - offset = swp_offset(entry); - if (offset >= si->max) - goto bad_offset; - if (data_race(!si->swap_map[swp_offset(entry)])) - goto bad_free; - return si; + VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_FOLIO(!folio_test_swapcache(folio), folio); -bad_free: - pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val); - goto out; -bad_offset: - pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val); - goto out; -bad_device: - pr_err("%s: %s%08lx\n", __func__, Unused_file, entry.val); - goto out; -bad_nofile: - pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); -out: - return NULL; + if (subpage) { + entry.val += folio_page_idx(folio, subpage); + nr_pages = 1; + } + + swap_put_entries_cluster(si, swp_offset(entry), nr_pages, false); } -static unsigned char swap_entry_put_locked(struct swap_info_struct *si, - struct swap_cluster_info *ci, - swp_entry_t entry, - unsigned char usage) +static void swap_put_entry_locked(struct swap_info_struct *si, + struct swap_cluster_info *ci, + unsigned long offset) { - unsigned long offset = swp_offset(entry); unsigned char count; - unsigned char has_cache; count = si->swap_map[offset]; - - has_cache = count & SWAP_HAS_CACHE; - count &= ~SWAP_HAS_CACHE; - - if (usage == SWAP_HAS_CACHE) { - VM_BUG_ON(!has_cache); - has_cache = 0; - } else if (count == SWAP_MAP_SHMEM) { - /* - * Or we could insist on shmem.c using a special - * swap_shmem_free() and free_shmem_swap_and_cache()... - */ - count = 0; - } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { + if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { if (count == COUNT_CONTINUED) { if (swap_count_continued(si, offset, count)) count = SWAP_MAP_MAX | COUNT_CONTINUED; @@ -1548,13 +1594,9 @@ static unsigned char swap_entry_put_locked(struct swap_info_struct *si, count--; } - usage = count | has_cache; - if (usage) - WRITE_ONCE(si->swap_map[offset], usage); - else - swap_entries_free(si, ci, entry, 1); - - return usage; + WRITE_ONCE(si->swap_map[offset], count); + if (!count && !swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER))) + swap_entries_free(si, ci, offset, 1); } /* @@ -1583,10 +1625,9 @@ static unsigned char swap_entry_put_locked(struct swap_info_struct *si, * CPU1 CPU2 * do_swap_page() * ... swapoff+swapon - * __read_swap_cache_async() - * swapcache_prepare() - * __swap_duplicate() - * // check swap_map + * swap_cache_alloc_folio() + * swap_cache_add_folio() + * // check swap_map * // verify PTE not changed * * In __swap_duplicate(), the swap_map need to be checked before @@ -1623,104 +1664,15 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry) return NULL; } -static void swap_entries_put_cache(struct swap_info_struct *si, - swp_entry_t entry, int nr) -{ - unsigned long offset = swp_offset(entry); - struct swap_cluster_info *ci; - - ci = swap_cluster_lock(si, offset); - if (swap_only_has_cache(si, offset, nr)) { - swap_entries_free(si, ci, entry, nr); - } else { - for (int i = 0; i < nr; i++, entry.val++) - swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE); - } - swap_cluster_unlock(ci); -} - -static bool swap_entries_put_map(struct swap_info_struct *si, - swp_entry_t entry, int nr) -{ - unsigned long offset = swp_offset(entry); - struct swap_cluster_info *ci; - bool has_cache = false; - unsigned char count; - int i; - - if (nr <= 1) - goto fallback; - count = swap_count(data_race(si->swap_map[offset])); - if (count != 1 && count != SWAP_MAP_SHMEM) - goto fallback; - - ci = swap_cluster_lock(si, offset); - if (!swap_is_last_map(si, offset, nr, &has_cache)) { - goto locked_fallback; - } - if (!has_cache) - swap_entries_free(si, ci, entry, nr); - else - for (i = 0; i < nr; i++) - WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); - swap_cluster_unlock(ci); - - return has_cache; -fallback: - ci = swap_cluster_lock(si, offset); -locked_fallback: - for (i = 0; i < nr; i++, entry.val++) { - count = swap_entry_put_locked(si, ci, entry, 1); - if (count == SWAP_HAS_CACHE) - has_cache = true; - } - swap_cluster_unlock(ci); - return has_cache; -} - -/* - * Only functions with "_nr" suffix are able to free entries spanning - * cross multi clusters, so ensure the range is within a single cluster - * when freeing entries with functions without "_nr" suffix. - */ -static bool swap_entries_put_map_nr(struct swap_info_struct *si, - swp_entry_t entry, int nr) -{ - int cluster_nr, cluster_rest; - unsigned long offset = swp_offset(entry); - bool has_cache = false; - - cluster_rest = SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER; - while (nr) { - cluster_nr = min(nr, cluster_rest); - has_cache |= swap_entries_put_map(si, entry, cluster_nr); - cluster_rest = SWAPFILE_CLUSTER; - nr -= cluster_nr; - entry.val += cluster_nr; - } - - return has_cache; -} - -/* - * Check if it's the last ref of swap entry in the freeing path. - * Qualified vlaue includes 1, SWAP_HAS_CACHE or SWAP_MAP_SHMEM. - */ -static inline bool __maybe_unused swap_is_last_ref(unsigned char count) -{ - return (count == SWAP_HAS_CACHE) || (count == 1) || - (count == SWAP_MAP_SHMEM); -} - /* * Drop the last ref of swap entries, caller have to ensure all entries * belong to the same cgroup and cluster. */ -static void swap_entries_free(struct swap_info_struct *si, - struct swap_cluster_info *ci, - swp_entry_t entry, unsigned int nr_pages) +void swap_entries_free(struct swap_info_struct *si, + struct swap_cluster_info *ci, + unsigned long offset, unsigned int nr_pages) { - unsigned long offset = swp_offset(entry); + swp_entry_t entry = swp_entry(si->type, offset); unsigned char *map = si->swap_map + offset; unsigned char *map_end = map + nr_pages; @@ -1731,7 +1683,7 @@ static void swap_entries_free(struct swap_info_struct *si, ci->count -= nr_pages; do { - VM_BUG_ON(!swap_is_last_ref(*map)); + VM_WARN_ON(*map > 1); *map = 0; } while (++map < map_end); @@ -1745,66 +1697,29 @@ static void swap_entries_free(struct swap_info_struct *si, partial_free_cluster(si, ci); } -/* - * Caller has made sure that the swap device corresponding to entry - * is still around or has not been recycled. - */ -void swap_free_nr(swp_entry_t entry, int nr_pages) -{ - int nr; - struct swap_info_struct *sis; - unsigned long offset = swp_offset(entry); - - sis = _swap_info_get(entry); - if (!sis) - return; - - while (nr_pages) { - nr = min_t(int, nr_pages, SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); - swap_entries_put_map(sis, swp_entry(sis->type, offset), nr); - offset += nr; - nr_pages -= nr; - } -} - -/* - * Called after dropping swapcache to decrease refcnt to swap entries. - */ -void put_swap_folio(struct folio *folio, swp_entry_t entry) -{ - struct swap_info_struct *si; - int size = 1 << swap_entry_order(folio_order(folio)); - - si = _swap_info_get(entry); - if (!si) - return; - - swap_entries_put_cache(si, entry, size); -} - int __swap_count(swp_entry_t entry) { struct swap_info_struct *si = __swap_entry_to_info(entry); pgoff_t offset = swp_offset(entry); - return swap_count(si->swap_map[offset]); + return si->swap_map[offset]; } -/* - * How many references to @entry are currently swapped out? - * This does not give an exact answer when swap count is continued, - * but does include the high COUNT_CONTINUED flag to allow for that. +/** + * swap_entry_swapped - Check if the swap entry at @offset is swapped. + * @si: the swap device. + * @offset: offset of the swap entry. */ -bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry) +bool swap_entry_swapped(struct swap_info_struct *si, unsigned long offset) { - pgoff_t offset = swp_offset(entry); struct swap_cluster_info *ci; int count; ci = swap_cluster_lock(si, offset); - count = swap_count(si->swap_map[offset]); + count = si->swap_map[offset]; swap_cluster_unlock(ci); - return !!count; + + return count && count != SWAP_MAP_BAD; } /* @@ -1820,7 +1735,7 @@ int swp_swapcount(swp_entry_t entry) pgoff_t offset; unsigned char *map; - si = _swap_info_get(entry); + si = get_swap_device(entry); if (!si) return 0; @@ -1828,7 +1743,7 @@ int swp_swapcount(swp_entry_t entry) ci = swap_cluster_lock(si, offset); - count = swap_count(si->swap_map[offset]); + count = si->swap_map[offset]; if (!(count & COUNT_CONTINUED)) goto out; @@ -1850,6 +1765,7 @@ int swp_swapcount(swp_entry_t entry) } while (tmp_count & COUNT_CONTINUED); out: swap_cluster_unlock(ci); + put_swap_device(si); return count; } EXPORT_SYMBOL(swp_swapcount); @@ -1867,12 +1783,12 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, ci = swap_cluster_lock(si, offset); if (nr_pages == 1) { - if (swap_count(map[roffset])) + if (map[roffset]) ret = true; goto unlock_out; } for (i = 0; i < nr_pages; i++) { - if (swap_count(map[offset + i])) { + if (map[offset + i]) { ret = true; break; } @@ -1885,13 +1801,14 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, static bool folio_swapped(struct folio *folio) { swp_entry_t entry = folio->swap; - struct swap_info_struct *si = _swap_info_get(entry); + struct swap_info_struct *si; - if (!si) - return false; + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); + si = __swap_entry_to_info(entry); if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio))) - return swap_entry_swapped(si, entry); + return swap_entry_swapped(si, swp_offset(entry)); return swap_page_trans_huge_swapped(si, entry, folio_order(folio)); } @@ -1948,73 +1865,45 @@ bool folio_free_swap(struct folio *folio) } /** - * free_swap_and_cache_nr() - Release reference on range of swap entries and - * reclaim their cache if no more references remain. + * swap_put_entries_direct() - Release reference on range of swap entries and + * reclaim their cache if no more references remain. * @entry: First entry of range. * @nr: Number of entries in range. * * For each swap entry in the contiguous range, release a reference. If any swap * entries become free, try to reclaim their underlying folios, if present. The * offset range is defined by [entry.offset, entry.offset + nr). + * + * Context: Caller must ensure there is no race condition on the reference + * owner. e.g., locking the PTL of a PTE containing the entry being released. */ -void free_swap_and_cache_nr(swp_entry_t entry, int nr) +void swap_put_entries_direct(swp_entry_t entry, int nr) { const unsigned long start_offset = swp_offset(entry); const unsigned long end_offset = start_offset + nr; + unsigned long offset, cluster_end; struct swap_info_struct *si; - bool any_only_cache = false; - unsigned long offset; si = get_swap_device(entry); - if (!si) + if (WARN_ON_ONCE(!si)) return; - - if (WARN_ON(end_offset > si->max)) - goto out; - - /* - * First free all entries in the range. - */ - any_only_cache = swap_entries_put_map_nr(si, entry, nr); - - /* - * Short-circuit the below loop if none of the entries had their - * reference drop to zero. - */ - if (!any_only_cache) + if (WARN_ON_ONCE(end_offset > si->max)) goto out; - /* - * Now go back over the range trying to reclaim the swap cache. - */ - for (offset = start_offset; offset < end_offset; offset += nr) { - nr = 1; - if (READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { - /* - * Folios are always naturally aligned in swap so - * advance forward to the next boundary. Zero means no - * folio was found for the swap entry, so advance by 1 - * in this case. Negative value means folio was found - * but could not be reclaimed. Here we can still advance - * to the next boundary. - */ - nr = __try_to_reclaim_swap(si, offset, - TTRS_UNMAPPED | TTRS_FULL); - if (nr == 0) - nr = 1; - else if (nr < 0) - nr = -nr; - nr = ALIGN(offset + 1, nr) - offset; - } - } - + /* Put entries and reclaim cache in each cluster */ + offset = start_offset; + do { + cluster_end = min(round_up(offset + 1, SWAPFILE_CLUSTER), end_offset); + swap_put_entries_cluster(si, offset, cluster_end - offset, true); + offset = cluster_end; + } while (offset < end_offset); out: put_swap_device(si); } #ifdef CONFIG_HIBERNATION - -swp_entry_t get_swap_page_of_type(int type) +/* Allocate a slot for hibernation */ +swp_entry_t swap_alloc_hibernation_slot(int type) { struct swap_info_struct *si = swap_type_to_info(type); unsigned long offset; @@ -2031,7 +1920,7 @@ swp_entry_t get_swap_page_of_type(int type) * with swap table allocation. */ local_lock(&percpu_swap_cluster.lock); - offset = cluster_alloc_swap_entry(si, 0, 1); + offset = cluster_alloc_swap_entry(si, NULL); local_unlock(&percpu_swap_cluster.lock); if (offset) { entry = swp_entry(si->type, offset); @@ -2044,6 +1933,27 @@ swp_entry_t get_swap_page_of_type(int type) return entry; } +/* Free a slot allocated by swap_alloc_hibernation_slot */ +void swap_free_hibernation_slot(swp_entry_t entry) +{ + struct swap_info_struct *si; + struct swap_cluster_info *ci; + pgoff_t offset = swp_offset(entry); + + si = get_swap_device(entry); + if (WARN_ON(!si)) + return; + + ci = swap_cluster_lock(si, offset); + swap_put_entry_locked(si, ci, offset); + WARN_ON(swap_entry_swapped(si, offset)); + swap_cluster_unlock(ci); + + /* In theory readahead might add it to the swap cache by accident */ + __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); + put_swap_device(si); +} + /* * Find the swap type that corresponds to given device (if any). * @@ -2205,7 +2115,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, /* * Some architectures may have to restore extra metadata to the page * when reading from swap. This metadata may be indexed by swap entry - * so this must be called before swap_free(). + * so this must be called before folio_put_swap(). */ arch_swap_restore(folio_swap(entry, folio), folio); @@ -2246,7 +2156,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, new_pte = pte_mkuffd_wp(new_pte); setpte: set_pte_at(vma->vm_mm, addr, pte, new_pte); - swap_free(entry); + folio_put_swap(folio, page); out: if (pte) pte_unmap_unlock(pte, ptl); @@ -2439,6 +2349,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, unsigned int prev) { unsigned int i; + unsigned long swp_tb; unsigned char count; /* @@ -2449,7 +2360,11 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, */ for (i = prev + 1; i < si->max; i++) { count = READ_ONCE(si->swap_map[i]); - if (count && swap_count(count) != SWAP_MAP_BAD) + swp_tb = swap_table_get(__swap_offset_to_cluster(si, i), + i % SWAPFILE_CLUSTER); + if (count == SWAP_MAP_BAD) + continue; + if (count || swp_tb_is_folio(swp_tb)) break; if ((i % LATENCY_LIMIT) == 0) cond_resched(); @@ -2709,44 +2624,18 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) return generic_swapfile_activate(sis, swap_file, span); } -static int swap_node(struct swap_info_struct *si) -{ - struct block_device *bdev; - - if (si->bdev) - bdev = si->bdev; - else - bdev = si->swap_file->f_inode->i_sb->s_bdev; - - return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; -} - static void setup_swap_info(struct swap_info_struct *si, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info, unsigned long *zeromap) { - int i; - - if (prio >= 0) - si->prio = prio; - else - si->prio = --least_priority; + si->prio = prio; /* * the plist prio is negated because plist ordering is * low-to-high, while swap ordering is high-to-low */ si->list.prio = -si->prio; - for_each_node(i) { - if (si->prio >= 0) - si->avail_lists[i].prio = -si->prio; - else { - if (swap_node(si) == i) - si->avail_lists[i].prio = 1; - else - si->avail_lists[i].prio = -si->prio; - } - } + si->avail_list.prio = -si->prio; si->swap_map = swap_map; si->cluster_info = cluster_info; si->zeromap = zeromap; @@ -2758,16 +2647,7 @@ static void _enable_swap_info(struct swap_info_struct *si) total_swap_pages += si->pages; assert_spin_locked(&swap_lock); - /* - * both lists are plists, and thus priority ordered. - * swap_active_head needs to be priority ordered for swapoff(), - * which on removal of any swap_info_struct with an auto-assigned - * (i.e. negative) priority increments the auto-assigned priority - * of any lower-priority swap_info_structs. - * swap_avail_head needs to be priority ordered for folio_alloc_swap(), - * which allocates swap pages from the highest available priority - * swap_info_struct. - */ + plist_add(&si->list, &swap_active_head); /* Add back to available list */ @@ -2917,20 +2797,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) } spin_lock(&p->lock); del_from_avail_list(p, true); - if (p->prio < 0) { - struct swap_info_struct *si = p; - int nid; - - plist_for_each_entry_continue(si, &swap_active_head, list) { - si->prio++; - si->list.prio--; - for_each_node(nid) { - if (si->avail_lists[nid].prio != 1) - si->avail_lists[nid].prio--; - } - } - least_priority++; - } plist_del(&p->list, &swap_active_head); atomic_long_sub(p->pages, &nr_swap_pages); total_swap_pages -= p->pages; @@ -3168,9 +3034,8 @@ static struct swap_info_struct *alloc_swap_info(void) struct swap_info_struct *p; struct swap_info_struct *defer = NULL; unsigned int type; - int i; - p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL); + p = kvzalloc(sizeof(struct swap_info_struct), GFP_KERNEL); if (!p) return ERR_PTR(-ENOMEM); @@ -3209,8 +3074,7 @@ static struct swap_info_struct *alloc_swap_info(void) } p->swap_extent_root = RB_ROOT; plist_node_init(&p->list, 0); - for_each_node(i) - plist_node_init(&p->avail_lists[i], 0); + plist_node_init(&p->avail_list, 0); p->flags = SWP_USED; spin_unlock(&swap_lock); if (defer) { @@ -3399,7 +3263,7 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, * See setup_swap_map(): header page, bad pages, * and the EOF part of the last cluster. */ - err = inc_cluster_info_page(si, cluster_info, 0); + err = swap_cluster_setup_bad_slot(cluster_info, 0); if (err) goto err; for (i = 0; i < swap_header->info.nr_badpages; i++) { @@ -3407,12 +3271,12 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, if (page_nr >= maxpages) continue; - err = inc_cluster_info_page(si, cluster_info, page_nr); + err = swap_cluster_setup_bad_slot(cluster_info, page_nr); if (err) goto err; } for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) { - err = inc_cluster_info_page(si, cluster_info, i); + err = swap_cluster_setup_bad_slot(cluster_info, i); if (err) goto err; } @@ -3471,9 +3335,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!swap_avail_heads) - return -ENOMEM; - si = alloc_swap_info(); if (IS_ERR(si)) return PTR_ERR(si); @@ -3641,7 +3502,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } mutex_lock(&swapon_mutex); - prio = -1; + prio = DEF_SWAP_PRIO; if (swap_flags & SWAP_FLAG_PREFER) prio = swap_flags & SWAP_FLAG_PRIO_MASK; enable_swap_info(si, prio, swap_map, cluster_info, zeromap); @@ -3721,63 +3582,32 @@ void si_swapinfo(struct sysinfo *val) * - swap-cache reference is requested but the entry is not used. -> ENOENT * - swap-mapped reference requested but needs continued swap count. -> ENOMEM */ -static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) +static int swap_dup_entries(struct swap_info_struct *si, + struct swap_cluster_info *ci, + unsigned long offset, + unsigned char usage, int nr) { - struct swap_info_struct *si; - struct swap_cluster_info *ci; - unsigned long offset; + int i; unsigned char count; - unsigned char has_cache; - int err, i; - - si = swap_entry_to_info(entry); - if (WARN_ON_ONCE(!si)) { - pr_err("%s%08lx\n", Bad_file, entry.val); - return -EINVAL; - } - - offset = swp_offset(entry); - VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); - VM_WARN_ON(usage == 1 && nr > 1); - ci = swap_cluster_lock(si, offset); - err = 0; for (i = 0; i < nr; i++) { count = si->swap_map[offset + i]; - /* - * swapin_readahead() doesn't check if a swap entry is valid, so the - * swap entry could be SWAP_MAP_BAD. Check here with lock held. + * Allocator never allocates bad slots, and readahead is guarded + * by swap_entry_swapped. */ - if (unlikely(swap_count(count) == SWAP_MAP_BAD)) { - err = -ENOENT; - goto unlock_out; - } - - has_cache = count & SWAP_HAS_CACHE; - count &= ~SWAP_HAS_CACHE; - - if (!count && !has_cache) { - err = -ENOENT; - } else if (usage == SWAP_HAS_CACHE) { - if (has_cache) - err = -EEXIST; - } else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) { - err = -EINVAL; - } - - if (err) - goto unlock_out; + VM_WARN_ON(count == SWAP_MAP_BAD); + /* + * Swap count duplication is guranteed by either locked swap cache + * folio (folio_dup_swap) or external lock (swap_dup_entry_direct). + */ + VM_WARN_ON(!count && + !swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER))); } for (i = 0; i < nr; i++) { count = si->swap_map[offset + i]; - has_cache = count & SWAP_HAS_CACHE; - count &= ~SWAP_HAS_CACHE; - - if (usage == SWAP_HAS_CACHE) - has_cache = SWAP_HAS_CACHE; - else if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) + if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) count += usage; else if (swap_count_continued(si, offset + i, count)) count = COUNT_CONTINUED; @@ -3786,65 +3616,54 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) * Don't need to rollback changes, because if * usage == 1, there must be nr == 1. */ - err = -ENOMEM; - goto unlock_out; + return -ENOMEM; } - WRITE_ONCE(si->swap_map[offset + i], count | has_cache); + WRITE_ONCE(si->swap_map[offset + i], count); } -unlock_out: - swap_cluster_unlock(ci); - return err; + return 0; } -/* - * Help swapoff by noting that swap entry belongs to shmem/tmpfs - * (in which case its reference count is never incremented). - */ -void swap_shmem_alloc(swp_entry_t entry, int nr) +static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) { - __swap_duplicate(entry, SWAP_MAP_SHMEM, nr); + int err; + struct swap_info_struct *si; + struct swap_cluster_info *ci; + unsigned long offset = swp_offset(entry); + + si = swap_entry_to_info(entry); + if (WARN_ON_ONCE(!si)) { + pr_err("%s%08lx\n", Bad_file, entry.val); + return -EINVAL; + } + + VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); + ci = swap_cluster_lock(si, offset); + err = swap_dup_entries(si, ci, offset, usage, nr); + swap_cluster_unlock(ci); + return err; } /* - * Increase reference count of swap entry by 1. + * swap_dup_entry_direct() - Increase reference count of a swap entry by one. + * @entry: first swap entry from which we want to increase the refcount. + * * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required * but could not be atomically allocated. Returns 0, just as if it succeeded, * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which * might occur if a page table entry has got corrupted. + * + * Context: Caller must ensure there is no race condition on the reference + * owner. e.g., locking the PTL of a PTE containing the entry being increased. */ -int swap_duplicate(swp_entry_t entry) +int swap_dup_entry_direct(swp_entry_t entry) { int err = 0; - while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM) err = add_swap_count_continuation(entry, GFP_ATOMIC); return err; } -EXPORT_SYMBOL(swap_duplicate); - -/* - * @entry: first swap entry from which we allocate nr swap cache. - * - * Called when allocating swap cache for existing swap entries, - * This can return error codes. Returns 0 at success. - * -EEXIST means there is a swap cache. - * Note: return code is different from swap_duplicate(). - */ -int swapcache_prepare(swp_entry_t entry, int nr) -{ - return __swap_duplicate(entry, SWAP_HAS_CACHE, nr); -} - -/* - * Caller should ensure entries belong to the same folio so - * the entries won't span cross cluster boundary. - */ -void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr) -{ - swap_entries_put_cache(si, entry, nr); -} /* * add_swap_count_continuation - called when a swap count is duplicated @@ -3891,7 +3710,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) ci = swap_cluster_lock(si, offset); - count = swap_count(si->swap_map[offset]); + count = si->swap_map[offset]; if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { /* @@ -3963,7 +3782,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) * into, carry if so, or else fail until a new continuation page is allocated; * when the original swap_map count is decremented from 0 with continuation, * borrow from the continuation and report whether it still holds more. - * Called while __swap_duplicate() or caller of swap_entry_put_locked() + * Called while __swap_duplicate() or caller of swap_put_entry_locked() * holds cluster lock. */ static bool swap_count_continued(struct swap_info_struct *si, @@ -4077,7 +3896,6 @@ static bool __has_usable_swap(void) void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) { struct swap_info_struct *si, *next; - int nid = folio_nid(folio); if (!(gfp & __GFP_IO)) return; @@ -4096,8 +3914,8 @@ void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) return; spin_lock(&swap_avail_lock); - plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], - avail_lists[nid]) { + plist_for_each_entry_safe(si, next, &swap_avail_head, + avail_list) { if (si->bdev) { blkcg_schedule_throttle(si->bdev->bd_disk, true); break; @@ -4109,18 +3927,6 @@ void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) static int __init swapfile_init(void) { - int nid; - - swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head), - GFP_KERNEL); - if (!swap_avail_heads) { - pr_emerg("Not enough memory for swap heads, swap is disabled\n"); - return -ENOMEM; - } - - for_each_node(nid) - plist_head_init(&swap_avail_heads[nid]); - swapfile_maximum_size = arch_max_swapfile_size(); /* diff --git a/mm/vmscan.c b/mm/vmscan.c index b2240d621727ecf8d6de6d3b9bc9b98c520ef57b..6a55652f6ed3f25dd1499eef4363ff42ae05ce5e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -868,7 +868,6 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, __swap_cache_del_folio(ci, folio, swap, shadow); mem_cgroup_swapout(folio, swap); swap_cluster_unlock_irq(ci); - put_swap_folio(folio, swap); } else { void (*free_folio)(struct folio *); @@ -1350,7 +1349,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, split_folio_to_list(folio, folio_list)) goto activate_locked; } - if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN)) { + if (folio_alloc_swap(folio)) { int __maybe_unused order = folio_order(folio); if (!folio_test_large(folio)) @@ -1366,7 +1365,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, } #endif count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK); - if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN)) + if (folio_alloc_swap(folio)) goto activate_locked_split; } /* diff --git a/mm/zswap.c b/mm/zswap.c index 11cc0c684f04e66f6a2920051341eb9724511e35..62b57980bd2d5fddc7a3d14e3d3bf2578477ce6a 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1094,8 +1094,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry, return -EEXIST; mpol = get_task_policy(current); - folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, - NO_INTERLEAVE_INDEX, &folio_was_allocated, true); + folio = swap_cache_alloc_folio(swpentry, GFP_KERNEL, mpol, + NO_INTERLEAVE_INDEX, &folio_was_allocated); put_swap_device(si); if (!folio) return -ENOMEM;