diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index ac7eb9cbf24a67ac02925ec8a659219db1b97cf6..90aae84c850aa257bb940e72528e712ab396c90f 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4669,6 +4669,14 @@ pm_debug_messages [SUSPEND,KNL] Enable suspend/resume debug messages during boot up. + pmd_mapping= [ARM64,KNL] + Format: nn% + Allows to allocate contiguous memory from special pfn + range, the linear mapping granule of this range is never + larger than PMD. pmd_mapping specifies the percent of + memory of each node. pmd_mapping=100% is used for hugetlb + scenarios, the whole linear mapping isn't large than PMD. + pnp.debug=1 [PNP] Enable PNP debug messages (depends on the CONFIG_PNP_DEBUG_MESSAGES option). Change at run-time diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index dec41d9fa0548721ae915729a929018f38008adc..95974b69e20207840b0f4dad0ec927301b512330 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2753,3 +2753,4 @@ source "drivers/acpi/Kconfig" source "arch/arm64/kvm/Kconfig" +source "arch/arm64/mm/Kconfig" diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 63625b54e541755a849a552fb6cb7473ce94574e..51549b6fc1a39ee7280361838fc6eaeebbce1433 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -796,6 +796,7 @@ CONFIG_KVM=y CONFIG_KVM_ARM_MULTI_LPI_TRANSLATE_CACHE=y CONFIG_ARCH_VCPU_STAT=y CONFIG_VIRT_VTIMER_IRQ_BYPASS=y +CONFIG_PFN_RANGE_ALLOC=y CONFIG_CPU_MITIGATIONS=y # diff --git a/arch/arm64/mm/Kconfig b/arch/arm64/mm/Kconfig new file mode 100644 index 0000000000000000000000000000000000000000..e7880c691822bbc4eca09fb493de550787ae6cb9 --- /dev/null +++ b/arch/arm64/mm/Kconfig @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# MEMORY configuration for arm64 +# + +config PFN_RANGE_ALLOC + bool "Enable contiguous pfn range allocator" + default n + select CONTIG_ALLOC + depends on MEMORY_HOTPLUG + help + It allows to allocate contiguous memory from special pfn range, + the linear mapping granule of this range is never larger than PMD. + + If unsure, say N. diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile index 2fc8c6dd04070b61078e2e8627fd3acc606adc7e..c02aeb729717d8888d6aad41e52ebb200fc18134 100644 --- a/arch/arm64/mm/Makefile +++ b/arch/arm64/mm/Makefile @@ -15,3 +15,4 @@ KASAN_SANITIZE_physaddr.o += n obj-$(CONFIG_KASAN) += kasan_init.o KASAN_SANITIZE_kasan_init.o := n +obj-$(CONFIG_PFN_RANGE_ALLOC) += pfn_range_alloc.o diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 367f761de1ab536e6557cff1fd57765daa19ce14..da75dd9d964b9965a1fdff7c21e50b10e8d9fb38 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -632,6 +632,11 @@ void __init mem_init(void) pswiotlb_init(1, PSWIOTLB_VERBOSE); #endif + /* Must be placed before buddy is initialized, to avoid reserved + * memory is reserved by memblock. + */ + pmd_mapping_reserve_and_remap(); + /* this will put all unused low memory onto the freelists */ memblock_free_all(); diff --git a/arch/arm64/mm/internal.h b/arch/arm64/mm/internal.h index e75ad9dd656d5bfd6f8b80251f5d13a2a8f88ee4..6be8688aad4ecc7cc0a98844fbb4c8427bcd517b 100644 --- a/arch/arm64/mm/internal.h +++ b/arch/arm64/mm/internal.h @@ -9,4 +9,27 @@ extern struct memblock_region mbk_memmap_regions[MAX_RES_REGIONS]; extern int mbk_memmap_cnt; +#ifdef CONFIG_PFN_RANGE_ALLOC +#define PFN_RANGE_ALLOC_SIZE PMD_SIZE +#define PFN_RANGE_ALLOC_ORDER PMD_ORDER + +static inline bool should_pmd_linear_mapping(void) +{ + return contig_mem_pool_percent == 100; +} + +void __init pmd_mapping_reserve_and_remap(void); +void __init pmd_mapping_reserved_remap(phys_addr_t start, phys_addr_t end); +#else +static inline void pmd_mapping_reserve_and_remap(void) +{ +} +static inline void pmd_mapping_reserved_remap(phys_addr_t start, phys_addr_t end) +{ +} +static inline bool should_pmd_linear_mapping(void) +{ + return false; +} +#endif #endif /* ifndef _ARM64_MM_INTERNAL_H */ diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 48e42e4383d3c3fd9008edd0e7e66046a2e12f16..181b5ceb628c07f122d9215135e0f8bfb335ae85 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -25,6 +25,7 @@ #include #include #include +#include "internal.h" #include #include @@ -45,6 +46,11 @@ #define NO_BLOCK_MAPPINGS BIT(0) #define NO_CONT_MAPPINGS BIT(1) #define NO_EXEC_MAPPINGS BIT(2) /* assumes FEAT_HPDS is not used */ +#ifdef CONFIG_PFN_RANGE_ALLOC +#define NO_PUD_BLOCK_MAPPINGS BIT(3) +#else +#define NO_PUD_BLOCK_MAPPINGS 0 +#endif int idmap_t0sz __ro_after_init; @@ -347,7 +353,7 @@ static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end, */ if (pud_sect_supported() && ((addr | next | phys) & ~PUD_MASK) == 0 && - (flags & NO_BLOCK_MAPPINGS) == 0) { + (flags & (NO_BLOCK_MAPPINGS | NO_PUD_BLOCK_MAPPINGS)) == 0) { pud_set_huge(pudp, phys, prot); /* @@ -594,6 +600,8 @@ static void __init map_mem(pgd_t *pgdp) if (can_set_direct_map() || is_virtcca_cvm_world()) flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; + else if (should_pmd_linear_mapping()) + flags |= NO_PUD_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; /* * Take care not to create a writable alias for the @@ -1141,6 +1149,20 @@ static void free_empty_tables(unsigned long addr, unsigned long end, } #endif +#ifdef CONFIG_PFN_RANGE_ALLOC +void __init pmd_mapping_reserved_remap(phys_addr_t start, phys_addr_t end) +{ + unsigned long vstart, vend; + + vstart = __phys_to_virt(start); + vend = __phys_to_virt(end); + unmap_hotplug_range(vstart, vend, false, NULL); + __create_pgd_mapping(swapper_pg_dir, start, vstart, end - start, + pgprot_tagged(PAGE_KERNEL), early_pgtable_alloc, + NO_PUD_BLOCK_MAPPINGS | NO_CONT_MAPPINGS); +} +#endif + void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node, unsigned long addr, unsigned long next) { @@ -1385,6 +1407,8 @@ int arch_add_memory(int nid, u64 start, u64 size, if (can_set_direct_map() || is_virtcca_cvm_world()) flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; + else if (should_pmd_linear_mapping()) + flags |= NO_PUD_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start), size, params->pgprot, __pgd_pgtable_alloc, diff --git a/arch/arm64/mm/pfn_range_alloc.c b/arch/arm64/mm/pfn_range_alloc.c new file mode 100644 index 0000000000000000000000000000000000000000..011b65bcbbaa8a93e05e3b6f931a568b33cdca32 --- /dev/null +++ b/arch/arm64/mm/pfn_range_alloc.c @@ -0,0 +1,624 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Contiguous pfn range allocator + * + * Copyright (C) 2025 Huawei Limited. + */ + +#define pr_fmt(fmt) "pfn_range_alloc: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" +#include "../../../mm/internal.h" + +struct pmd_lm_range { + unsigned long start_pfn; + unsigned long end_pfn; + spinlock_t lock; + unsigned long *bitmap; + unsigned long bitmap_maxno; +}; + +unsigned long contig_mem_pool_percent __ro_after_init; +EXPORT_SYMBOL_GPL(contig_mem_pool_percent); +static unsigned long nr_reserved_pages[MAX_NUMNODES] __initdata; +static struct pmd_lm_range reserved_range[MAX_NUMNODES]; +DEFINE_STATIC_KEY_FALSE(pmd_mapping_initialized); +static atomic_long_t num_poisoned_pfn __read_mostly = ATOMIC_LONG_INIT(0); + +static inline bool pmd_linear_mapping_enabled(void) +{ + return static_branch_unlikely(&pmd_mapping_initialized); +} + +static __init int cmdline_parse_pmd_mapping(char *p) +{ + unsigned long percent; + char *endptr; + + if (!p) + return -EINVAL; + + percent = simple_strtoul(p, &endptr, 0); + if (*endptr != '%' || *(endptr + 1) != '\0') + return -EINVAL; + + if (percent > 100) + return -EINVAL; + + contig_mem_pool_percent = percent; + + return 0; +} +early_param("pmd_mapping", cmdline_parse_pmd_mapping); + +static __init void calculate_node_nr_reserved_pages(void) +{ + unsigned long start_pfn, end_pfn; + int i, nid; + + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) + nr_reserved_pages[nid] += end_pfn - start_pfn; + + for_each_online_node(nid) { + nr_reserved_pages[nid] = nr_reserved_pages[nid] * contig_mem_pool_percent / 100; + nr_reserved_pages[nid] = ALIGN_DOWN(nr_reserved_pages[nid], + PUD_SIZE / PAGE_SIZE); + } +} + +static __init unsigned long calculate_reserve_base(int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + struct zone *zone; + unsigned long base = 0; + +#ifdef CONFIG_ZONE_DMA + zone = &pgdat->node_zones[ZONE_DMA]; + if (managed_zone(zone)) + base = max(base, PFN_PHYS(zone_end_pfn(zone))); +#endif + +#ifdef CONFIG_ZONE_DMA32 + zone = &pgdat->node_zones[ZONE_DMA32]; + if (managed_zone(zone)) + base = max(base, PFN_PHYS(zone_end_pfn(zone))); +#endif + + return base; +} + +static __init int __get_suitable_reserved_range(int nid) +{ + unsigned long base, size, start; + + base = calculate_reserve_base(nid); +retry: + size = nr_reserved_pages[nid] * PAGE_SIZE; + start = memblock_alloc_range_nid(size, PUD_SIZE, base, 0, nid, true); + /* + * If reservation fails, try to fallback to reserve + * smaller size. Fallback is at PUD_SIZE granularity. + */ + if (!start) { + nr_reserved_pages[nid] -= PUD_SIZE / PAGE_SIZE; + if (!nr_reserved_pages[nid]) + return -ENOMEM; + goto retry; + } + + reserved_range[nid].start_pfn = PHYS_PFN(start); + reserved_range[nid].end_pfn = PHYS_PFN(start) + nr_reserved_pages[nid]; + + return 0; +} + +static __init int get_suitable_reserved_range(void) +{ + bool restore_bottom_up = false; + unsigned long start, end; + bool resved = false; + int nid, ret; + + if (memblock_bottom_up()) { + memblock_set_bottom_up(false); + restore_bottom_up = true; + } + + calculate_node_nr_reserved_pages(); + for_each_online_node(nid) { + if (!nr_reserved_pages[nid]) + continue; + + ret = __get_suitable_reserved_range(nid); + if (ret) { + pr_warn("reservation failed for node %d\n", nid); + continue; + } + + start = PFN_PHYS(reserved_range[nid].start_pfn); + end = PFN_PHYS(reserved_range[nid].end_pfn); + pmd_mapping_reserved_remap(start, end); + resved = true; + pr_info("reserved %lu MiB on node %d\n", (end - start) / SZ_1M, nid); + } + + if (restore_bottom_up) + memblock_set_bottom_up(true); + + return resved; +} + +static __init void put_suitable_reserved_range(void) +{ + unsigned long start, end; + int ret, nid; + + for_each_online_node(nid) { + start = PFN_PHYS(reserved_range[nid].start_pfn); + end = PFN_PHYS(reserved_range[nid].end_pfn); + + if (start == end) + continue; + + ret = memblock_phys_free(start, end - start); + if (ret) + pr_warn("put reserved memory [%lx, %lx) failed(%d) for node %d\n", + start, end, ret, nid); + } +} + +void __init pmd_mapping_reserve_and_remap(void) +{ + bool resved; + + if (!contig_mem_pool_percent) + return; + + if (should_pmd_linear_mapping()) + goto out; + + if (can_set_direct_map()) { + pr_info("linear mapping is mapped at PTE level, all memory can be borrowed\n"); + goto out; + } + + resved = get_suitable_reserved_range(); + if (!resved) + return; + + put_suitable_reserved_range(); +out: + static_branch_enable(&pmd_mapping_initialized); +} + +static int __init activate_reserved_range(void) +{ + int nid; + unsigned long pfn, end_pfn; + unsigned long bitmap_maxno; + + if (!pmd_linear_mapping_enabled()) + return 0; + + for_each_online_node(nid) { + pfn = reserved_range[nid].start_pfn; + end_pfn = reserved_range[nid].end_pfn; + + if (pfn == end_pfn) + continue; + + bitmap_maxno = (end_pfn - pfn) / (PFN_RANGE_ALLOC_SIZE / PAGE_SIZE); + reserved_range[nid].bitmap_maxno = bitmap_maxno; + reserved_range[nid].bitmap = bitmap_zalloc(bitmap_maxno, GFP_KERNEL); + if (!reserved_range[nid].bitmap) { + reserved_range[nid].start_pfn = 0; + reserved_range[nid].end_pfn = 0; + pr_warn("reserved_range %d fails to be initialized\n", nid); + continue; + } + spin_lock_init(&reserved_range[nid].lock); + } + + return 0; +} +core_initcall(activate_reserved_range); + +struct folio *pfn_range_alloc(unsigned int nr_pages, int nid) +{ + unsigned long min_align = PFN_RANGE_ALLOC_NR_PAGES; + gfp_t gfp_mask = (GFP_KERNEL | __GFP_COMP) & ~__GFP_RECLAIM; + unsigned long start, bitmap_no, bitmap_count, mask, offset; + struct pmd_lm_range *mem_range; + struct folio *folio = ERR_PTR(-EINVAL); + unsigned long pfn; + int ret; + + if (in_interrupt()) + goto out; + + if (nid < 0 || nid >= MAX_NUMNODES) + goto out; + + if (!IS_ALIGNED(nr_pages, min_align)) + goto out; + + if (can_set_direct_map() || should_pmd_linear_mapping()) { + int order = ilog2(nr_pages); + + folio = NULL; + gfp_mask |= __GFP_THISNODE; + if (nr_pages <= MAX_ORDER_NR_PAGES) + folio = __folio_alloc_node(gfp_mask | __GFP_NOWARN, order, nid); + if (!folio) + folio = folio_alloc_gigantic(order, gfp_mask, nid, NULL); + if (!folio) + folio = ERR_PTR(-ENOMEM); + + goto out; + } + + mem_range = &reserved_range[nid]; + if (!mem_range->bitmap) { + folio = ERR_PTR(-ENOMEM); + goto out; + } + + start = 0; + bitmap_count = nr_pages / min_align; + mask = bitmap_count - 1; + offset = (mem_range->start_pfn & (nr_pages - 1)) / min_align; + for (;;) { + spin_lock(&mem_range->lock); + bitmap_no = bitmap_find_next_zero_area_off(mem_range->bitmap, + mem_range->bitmap_maxno, start, bitmap_count, mask, offset); + if (bitmap_no >= mem_range->bitmap_maxno) { + spin_unlock(&mem_range->lock); + break; + } + bitmap_set(mem_range->bitmap, bitmap_no, bitmap_count); + spin_unlock(&mem_range->lock); + pfn = mem_range->start_pfn + bitmap_no * min_align; + ret = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE, gfp_mask); + if (!ret) { + folio = pfn_folio(pfn); + goto out; + } + + spin_lock(&mem_range->lock); + bitmap_clear(mem_range->bitmap, bitmap_no, bitmap_count); + spin_unlock(&mem_range->lock); + start = bitmap_no + bitmap_count; + } + + folio = ERR_PTR(-ENOMEM); +out: + trace_pfn_range_alloc(folio, nr_pages, nid); + return folio; +} +EXPORT_SYMBOL_GPL(pfn_range_alloc); + +static void pfn_range_folio_dissolve(struct folio *folio) +{ + int nr_pages = folio_nr_pages(folio); + struct page *page; + int i; + + VM_WARN_ON_FOLIO(folio_ref_count(folio) != 1, folio); + + for (i = 1; i < nr_pages; i++) { + page = folio_page(folio, i); + page->flags &= ~PAGE_FLAGS_CHECK_AT_FREE; + page->mapping = NULL; + clear_compound_head(page); + set_page_refcounted(page); + } + + __folio_clear_head(folio); + + page = &folio->page; + for (i = 0; i < nr_pages; i++, page++) { + if (PageHWPoison(page)) { + atomic_long_inc(&num_poisoned_pfn); + continue; + } + + __free_page(page); + } +} + +static inline bool pfn_range_free_prepare(struct folio *folio) +{ + int nr_pages = folio_nr_pages(folio); + struct page *page = &folio->page; + int i; + + for (i = 0; i < nr_pages; i++, page++) + if (PageHWPoison(page)) { + pfn_range_folio_dissolve(folio); + return false; + } + + return true; +} + +int pfn_range_free(struct folio *folio) +{ + struct pmd_lm_range *mem_range; + unsigned long start_pfn, end_pfn; + unsigned long bitmap_no, bitmap_count; + unsigned long nr_pages = folio_nr_pages(folio); + unsigned long min_align = PFN_RANGE_ALLOC_NR_PAGES; + int ret = 0; + + if (in_interrupt()) { + ret = -EINVAL; + goto out; + } + + if (!pfn_range_free_prepare(folio)) + goto out; + + if (can_set_direct_map() || should_pmd_linear_mapping()) { + folio_put(folio); + goto out; + } + + mem_range = &reserved_range[folio_nid(folio)]; + start_pfn = folio_pfn(folio); + end_pfn = start_pfn + nr_pages; + + if (start_pfn < mem_range->start_pfn || end_pfn > mem_range->end_pfn) { + ret = -EINVAL; + goto out; + } + + free_contig_range(start_pfn, nr_pages); + bitmap_no = (start_pfn - mem_range->start_pfn) / min_align; + bitmap_count = nr_pages / min_align; + spin_lock(&mem_range->lock); + bitmap_clear(mem_range->bitmap, bitmap_no, bitmap_count); + spin_unlock(&mem_range->lock); + +out: + trace_pfn_range_free(folio, ret); + return ret; +} +EXPORT_SYMBOL_GPL(pfn_range_free); + +static inline int check_update_lm_arg(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long start, end; + struct page *start_page; + int nid; + + start_page = pfn_to_page(start_pfn); + nid = page_to_nid(start_page); + start = (unsigned long)page_to_virt(start_page); + end = start + (end_pfn - start_pfn) * PAGE_SIZE; + if ((start_pfn >= reserved_range[nid].start_pfn && + end_pfn <= reserved_range[nid].end_pfn) + || should_pmd_linear_mapping() + || can_set_direct_map()) { + if (!IS_ALIGNED(start, PFN_RANGE_ALLOC_SIZE) || + !IS_ALIGNED(end, PFN_RANGE_ALLOC_SIZE)) { + return -EINVAL; + } + } else if (!IS_ALIGNED(start, PUD_SIZE) || !IS_ALIGNED(end, PUD_SIZE)) { + return -EINVAL; + } + + return 0; +} + +static int invalid_pud_entry(pud_t *pudp, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + bool set_invalid = (bool)walk->private; + pud_t pud; + + pud = pudp_get(pudp); + if (pud_table(pud)) + return 0; + + if (set_invalid) + pud_val(pud) &= ~PTE_VALID; + else + pud_val(pud) |= PTE_VALID; + set_pud(pudp, pud); + + return 0; +} + +static int invalid_pmd_entry(pmd_t *pmdp, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + bool set_invalid = (bool)walk->private; + pmd_t pmd; + + pmd = pmdp_get(pmdp); + if (pmd_table(pmd)) + return 0; + + if (set_invalid) + pmd_val(pmd) &= ~PTE_VALID; + else + pmd_val(pmd) |= PTE_VALID; + set_pmd(pmdp, pmd); + + return 0; +} + +static int invalid_pte_entry(pte_t *ptep, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + bool set_invalid = (bool)walk->private; + pte_t pte; + + pte = ptep_get(ptep); + + if (set_invalid) + pte_val(pte) &= ~PTE_VALID; + else + pte_val(pte) |= PTE_VALID; + set_pte(ptep, pte); + + return 0; +} + +static const struct mm_walk_ops invalid_ops = { + .pud_entry = invalid_pud_entry, + .pmd_entry = invalid_pmd_entry, + .pte_entry = invalid_pte_entry, +}; + +int set_linear_mapping_invalid(unsigned long start_pfn, unsigned long end_pfn, + bool set_invalid) +{ + unsigned long start, end; + int ret; + + ret = check_update_lm_arg(start_pfn, end_pfn); + if (ret) + return ret; + + start = (unsigned long)page_to_virt(pfn_to_page(start_pfn)); + end = start + (end_pfn - start_pfn) * PAGE_SIZE; + mmap_write_lock(&init_mm); + walk_page_range_novma(&init_mm, start, end, + &invalid_ops, NULL, (void *)set_invalid); + mmap_write_unlock(&init_mm); + if (set_invalid) + flush_tlb_kernel_range(start, end); + + return 0; +} +EXPORT_SYMBOL_GPL(set_linear_mapping_invalid); + +static inline void update_entry_nc(unsigned long long *val, bool set_nc) +{ + *val &= ~PTE_ATTRINDX_MASK; + if (set_nc) + *val |= PTE_ATTRINDX(MT_NORMAL_NC); + else + *val |= PTE_ATTRINDX(MT_NORMAL_TAGGED); + *val |= PTE_VALID; +} + +static int nc_pud_entry(pud_t *pudp, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + bool set_nc = (bool)walk->private; + pud_t pud; + + if (pud_table(*pudp)) + return 0; + + pud = pudp_huge_get_and_clear(walk->mm, addr, pudp); + update_entry_nc(&pud_val(pud), set_nc); + set_pud(pudp, pud); + + return 0; +} + +static int nc_pmd_entry(pmd_t *pmdp, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + bool set_nc = (bool)walk->private; + pmd_t pmd; + + if (pmd_table(*pmdp)) + return 0; + + pmd = pmdp_huge_get_and_clear(walk->mm, addr, pmdp); + update_entry_nc(&pmd_val(pmd), set_nc); + set_pmd(pmdp, pmd); + + return 0; +} + +static int nc_pte_entry(pte_t *ptep, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + bool set_nc = (bool)walk->private; + pte_t pte; + + pte = ptep_get_and_clear(walk->mm, addr, ptep); + update_entry_nc(&pte_val(pte), set_nc); + set_pte(ptep, pte); + + return 0; +} + +static const struct mm_walk_ops nc_ops = { + .pud_entry = nc_pud_entry, + .pmd_entry = nc_pmd_entry, + .pte_entry = nc_pte_entry, +}; + +int set_linear_mapping_nc(unsigned long start_pfn, unsigned long end_pfn, bool set_nc) +{ + unsigned long start, end; + int ret; + + start = (unsigned long)page_to_virt(pfn_to_page(start_pfn)); + end = start + (end_pfn - start_pfn) * PAGE_SIZE; + ret = check_update_lm_arg(start_pfn, end_pfn); + if (ret) + return ret; + + mmap_write_lock(&init_mm); + walk_page_range_novma(&init_mm, start, end, + &invalid_ops, NULL, (void *)true); + flush_tlb_kernel_range(start, end); + walk_page_range_novma(&init_mm, start, end, + &nc_ops, NULL, (void *)set_nc); + mmap_write_unlock(&init_mm); + + return 0; +} +EXPORT_SYMBOL_GPL(set_linear_mapping_nc); + +#ifdef CONFIG_DEBUG_FS +static int reserved_range_show(struct seq_file *m, void *v) +{ + int nid; + + for_each_online_node(nid) { + if (numa_is_remote_node(nid)) + continue; + + seq_printf(m, "%d, %llx-%llx\n", nid, + PFN_PHYS(reserved_range[nid].start_pfn), + PFN_PHYS(reserved_range[nid].end_pfn)); + } + + seq_printf(m, "\nHardwareCorrupted: %lu kB\n", + atomic_long_read(&num_poisoned_pfn) << (PAGE_SHIFT - 10)); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(reserved_range); + +static int __init reserved_range_debug_init(void) +{ + if (!pmd_linear_mapping_enabled()) + return 0; + + if (can_set_direct_map() || should_pmd_linear_mapping()) + return 0; + + debugfs_create_file("pmd_mapping_reserved_range", 0400, NULL, + NULL, &reserved_range_fops); + return 0; +} +late_initcall(reserved_range_debug_init); +#endif diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 9f5c7c793c5a10762c559597420d57dd5f71515a..fbad47b3f5e87f18895de12508ad00766ea00cdf 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -399,4 +399,25 @@ extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, #endif void free_contig_range(unsigned long pfn, unsigned long nr_pages); +#ifdef CONFIG_CONTIG_ALLOC +/* This should be paired with folio_put() rather than free_contig_range(). */ +static inline struct folio *folio_alloc_gigantic(int order, gfp_t gfp, + int nid, nodemask_t *node) +{ + struct page *page; + + if (WARN_ON(!order || !(gfp & __GFP_COMP))) + return NULL; + + page = alloc_contig_pages(1 << order, gfp, nid, node); + + return page ? page_folio(page) : NULL; +} +#else +static inline struct folio *folio_alloc_gigantic(int order, gfp_t gfp, + int nid, nodemask_t *node) +{ + return NULL; +} +#endif #endif /* __LINUX_GFP_H */ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 9213e4f939ff92e72bd03180dd9bab0873b207bf..93da3db9d7ce9e4f60f17ad12a04f46d70f85571 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -305,14 +305,7 @@ static inline int hugetlb_insert_hugepage_pte_by_pa(struct mm_struct *mm, } #endif /* CONFIG_HUGETLB_INSERT_PAGE */ -#ifdef CONFIG_ASCEND_FEATURES struct folio *alloc_hugetlb_folio_size(int nid, unsigned long size); -#else -static inline struct folio *alloc_hugetlb_folio_size(int nid, unsigned long size) -{ - return NULL; -} -#endif #else /* !CONFIG_HUGETLB_PAGE */ @@ -1376,4 +1369,23 @@ hugetlb_walk(struct vm_area_struct *vma, unsigned long addr, unsigned long sz) return huge_pte_offset(vma->vm_mm, addr, sz); } +#ifdef CONFIG_PFN_RANGE_ALLOC +struct folio *hugetlb_pool_alloc(int nid); +int hugetlb_pool_free(struct folio *folio); +struct folio *hugetlb_pool_alloc_size(int nid, unsigned long size); +#else +static inline struct folio *hugetlb_pool_alloc(int nid) +{ + return ERR_PTR(-EINVAL); +} +static inline int hugetlb_pool_free(struct folio *folio) +{ + return -EINVAL; +} +static inline struct folio *hugetlb_pool_alloc_size(int nid, unsigned long size) +{ + return ERR_PTR(-EINVAL); +} +#endif + #endif /* _LINUX_HUGETLB_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 1dfc9ecc195e51492db4d52d1d771283e2369238..6afe7df09ca6deb2600ae99a7ac05b93f1efbed2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3820,24 +3820,22 @@ static inline bool page_is_guard(struct page *page) return PageGuard(page); } -bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order, - int migratetype); +bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order); static inline bool set_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) + unsigned int order) { if (!debug_guardpage_enabled()) return false; - return __set_page_guard(zone, page, order, migratetype); + return __set_page_guard(zone, page, order); } -void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order, - int migratetype); +void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order); static inline void clear_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) + unsigned int order) { if (!debug_guardpage_enabled()) return; - __clear_page_guard(zone, page, order, migratetype); + __clear_page_guard(zone, page, order); } #else /* CONFIG_DEBUG_PAGEALLOC */ @@ -3847,9 +3845,9 @@ static inline unsigned int debug_guardpage_minorder(void) { return 0; } static inline bool debug_guardpage_enabled(void) { return false; } static inline bool page_is_guard(struct page *page) { return false; } static inline bool set_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) { return false; } + unsigned int order) { return false; } static inline void clear_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) {} + unsigned int order) {} #endif /* CONFIG_DEBUG_PAGEALLOC */ #ifdef __HAVE_ARCH_GATE_AREA @@ -4214,4 +4212,36 @@ void vma_pgtable_walk_end(struct vm_area_struct *vma); /* added to mm.h to avoid every caller adding new header file */ #include +#ifdef CONFIG_PFN_RANGE_ALLOC +#define PFN_RANGE_ALLOC_SIZE PMD_SIZE +#define PFN_RANGE_ALLOC_ORDER PMD_ORDER +#define PFN_RANGE_ALLOC_NR_PAGES (1 << PFN_RANGE_ALLOC_ORDER) + +extern unsigned long contig_mem_pool_percent; +struct folio *pfn_range_alloc(unsigned int nr_pages, int nid); +int pfn_range_free(struct folio *folio); +int set_linear_mapping_nc(unsigned long start_pfn, unsigned long end_pfn, bool set_nc); +int set_linear_mapping_invalid(unsigned long start_pfn, unsigned long end_pfn, + bool set_invalid); +#else +static inline struct folio *pfn_range_alloc(unsigned int nr_pages, int nid) +{ + return ERR_PTR(-EINVAL); +} +static inline int pfn_range_free(struct folio *folio) +{ + return -EINVAL; +} +static inline +int set_linear_mapping_nc(unsigned long start_pfn, unsigned long end_pfn, bool set_nc) +{ + return -EINVAL; +} +static inline +int set_linear_mapping_invalid(unsigned long start_pfn, unsigned long end_pfn, + bool set_invalid) +{ + return -EINVAL; +} +#endif #endif /* _LINUX_MM_H */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b1cded2400498751ff73233d7c3022e116b7b751..afc35b8cb44e2495a21ba80d18bd38718ef25830 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1032,7 +1032,7 @@ struct zone { CACHELINE_PADDING(_pad3_); - KABI_RESERVE(1) + KABI_USE(1, unsigned long nr_free_highatomic) KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 4ac34392823a9c74321e18096db07d8bba682aea..73dc2c1841ec13c51b1526279d57134294ca982b 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -3,10 +3,6 @@ #define __LINUX_PAGEISOLATION_H #ifdef CONFIG_MEMORY_ISOLATION -static inline bool has_isolate_pageblock(struct zone *zone) -{ - return zone->nr_isolate_pageblock; -} static inline bool is_migrate_isolate_page(struct page *page) { return get_pageblock_migratetype(page) == MIGRATE_ISOLATE; @@ -16,10 +12,6 @@ static inline bool is_migrate_isolate(int migratetype) return migratetype == MIGRATE_ISOLATE; } #else -static inline bool has_isolate_pageblock(struct zone *zone) -{ - return false; -} static inline bool is_migrate_isolate_page(struct page *page) { return false; @@ -34,8 +26,9 @@ static inline bool is_migrate_isolate(int migratetype) #define REPORT_FAILURE 0x2 void set_pageblock_migratetype(struct page *page, int migratetype); -int move_freepages_block(struct zone *zone, struct page *page, - int migratetype, int *num_movable); + +bool move_freepages_block_isolate(struct zone *zone, struct page *page, + int migratetype); int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, int migratetype, int flags, gfp_t gfp_flags); diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index d783d5a27352d3227e49274a0f43157080a0077f..72ecd46fd0c494183967334a41fa8d4ac461ce8d 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -487,14 +487,6 @@ static inline void node_stat_sub_folio(struct folio *folio, mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio)); } -static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages, - int migratetype) -{ - __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages); - if (is_migrate_cma(migratetype)) - __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages); -} - extern const char * const vmstat_text[]; static inline const char *zone_stat_name(enum zone_stat_item item) diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 3e78e6bd6e18e9ca5d8562fda43a6a2a7089d2c6..28b9d6958724696193d6c9eb727b7aff89f398bd 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -544,6 +544,124 @@ TRACE_EVENT(mm_mem_sampling_damon_record, TP_printk("vaddr=%llx pid=%d", __entry->vaddr, __entry->pid) ); #endif /* CONFIG_DAMON_MEM_SAMPLING */ + +#ifdef CONFIG_PFN_RANGE_ALLOC +TRACE_EVENT(pfn_range_alloc, + + TP_PROTO(struct folio *folio, unsigned int nr_pages, + int nid), + + TP_ARGS(folio, nr_pages, nid), + + TP_STRUCT__entry( + __field(struct folio*, folio) + __field(unsigned int, nr_pages) + __field(int, nid) + ), + + TP_fast_assign( + __entry->folio = folio; + __entry->nr_pages = nr_pages; + __entry->nid = nid; + ), + + TP_printk("folio=%p pfn=0x%lx err=%ld nr_pages=%u nid=%d", + !IS_ERR(__entry->folio) ? __entry->folio : NULL, + !IS_ERR(__entry->folio) ? folio_pfn(__entry->folio) : 0, + !IS_ERR(__entry->folio) ? 0 : PTR_ERR(__entry->folio), + __entry->nr_pages, + __entry->nid) +); + +TRACE_EVENT(pfn_range_free, + + TP_PROTO(struct folio *folio, int ret), + + TP_ARGS(folio, ret), + + TP_STRUCT__entry( + __field(struct folio*, folio) + __field(int, ret) + ), + + TP_fast_assign( + __entry->folio = folio; + __entry->ret = ret; + ), + + TP_printk("folio=%p pfn=0x%lx ret=%d", + __entry->folio, folio_pfn(__entry->folio), __entry->ret) +); + +TRACE_EVENT(hugetlb_pool_alloc, + + TP_PROTO(struct folio *folio, int nid), + + TP_ARGS(folio, nid), + + TP_STRUCT__entry( + __field(struct folio*, folio) + __field(int, nid) + ), + + TP_fast_assign( + __entry->folio = folio; + __entry->nid = nid; + ), + + TP_printk("folio=%p pfn=0x%lx err=%ld nid=%d", + !IS_ERR(__entry->folio) ? __entry->folio : NULL, + !IS_ERR(__entry->folio) ? folio_pfn(__entry->folio) : 0, + !IS_ERR(__entry->folio) ? 0 : PTR_ERR(__entry->folio), + __entry->nid) +); + +TRACE_EVENT(hugetlb_pool_free, + + TP_PROTO(struct folio *folio, int ret), + + TP_ARGS(folio, ret), + + TP_STRUCT__entry( + __field(struct folio*, folio) + __field(int, ret) + ), + + TP_fast_assign( + __entry->folio = folio; + __entry->ret = ret; + ), + + TP_printk("folio=%p pfn=0x%lx ret=%d", + __entry->folio, folio_pfn(__entry->folio), __entry->ret) +); + +TRACE_EVENT(hugetlb_pool_alloc_size, + + TP_PROTO(struct folio *folio, int nid, unsigned long size), + + TP_ARGS(folio, nid, size), + + TP_STRUCT__entry( + __field(struct folio*, folio) + __field(int, nid) + __field(unsigned long, size) + ), + + TP_fast_assign( + __entry->folio = folio; + __entry->nid = nid; + __entry->size = size; + ), + + TP_printk("folio=%p pfn=0x%lx err=%ld nid=%d size=0x%lx", + !IS_ERR(__entry->folio) ? __entry->folio : NULL, + !IS_ERR(__entry->folio) ? folio_pfn(__entry->folio) : 0, + !IS_ERR(__entry->folio) ? 0 : PTR_ERR(__entry->folio), + __entry->nid, __entry->size) +); + +#endif #endif /* _TRACE_KMEM_H */ /* This part must be outside protection */ diff --git a/mm/compaction.c b/mm/compaction.c index ed0baf7404c8b8e54624a3ed70fe42532e5b0ed0..98be4f7c07dd6c731ac3bf16e06de6187ec866a2 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -80,33 +80,6 @@ static inline bool is_via_compact_memory(int order) { return false; } #define COMPACTION_HPAGE_ORDER (PMD_SHIFT - PAGE_SHIFT) #endif -static void split_map_pages(struct list_head *freepages) -{ - unsigned int i, order; - struct page *page, *next; - LIST_HEAD(tmp_list); - - for (order = 0; order < NR_PAGE_ORDERS; order++) { - list_for_each_entry_safe(page, next, &freepages[order], lru) { - unsigned int nr_pages; - - list_del(&page->lru); - - nr_pages = 1 << order; - - post_alloc_hook(page, order, __GFP_MOVABLE); - if (order) - split_page(page, order); - - for (i = 0; i < nr_pages; i++) { - list_add(&page->lru, &tmp_list); - page++; - } - } - list_splice_init(&tmp_list, &freepages[0]); - } -} - static unsigned long release_free_list(struct list_head *freepages) { int order; @@ -737,11 +710,11 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, * * Non-free pages, invalid PFNs, or zone boundaries within the * [start_pfn, end_pfn) range are considered errors, cause function to - * undo its actions and return zero. + * undo its actions and return zero. cc->freepages[] are empty. * * Otherwise, function returns one-past-the-last PFN of isolated page * (which may be greater then end_pfn if end fell in a middle of - * a free page). + * a free page). cc->freepages[] contain free pages isolated. */ unsigned long isolate_freepages_range(struct compact_control *cc, @@ -749,10 +722,9 @@ isolate_freepages_range(struct compact_control *cc, { unsigned long isolated, pfn, block_start_pfn, block_end_pfn; int order; - struct list_head tmp_freepages[NR_PAGE_ORDERS]; for (order = 0; order < NR_PAGE_ORDERS; order++) - INIT_LIST_HEAD(&tmp_freepages[order]); + INIT_LIST_HEAD(&cc->freepages[order]); pfn = start_pfn; block_start_pfn = pageblock_start_pfn(pfn); @@ -783,7 +755,7 @@ isolate_freepages_range(struct compact_control *cc, break; isolated = isolate_freepages_block(cc, &isolate_start_pfn, - block_end_pfn, tmp_freepages, 0, true); + block_end_pfn, cc->freepages, 0, true); /* * In strict mode, isolate_freepages_block() returns 0 if @@ -802,13 +774,10 @@ isolate_freepages_range(struct compact_control *cc, if (pfn < end_pfn) { /* Loop terminated early, cleanup. */ - release_free_list(tmp_freepages); + release_free_list(cc->freepages); return 0; } - /* __isolate_free_page() does not map the pages */ - split_map_pages(tmp_freepages); - /* We don't use freelists for anything. */ return pfn; } @@ -2333,7 +2302,7 @@ static enum compact_result __compact_finished(struct compact_control *cc) ret = COMPACT_NO_SUITABLE_PAGE; for (order = cc->order; order < NR_PAGE_ORDERS; order++) { struct free_area *area = &cc->zone->free_area[order]; - bool can_steal; + bool claim_block; /* Job done if page is free of the right migratetype */ if (!free_area_empty(area, migratetype)) @@ -2350,7 +2319,7 @@ static enum compact_result __compact_finished(struct compact_control *cc) * other migratetype buddy lists. */ if (find_suitable_fallback(area, order, migratetype, - true, &can_steal) != -1) + true, &claim_block) != -1) /* * Movable pages are OK in any pageblock. If we are * stealing for a non-movable allocation, make sure diff --git a/mm/debug_page_alloc.c b/mm/debug_page_alloc.c index f9d145730fd1693a0265de8a47178c6c320ff054..03a810927d0a73f08ac8fbd77b7fe7a6f5953b66 100644 --- a/mm/debug_page_alloc.c +++ b/mm/debug_page_alloc.c @@ -32,8 +32,7 @@ static int __init debug_guardpage_minorder_setup(char *buf) } early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup); -bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order, - int migratetype) +bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order) { if (order >= debug_guardpage_minorder()) return false; @@ -41,19 +40,12 @@ bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order, __SetPageGuard(page); INIT_LIST_HEAD(&page->buddy_list); set_page_private(page, order); - /* Guard pages are not available for any usage */ - if (!is_migrate_isolate(migratetype)) - __mod_zone_freepage_state(zone, -(1 << order), migratetype); return true; } -void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order, - int migratetype) +void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order) { __ClearPageGuard(page); - set_page_private(page, 0); - if (!is_migrate_isolate(migratetype)) - __mod_zone_freepage_state(zone, (1 << order), migratetype); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9ba075d424f900043c12591c2ed274b56bbf2d36..1cd1196e0d66fba77811cf211c9e4f22c9d04b73 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -37,6 +37,9 @@ #include #include #include +#ifndef __GENKSYMS__ +#include +#endif #include #include @@ -7917,7 +7920,6 @@ int hugetlb_insert_hugepage_pte_by_pa(struct mm_struct *mm, unsigned long addr, EXPORT_SYMBOL_GPL(hugetlb_insert_hugepage_pte_by_pa); #endif /* CONFIG_HUGETLB_INSERT_PAGE */ -#ifdef CONFIG_ASCEND_FEATURES struct folio *alloc_hugetlb_folio_size(int nid, unsigned long size) { gfp_t gfp_mask; @@ -7926,13 +7928,12 @@ struct folio *alloc_hugetlb_folio_size(int nid, unsigned long size) unsigned long flags; struct folio *folio = NULL; - nodes_clear(nodemask); - node_set(nid, nodemask); - h = size_to_hstate(size); if (!h) return NULL; + nodes_clear(nodemask); + node_set(nid, nodemask); gfp_mask = htlb_alloc_mask(h); spin_lock_irqsave(&hugetlb_lock, flags); if (h->free_huge_pages - h->resv_huge_pages > 0) @@ -7942,4 +7943,57 @@ struct folio *alloc_hugetlb_folio_size(int nid, unsigned long size) return folio; } EXPORT_SYMBOL(alloc_hugetlb_folio_size); + +#ifdef CONFIG_PFN_RANGE_ALLOC +struct folio *hugetlb_pool_alloc(int nid) +{ + struct folio *folio = ERR_PTR(-EINVAL); + + if (nid < 0 || nid >= MAX_NUMNODES) + goto out; + + folio = alloc_hugetlb_folio_size(nid, PFN_RANGE_ALLOC_SIZE); + if (!folio) + folio = ERR_PTR(-ENOMEM); + +out: + trace_hugetlb_pool_alloc(folio, nid); + return folio; +} +EXPORT_SYMBOL_GPL(hugetlb_pool_alloc); + +int hugetlb_pool_free(struct folio *folio) +{ + int ret = -EINVAL; + + if (!folio_test_hugetlb(folio)) + goto out; + + ret = 0; + folio_put(folio); +out: + trace_hugetlb_pool_free(folio, ret); + return ret; +} +EXPORT_SYMBOL_GPL(hugetlb_pool_free); + +struct folio *hugetlb_pool_alloc_size(int nid, unsigned long size) +{ + struct folio *folio = ERR_PTR(-EINVAL); + + if (nid < 0 || nid >= MAX_NUMNODES) + goto out; + + if ((size != PMD_SIZE) && (size != PUD_SIZE)) + goto out; + + folio = alloc_hugetlb_folio_size(nid, size); + if (!folio) + folio = ERR_PTR(-ENOMEM); + +out: + trace_hugetlb_pool_alloc_size(folio, nid, size); + return folio; +} +EXPORT_SYMBOL_GPL(hugetlb_pool_alloc_size); #endif diff --git a/mm/internal.h b/mm/internal.h index 55b0698ad3b02300fbc1a89187765c20e5f60483..3a127c3e232513928dc547805f81d4a14d051c12 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -714,10 +714,6 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, void memmap_init_range(unsigned long, int, unsigned long, unsigned long, unsigned long, enum meminit_context, struct vmem_altmap *, int); - -int split_free_page(struct page *free_page, - unsigned int order, unsigned long split_pfn_offset); - #if defined CONFIG_COMPACTION || defined CONFIG_CMA #define MAX_PAGE_ORDER MAX_ORDER @@ -788,17 +784,13 @@ int isolate_migratepages_range(struct compact_control *cc, unsigned long low_pfn, unsigned long end_pfn); -int __alloc_contig_migrate_range(struct compact_control *cc, - unsigned long start, unsigned long end, - int migratetype); - /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ void init_cma_reserved_pageblock(struct page *page); #endif /* CONFIG_COMPACTION || CONFIG_CMA */ int find_suitable_fallback(struct free_area *area, unsigned int order, - int migratetype, bool only_stealable, bool *can_steal); + int migratetype, bool claim_only, bool *claim_block); static inline bool free_area_empty(struct free_area *area, int migratetype) { @@ -1196,11 +1188,6 @@ static inline bool is_migrate_highatomic(enum migratetype migratetype) return migratetype == MIGRATE_HIGHATOMIC; } -static inline bool is_migrate_highatomic_page(struct page *page) -{ - return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC; -} - void setup_zone_pageset(struct zone *zone); struct migration_target_control { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index afbcbc8adeb299644b47e7cbf9dd0822034b3432..8f06d6f8124d553f1866659a7eb03da7a869aa68 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -208,24 +208,6 @@ EXPORT_SYMBOL(node_states); gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; -/* - * A cached value of the page's pageblock's migratetype, used when the page is - * put on a pcplist. Used to avoid the pageblock migratetype lookup when - * freeing from pcplists in most cases, at the cost of possibly becoming stale. - * Also the migratetype set in the page does not necessarily match the pcplist - * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any - * other index - this ensures that it will be put on the correct CMA freelist. - */ -static inline int get_pcppage_migratetype(struct page *page) -{ - return page->index; -} - -static inline void set_pcppage_migratetype(struct page *page, int migratetype) -{ - page->index = migratetype; -} - #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE unsigned int pageblock_order __read_mostly; #endif @@ -655,23 +637,38 @@ compaction_capture(struct capture_control *capc, struct page *page, } #endif /* CONFIG_COMPACTION */ -/* Used for pages not on another list */ -static inline void add_to_free_list(struct page *page, struct zone *zone, - unsigned int order, int migratetype) +static inline void account_freepages(struct zone *zone, int nr_pages, + int migratetype) { - struct free_area *area = &zone->free_area[order]; + lockdep_assert_held(&zone->lock); - list_add(&page->buddy_list, &area->free_list[migratetype]); - area->nr_free++; + if (is_migrate_isolate(migratetype)) + return; + + __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages); + + if (is_migrate_cma(migratetype)) + __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages); + else if (is_migrate_highatomic(migratetype)) + WRITE_ONCE(zone->nr_free_highatomic, + zone->nr_free_highatomic + nr_pages); } /* Used for pages not on another list */ -static inline void add_to_free_list_tail(struct page *page, struct zone *zone, - unsigned int order, int migratetype) +static inline void __add_to_free_list(struct page *page, struct zone *zone, + unsigned int order, int migratetype, + bool tail) { struct free_area *area = &zone->free_area[order]; - list_add_tail(&page->buddy_list, &area->free_list[migratetype]); + VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype, + "page type is %lu, passed migratetype is %d (nr=%d)\n", + get_pageblock_migratetype(page), migratetype, 1 << order); + + if (tail) + list_add_tail(&page->buddy_list, &area->free_list[migratetype]); + else + list_add(&page->buddy_list, &area->free_list[migratetype]); area->nr_free++; } @@ -681,16 +678,28 @@ static inline void add_to_free_list_tail(struct page *page, struct zone *zone, * allocation again (e.g., optimization for memory onlining). */ static inline void move_to_free_list(struct page *page, struct zone *zone, - unsigned int order, int migratetype) + unsigned int order, int old_mt, int new_mt) { struct free_area *area = &zone->free_area[order]; - list_move_tail(&page->buddy_list, &area->free_list[migratetype]); + /* Free page moving can fail, so it happens before the type update */ + VM_WARN_ONCE(get_pageblock_migratetype(page) != old_mt, + "page type is %lu, passed migratetype is %d (nr=%d)\n", + get_pageblock_migratetype(page), old_mt, 1 << order); + + list_move_tail(&page->buddy_list, &area->free_list[new_mt]); + + account_freepages(zone, -(1 << order), old_mt); + account_freepages(zone, 1 << order, new_mt); } -static inline void del_page_from_free_list(struct page *page, struct zone *zone, - unsigned int order) +static inline void __del_page_from_free_list(struct page *page, struct zone *zone, + unsigned int order, int migratetype) { + VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype, + "page type is %lu, passed migratetype is %d (nr=%d)\n", + get_pageblock_migratetype(page), migratetype, 1 << order); + /* clear reported state and update reported page count */ if (page_reported(page)) __ClearPageReported(page); @@ -701,6 +710,13 @@ static inline void del_page_from_free_list(struct page *page, struct zone *zone, zone->free_area[order].nr_free--; } +static inline void del_page_from_free_list(struct page *page, struct zone *zone, + unsigned int order, int migratetype) +{ + __del_page_from_free_list(page, zone, order, migratetype); + account_freepages(zone, -(1 << order), migratetype); +} + static inline struct page *get_page_from_free_area(struct free_area *area, int migratetype) { @@ -772,16 +788,17 @@ static inline void __free_one_page(struct page *page, VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); VM_BUG_ON(migratetype == -1); - if (likely(!is_migrate_isolate(migratetype))) - __mod_zone_freepage_state(zone, 1 << order, migratetype); VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); + account_freepages(zone, 1 << order, migratetype); + while (order < MAX_ORDER) { + int buddy_mt = migratetype; + if (compaction_capture(capc, page, order, migratetype)) { - __mod_zone_freepage_state(zone, -(1 << order), - migratetype); + account_freepages(zone, -(1 << order), migratetype); return; } @@ -796,11 +813,11 @@ static inline void __free_one_page(struct page *page, * pageblock isolation could cause incorrect freepage or CMA * accounting or HIGHATOMIC accounting. */ - int buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn); + buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn); - if (migratetype != buddy_mt - && (!migratetype_is_mergeable(migratetype) || - !migratetype_is_mergeable(buddy_mt))) + if (migratetype != buddy_mt && + (!migratetype_is_mergeable(migratetype) || + !migratetype_is_mergeable(buddy_mt))) goto done_merging; } @@ -809,9 +826,19 @@ static inline void __free_one_page(struct page *page, * merge with it and move up one order. */ if (page_is_guard(buddy)) - clear_page_guard(zone, buddy, order, migratetype); + clear_page_guard(zone, buddy, order); else - del_page_from_free_list(buddy, zone, order); + __del_page_from_free_list(buddy, zone, order, buddy_mt); + + if (unlikely(buddy_mt != migratetype)) { + /* + * Match buddy type. This ensures that an + * expand() down the line puts the sub-blocks + * on the right freelists. + */ + set_pageblock_migratetype(buddy, migratetype); + } + combined_pfn = buddy_pfn & pfn; page = page + (combined_pfn - pfn); pfn = combined_pfn; @@ -828,74 +855,13 @@ static inline void __free_one_page(struct page *page, else to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order); - if (to_tail) - add_to_free_list_tail(page, zone, order, migratetype); - else - add_to_free_list(page, zone, order, migratetype); + __add_to_free_list(page, zone, order, migratetype, to_tail); /* Notify page reporting subsystem of freed page */ if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY)) page_reporting_notify_free(order); } -/** - * split_free_page() -- split a free page at split_pfn_offset - * @free_page: the original free page - * @order: the order of the page - * @split_pfn_offset: split offset within the page - * - * Return -ENOENT if the free page is changed, otherwise 0 - * - * It is used when the free page crosses two pageblocks with different migratetypes - * at split_pfn_offset within the page. The split free page will be put into - * separate migratetype lists afterwards. Otherwise, the function achieves - * nothing. - */ -int split_free_page(struct page *free_page, - unsigned int order, unsigned long split_pfn_offset) -{ - struct zone *zone = page_zone(free_page); - unsigned long free_page_pfn = page_to_pfn(free_page); - unsigned long pfn; - unsigned long flags; - int free_page_order; - int mt; - int ret = 0; - - if (split_pfn_offset == 0) - return ret; - - spin_lock_irqsave(&zone->lock, flags); - - if (!PageBuddy(free_page) || buddy_order(free_page) != order) { - ret = -ENOENT; - goto out; - } - - mt = get_pfnblock_migratetype(free_page, free_page_pfn); - if (likely(!is_migrate_isolate(mt))) - __mod_zone_freepage_state(zone, -(1UL << order), mt); - - del_page_from_free_list(free_page, zone, order); - for (pfn = free_page_pfn; - pfn < free_page_pfn + (1UL << order);) { - int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn); - - free_page_order = min_t(unsigned int, - pfn ? __ffs(pfn) : order, - __fls(split_pfn_offset)); - __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order, - mt, FPI_NONE); - pfn += 1UL << free_page_order; - split_pfn_offset -= (1UL << free_page_order); - /* we have done the first part, now switch to second part */ - if (split_pfn_offset == 0) - split_pfn_offset = (1UL << order) - (pfn - free_page_pfn); - } -out: - spin_unlock_irqrestore(&zone->lock, flags); - return ret; -} /* * A bad page could be due to a number of fields. Instead of multiple branches, * try and check multiple fields with one check. The caller must do a detailed @@ -1202,7 +1168,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, { unsigned long flags; unsigned int order; - bool isolated_pageblocks; struct page *page; /* @@ -1215,7 +1180,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, pindex = pindex - 1; spin_lock_irqsave(&zone->lock, flags); - isolated_pageblocks = has_isolate_pageblock(zone); while (count > 0) { struct list_head *list; @@ -1231,23 +1195,19 @@ static void free_pcppages_bulk(struct zone *zone, int count, order = pindex_to_order(pindex); nr_pages = 1 << order; do { + unsigned long pfn; int mt; page = list_last_entry(list, struct page, pcp_list); - mt = get_pcppage_migratetype(page); + pfn = page_to_pfn(page); + mt = get_pfnblock_migratetype(page, pfn); /* must delete to avoid corrupting pcp list */ list_del(&page->pcp_list); count -= nr_pages; pcp->count -= nr_pages; - /* MIGRATE_ISOLATE page should not go to pcplists */ - VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); - /* Pageblock could have been isolated meanwhile */ - if (unlikely(isolated_pageblocks)) - mt = get_pageblock_migratetype(page); - - __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE); + __free_one_page(page, pfn, zone, order, mt, FPI_NONE); trace_mm_page_pcpu_drain(page, order, mt); } while (count > 0 && !list_empty(list)); } @@ -1255,47 +1215,51 @@ static void free_pcppages_bulk(struct zone *zone, int count, spin_unlock_irqrestore(&zone->lock, flags); } -static void free_one_page(struct zone *zone, - struct page *page, unsigned long pfn, - unsigned int order, - int migratetype, fpi_t fpi_flags) +/* Split a multi-block free page into its individual pageblocks. */ +static void split_large_buddy(struct zone *zone, struct page *page, + unsigned long pfn, int order, fpi_t fpi) +{ + unsigned long end = pfn + (1 << order); + + VM_WARN_ON_ONCE(!IS_ALIGNED(pfn, 1 << order)); + /* Caller removed page from freelist, buddy info cleared! */ + VM_WARN_ON_ONCE(PageBuddy(page)); + + if (order > pageblock_order) + order = pageblock_order; + + do { + int mt = get_pfnblock_migratetype(page, pfn); + + __free_one_page(page, pfn, zone, order, mt, fpi); + pfn += 1 << order; + if (pfn == end) + break; + page = pfn_to_page(pfn); + } while (1); +} + +static void free_one_page(struct zone *zone, struct page *page, + unsigned long pfn, unsigned int order, + fpi_t fpi_flags) { unsigned long flags; spin_lock_irqsave(&zone->lock, flags); - if (unlikely(has_isolate_pageblock(zone) || - is_migrate_isolate(migratetype))) { - migratetype = get_pfnblock_migratetype(page, pfn); - } - __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); + split_large_buddy(zone, page, pfn, order, fpi_flags); spin_unlock_irqrestore(&zone->lock, flags); } static void __free_pages_ok(struct page *page, unsigned int order, fpi_t fpi_flags) { - unsigned long flags; - int migratetype; unsigned long pfn = page_to_pfn(page); struct zone *zone = page_zone(page); if (!free_pages_prepare(page, order)) return; - /* - * Calling get_pfnblock_migratetype() without spin_lock_irqsave() here - * is used to avoid calling get_pfnblock_migratetype() under the lock. - * This will reduce the lock holding time. - */ - migratetype = get_pfnblock_migratetype(page, pfn); - - spin_lock_irqsave(&zone->lock, flags); - if (unlikely(has_isolate_pageblock(zone) || - is_migrate_isolate(migratetype))) { - migratetype = get_pfnblock_migratetype(page, pfn); - } - __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); - spin_unlock_irqrestore(&zone->lock, flags); + free_one_page(zone, page, pfn, order, fpi_flags); __count_vm_events(PGFREE, 1 << order); } @@ -1402,10 +1366,11 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn, * * -- nyc */ -static inline void expand(struct zone *zone, struct page *page, - int low, int high, int migratetype) +static inline unsigned int expand(struct zone *zone, struct page *page, int low, + int high, int migratetype) { - unsigned long size = 1 << high; + unsigned int size = 1 << high; + unsigned int nr_added = 0; while (high > low) { high--; @@ -1418,12 +1383,26 @@ static inline void expand(struct zone *zone, struct page *page, * Corresponding page table entries will not be touched, * pages will stay not present in virtual address space */ - if (set_page_guard(zone, &page[size], high, migratetype)) + if (set_page_guard(zone, &page[size], high)) continue; - add_to_free_list(&page[size], zone, high, migratetype); + __add_to_free_list(&page[size], zone, high, migratetype, false); set_buddy_order(&page[size], high); + nr_added += size; } + + return nr_added; +} + +static __always_inline void page_del_and_expand(struct zone *zone, + struct page *page, int low, + int high, int migratetype) +{ + int nr_pages = 1 << high; + + __del_page_from_free_list(page, zone, high, migratetype); + nr_pages -= expand(zone, page, low, high, migratetype); + account_freepages(zone, -nr_pages, migratetype); } static void check_new_page_bad(struct page *page) @@ -1612,9 +1591,9 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, page = get_page_from_free_area(area, migratetype); if (!page) continue; - del_page_from_free_list(page, zone, current_order); - expand(zone, page, order, current_order, migratetype); - set_pcppage_migratetype(page, migratetype); + + page_del_and_expand(zone, page, order, current_order, + migratetype); trace_mm_page_alloc_zone_locked(page, order, migratetype, pcp_allowed_order(order) && migratetype < MIGRATE_PCPTYPES); @@ -1649,30 +1628,23 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone, #endif /* - * Move the free pages in a range to the freelist tail of the requested type. - * Note that start_page and end_pages are not aligned on a pageblock - * boundary. If alignment is required, use move_freepages_block() + * Change the type of a block and move all its free pages to that + * type's freelist. */ -static int move_freepages(struct zone *zone, - unsigned long start_pfn, unsigned long end_pfn, - int migratetype, int *num_movable) +static int __move_freepages_block(struct zone *zone, unsigned long start_pfn, + int old_mt, int new_mt) { struct page *page; - unsigned long pfn; + unsigned long pfn, end_pfn; unsigned int order; int pages_moved = 0; - for (pfn = start_pfn; pfn <= end_pfn;) { + VM_WARN_ON(start_pfn & (pageblock_nr_pages - 1)); + end_pfn = pageblock_end_pfn(start_pfn); + + for (pfn = start_pfn; pfn < end_pfn;) { page = pfn_to_page(pfn); if (!PageBuddy(page)) { - /* - * We assume that pages that could be isolated for - * migration are movable. But we don't actually try - * isolating, as that would be expensive. - */ - if (num_movable && - (PageLRU(page) || __PageMovable(page))) - (*num_movable)++; pfn++; continue; } @@ -1682,36 +1654,166 @@ static int move_freepages(struct zone *zone, VM_BUG_ON_PAGE(page_zone(page) != zone, page); order = buddy_order(page); - move_to_free_list(page, zone, order, migratetype); + + move_to_free_list(page, zone, order, old_mt, new_mt); + pfn += 1 << order; pages_moved += 1 << order; } + set_pageblock_migratetype(pfn_to_page(start_pfn), new_mt); + return pages_moved; } -int move_freepages_block(struct zone *zone, struct page *page, - int migratetype, int *num_movable) +static bool prep_move_freepages_block(struct zone *zone, struct page *page, + unsigned long *start_pfn, + int *num_free, int *num_movable) { - unsigned long start_pfn, end_pfn, pfn; + unsigned long pfn, start, end; + + pfn = page_to_pfn(page); + start = pageblock_start_pfn(pfn); + end = pageblock_end_pfn(pfn); + + /* + * The caller only has the lock for @zone, don't touch ranges + * that straddle into other zones. While we could move part of + * the range that's inside the zone, this call is usually + * accompanied by other operations such as migratetype updates + * which also should be locked. + */ + if (!zone_spans_pfn(zone, start)) + return false; + if (!zone_spans_pfn(zone, end - 1)) + return false; - if (num_movable) + *start_pfn = start; + + if (num_free) { + *num_free = 0; *num_movable = 0; + for (pfn = start; pfn < end;) { + page = pfn_to_page(pfn); + if (PageBuddy(page)) { + int nr = 1 << buddy_order(page); - pfn = page_to_pfn(page); - start_pfn = pageblock_start_pfn(pfn); - end_pfn = pageblock_end_pfn(pfn) - 1; + *num_free += nr; + pfn += nr; + continue; + } + /* + * We assume that pages that could be isolated for + * migration are movable. But we don't actually try + * isolating, as that would be expensive. + */ + if (PageLRU(page) || __PageMovable(page)) + (*num_movable)++; + pfn++; + } + } - /* Do not cross zone boundaries */ - if (!zone_spans_pfn(zone, start_pfn)) - start_pfn = pfn; - if (!zone_spans_pfn(zone, end_pfn)) - return 0; + return true; +} + +static int move_freepages_block(struct zone *zone, struct page *page, + int old_mt, int new_mt) +{ + unsigned long start_pfn; + + if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL)) + return -1; - return move_freepages(zone, start_pfn, end_pfn, migratetype, - num_movable); + return __move_freepages_block(zone, start_pfn, old_mt, new_mt); } +#ifdef CONFIG_MEMORY_ISOLATION +/* Look for a buddy that straddles start_pfn */ +static unsigned long find_large_buddy(unsigned long start_pfn) +{ + int order = 0; + struct page *page; + unsigned long pfn = start_pfn; + + while (!PageBuddy(page = pfn_to_page(pfn))) { + /* Nothing found */ + if (++order > MAX_PAGE_ORDER) + return start_pfn; + pfn &= ~0UL << order; + } + + /* + * Found a preceding buddy, but does it straddle? + */ + if (pfn + (1 << buddy_order(page)) > start_pfn) + return pfn; + + /* Nothing found */ + return start_pfn; +} + +/** + * move_freepages_block_isolate - move free pages in block for page isolation + * @zone: the zone + * @page: the pageblock page + * @migratetype: migratetype to set on the pageblock + * + * This is similar to move_freepages_block(), but handles the special + * case encountered in page isolation, where the block of interest + * might be part of a larger buddy spanning multiple pageblocks. + * + * Unlike the regular page allocator path, which moves pages while + * stealing buddies off the freelist, page isolation is interested in + * arbitrary pfn ranges that may have overlapping buddies on both ends. + * + * This function handles that. Straddling buddies are split into + * individual pageblocks. Only the block of interest is moved. + * + * Returns %true if pages could be moved, %false otherwise. + */ +bool move_freepages_block_isolate(struct zone *zone, struct page *page, + int migratetype) +{ + unsigned long start_pfn, pfn; + + if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL)) + return false; + + /* No splits needed if buddies can't span multiple blocks */ + if (pageblock_order == MAX_PAGE_ORDER) + goto move; + + /* We're a tail block in a larger buddy */ + pfn = find_large_buddy(start_pfn); + if (pfn != start_pfn) { + struct page *buddy = pfn_to_page(pfn); + int order = buddy_order(buddy); + + del_page_from_free_list(buddy, zone, order, + get_pfnblock_migratetype(buddy, pfn)); + set_pageblock_migratetype(page, migratetype); + split_large_buddy(zone, buddy, pfn, order, FPI_NONE); + return true; + } + + /* We're the starting block of a larger buddy */ + if (PageBuddy(page) && buddy_order(page) > pageblock_order) { + int order = buddy_order(page); + + del_page_from_free_list(page, zone, order, + get_pfnblock_migratetype(page, pfn)); + set_pageblock_migratetype(page, migratetype); + split_large_buddy(zone, page, pfn, order, FPI_NONE); + return true; + } +move: + __move_freepages_block(zone, start_pfn, + get_pfnblock_migratetype(page, start_pfn), + migratetype); + return true; +} +#endif /* CONFIG_MEMORY_ISOLATION */ + static void change_pageblock_range(struct page *pageblock_page, int start_order, int migratetype) { @@ -1724,35 +1826,49 @@ static void change_pageblock_range(struct page *pageblock_page, } /* - * When we are falling back to another migratetype during allocation, try to - * steal extra free pages from the same pageblocks to satisfy further - * allocations, instead of polluting multiple pageblocks. - * - * If we are stealing a relatively large buddy page, it is likely there will - * be more free pages in the pageblock, so try to steal them all. For - * reclaimable and unmovable allocations, we steal regardless of page size, - * as fragmentation caused by those allocations polluting movable pageblocks - * is worse than movable allocations stealing from unmovable and reclaimable - * pageblocks. + * When we are falling back to another migratetype during allocation, should we + * try to claim an entire block to satisfy further allocations, instead of + * polluting multiple pageblocks? */ -static bool can_steal_fallback(unsigned int order, int start_mt) +static bool should_try_claim_block(unsigned int order, int start_mt) { /* * Leaving this order check is intended, although there is * relaxed order check in next check. The reason is that - * we can actually steal whole pageblock if this condition met, + * we can actually claim the whole pageblock if this condition met, * but, below check doesn't guarantee it and that is just heuristic * so could be changed anytime. */ if (order >= pageblock_order) return true; - if (order >= pageblock_order / 2 || - start_mt == MIGRATE_RECLAIMABLE || - start_mt == MIGRATE_UNMOVABLE || - page_group_by_mobility_disabled) + /* + * Above a certain threshold, always try to claim, as it's likely there + * will be more free pages in the pageblock. + */ + if (order >= pageblock_order / 2) return true; + /* + * Unmovable/reclaimable allocations would cause permanent + * fragmentations if they fell back to allocating from a movable block + * (polluting it), so we try to claim the whole block regardless of the + * allocation size. Later movable allocations can always steal from this + * block, which is less problematic. + */ + if (start_mt == MIGRATE_RECLAIMABLE || start_mt == MIGRATE_UNMOVABLE) + return true; + + if (page_group_by_mobility_disabled) + return true; + + /* + * Movable pages won't cause permanent fragmentation, so when you alloc + * small pages, we just need to temporarily steal unmovable or + * reclaimable pages that are closest to the request size. After a + * while, memory compaction may occur to form large contiguous pages, + * and the next movable allocation may not need to steal. + */ return false; } @@ -1794,33 +1910,29 @@ static inline bool boost_watermark(struct zone *zone) } /* - * This function implements actual steal behaviour. If order is large enough, - * we can steal whole pageblock. If not, we first move freepages in this - * pageblock to our migratetype and determine how many already-allocated pages - * are there in the pageblock with a compatible migratetype. If at least half - * of pages are free or compatible, we can change migratetype of the pageblock - * itself, so pages freed in the future will be put on the correct free list. + * This function implements actual block claiming behaviour. If order is large + * enough, we can claim the whole pageblock for the requested migratetype. If + * not, we check the pageblock for constituent pages; if at least half of the + * pages are free or compatible, we can still claim the whole block, so pages + * freed in the future will be put on the correct free list. */ -static void steal_suitable_fallback(struct zone *zone, struct page *page, - unsigned int alloc_flags, int start_type, bool whole_block) +static struct page * +try_to_claim_block(struct zone *zone, struct page *page, + int current_order, int order, int start_type, + int block_type, unsigned int alloc_flags) { - unsigned int current_order = buddy_order(page); int free_pages, movable_pages, alike_pages; - int old_block_type; - - old_block_type = get_pageblock_migratetype(page); - - /* - * This can happen due to races and we want to prevent broken - * highatomic accounting. - */ - if (is_migrate_highatomic(old_block_type)) - goto single_page; + unsigned long start_pfn; /* Take ownership for orders >= pageblock_order */ if (current_order >= pageblock_order) { + unsigned int nr_added; + + del_page_from_free_list(page, zone, current_order, block_type); change_pageblock_range(page, current_order, start_type); - goto single_page; + nr_added = expand(zone, page, order, current_order, start_type); + account_freepages(zone, nr_added, start_type); + return page; } /* @@ -1831,15 +1943,10 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD)) set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); - /* We are not allowed to try stealing from the whole block */ - if (!whole_block) - goto single_page; - - free_pages = move_freepages_block(zone, page, start_type, - &movable_pages); /* moving whole block can fail due to zone boundary conditions */ - if (!free_pages) - goto single_page; + if (!prep_move_freepages_block(zone, page, &start_pfn, &free_pages, + &movable_pages)) + return NULL; /* * Determine how many pages are compatible with our allocation. @@ -1856,7 +1963,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, * vice versa, be conservative since we can't distinguish the * exact migratetype of non-movable pages. */ - if (old_block_type == MIGRATE_MOVABLE) + if (block_type == MIGRATE_MOVABLE) alike_pages = pageblock_nr_pages - (free_pages + movable_pages); else @@ -1867,23 +1974,24 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, * compatible migratability as our allocation, claim the whole block. */ if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || - page_group_by_mobility_disabled) - set_pageblock_migratetype(page, start_type); - - return; + page_group_by_mobility_disabled) { + __move_freepages_block(zone, start_pfn, block_type, start_type); + return __rmqueue_smallest(zone, order, start_type); + } -single_page: - move_to_free_list(page, zone, current_order, start_type); + return NULL; } /* * Check whether there is a suitable fallback freepage with requested order. - * If only_stealable is true, this function returns fallback_mt only if - * we can steal other freepages all together. This would help to reduce + * Sets *claim_block to instruct the caller whether it should convert a whole + * pageblock to the returned migratetype. + * If only_claim is true, this function returns fallback_mt only if + * we would do this whole-block claiming. This would help to reduce * fragmentation due to mixed migratetype pages in one pageblock. */ int find_suitable_fallback(struct free_area *area, unsigned int order, - int migratetype, bool only_stealable, bool *can_steal) + int migratetype, bool only_claim, bool *claim_block) { int i; int fallback_mt; @@ -1891,19 +1999,16 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, if (area->nr_free == 0) return -1; - *can_steal = false; + *claim_block = false; for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) { fallback_mt = fallbacks[migratetype][i]; if (free_area_empty(area, fallback_mt)) continue; - if (can_steal_fallback(order, migratetype)) - *can_steal = true; - - if (!only_stealable) - return fallback_mt; + if (should_try_claim_block(order, migratetype)) + *claim_block = true; - if (*can_steal) + if (*claim_block || !only_claim) return fallback_mt; } @@ -1911,10 +2016,12 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, } /* - * Reserve a pageblock for exclusive use of high-order atomic allocations if - * there are no empty page blocks that contain a page with a suitable order + * Reserve the pageblock(s) surrounding an allocation request for + * exclusive use of high-order atomic allocations if there are no + * empty page blocks that contain a page with a suitable order */ -static void reserve_highatomic_pageblock(struct page *page, struct zone *zone) +static void reserve_highatomic_pageblock(struct page *page, int order, + struct zone *zone) { int mt; unsigned long max_managed, flags; @@ -1940,10 +2047,16 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone) /* Yoink! */ mt = get_pageblock_migratetype(page); /* Only reserve normal pageblocks (i.e., they can merge with others) */ - if (migratetype_is_mergeable(mt)) { + if (!migratetype_is_mergeable(mt)) + goto out_unlock; + + if (order < pageblock_order) { + if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1) + goto out_unlock; zone->nr_reserved_highatomic += pageblock_nr_pages; - set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); - move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL); + } else { + change_pageblock_range(page, order, MIGRATE_HIGHATOMIC); + zone->nr_reserved_highatomic += 1 << order; } out_unlock: @@ -1956,7 +2069,7 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone) * intense memory pressure but failed atomic allocations should be easier * to recover from than an OOM. * - * If @force is true, try to unreserve a pageblock even though highatomic + * If @force is true, try to unreserve pageblocks even though highatomic * pageblock is exhausted. */ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, @@ -1968,7 +2081,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, struct zone *zone; struct page *page; int order; - bool ret; + int ret; for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx, ac->nodemask) { @@ -1983,30 +2096,22 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < NR_PAGE_ORDERS; order++) { struct free_area *area = &(zone->free_area[order]); + unsigned long size; page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); if (!page) continue; /* - * In page freeing path, migratetype change is racy so - * we can counter several free pages in a pageblock - * in this loop although we changed the pageblock type - * from highatomic to ac->migratetype. So we should - * adjust the count once. + * It should never happen but changes to + * locking could inadvertently allow a per-cpu + * drain to add pages to MIGRATE_HIGHATOMIC + * while unreserving so be safe and watch for + * underflows. */ - if (is_migrate_highatomic_page(page)) { - /* - * It should never happen but changes to - * locking could inadvertently allow a per-cpu - * drain to add pages to MIGRATE_HIGHATOMIC - * while unreserving so be safe and watch for - * underflows. - */ - zone->nr_reserved_highatomic -= min( - pageblock_nr_pages, - zone->nr_reserved_highatomic); - } + size = max(pageblock_nr_pages, 1UL << order); + size = min(size, zone->nr_reserved_highatomic); + zone->nr_reserved_highatomic -= size; /* * Convert to ac->migratetype and avoid the normal @@ -2017,10 +2122,24 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * of pageblocks that cannot be completely freed * may increase. */ - set_pageblock_migratetype(page, ac->migratetype); - ret = move_freepages_block(zone, page, ac->migratetype, - NULL); - if (ret) { + if (order < pageblock_order) + ret = move_freepages_block(zone, page, + MIGRATE_HIGHATOMIC, + ac->migratetype); + else { + move_to_free_list(page, zone, order, + MIGRATE_HIGHATOMIC, + ac->migratetype); + change_pageblock_range(page, order, + ac->migratetype); + ret = 1; + } + /* + * Reserving the block(s) already succeeded, + * so this should not fail on zone boundaries. + */ + WARN_ON_ONCE(ret == -1); + if (ret > 0) { spin_unlock_irqrestore(&zone->lock, flags); return ret; } @@ -2032,17 +2151,15 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, } /* - * Try finding a free buddy page on the fallback list and put it on the free - * list of requested migratetype, possibly along with other pages from the same - * block, depending on fragmentation avoidance heuristics. Returns true if - * fallback was found so that __rmqueue_smallest() can grab it. + * Try to allocate from some fallback migratetype by claiming the entire block, + * i.e. converting it to the allocation's start migratetype. * * The use of signed ints for order and current_order is a deliberate * deviation from the rest of this file, to make the for loop * condition simpler. */ -static __always_inline bool -__rmqueue_fallback(struct zone *zone, int order, int start_migratetype, +static __always_inline struct page * +__rmqueue_claim(struct zone *zone, int order, int start_migratetype, unsigned int alloc_flags) { struct free_area *area; @@ -2050,7 +2167,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, int min_order = order; struct page *page; int fallback_mt; - bool can_steal; + bool claim_block; /* * Do not steal pages from freelists belonging to other pageblocks @@ -2069,62 +2186,71 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, --current_order) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, - start_migratetype, false, &can_steal); + start_migratetype, false, &claim_block); if (fallback_mt == -1) continue; - /* - * We cannot steal all free pages from the pageblock and the - * requested migratetype is movable. In that case it's better to - * steal and split the smallest available page instead of the - * largest available page, because even if the next movable - * allocation falls back into a different pageblock than this - * one, it won't cause permanent fragmentation. - */ - if (!can_steal && start_migratetype == MIGRATE_MOVABLE - && current_order > order) - goto find_smallest; + if (!claim_block) + break; - goto do_steal; + page = get_page_from_free_area(area, fallback_mt); + page = try_to_claim_block(zone, page, current_order, order, + start_migratetype, fallback_mt, + alloc_flags); + if (page) { + trace_mm_page_alloc_extfrag(page, order, current_order, + start_migratetype, fallback_mt); + return page; + } } - return false; + return NULL; +} + +/* + * Try to steal a single page from some fallback migratetype. Leave the rest of + * the block as its current migratetype, potentially causing fragmentation. + */ +static __always_inline struct page * +__rmqueue_steal(struct zone *zone, int order, int start_migratetype) +{ + struct free_area *area; + int current_order; + struct page *page; + int fallback_mt; + bool claim_block; -find_smallest: for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, - start_migratetype, false, &can_steal); - if (fallback_mt != -1) - break; - } - - /* - * This should not happen - we already found a suitable fallback - * when looking for the largest page. - */ - VM_BUG_ON(current_order > MAX_ORDER); - -do_steal: - page = get_page_from_free_area(area, fallback_mt); - - steal_suitable_fallback(zone, page, alloc_flags, start_migratetype, - can_steal); - - trace_mm_page_alloc_extfrag(page, order, current_order, - start_migratetype, fallback_mt); + start_migratetype, false, &claim_block); + if (fallback_mt == -1) + continue; - return true; + page = get_page_from_free_area(area, fallback_mt); + page_del_and_expand(zone, page, order, current_order, fallback_mt); + trace_mm_page_alloc_extfrag(page, order, current_order, + start_migratetype, fallback_mt); + return page; + } + return NULL; } +enum rmqueue_mode { + RMQUEUE_NORMAL, + RMQUEUE_CMA, + RMQUEUE_CLAIM, + RMQUEUE_STEAL, +}; + /* * Do the hard work of removing an element from the buddy allocator. * Call me with the zone->lock already held. */ static __always_inline struct page * __rmqueue(struct zone *zone, unsigned int order, int migratetype, - unsigned int alloc_flags) + unsigned int alloc_flags, enum rmqueue_mode *mode) { struct page *page; @@ -2142,17 +2268,49 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype, return page; } } -retry: - page = __rmqueue_smallest(zone, order, migratetype); - if (unlikely(!page)) { - if (alloc_flags & ALLOC_CMA) - page = __rmqueue_cma_fallback(zone, order); - if (!page && __rmqueue_fallback(zone, order, migratetype, - alloc_flags)) - goto retry; + /* + * First try the freelists of the requested migratetype, then try + * fallbacks modes with increasing levels of fragmentation risk. + * + * The fallback logic is expensive and rmqueue_bulk() calls in + * a loop with the zone->lock held, meaning the freelists are + * not subject to any outside changes. Remember in *mode where + * we found pay dirt, to save us the search on the next call. + */ + switch (*mode) { + case RMQUEUE_NORMAL: + page = __rmqueue_smallest(zone, order, migratetype); + if (page) + return page; + fallthrough; + case RMQUEUE_CMA: + if (alloc_flags & ALLOC_CMA) { + page = __rmqueue_cma_fallback(zone, order); + if (page) { + *mode = RMQUEUE_CMA; + return page; + } + } + fallthrough; + case RMQUEUE_CLAIM: + page = __rmqueue_claim(zone, order, migratetype, alloc_flags); + if (page) { + /* Replenished preferred freelist, back to normal mode. */ + *mode = RMQUEUE_NORMAL; + return page; + } + fallthrough; + case RMQUEUE_STEAL: + if (!(alloc_flags & ALLOC_NOFRAGMENT)) { + page = __rmqueue_steal(zone, order, migratetype); + if (page) { + *mode = RMQUEUE_STEAL; + return page; + } + } } - return page; + return NULL; } /* @@ -2164,13 +2322,14 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, unsigned int alloc_flags) { + enum rmqueue_mode rmqm = RMQUEUE_NORMAL; unsigned long flags; int i; spin_lock_irqsave(&zone->lock, flags); for (i = 0; i < count; ++i) { struct page *page = __rmqueue(zone, order, migratetype, - alloc_flags); + alloc_flags, &rmqm); if (unlikely(page == NULL)) break; @@ -2185,12 +2344,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * pages are ordered properly. */ list_add_tail(&page->pcp_list, list); - if (is_migrate_cma(get_pcppage_migratetype(page))) - __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, - -(1 << order)); } - - __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); spin_unlock_irqrestore(&zone->lock, flags); return i; @@ -2385,19 +2539,6 @@ void drain_all_pages(struct zone *zone) __drain_all_pages(zone, false); } -static bool free_unref_page_prepare(struct page *page, unsigned long pfn, - unsigned int order) -{ - int migratetype; - - if (!free_pages_prepare(page, order)) - return false; - - migratetype = get_pfnblock_migratetype(page, pfn); - set_pcppage_migratetype(page, migratetype); - return true; -} - static int nr_pcp_free(struct per_cpu_pages *pcp, int batch, int high, bool free_high) { int min_nr_free, max_nr_free; @@ -2528,7 +2669,7 @@ void free_unref_page(struct page *page, unsigned int order) struct per_cpu_pages *pcp; struct zone *zone; unsigned long pfn = page_to_pfn(page); - int migratetype, pcpmigratetype; + int migratetype; if (page_from_dynamic_pool(page)) { dynamic_pool_free_page(page); @@ -2540,7 +2681,7 @@ void free_unref_page(struct page *page, unsigned int order) return; } - if (!free_unref_page_prepare(page, pfn, order)) + if (!free_pages_prepare(page, order)) return; /* @@ -2550,23 +2691,23 @@ void free_unref_page(struct page *page, unsigned int order) * get those areas back if necessary. Otherwise, we may have to free * excessively into the page allocator */ - migratetype = pcpmigratetype = get_pcppage_migratetype(page); + migratetype = get_pfnblock_migratetype(page, pfn); if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { if (unlikely(is_migrate_isolate(migratetype))) { - free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE); + free_one_page(page_zone(page), page, pfn, order, FPI_NONE); return; } - pcpmigratetype = MIGRATE_MOVABLE; + migratetype = MIGRATE_MOVABLE; } zone = page_zone(page); pcp_trylock_prepare(UP_flags); pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (pcp) { - free_unref_page_commit(zone, pcp, page, pcpmigratetype, order); + free_unref_page_commit(zone, pcp, page, migratetype, order); pcp_spin_unlock(pcp); } else { - free_one_page(zone, page, pfn, order, migratetype, FPI_NONE); + free_one_page(zone, page, pfn, order, FPI_NONE); } pcp_trylock_finish(UP_flags); } @@ -2579,7 +2720,7 @@ void free_unref_folios(struct folio_batch *folios) unsigned long __maybe_unused UP_flags; struct per_cpu_pages *pcp = NULL; struct zone *locked_zone = NULL; - int i, j, migratetype; + int i, j; /* Prepare folios for freeing */ for (i = 0, j = 0; i < folios->nr; i++) { @@ -2592,18 +2733,16 @@ void free_unref_folios(struct folio_batch *folios) continue; } - if (!free_unref_page_prepare(&folio->page, pfn, order)) + if (!free_pages_prepare(&folio->page, order)) continue; /* - * Free isolated folios and orders not handled on the PCP - * directly to the allocator, see comment in free_unref_page. + * Free orders not handled on the PCP directly to the + * allocator. */ - migratetype = get_pcppage_migratetype(&folio->page); - if (!pcp_allowed_order(order) || - is_migrate_isolate(migratetype)) { - free_one_page(folio_zone(folio), &folio->page, pfn, - order, migratetype, FPI_NONE); + if (!pcp_allowed_order(order)) { + free_one_page(folio_zone(folio), &folio->page, + pfn, order, FPI_NONE); continue; } folio->private = (void *)(unsigned long)order; @@ -2616,16 +2755,31 @@ void free_unref_folios(struct folio_batch *folios) for (i = 0; i < folios->nr; i++) { struct folio *folio = folios->folios[i]; struct zone *zone = folio_zone(folio); + unsigned long pfn = folio_pfn(folio); unsigned int order = (unsigned long)folio->private; + int migratetype; folio->private = NULL; - migratetype = get_pcppage_migratetype(&folio->page); + migratetype = get_pfnblock_migratetype(&folio->page, pfn); /* Different zone requires a different pcp lock */ - if (zone != locked_zone) { + if (zone != locked_zone || + is_migrate_isolate(migratetype)) { if (pcp) { pcp_spin_unlock(pcp); pcp_trylock_finish(UP_flags); + locked_zone = NULL; + pcp = NULL; + } + + /* + * Free isolated pages directly to the + * allocator, see comment in free_unref_page. + */ + if (is_migrate_isolate(migratetype)) { + free_one_page(zone, &folio->page, pfn, + order, FPI_NONE); + continue; } /* @@ -2636,10 +2790,8 @@ void free_unref_folios(struct folio_batch *folios) pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (unlikely(!pcp)) { pcp_trylock_finish(UP_flags); - free_one_page(zone, &folio->page, - folio_pfn(folio), order, - migratetype, FPI_NONE); - locked_zone = NULL; + free_one_page(zone, &folio->page, pfn, + order, FPI_NONE); continue; } locked_zone = zone; @@ -2702,11 +2854,9 @@ int __isolate_free_page(struct page *page, unsigned int order) watermark = zone->_watermark[WMARK_MIN] + (1UL << order); if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) return 0; - - __mod_zone_freepage_state(zone, -(1UL << order), mt); } - del_page_from_free_list(page, zone, order); + del_page_from_free_list(page, zone, order, mt); /* * Set the pageblock if the isolated page is at least half of a @@ -2721,8 +2871,8 @@ int __isolate_free_page(struct page *page, unsigned int order) * with others) */ if (migratetype_is_mergeable(mt)) - set_pageblock_migratetype(page, - MIGRATE_MOVABLE); + move_freepages_block(zone, page, mt, + MIGRATE_MOVABLE); } } @@ -2790,7 +2940,9 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, if (alloc_flags & ALLOC_HIGHATOMIC) page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); if (!page) { - page = __rmqueue(zone, order, migratetype, alloc_flags); + enum rmqueue_mode rmqm = RMQUEUE_NORMAL; + + page = __rmqueue(zone, order, migratetype, alloc_flags, &rmqm); /* * If the allocation fails, allow OOM handling and @@ -2806,8 +2958,6 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, return NULL; } } - __mod_zone_freepage_state(zone, -(1 << order), - get_pcppage_migratetype(page)); spin_unlock_irqrestore(&zone->lock, flags); } while (check_new_pages(page, order)); @@ -2989,11 +3139,10 @@ static inline long __zone_watermark_unusable_free(struct zone *z, /* * If the caller does not have rights to reserves below the min - * watermark then subtract the high-atomic reserves. This will - * over-estimate the size of the atomic reserve but it avoids a search. + * watermark then subtract the free pages reserved for highatomic. */ if (likely(!(alloc_flags & ALLOC_RESERVES))) - unusable_free += z->nr_reserved_highatomic; + unusable_free += READ_ONCE(z->nr_free_highatomic); #ifdef CONFIG_CMA /* If allocation can't use CMA areas don't use free CMA pages */ @@ -3381,7 +3530,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, * if the pageblock should be reserved for the future */ if (unlikely(alloc_flags & ALLOC_HIGHATOMIC)) - reserve_highatomic_pageblock(page, zone); + reserve_highatomic_pageblock(page, order, zone); return page; } else { @@ -6506,9 +6655,8 @@ static void alloc_contig_dump_pages(struct list_head *page_list) * @migratetype: using migratetype to filter the type of migration in * trace_mm_alloc_contig_migrate_range_info. */ -int __alloc_contig_migrate_range(struct compact_control *cc, - unsigned long start, unsigned long end, - int migratetype) +static int __alloc_contig_migrate_range(struct compact_control *cc, + unsigned long start, unsigned long end, int migratetype) { /* This function is based on compact_zone() from compaction.c. */ unsigned int nr_reclaimed; @@ -6517,7 +6665,7 @@ int __alloc_contig_migrate_range(struct compact_control *cc, int ret = 0; struct migration_target_control mtc = { .nid = zone_to_nid(cc->zone), - .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + .gfp_mask = cc->gfp_mask, }; struct page *page; unsigned long total_mapped = 0; @@ -6582,6 +6730,94 @@ int __alloc_contig_migrate_range(struct compact_control *cc, return (ret < 0) ? ret : 0; } +static void split_free_pages(struct list_head *list, gfp_t gfp_mask) +{ + int order; + + for (order = 0; order < NR_PAGE_ORDERS; order++) { + struct page *page, *next; + int nr_pages = 1 << order; + + list_for_each_entry_safe(page, next, &list[order], lru) { + int i; + + post_alloc_hook(page, order, gfp_mask); + if (!order) + continue; + + split_page(page, order); + + /* Add all subpages to the order-0 head, in sequence. */ + list_del(&page->lru); + for (i = 0; i < nr_pages; i++) + list_add_tail(&page[i].lru, &list[0]); + } + } +} + +static void split_pages_to_order0(struct list_head *list) +{ + int order; + + for (order = 1; order < NR_PAGE_ORDERS; order++) { + struct page *page, *next; + int nr_pages = 1 << order; + + list_for_each_entry_safe(page, next, &list[order], lru) { + int i; + + list_del(&page->lru); + for (i = 0; i < nr_pages; i++) + list_add_tail(&page[i].lru, &list[0]); + } + } +} + +static void free_pfn_range(unsigned long start, unsigned long end, gfp_t gfp_mask) +{ + struct page *page; + unsigned long i; + + page = pfn_to_page(start); + for (i = 0; i < end - start; ++i, ++page) + post_alloc_hook(page, 0, gfp_mask); + free_contig_range(start, end - start); +} + +static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask) +{ + const gfp_t reclaim_mask = __GFP_IO | __GFP_FS | __GFP_RECLAIM; + const gfp_t action_mask = __GFP_COMP | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | + __GFP_ZERO | __GFP_ZEROTAGS | __GFP_SKIP_ZERO; + const gfp_t cc_action_mask = __GFP_RETRY_MAYFAIL | __GFP_NOWARN; + + /* + * We are given the range to allocate; node, mobility and placement + * hints are irrelevant at this point. We'll simply ignore them. + */ + gfp_mask &= ~(GFP_ZONEMASK | __GFP_RECLAIMABLE | __GFP_WRITE | + __GFP_HARDWALL | __GFP_THISNODE | __GFP_MOVABLE); + + /* + * We only support most reclaim flags (but not NOFAIL/NORETRY), and + * selected action flags. + */ + if (gfp_mask & ~(reclaim_mask | action_mask)) + return -EINVAL; + + /* + * Flags to control page compaction/migration/reclaim, to free up our + * page range. Migratable pages are movable, __GFP_MOVABLE is implied + * for them. + * + * Traditionally we always had __GFP_HARDWALL|__GFP_RETRY_MAYFAIL set, + * keep doing that to not degrade callers. + */ + *gfp_cc_mask = (gfp_mask & (reclaim_mask | cc_action_mask)) | + __GFP_HARDWALL | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL; + return 0; +} + /** * alloc_contig_range() -- tries to allocate given range of pages * @start: start PFN to allocate @@ -6590,7 +6826,9 @@ int __alloc_contig_migrate_range(struct compact_control *cc, * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks * in range must have the same migratetype and it must * be either of the two. - * @gfp_mask: GFP mask to use during compaction + * @gfp_mask: GFP mask. Node/zone/placement hints are ignored; only some + * action and reclaim modifiers are supported. Reclaim modifiers + * control allocation behavior during compaction/migration/reclaim. * * The PFN range does not have to be pageblock aligned. The PFN range must * belong to a single zone. @@ -6606,8 +6844,8 @@ int __alloc_contig_migrate_range(struct compact_control *cc, int alloc_contig_range(unsigned long start, unsigned long end, unsigned migratetype, gfp_t gfp_mask) { + int range_order = ilog2(end - start); unsigned long outer_start, outer_end; - int order; int ret = 0; struct compact_control cc = { @@ -6617,11 +6855,22 @@ int alloc_contig_range(unsigned long start, unsigned long end, .mode = MIGRATE_SYNC, .ignore_skip_hint = true, .no_set_skip_hint = true, - .gfp_mask = current_gfp_context(gfp_mask), .alloc_contig = true, }; INIT_LIST_HEAD(&cc.migratepages); + gfp_mask = current_gfp_context(gfp_mask); + if (__alloc_contig_verify_gfp_mask(gfp_mask, (gfp_t *)&cc.gfp_mask)) + return -EINVAL; + + /* __GFP_COMP may only be used for certain aligned+sized ranges. */ + if ((gfp_mask & __GFP_COMP) && + (!is_power_of_2(end - start) || !IS_ALIGNED(start, 1 << range_order))) { + WARN_ONCE(true, "PFN range: requested [%lu, %lu) is not suitable for __GFP_COMP\n", + start, end); + return -EINVAL; + } + /* * What we do here is we mark all pageblocks in range as * MIGRATE_ISOLATE. Because pageblock and max order pages may @@ -6690,29 +6939,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, * We don't have to hold zone->lock here because the pages are * isolated thus they won't get removed from buddy. */ - - order = 0; - outer_start = start; - while (!PageBuddy(pfn_to_page(outer_start))) { - if (++order > MAX_ORDER) { - outer_start = start; - break; - } - outer_start &= ~0UL << order; - } - - if (outer_start != start) { - order = buddy_order(pfn_to_page(outer_start)); - - /* - * outer_start page could be small order buddy page and - * it doesn't include start page. Adjust outer_start - * in this case to report failed page properly - * on tracepoint in test_pages_isolated() - */ - if (outer_start + (1UL << order) <= start) - outer_start = start; - } + outer_start = find_large_buddy(start); /* Make sure the range is really isolated. */ if (test_pages_isolated(outer_start, end, 0)) { @@ -6727,12 +6954,34 @@ int alloc_contig_range(unsigned long start, unsigned long end, goto done; } - /* Free head and tail (if any) */ - if (start != outer_start) - free_contig_range(outer_start, start - outer_start); - if (end != outer_end) - free_contig_range(end, outer_end - end); + /* + * With __GFP_COMP and the requested order < MAX_PAGE_ORDER, + * isolated free pages can have higher order than the requested + * one. Use split_free_pages() to free out of range pages. + */ + if (!(gfp_mask & __GFP_COMP)) { + split_free_pages(cc.freepages, gfp_mask); + + /* Free head and tail (if any) */ + if (start != outer_start) + free_contig_range(outer_start, start - outer_start); + if (end != outer_end) + free_contig_range(end, outer_end - end); + } else { + struct page *head = pfn_to_page(start); + + if ((outer_start != start) || (end != outer_end)) { + split_pages_to_order0(cc.freepages); + if (start != outer_start) + free_pfn_range(outer_start, start, gfp_mask); + if (end != outer_end) + free_pfn_range(end, outer_end, gfp_mask); + } + + check_new_pages(head, range_order); + prep_new_page(head, range_order, gfp_mask, 0); + } done: undo_isolate_page_range(start, end, migratetype); return ret; @@ -6782,7 +7031,9 @@ static bool zone_spans_last_pfn(const struct zone *zone, /** * alloc_contig_pages() -- tries to find and allocate contiguous range of pages * @nr_pages: Number of contiguous pages to allocate - * @gfp_mask: GFP mask to limit search and used during compaction + * @gfp_mask: GFP mask. Node/zone/placement hints limit the search; only some + * action and reclaim modifiers are supported. Reclaim modifiers + * control allocation behavior during compaction/migration/reclaim. * @nid: Target node * @nodemask: Mask for other possible nodes * @@ -6841,6 +7092,18 @@ struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, void free_contig_range(unsigned long pfn, unsigned long nr_pages) { unsigned long count = 0; + struct folio *folio = pfn_folio(pfn); + + if (folio_test_large(folio)) { + int expected = folio_nr_pages(folio); + + if (nr_pages == expected) + folio_put(folio); + else + WARN(true, "PFN %lu: nr_pages %lu != expected %d\n", + pfn, nr_pages, expected); + return; + } for (; nr_pages--; pfn++) { struct page *page = pfn_to_page(pfn); @@ -6946,8 +7209,9 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) BUG_ON(page_count(page)); BUG_ON(!PageBuddy(page)); + VM_WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE); order = buddy_order(page); - del_page_from_free_list(page, zone, order); + del_page_from_free_list(page, zone, order, MIGRATE_ISOLATE); pfn += (1 << order); } spin_unlock_irqrestore(&zone->lock, flags); @@ -6975,6 +7239,14 @@ bool is_free_buddy_page(struct page *page) EXPORT_SYMBOL(is_free_buddy_page); #ifdef CONFIG_MEMORY_FAILURE +static inline void add_to_free_list(struct page *page, struct zone *zone, + unsigned int order, int migratetype, + bool tail) +{ + __add_to_free_list(page, zone, order, migratetype, tail); + account_freepages(zone, 1 << order, migratetype); +} + /* * Break down a higher-order page in sub-pages, and keep our target out of * buddy allocator. @@ -6984,28 +7256,24 @@ static void break_down_buddy_pages(struct zone *zone, struct page *page, int migratetype) { unsigned long size = 1 << high; - struct page *current_buddy, *next_page; + struct page *current_buddy; while (high > low) { high--; size >>= 1; if (target >= &page[size]) { - next_page = page + size; current_buddy = page; + page = page + size; } else { - next_page = page; current_buddy = page + size; } - page = next_page; - if (set_page_guard(zone, current_buddy, high, migratetype)) + if (set_page_guard(zone, current_buddy, high)) continue; - if (current_buddy != target) { - add_to_free_list(current_buddy, zone, high, migratetype); - set_buddy_order(current_buddy, high); - } + add_to_free_list(current_buddy, zone, high, migratetype, false); + set_buddy_order(current_buddy, high); } } @@ -7030,12 +7298,11 @@ bool take_page_off_buddy(struct page *page) int migratetype = get_pfnblock_migratetype(page_head, pfn_head); - del_page_from_free_list(page_head, zone, page_order); + del_page_from_free_list(page_head, zone, page_order, + migratetype); break_down_buddy_pages(zone, page_head, page, 0, page_order, migratetype); SetPageHWPoisonTakenOff(page); - if (!is_migrate_isolate(migratetype)) - __mod_zone_freepage_state(zone, -1, migratetype); ret = true; break; } @@ -7052,13 +7319,14 @@ bool take_page_off_buddy(struct page *page) bool put_page_back_buddy(struct page *page) { struct zone *zone = page_zone(page); - unsigned long pfn = page_to_pfn(page); unsigned long flags; - int migratetype = get_pfnblock_migratetype(page, pfn); bool ret = false; spin_lock_irqsave(&zone->lock, flags); if (put_page_testzero(page)) { + unsigned long pfn = page_to_pfn(page); + int migratetype = get_pfnblock_migratetype(page, pfn); + ClearPageHWPoisonTakenOff(page); __free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE); if (TestClearPageHWPoison(page)) { @@ -7139,7 +7407,7 @@ static bool try_to_accept_memory_one(struct zone *zone) list_del(&page->lru); last = list_empty(&zone->unaccepted_pages); - __mod_zone_freepage_state(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); + account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES); spin_unlock_irqrestore(&zone->lock, flags); @@ -7197,7 +7465,7 @@ static bool __free_unaccepted(struct page *page) spin_lock_irqsave(&zone->lock, flags); first = list_empty(&zone->unaccepted_pages); list_add_tail(&page->lru, &zone->unaccepted_pages); - __mod_zone_freepage_state(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); + account_freepages(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); __mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES); spin_unlock_irqrestore(&zone->lock, flags); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index c634db34978366a4344cf449b883bd8ebedb4c5d..ddee0901c77f24dff395e37d1cd42b31918d445b 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -179,15 +179,11 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end, migratetype, isol_flags); if (!unmovable) { - unsigned long nr_pages; - int mt = get_pageblock_migratetype(page); - - set_pageblock_migratetype(page, MIGRATE_ISOLATE); + if (!move_freepages_block_isolate(zone, page, MIGRATE_ISOLATE)) { + spin_unlock_irqrestore(&zone->lock, flags); + return -EBUSY; + } zone->nr_isolate_pageblock++; - nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE, - NULL); - - __mod_zone_freepage_state(zone, -nr_pages, mt); spin_unlock_irqrestore(&zone->lock, flags); return 0; } @@ -207,7 +203,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ static void unset_migratetype_isolate(struct page *page, int migratetype) { struct zone *zone; - unsigned long flags, nr_pages; + unsigned long flags; bool isolated_page = false; unsigned int order; struct page *buddy; @@ -253,12 +249,15 @@ static void unset_migratetype_isolate(struct page *page, int migratetype) * allocation. */ if (!isolated_page) { - nr_pages = move_freepages_block(zone, page, migratetype, NULL); - __mod_zone_freepage_state(zone, nr_pages, migratetype); - } - set_pageblock_migratetype(page, migratetype); - if (isolated_page) + /* + * Isolating this block already succeeded, so this + * should not fail on zone boundaries. + */ + WARN_ON_ONCE(!move_freepages_block_isolate(zone, page, migratetype)); + } else { + set_pageblock_migratetype(page, migratetype); __putback_isolated_page(page, order, migratetype); + } zone->nr_isolate_pageblock--; out: spin_unlock_irqrestore(&zone->lock, flags); @@ -285,7 +284,6 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * within a free or in-use page. * @boundary_pfn: pageblock-aligned pfn that a page might cross * @flags: isolation flags - * @gfp_flags: GFP flags used for migrating pages * @isolate_before: isolate the pageblock before the boundary_pfn * @skip_isolation: the flag to skip the pageblock isolation in second * isolate_single_pageblock() @@ -304,8 +302,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * the in-use page then splitting the free page. */ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, - gfp_t gfp_flags, bool isolate_before, bool skip_isolation, - int migratetype) + bool isolate_before, bool skip_isolation, int migratetype) { unsigned long start_pfn; unsigned long isolate_pageblock; @@ -370,108 +367,52 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, VM_BUG_ON(!page); pfn = page_to_pfn(page); - /* - * start_pfn is MAX_ORDER_NR_PAGES aligned, if there is any - * free pages in [start_pfn, boundary_pfn), its head page will - * always be in the range. - */ + if (PageBuddy(page)) { int order = buddy_order(page); - if (pfn + (1UL << order) > boundary_pfn) { - /* free page changed before split, check it again */ - if (split_free_page(page, order, boundary_pfn - pfn)) - continue; - } + /* move_freepages_block_isolate() handled this */ + VM_WARN_ON_ONCE(pfn + (1 << order) > boundary_pfn); pfn += 1UL << order; continue; } + /* - * migrate compound pages then let the free page handling code - * above do the rest. If migration is not possible, just fail. + * If a compound page is straddling our block, attempt + * to migrate it out of the way. + * + * We don't have to worry about this creating a large + * free page that straddles into our block: gigantic + * pages are freed as order-0 chunks, and LRU pages + * (currently) do not exceed pageblock_order. + * + * The block of interest has already been marked + * MIGRATE_ISOLATE above, so when migration is done it + * will free its pages onto the correct freelists. */ if (PageCompound(page)) { struct page *head = compound_head(page); unsigned long head_pfn = page_to_pfn(head); unsigned long nr_pages = compound_nr(head); - if (head_pfn + nr_pages <= boundary_pfn) { + if (head_pfn + nr_pages <= boundary_pfn || + PageHuge(page)) { pfn = head_pfn + nr_pages; continue; } -#if defined CONFIG_COMPACTION || defined CONFIG_CMA + /* - * hugetlb, lru compound (THP), and movable compound pages - * can be migrated. Otherwise, fail the isolation. + * These pages are movable too, but they're + * not expected to exceed pageblock_order. + * + * Let us know when they do, so we can add + * proper free and split handling for them. */ - if (PageHuge(page) || PageLRU(page) || __PageMovable(page)) { - int order; - unsigned long outer_pfn; - int page_mt = get_pageblock_migratetype(page); - bool isolate_page = !is_migrate_isolate_page(page); - struct compact_control cc = { - .nr_migratepages = 0, - .order = -1, - .zone = page_zone(pfn_to_page(head_pfn)), - .mode = MIGRATE_SYNC, - .ignore_skip_hint = true, - .no_set_skip_hint = true, - .gfp_mask = gfp_flags, - .alloc_contig = true, - }; - INIT_LIST_HEAD(&cc.migratepages); - - /* - * XXX: mark the page as MIGRATE_ISOLATE so that - * no one else can grab the freed page after migration. - * Ideally, the page should be freed as two separate - * pages to be added into separate migratetype free - * lists. - */ - if (isolate_page) { - ret = set_migratetype_isolate(page, page_mt, - flags, head_pfn, head_pfn + nr_pages); - if (ret) - goto failed; - } - - ret = __alloc_contig_migrate_range(&cc, head_pfn, - head_pfn + nr_pages, page_mt); + VM_WARN_ON_ONCE_PAGE(PageLRU(page), page); + VM_WARN_ON_ONCE_PAGE(__PageMovable(page), page); - /* - * restore the page's migratetype so that it can - * be split into separate migratetype free lists - * later. - */ - if (isolate_page) - unset_migratetype_isolate(page, page_mt); - - if (ret) - goto failed; - /* - * reset pfn to the head of the free page, so - * that the free page handling code above can split - * the free page to the right migratetype list. - * - * head_pfn is not used here as a hugetlb page order - * can be bigger than MAX_ORDER, but after it is - * freed, the free page order is not. Use pfn within - * the range to find the head of the free page. - */ - order = 0; - outer_pfn = pfn; - while (!PageBuddy(pfn_to_page(outer_pfn))) { - /* stop if we cannot find the free page */ - if (++order > MAX_ORDER) - goto failed; - outer_pfn &= ~0UL << order; - } - pfn = outer_pfn; - continue; - } else -#endif - goto failed; + goto failed; } pfn++; @@ -541,7 +482,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, bool skip_isolation = false; /* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */ - ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false, + ret = isolate_single_pageblock(isolate_start, flags, false, skip_isolation, migratetype); if (ret) return ret; @@ -550,7 +491,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, skip_isolation = true; /* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */ - ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true, + ret = isolate_single_pageblock(isolate_end, flags, true, skip_isolation, migratetype); if (ret) { unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype);