From cfd18ce7a2f5ee1f93f3fc02db26388ea2d4de45 Mon Sep 17 00:00:00 2001 From: Yuan Can Date: Tue, 5 Sep 2023 20:45:03 +0800 Subject: [PATCH 01/10] mm: hugetlb_vmemmap: fix hugetlb page number decrease failed on movable nodes commit 2eaa6c2abb9dd55041a05c20c451790c124d5cf0 upstream. The decreasing of hugetlb pages number failed with the following message given: sh: page allocation failure: order:0, mode:0x204cc0(GFP_KERNEL|__GFP_RETRY_MAYFAIL|__GFP_THISNODE) CPU: 1 PID: 112 Comm: sh Not tainted 6.5.0-rc7-... #45 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace.part.6+0x84/0xe4 show_stack+0x18/0x24 dump_stack_lvl+0x48/0x60 dump_stack+0x18/0x24 warn_alloc+0x100/0x1bc __alloc_pages_slowpath.constprop.107+0xa40/0xad8 __alloc_pages+0x244/0x2d0 hugetlb_vmemmap_restore+0x104/0x1e4 __update_and_free_hugetlb_folio+0x44/0x1f4 update_and_free_hugetlb_folio+0x20/0x68 update_and_free_pages_bulk+0x4c/0xac set_max_huge_pages+0x198/0x334 nr_hugepages_store_common+0x118/0x178 nr_hugepages_store+0x18/0x24 kobj_attr_store+0x18/0x2c sysfs_kf_write+0x40/0x54 kernfs_fop_write_iter+0x164/0x1dc vfs_write+0x3a8/0x460 ksys_write+0x6c/0x100 __arm64_sys_write+0x1c/0x28 invoke_syscall+0x44/0x100 el0_svc_common.constprop.1+0x6c/0xe4 do_el0_svc+0x38/0x94 el0_svc+0x28/0x74 el0t_64_sync_handler+0xa0/0xc4 el0t_64_sync+0x174/0x178 Mem-Info: ... The reason is that the hugetlb pages being released are allocated from movable nodes, and with hugetlb_optimize_vmemmap enabled, vmemmap pages need to be allocated from the same node during the hugetlb pages releasing. With GFP_KERNEL and __GFP_THISNODE set, allocating from movable node is always failed. Fix this problem by removing __GFP_THISNODE. Link: https://lkml.kernel.org/r/20230905124503.24899-1-yuancan@huawei.com Fixes: ad2fa3717b74 ("mm: hugetlb: alloc the vmemmap pages associated with each HugeTLB page") Signed-off-by: Yuan Can Reviewed-by: Muchun Song Cc: Kefeng Wang Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 4b9734777f69..446e9fc723ec 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -380,7 +380,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, struct list_head *list) { - gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_THISNODE; + gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL; unsigned long nr_pages = (end - start) >> PAGE_SHIFT; int nid = page_to_nid((struct page *)start); struct page *page, *next; -- Gitee From 4ce3ca784b810d423f217addad9ba1447fc27088 Mon Sep 17 00:00:00 2001 From: hanliyang Date: Fri, 25 Apr 2025 17:35:50 +0800 Subject: [PATCH 02/10] Revert "KVM: SEV: Pin SEV guest memory out of CMA area" Upstream: no The commit 68c126a9564a ("KVM: SEV: Pin SEV guest memory out of CMA area") has its corresponding upstream version. We'll backport the upstream version to this repo. Fixes: 68c126a9564a ("KVM: SEV: Pin SEV guest memory out of CMA area") Signed-off-by: hanliyang --- arch/x86/kvm/svm/sev.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 12e88c2d948c..900506775f70 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -516,7 +516,6 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, unsigned long locked, lock_limit; struct page **pages; unsigned long first, last; - unsigned int flags = 0; int ret; lockdep_assert_held(&kvm->lock); @@ -549,10 +548,8 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, if (!pages) return ERR_PTR(-ENOMEM); - flags = write ? FOLL_WRITE : 0; - /* Pin the user virtual address. */ - npinned = pin_user_pages_fast(uaddr, npages, flags | FOLL_LONGTERM, pages); + npinned = pin_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages); if (npinned != npages) { pr_err("SEV: Failure locking %lu pages.\n", npages); ret = -ENOMEM; -- Gitee From d3a78d108d9e69d3f753b12140d1b008b8b8f39d Mon Sep 17 00:00:00 2001 From: Ge Yang Date: Tue, 11 Feb 2025 10:37:03 +0800 Subject: [PATCH 03/10] KVM: SEV: Use long-term pin when registering encrypted memory regions commit 7e066cb9b71a22c3e5ef233de63ff14525baf6f0 upstream. When registering an encrypted memory region for SEV-MEM/SEV-ES guests, pin the pages with FOLL_TERM so that the pages are migrated out of MIGRATE_CMA/ZONE_MOVABLE. Failure to do so violates the CMA/MOVABLE mechanisms and can result in fragmentation due to unmovable pages, e.g. can make CMA allocations fail. Signed-off-by: Ge Yang Reviewed-by: Tom Lendacky Acked-by: David Hildenbrand Link: https://lore.kernel.org/r/1739241423-14326-1-git-send-email-yangge1116@126.com [sean: massage changelog, make @flags an unsigned int] Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/csv.c | 6 +++--- arch/x86/kvm/svm/csv.h | 2 +- arch/x86/kvm/svm/sev.c | 15 ++++++++------- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/svm/csv.c b/arch/x86/kvm/svm/csv.c index 25e86a8c4c5e..27031f30271b 100644 --- a/arch/x86/kvm/svm/csv.c +++ b/arch/x86/kvm/svm/csv.c @@ -79,7 +79,7 @@ int csv_vm_attestation(struct kvm *kvm, unsigned long gpa, unsigned long len) } guest_uaddr = gfn_to_hva(kvm, gpa_to_gfn(gpa)); - pages = hygon_kvm_hooks.sev_pin_memory(kvm, guest_uaddr, len, &n, 1); + pages = hygon_kvm_hooks.sev_pin_memory(kvm, guest_uaddr, len, &n, FOLL_WRITE); if (IS_ERR(pages)) return PTR_ERR(pages); @@ -404,7 +404,7 @@ csv_receive_update_data_to_ringbuf(struct kvm *kvm, /* Pin guest memory */ guest_page = hygon_kvm_hooks.sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK, - PAGE_SIZE, &n, 1); + PAGE_SIZE, &n, FOLL_WRITE); if (IS_ERR(guest_page)) { ret = PTR_ERR(guest_page); goto e_free; @@ -2649,7 +2649,7 @@ static int csv_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!csv3_guest(kvm) || !(csv->inuse_ext & KVM_CAP_HYGON_COCO_EXT_CSV3_INJ_SECRET)) { pages = hygon_kvm_hooks.sev_pin_memory(kvm, params.guest_uaddr, - params.guest_len, &n, 1); + params.guest_len, &n, FOLL_WRITE); if (IS_ERR(pages)) return PTR_ERR(pages); diff --git a/arch/x86/kvm/svm/csv.h b/arch/x86/kvm/svm/csv.h index 9b0563062a94..fca2c43374a8 100644 --- a/arch/x86/kvm/svm/csv.h +++ b/arch/x86/kvm/svm/csv.h @@ -61,7 +61,7 @@ extern struct hygon_kvm_hooks_table { unsigned long npages); struct page **(*sev_pin_memory)(struct kvm *kvm, unsigned long uaddr, unsigned long ulen, unsigned long *n, - int write); + unsigned int flags); void (*sev_unpin_memory)(struct kvm *kvm, struct page **pages, unsigned long npages); void (*sev_clflush_pages)(struct page *pages[], unsigned long npages); diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 900506775f70..f2e1326c09af 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -508,7 +508,7 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, unsigned long ulen, unsigned long *n, - int write) + unsigned int flags) { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; unsigned long npages, size; @@ -549,7 +549,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, return ERR_PTR(-ENOMEM); /* Pin the user virtual address. */ - npinned = pin_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages); + npinned = pin_user_pages_fast(uaddr, npages, flags, pages); if (npinned != npages) { pr_err("SEV: Failure locking %lu pages.\n", npages); ret = -ENOMEM; @@ -637,7 +637,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) vaddr_end = vaddr + size; /* Lock the user memory. */ - inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1); + inpages = sev_pin_memory(kvm, vaddr, size, &npages, FOLL_WRITE); if (IS_ERR(inpages)) return PTR_ERR(inpages); @@ -1100,7 +1100,7 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) if (IS_ERR(src_p)) return PTR_ERR(src_p); - dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, 1); + dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, FOLL_WRITE); if (IS_ERR(dst_p)) { sev_unpin_memory(kvm, src_p, n); return PTR_ERR(dst_p); @@ -1166,7 +1166,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) return -EFAULT; - pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1); + pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, FOLL_WRITE); if (IS_ERR(pages)) return PTR_ERR(pages); @@ -1648,7 +1648,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) /* Pin guest memory */ guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK, - PAGE_SIZE, &n, 1); + PAGE_SIZE, &n, FOLL_WRITE); if (IS_ERR(guest_page)) { ret = PTR_ERR(guest_page); goto e_free_trans; @@ -2106,7 +2106,8 @@ int sev_mem_enc_register_region(struct kvm *kvm, return -ENOMEM; mutex_lock(&kvm->lock); - region->pages = sev_pin_memory(kvm, range->addr, range->size, ®ion->npages, 1); + region->pages = sev_pin_memory(kvm, range->addr, range->size, ®ion->npages, + FOLL_WRITE | FOLL_LONGTERM); if (IS_ERR(region->pages)) { ret = PTR_ERR(region->pages); mutex_unlock(&kvm->lock); -- Gitee From 0a0e7f8ac3ce221370334f934a41003dd1245c1f Mon Sep 17 00:00:00 2001 From: hanliyang Date: Fri, 25 Apr 2025 17:42:33 +0800 Subject: [PATCH 04/10] Revert "x86/mm: CSV allows CMA allocation concurrently" Upstream: no The commit 33e23866287f ("x86/mm: CSV allows CMA allocation concurrently") has its corresponding upstream version. We'll backport the upstream version to this repo. Fixes: 33e23866287f ("x86/mm: CSV allows CMA allocation concurrently") Signed-off-by: hanliyang --- arch/x86/mm/mem_encrypt_hygon.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/mm/mem_encrypt_hygon.c b/arch/x86/mm/mem_encrypt_hygon.c index 52ec3fa041fe..da42e32f66e0 100644 --- a/arch/x86/mm/mem_encrypt_hygon.c +++ b/arch/x86/mm/mem_encrypt_hygon.c @@ -283,7 +283,6 @@ static void __init csv_cma_reserve_mem(void) 1 << CSV_CMA_SHIFT, node); break; } - cma_enable_concurrency(csv_cma->cma); if (start > cma_get_base(csv_cma->cma) || !start) start = cma_get_base(csv_cma->cma); -- Gitee From 31ed0344f2ac74a77914c75b8e6becbfe973281c Mon Sep 17 00:00:00 2001 From: hanliyang Date: Fri, 25 Apr 2025 17:49:13 +0800 Subject: [PATCH 05/10] Revert "mm/cma: add API to enable concurrent allocation from the CMA" Upstream: no The commit 6757c7150f7d ("mm/cma: add API to enable concurrent allocation from the CMA") has its corresponding upstream version. We'll backport the upstream version to this repo. Fixes: 6757c7150f7d ("mm/cma: add API to enable concurrent allocation from the CMA") Signed-off-by: hanliyang --- include/linux/cma.h | 1 - mm/cma.c | 14 ++------------ mm/cma.h | 1 - 3 files changed, 2 insertions(+), 14 deletions(-) diff --git a/include/linux/cma.h b/include/linux/cma.h index 010c89f4b772..18c8d6495f08 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -58,5 +58,4 @@ extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data) extern void cma_reserve_pages_on_error(struct cma *cma); extern int __init cma_alloc_areas(unsigned int max_cma_size); -extern void cma_enable_concurrency(struct cma *cma); #endif diff --git a/mm/cma.c b/mm/cma.c index 304a4e69180c..5af7642e607b 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -492,12 +492,10 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, spin_unlock_irq(&cma->lock); pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); - if (!cma->no_mutex) - mutex_lock(&cma_mutex); + mutex_lock(&cma_mutex); ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0)); - if (!cma->no_mutex) - mutex_unlock(&cma_mutex); + mutex_unlock(&cma_mutex); if (ret == 0) { page = pfn_to_page(pfn); break; @@ -611,11 +609,3 @@ int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data) return 0; } - -void cma_enable_concurrency(struct cma *cma) -{ - if (!cma) - return; - - cma->no_mutex = true; -} diff --git a/mm/cma.h b/mm/cma.h index 50275c1d98cc..12aba820969c 100644 --- a/mm/cma.h +++ b/mm/cma.h @@ -16,7 +16,6 @@ struct cma { unsigned long *bitmap; unsigned int order_per_bit; /* Order of pages represented by one bit */ spinlock_t lock; - bool no_mutex; #ifdef CONFIG_CMA_DEBUGFS struct hlist_head mem_head; spinlock_t mem_head_lock; -- Gitee From f072c8fdfd35e3141aa97db0a42f52537e1df10b Mon Sep 17 00:00:00 2001 From: Ge Yang Date: Mon, 10 Feb 2025 09:56:06 +0800 Subject: [PATCH 06/10] mm/cma: using per-CMA locks to improve concurrent allocation performance commit 24ac6fb6e3647fff3646b3ea1811095441380560 upstream. For different CMAs, concurrent allocation of CMA memory ideally should not require synchronization using locks. Currently, a global cma_mutex lock is employed to synchronize all CMA allocations, which can impact the performance of concurrent allocations across different CMAs. To test the performance impact, follow these steps: 1. Boot the kernel with the command line argument hugetlb_cma=30G to allocate a 30GB CMA area specifically for huge page allocations. (note: on my machine, which has 3 nodes, each node is initialized with 10G of CMA) 2. Use the dd command with parameters if=/dev/zero of=/dev/shm/file bs=1G count=30 to fully utilize the CMA area by writing zeroes to a file in /dev/shm. 3. Open three terminals and execute the following commands simultaneously: (Note: Each of these commands attempts to allocate 10GB [2621440 * 4KB pages] of CMA memory.) On Terminal 1: time echo 2621440 > /sys/kernel/debug/cma/hugetlb1/alloc On Terminal 2: time echo 2621440 > /sys/kernel/debug/cma/hugetlb2/alloc On Terminal 3: time echo 2621440 > /sys/kernel/debug/cma/hugetlb3/alloc We attempt to allocate pages through the CMA debug interface and use the time command to measure the duration of each allocation. Performance comparison: Without this patch With this patch Terminal1 ~7s ~7s Terminal2 ~14s ~8s Terminal3 ~21s ~7s To solve problem above, we could use per-CMA locks to improve concurrent allocation performance. This would allow each CMA to be managed independently, reducing the need for a global lock and thus improving scalability and performance. Link: https://lkml.kernel.org/r/1739152566-744-1-git-send-email-yangge1116@126.com Signed-off-by: Ge Yang Reviewed-by: Barry Song Acked-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Aisheng Dong Cc: Baolin Wang Signed-off-by: Andrew Morton --- mm/cma.c | 7 ++++--- mm/cma.h | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/cma.c b/mm/cma.c index 5af7642e607b..e523ba62a6f9 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -41,7 +41,6 @@ static unsigned int cma_areas_size = MAX_CMA_AREAS; struct cma *cma_areas = cma_areas_data; unsigned cma_area_count; -static DEFINE_MUTEX(cma_mutex); phys_addr_t cma_get_base(const struct cma *cma) { @@ -125,6 +124,8 @@ static void __init cma_activate_area(struct cma *cma) spin_lock_init(&cma->lock); + mutex_init(&cma->alloc_mutex); + #ifdef CONFIG_CMA_DEBUGFS INIT_HLIST_HEAD(&cma->mem_head); spin_lock_init(&cma->mem_head_lock); @@ -492,10 +493,10 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, spin_unlock_irq(&cma->lock); pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); - mutex_lock(&cma_mutex); + mutex_lock(&cma->alloc_mutex); ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0)); - mutex_unlock(&cma_mutex); + mutex_unlock(&cma->alloc_mutex); if (ret == 0) { page = pfn_to_page(pfn); break; diff --git a/mm/cma.h b/mm/cma.h index 12aba820969c..f63cd31123a9 100644 --- a/mm/cma.h +++ b/mm/cma.h @@ -16,6 +16,7 @@ struct cma { unsigned long *bitmap; unsigned int order_per_bit; /* Order of pages represented by one bit */ spinlock_t lock; + struct mutex alloc_mutex; #ifdef CONFIG_CMA_DEBUGFS struct hlist_head mem_head; spinlock_t mem_head_lock; -- Gitee From 79bb3548369d070682d1e07291d783d5a36ca6bf Mon Sep 17 00:00:00 2001 From: yangge Date: Sat, 11 Jan 2025 15:58:20 +0800 Subject: [PATCH 07/10] mm: replace free hugepage folios after migration commit 04f13d241b8b146b23038bffd907cb8278391d07 upstream. My machine has 4 NUMA nodes, each equipped with 32GB of memory. I have configured each NUMA node with 16GB of CMA and 16GB of in-use hugetlb pages. The allocation of contiguous memory via cma_alloc() can fail probabilistically. When there are free hugetlb folios in the hugetlb pool, during the migration of in-use hugetlb folios, new folios are allocated from the free hugetlb pool. After the migration is completed, the old folios are released back to the free hugetlb pool instead of being returned to the buddy system. This can cause test_pages_isolated() check to fail, ultimately leading to the failure of cma_alloc(). Call trace: cma_alloc() __alloc_contig_migrate_range() // migrate in-use hugepage test_pages_isolated() __test_page_isolated_in_pageblock() PageBuddy(page) // check if the page is in buddy To address this issue, we introduce a function named replace_free_hugepage_folios(). This function will replace the hugepage in the free hugepage pool with a new one and release the old one to the buddy system. After the migration of in-use hugetlb pages is completed, we will invoke replace_free_hugepage_folios() to ensure that these hugepages are properly released to the buddy system. Following this step, when test_pages_isolated() is executed for inspection, it will successfully pass. Additionally, when alloc_contig_range() is used to migrate multiple in-use hugetlb pages, it can result in some in-use hugetlb pages being released back to the free hugetlb pool and subsequently being reallocated and used again. For example: [huge 0] [huge 1] To migrate huge 0, we obtain huge x from the pool. After the migration is completed, we return the now-freed huge 0 back to the pool. When it's time to migrate huge 1, we can simply reuse the now-freed huge 0 from the pool. As a result, when replace_free_hugepage_folios() is executed, it cannot release huge 0 back to the buddy system. To address this issue, we should prevent the reuse of isolated free hugepages during the migration process. Link: https://lkml.kernel.org/r/1734503588-16254-1-git-send-email-yangge1116@126.com Link: https://lkml.kernel.org/r/1736582300-11364-1-git-send-email-yangge1116@126.com Signed-off-by: yangge Cc: Baolin Wang Cc: Barry Song <21cnbao@gmail.com> Cc: David Hildenbrand Cc: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 7 +++++++ mm/hugetlb.c | 42 +++++++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 12 +++++++++++- 3 files changed, 60 insertions(+), 1 deletion(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index fc2023d07f69..3f49d02174fc 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -745,6 +745,7 @@ struct huge_bootmem_page { }; int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); +int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn); struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, @@ -1049,6 +1050,12 @@ static inline int isolate_or_dissolve_huge_page(struct page *page, return -ENOMEM; } +static inline int replace_free_hugepage_folios(unsigned long start_pfn, + unsigned long end_pfn) +{ + return 0; +} + static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7c196b754071..cfbba11d6406 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -47,6 +47,7 @@ #include #include "internal.h" #include "hugetlb_vmemmap.h" +#include int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; @@ -1341,6 +1342,9 @@ static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h, if (folio_test_hwpoison(folio)) continue; + if (is_migrate_isolate_page(&folio->page)) + continue; + list_move(&folio->lru, &h->hugepage_activelist); folio_ref_unfreeze(folio, 1); folio_clear_hugetlb_freed(folio); @@ -3014,6 +3018,44 @@ static int alloc_and_dissolve_hugetlb_folio(struct hstate *h, return ret; } +/* + * replace_free_hugepage_folios - Replace free hugepage folios in a given pfn + * range with new folios. + * @start_pfn: start pfn of the given pfn range + * @end_pfn: end pfn of the given pfn range + * Returns 0 on success, otherwise negated error. + */ +int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn) +{ + struct hstate *h; + struct folio *folio; + int ret = 0; + + LIST_HEAD(isolate_list); + + while (start_pfn < end_pfn) { + folio = pfn_folio(start_pfn); + if (folio_test_hugetlb(folio)) { + h = folio_hstate(folio); + } else { + start_pfn++; + continue; + } + + if (!folio_ref_count(folio)) { + ret = alloc_and_dissolve_hugetlb_folio(h, folio, + &isolate_list); + if (ret) + break; + + putback_movable_pages(&isolate_list); + } + start_pfn++; + } + + return ret; +} + int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) { struct hstate *h; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 786648d205f1..aa292df1c282 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6430,7 +6430,17 @@ int alloc_contig_range(unsigned long start, unsigned long end, ret = __alloc_contig_migrate_range(&cc, start, end); if (ret && ret != -EBUSY) goto done; - ret = 0; + + /* + * When in-use hugetlb pages are migrated, they may simply be released + * back into the free hugepage pool instead of being returned to the + * buddy system. After the migration of in-use huge pages is completed, + * we will invoke replace_free_hugepage_folios() to ensure that these + * hugepages are properly released to the buddy system. + */ + ret = replace_free_hugepage_folios(start, end); + if (ret) + goto done; /* * Pages from [start, end) are within a pageblock_nr_pages -- Gitee From 5da37f54612ae6c646ea0d7d8173e3c7219aabdc Mon Sep 17 00:00:00 2001 From: Ge Yang Date: Wed, 19 Feb 2025 11:46:44 +0800 Subject: [PATCH 08/10] mm/hugetlb: wait for hugetlb folios to be freed commit 67bab13307c83fb742c2556b06cdc39dbad27f07 upstream. Since the introduction of commit c77c0a8ac4c52 ("mm/hugetlb: defer freeing of huge pages if in non-task context"), which supports deferring the freeing of hugetlb pages, the allocation of contiguous memory through cma_alloc() may fail probabilistically. In the CMA allocation process, if it is found that the CMA area is occupied by in-use hugetlb folios, these in-use hugetlb folios need to be migrated to another location. When there are no available hugetlb folios in the free hugetlb pool during the migration of in-use hugetlb folios, new folios are allocated from the buddy system. A temporary state is set on the newly allocated folio. Upon completion of the hugetlb folio migration, the temporary state is transferred from the new folios to the old folios. Normally, when the old folios with the temporary state are freed, it is directly released back to the buddy system. However, due to the deferred freeing of hugetlb pages, the PageBuddy() check fails, ultimately leading to the failure of cma_alloc(). Here is a simplified call trace illustrating the process: cma_alloc() ->__alloc_contig_migrate_range() // Migrate in-use hugetlb folios ->unmap_and_move_huge_page() ->folio_putback_hugetlb() // Free old folios ->test_pages_isolated() ->__test_page_isolated_in_pageblock() ->PageBuddy(page) // Check if the page is in buddy To resolve this issue, we have implemented a function named wait_for_freed_hugetlb_folios(). This function ensures that the hugetlb folios are properly released back to the buddy system after their migration is completed. By invoking wait_for_freed_hugetlb_folios() before calling PageBuddy(), we ensure that PageBuddy() will succeed. Link: https://lkml.kernel.org/r/1739936804-18199-1-git-send-email-yangge1116@126.com Fixes: c77c0a8ac4c5 ("mm/hugetlb: defer freeing of huge pages if in non-task context") Signed-off-by: Ge Yang Reviewed-by: Muchun Song Acked-by: David Hildenbrand Cc: Baolin Wang Cc: Barry Song <21cnbao@gmail.com> Cc: Oscar Salvador Cc: Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 5 +++++ mm/hugetlb.c | 8 ++++++++ mm/page_isolation.c | 10 ++++++++++ 3 files changed, 23 insertions(+) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 3f49d02174fc..a3b0ad17c417 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -746,6 +746,7 @@ struct huge_bootmem_page { int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn); +void wait_for_freed_hugetlb_folios(void); struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, @@ -1056,6 +1057,10 @@ static inline int replace_free_hugepage_folios(unsigned long start_pfn, return 0; } +static inline void wait_for_freed_hugetlb_folios(void) +{ +} + static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cfbba11d6406..60d3def1342d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3056,6 +3056,14 @@ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn) return ret; } +void wait_for_freed_hugetlb_folios(void) +{ + if (llist_empty(&hpage_freelist)) + return; + + flush_work(&free_hpage_work); +} + int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) { struct hstate *h; diff --git a/mm/page_isolation.c b/mm/page_isolation.c index bcf99ba747a0..b8ae5fa32b1d 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -659,6 +659,16 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, struct zone *zone; int ret; + /* + * Due to the deferred freeing of hugetlb folios, the hugepage folios may + * not immediately release to the buddy system. This can cause PageBuddy() + * to fail in __test_page_isolated_in_pageblock(). To ensure that the + * hugetlb folios are properly released back to the buddy system, we + * invoke the wait_for_freed_hugetlb_folios() function to wait for the + * release to complete. + */ + wait_for_freed_hugetlb_folios(); + /* * Note: pageblock_nr_pages != MAX_ORDER. Then, chunks of free pages * are not aligned to pageblock_nr_pages. -- Gitee From 37873182de5a40fd540615947c3832224e34a2f6 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 1 Sep 2023 23:51:41 +0800 Subject: [PATCH 09/10] mm/compaction: factor out code to test if we should run compaction for target order commit e19a3f595ae47bd8c034b98eb0b28a3877413387 upstream. We always do zone_watermark_ok check and compaction_suitable check together to test if compaction for target order should be ran. Factor these code out to remove repeat code. Link: https://lkml.kernel.org/r/20230901155141.249860-7-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/compaction.c | 66 +++++++++++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 27 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 8b889bee2ace..8779571b2e1e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2377,6 +2377,30 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, return false; } +/* + * Should we do compaction for target allocation order. + * Return COMPACT_SUCCESS if allocation for target order can be already + * satisfied + * Return COMPACT_SKIPPED if compaction for target order is likely to fail + * Return COMPACT_CONTINUE if compaction for target order should be ran + */ +static enum compact_result +compaction_suit_allocation_order(struct zone *zone, unsigned int order, + int highest_zoneidx, unsigned int alloc_flags) +{ + unsigned long watermark; + + watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); + if (zone_watermark_ok(zone, order, watermark, highest_zoneidx, + alloc_flags)) + return COMPACT_SUCCESS; + + if (!compaction_suitable(zone, order, highest_zoneidx)) + return COMPACT_SKIPPED; + + return COMPACT_CONTINUE; +} + static enum compact_result compact_zone(struct compact_control *cc, struct capture_control *capc) { @@ -2402,19 +2426,11 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) cc->migratetype = gfp_migratetype(cc->gfp_mask); if (!is_via_compact_memory(cc->order)) { - unsigned long watermark; - - /* Allocation can already succeed, nothing to do */ - watermark = wmark_pages(cc->zone, - cc->alloc_flags & ALLOC_WMARK_MASK); - if (zone_watermark_ok(cc->zone, cc->order, watermark, - cc->highest_zoneidx, cc->alloc_flags)) - return COMPACT_SUCCESS; - - /* Compaction is likely to fail */ - if (!compaction_suitable(cc->zone, cc->order, - cc->highest_zoneidx)) - return COMPACT_SKIPPED; + ret = compaction_suit_allocation_order(cc->zone, cc->order, + cc->highest_zoneidx, + cc->alloc_flags); + if (ret != COMPACT_CONTINUE) + return ret; } /* @@ -2908,6 +2924,7 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat) int zoneid; struct zone *zone; enum zone_type highest_zoneidx = pgdat->kcompactd_highest_zoneidx; + enum compact_result ret; for (zoneid = 0; zoneid <= highest_zoneidx; zoneid++) { zone = &pgdat->node_zones[zoneid]; @@ -2915,14 +2932,10 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat) if (!populated_zone(zone)) continue; - /* Allocation can already succeed, check other zones */ - if (zone_watermark_ok(zone, pgdat->kcompactd_max_order, - min_wmark_pages(zone), - highest_zoneidx, 0)) - continue; - - if (compaction_suitable(zone, pgdat->kcompactd_max_order, - highest_zoneidx)) + ret = compaction_suit_allocation_order(zone, + pgdat->kcompactd_max_order, + highest_zoneidx, ALLOC_WMARK_MIN); + if (ret == COMPACT_CONTINUE) return true; } @@ -2945,6 +2958,8 @@ static void kcompactd_do_work(pg_data_t *pgdat) .ignore_skip_hint = false, .gfp_mask = GFP_KERNEL, }; + enum compact_result ret; + trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, cc.highest_zoneidx); count_compact_event(KCOMPACTD_WAKE); @@ -2959,12 +2974,9 @@ static void kcompactd_do_work(pg_data_t *pgdat) if (compaction_deferred(zone, cc.order)) continue; - /* Allocation can already succeed, nothing to do */ - if (zone_watermark_ok(zone, cc.order, - min_wmark_pages(zone), zoneid, 0)) - continue; - - if (!compaction_suitable(zone, cc.order, zoneid)) + ret = compaction_suit_allocation_order(zone, + cc.order, zoneid, ALLOC_WMARK_MIN); + if (ret != COMPACT_CONTINUE) continue; if (kthread_should_stop()) -- Gitee From 2c333276aa97e2e3510a6910f388bf41de61f683 Mon Sep 17 00:00:00 2001 From: yangge Date: Sat, 25 Jan 2025 14:53:57 +0800 Subject: [PATCH 10/10] mm: compaction: use the proper flag to determine watermarks commit 6268f0a166ebcf5a31577036f4c1e613d5ab4fb1 upstream. There are 4 NUMA nodes on my machine, and each NUMA node has 32GB of memory. I have configured 16GB of CMA memory on each NUMA node, and starting a 32GB virtual machine with device passthrough is extremely slow, taking almost an hour. Long term GUP cannot allocate memory from CMA area, so a maximum of 16 GB of no-CMA memory on a NUMA node can be used as virtual machine memory. There is 16GB of free CMA memory on a NUMA node, which is sufficient to pass the order-0 watermark check, causing the __compaction_suitable() function to consistently return true. For costly allocations, if the __compaction_suitable() function always returns true, it causes the __alloc_pages_slowpath() function to fail to exit at the appropriate point. This prevents timely fallback to allocating memory on other nodes, ultimately resulting in excessively long virtual machine startup times. Call trace: __alloc_pages_slowpath if (compact_result == COMPACT_SKIPPED || compact_result == COMPACT_DEFERRED) goto nopage; // should exit __alloc_pages_slowpath() from here We could use the real unmovable allocation context to have __zone_watermark_unusable_free() subtract CMA pages, and thus we won't pass the order-0 check anymore once the non-CMA part is exhausted. There is some risk that in some different scenario the compaction could in fact migrate pages from the exhausted non-CMA part of the zone to the CMA part and succeed, and we'll skip it instead. But only __GFP_NORETRY allocations should be affected in the immediate "goto nopage" when compaction is skipped, others will attempt with DEF_COMPACT_PRIORITY anyway and won't fail without trying to compact-migrate the non-CMA pageblocks into CMA pageblocks first, so it should be fine. After this fix, it only takes a few tens of seconds to start a 32GB virtual machine with device passthrough functionality. Link: https://lore.kernel.org/lkml/1736335854-548-1-git-send-email-yangge1116@126.com/ Link: https://lkml.kernel.org/r/1737788037-8439-1-git-send-email-yangge1116@126.com Signed-off-by: yangge Acked-by: Vlastimil Babka Reviewed-by: Baolin Wang Acked-by: Johannes Weiner Cc: Barry Song <21cnbao@gmail.com> Cc: David Hildenbrand Signed-off-by: Andrew Morton --- mm/compaction.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 8779571b2e1e..eb77b1456012 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2386,7 +2386,8 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, */ static enum compact_result compaction_suit_allocation_order(struct zone *zone, unsigned int order, - int highest_zoneidx, unsigned int alloc_flags) + int highest_zoneidx, unsigned int alloc_flags, + bool async) { unsigned long watermark; @@ -2395,6 +2396,23 @@ compaction_suit_allocation_order(struct zone *zone, unsigned int order, alloc_flags)) return COMPACT_SUCCESS; + /* + * For unmovable allocations (without ALLOC_CMA), check if there is enough + * free memory in the non-CMA pageblocks. Otherwise compaction could form + * the high-order page in CMA pageblocks, which would not help the + * allocation to succeed. However, limit the check to costly order async + * compaction (such as opportunistic THP attempts) because there is the + * possibility that compaction would migrate pages from non-CMA to CMA + * pageblock. + */ + if (order > PAGE_ALLOC_COSTLY_ORDER && async && + !(alloc_flags & ALLOC_CMA)) { + watermark = low_wmark_pages(zone) + compact_gap(order); + if (!__zone_watermark_ok(zone, 0, watermark, highest_zoneidx, + 0, zone_page_state(zone, NR_FREE_PAGES))) + return COMPACT_SKIPPED; + } + if (!compaction_suitable(zone, order, highest_zoneidx)) return COMPACT_SKIPPED; @@ -2428,7 +2446,8 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) if (!is_via_compact_memory(cc->order)) { ret = compaction_suit_allocation_order(cc->zone, cc->order, cc->highest_zoneidx, - cc->alloc_flags); + cc->alloc_flags, + cc->mode == MIGRATE_ASYNC); if (ret != COMPACT_CONTINUE) return ret; } @@ -2934,7 +2953,8 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat) ret = compaction_suit_allocation_order(zone, pgdat->kcompactd_max_order, - highest_zoneidx, ALLOC_WMARK_MIN); + highest_zoneidx, ALLOC_WMARK_MIN, + false); if (ret == COMPACT_CONTINUE) return true; } @@ -2975,7 +2995,8 @@ static void kcompactd_do_work(pg_data_t *pgdat) continue; ret = compaction_suit_allocation_order(zone, - cc.order, zoneid, ALLOC_WMARK_MIN); + cc.order, zoneid, ALLOC_WMARK_MIN, + false); if (ret != COMPACT_CONTINUE) continue; -- Gitee