From cfd18ce7a2f5ee1f93f3fc02db26388ea2d4de45 Mon Sep 17 00:00:00 2001
From: Yuan Can <yuancan@huawei.com>
Date: Tue, 5 Sep 2023 20:45:03 +0800
Subject: [PATCH 01/10] mm: hugetlb_vmemmap: fix hugetlb page number decrease
 failed on movable nodes

commit 2eaa6c2abb9dd55041a05c20c451790c124d5cf0 upstream.

The decreasing of hugetlb pages number failed with the following message
given:

 sh: page allocation failure: order:0, mode:0x204cc0(GFP_KERNEL|__GFP_RETRY_MAYFAIL|__GFP_THISNODE)
 CPU: 1 PID: 112 Comm: sh Not tainted 6.5.0-rc7-... #45
 Hardware name: linux,dummy-virt (DT)
 Call trace:
  dump_backtrace.part.6+0x84/0xe4
  show_stack+0x18/0x24
  dump_stack_lvl+0x48/0x60
  dump_stack+0x18/0x24
  warn_alloc+0x100/0x1bc
  __alloc_pages_slowpath.constprop.107+0xa40/0xad8
  __alloc_pages+0x244/0x2d0
  hugetlb_vmemmap_restore+0x104/0x1e4
  __update_and_free_hugetlb_folio+0x44/0x1f4
  update_and_free_hugetlb_folio+0x20/0x68
  update_and_free_pages_bulk+0x4c/0xac
  set_max_huge_pages+0x198/0x334
  nr_hugepages_store_common+0x118/0x178
  nr_hugepages_store+0x18/0x24
  kobj_attr_store+0x18/0x2c
  sysfs_kf_write+0x40/0x54
  kernfs_fop_write_iter+0x164/0x1dc
  vfs_write+0x3a8/0x460
  ksys_write+0x6c/0x100
  __arm64_sys_write+0x1c/0x28
  invoke_syscall+0x44/0x100
  el0_svc_common.constprop.1+0x6c/0xe4
  do_el0_svc+0x38/0x94
  el0_svc+0x28/0x74
  el0t_64_sync_handler+0xa0/0xc4
  el0t_64_sync+0x174/0x178
 Mem-Info:
  ...

The reason is that the hugetlb pages being released are allocated from
movable nodes, and with hugetlb_optimize_vmemmap enabled, vmemmap pages
need to be allocated from the same node during the hugetlb pages
releasing. With GFP_KERNEL and __GFP_THISNODE set, allocating from movable
node is always failed. Fix this problem by removing __GFP_THISNODE.

Link: https://lkml.kernel.org/r/20230905124503.24899-1-yuancan@huawei.com
Fixes: ad2fa3717b74 ("mm: hugetlb: alloc the vmemmap pages associated with each HugeTLB page")
Signed-off-by: Yuan Can <yuancan@huawei.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/hugetlb_vmemmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 4b9734777f69..446e9fc723ec 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -380,7 +380,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end,
 static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
 				   struct list_head *list)
 {
-	gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_THISNODE;
+	gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL;
 	unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
 	int nid = page_to_nid((struct page *)start);
 	struct page *page, *next;
-- 
Gitee


From 4ce3ca784b810d423f217addad9ba1447fc27088 Mon Sep 17 00:00:00 2001
From: hanliyang <hanliyang@hygon.cn>
Date: Fri, 25 Apr 2025 17:35:50 +0800
Subject: [PATCH 02/10] Revert "KVM: SEV: Pin SEV guest memory out of CMA area"

Upstream: no

The commit 68c126a9564a ("KVM: SEV: Pin SEV guest memory out of CMA
area") has its corresponding upstream version. We'll backport the
upstream version to this repo.

Fixes: 68c126a9564a ("KVM: SEV: Pin SEV guest memory out of CMA area")
Signed-off-by: hanliyang <hanliyang@hygon.cn>
---
 arch/x86/kvm/svm/sev.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 12e88c2d948c..900506775f70 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -516,7 +516,6 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
 	unsigned long locked, lock_limit;
 	struct page **pages;
 	unsigned long first, last;
-	unsigned int flags = 0;
 	int ret;
 
 	lockdep_assert_held(&kvm->lock);
@@ -549,10 +548,8 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
 	if (!pages)
 		return ERR_PTR(-ENOMEM);
 
-	flags = write ? FOLL_WRITE : 0;
-
 	/* Pin the user virtual address. */
-	npinned = pin_user_pages_fast(uaddr, npages, flags | FOLL_LONGTERM, pages);
+	npinned = pin_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages);
 	if (npinned != npages) {
 		pr_err("SEV: Failure locking %lu pages.\n", npages);
 		ret = -ENOMEM;
-- 
Gitee


From d3a78d108d9e69d3f753b12140d1b008b8b8f39d Mon Sep 17 00:00:00 2001
From: Ge Yang <yangge1116@126.com>
Date: Tue, 11 Feb 2025 10:37:03 +0800
Subject: [PATCH 03/10] KVM: SEV: Use long-term pin when registering encrypted
 memory regions

commit 7e066cb9b71a22c3e5ef233de63ff14525baf6f0 upstream.

When registering an encrypted memory region for SEV-MEM/SEV-ES guests,
pin the pages with FOLL_TERM so that the pages are migrated out of
MIGRATE_CMA/ZONE_MOVABLE.  Failure to do so violates the CMA/MOVABLE
mechanisms and can result in fragmentation due to unmovable pages, e.g.
can make CMA allocations fail.

Signed-off-by: Ge Yang <yangge1116@126.com>
Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com>
Acked-by: David Hildenbrand <david@redhat.com>
Link: https://lore.kernel.org/r/1739241423-14326-1-git-send-email-yangge1116@126.com
[sean: massage changelog, make @flags an unsigned int]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/svm/csv.c |  6 +++---
 arch/x86/kvm/svm/csv.h |  2 +-
 arch/x86/kvm/svm/sev.c | 15 ++++++++-------
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/svm/csv.c b/arch/x86/kvm/svm/csv.c
index 25e86a8c4c5e..27031f30271b 100644
--- a/arch/x86/kvm/svm/csv.c
+++ b/arch/x86/kvm/svm/csv.c
@@ -79,7 +79,7 @@ int csv_vm_attestation(struct kvm *kvm, unsigned long gpa, unsigned long len)
 	}
 
 	guest_uaddr = gfn_to_hva(kvm, gpa_to_gfn(gpa));
-	pages = hygon_kvm_hooks.sev_pin_memory(kvm, guest_uaddr, len, &n, 1);
+	pages = hygon_kvm_hooks.sev_pin_memory(kvm, guest_uaddr, len, &n, FOLL_WRITE);
 	if (IS_ERR(pages))
 		return PTR_ERR(pages);
 
@@ -404,7 +404,7 @@ csv_receive_update_data_to_ringbuf(struct kvm *kvm,
 
 	/* Pin guest memory */
 	guest_page = hygon_kvm_hooks.sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
-						    PAGE_SIZE, &n, 1);
+						    PAGE_SIZE, &n, FOLL_WRITE);
 	if (IS_ERR(guest_page)) {
 		ret = PTR_ERR(guest_page);
 		goto e_free;
@@ -2649,7 +2649,7 @@ static int csv_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
 	if (!csv3_guest(kvm) ||
 	    !(csv->inuse_ext & KVM_CAP_HYGON_COCO_EXT_CSV3_INJ_SECRET)) {
 		pages = hygon_kvm_hooks.sev_pin_memory(kvm, params.guest_uaddr,
-						       params.guest_len, &n, 1);
+						       params.guest_len, &n, FOLL_WRITE);
 		if (IS_ERR(pages))
 			return PTR_ERR(pages);
 
diff --git a/arch/x86/kvm/svm/csv.h b/arch/x86/kvm/svm/csv.h
index 9b0563062a94..fca2c43374a8 100644
--- a/arch/x86/kvm/svm/csv.h
+++ b/arch/x86/kvm/svm/csv.h
@@ -61,7 +61,7 @@ extern struct hygon_kvm_hooks_table {
 					      unsigned long npages);
 	struct page **(*sev_pin_memory)(struct kvm *kvm, unsigned long uaddr,
 					unsigned long ulen, unsigned long *n,
-					int write);
+					unsigned int flags);
 	void (*sev_unpin_memory)(struct kvm *kvm, struct page **pages,
 				 unsigned long npages);
 	void (*sev_clflush_pages)(struct page *pages[], unsigned long npages);
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 900506775f70..f2e1326c09af 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -508,7 +508,7 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
 
 static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
 				    unsigned long ulen, unsigned long *n,
-				    int write)
+				    unsigned int flags)
 {
 	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
 	unsigned long npages, size;
@@ -549,7 +549,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
 		return ERR_PTR(-ENOMEM);
 
 	/* Pin the user virtual address. */
-	npinned = pin_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages);
+	npinned = pin_user_pages_fast(uaddr, npages, flags, pages);
 	if (npinned != npages) {
 		pr_err("SEV: Failure locking %lu pages.\n", npages);
 		ret = -ENOMEM;
@@ -637,7 +637,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
 	vaddr_end = vaddr + size;
 
 	/* Lock the user memory. */
-	inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1);
+	inpages = sev_pin_memory(kvm, vaddr, size, &npages, FOLL_WRITE);
 	if (IS_ERR(inpages))
 		return PTR_ERR(inpages);
 
@@ -1100,7 +1100,7 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
 		if (IS_ERR(src_p))
 			return PTR_ERR(src_p);
 
-		dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, 1);
+		dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, FOLL_WRITE);
 		if (IS_ERR(dst_p)) {
 			sev_unpin_memory(kvm, src_p, n);
 			return PTR_ERR(dst_p);
@@ -1166,7 +1166,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
 	if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
 		return -EFAULT;
 
-	pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1);
+	pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, FOLL_WRITE);
 	if (IS_ERR(pages))
 		return PTR_ERR(pages);
 
@@ -1648,7 +1648,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
 
 	/* Pin guest memory */
 	guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
-				    PAGE_SIZE, &n, 1);
+				    PAGE_SIZE, &n, FOLL_WRITE);
 	if (IS_ERR(guest_page)) {
 		ret = PTR_ERR(guest_page);
 		goto e_free_trans;
@@ -2106,7 +2106,8 @@ int sev_mem_enc_register_region(struct kvm *kvm,
 		return -ENOMEM;
 
 	mutex_lock(&kvm->lock);
-	region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1);
+	region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages,
+					   FOLL_WRITE | FOLL_LONGTERM);
 	if (IS_ERR(region->pages)) {
 		ret = PTR_ERR(region->pages);
 		mutex_unlock(&kvm->lock);
-- 
Gitee


From 0a0e7f8ac3ce221370334f934a41003dd1245c1f Mon Sep 17 00:00:00 2001
From: hanliyang <hanliyang@hygon.cn>
Date: Fri, 25 Apr 2025 17:42:33 +0800
Subject: [PATCH 04/10] Revert "x86/mm: CSV allows CMA allocation concurrently"

Upstream: no

The commit 33e23866287f ("x86/mm: CSV allows CMA allocation
concurrently") has its corresponding upstream version. We'll backport the
upstream version to this repo.

Fixes: 33e23866287f ("x86/mm: CSV allows CMA allocation concurrently")
Signed-off-by: hanliyang <hanliyang@hygon.cn>
---
 arch/x86/mm/mem_encrypt_hygon.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/x86/mm/mem_encrypt_hygon.c b/arch/x86/mm/mem_encrypt_hygon.c
index 52ec3fa041fe..da42e32f66e0 100644
--- a/arch/x86/mm/mem_encrypt_hygon.c
+++ b/arch/x86/mm/mem_encrypt_hygon.c
@@ -283,7 +283,6 @@ static void __init csv_cma_reserve_mem(void)
 					1 << CSV_CMA_SHIFT, node);
 				break;
 			}
-			cma_enable_concurrency(csv_cma->cma);
 
 			if (start > cma_get_base(csv_cma->cma) || !start)
 				start = cma_get_base(csv_cma->cma);
-- 
Gitee


From 31ed0344f2ac74a77914c75b8e6becbfe973281c Mon Sep 17 00:00:00 2001
From: hanliyang <hanliyang@hygon.cn>
Date: Fri, 25 Apr 2025 17:49:13 +0800
Subject: [PATCH 05/10] Revert "mm/cma: add API to enable concurrent allocation
 from the CMA"

Upstream: no

The commit 6757c7150f7d ("mm/cma: add API to enable concurrent allocation
from the CMA") has its corresponding upstream version. We'll backport the
upstream version to this repo.

Fixes: 6757c7150f7d ("mm/cma: add API to enable concurrent allocation from the CMA")
Signed-off-by: hanliyang <hanliyang@hygon.cn>
---
 include/linux/cma.h |  1 -
 mm/cma.c            | 14 ++------------
 mm/cma.h            |  1 -
 3 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/include/linux/cma.h b/include/linux/cma.h
index 010c89f4b772..18c8d6495f08 100644
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -58,5 +58,4 @@ extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data)
 extern void cma_reserve_pages_on_error(struct cma *cma);
 
 extern int __init cma_alloc_areas(unsigned int max_cma_size);
-extern void cma_enable_concurrency(struct cma *cma);
 #endif
diff --git a/mm/cma.c b/mm/cma.c
index 304a4e69180c..5af7642e607b 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -492,12 +492,10 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
 		spin_unlock_irq(&cma->lock);
 
 		pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit);
-		if (!cma->no_mutex)
-			mutex_lock(&cma_mutex);
+		mutex_lock(&cma_mutex);
 		ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA,
 				     GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0));
-		if (!cma->no_mutex)
-			mutex_unlock(&cma_mutex);
+		mutex_unlock(&cma_mutex);
 		if (ret == 0) {
 			page = pfn_to_page(pfn);
 			break;
@@ -611,11 +609,3 @@ int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data)
 
 	return 0;
 }
-
-void cma_enable_concurrency(struct cma *cma)
-{
-	if (!cma)
-		return;
-
-	cma->no_mutex = true;
-}
diff --git a/mm/cma.h b/mm/cma.h
index 50275c1d98cc..12aba820969c 100644
--- a/mm/cma.h
+++ b/mm/cma.h
@@ -16,7 +16,6 @@ struct cma {
 	unsigned long   *bitmap;
 	unsigned int order_per_bit; /* Order of pages represented by one bit */
 	spinlock_t	lock;
-	bool no_mutex;
 #ifdef CONFIG_CMA_DEBUGFS
 	struct hlist_head mem_head;
 	spinlock_t mem_head_lock;
-- 
Gitee


From f072c8fdfd35e3141aa97db0a42f52537e1df10b Mon Sep 17 00:00:00 2001
From: Ge Yang <yangge1116@126.com>
Date: Mon, 10 Feb 2025 09:56:06 +0800
Subject: [PATCH 06/10] mm/cma: using per-CMA locks to improve concurrent
 allocation performance

commit 24ac6fb6e3647fff3646b3ea1811095441380560 upstream.

For different CMAs, concurrent allocation of CMA memory ideally should not
require synchronization using locks.  Currently, a global cma_mutex lock
is employed to synchronize all CMA allocations, which can impact the
performance of concurrent allocations across different CMAs.

To test the performance impact, follow these steps:
1. Boot the kernel with the command line argument hugetlb_cma=30G to
   allocate a 30GB CMA area specifically for huge page allocations. (note:
   on my machine, which has 3 nodes, each node is initialized with 10G of
   CMA)
2. Use the dd command with parameters if=/dev/zero of=/dev/shm/file bs=1G
   count=30 to fully utilize the CMA area by writing zeroes to a file in
   /dev/shm.
3. Open three terminals and execute the following commands simultaneously:
   (Note: Each of these commands attempts to allocate 10GB [2621440 * 4KB
   pages] of CMA memory.)
   On Terminal 1: time echo 2621440 > /sys/kernel/debug/cma/hugetlb1/alloc
   On Terminal 2: time echo 2621440 > /sys/kernel/debug/cma/hugetlb2/alloc
   On Terminal 3: time echo 2621440 > /sys/kernel/debug/cma/hugetlb3/alloc

We attempt to allocate pages through the CMA debug interface and use the
time command to measure the duration of each allocation.
Performance comparison:
             Without this patch      With this patch
Terminal1        ~7s                     ~7s
Terminal2       ~14s                     ~8s
Terminal3       ~21s                     ~7s

To solve problem above, we could use per-CMA locks to improve concurrent
allocation performance.  This would allow each CMA to be managed
independently, reducing the need for a global lock and thus improving
scalability and performance.

Link: https://lkml.kernel.org/r/1739152566-744-1-git-send-email-yangge1116@126.com
Signed-off-by: Ge Yang <yangge1116@126.com>
Reviewed-by: Barry Song <baohua@kernel.org>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Aisheng Dong <aisheng.dong@nxp.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/cma.c | 7 ++++---
 mm/cma.h | 1 +
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/mm/cma.c b/mm/cma.c
index 5af7642e607b..e523ba62a6f9 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -41,7 +41,6 @@ static unsigned int cma_areas_size = MAX_CMA_AREAS;
 struct cma *cma_areas = cma_areas_data;
 
 unsigned cma_area_count;
-static DEFINE_MUTEX(cma_mutex);
 
 phys_addr_t cma_get_base(const struct cma *cma)
 {
@@ -125,6 +124,8 @@ static void __init cma_activate_area(struct cma *cma)
 
 	spin_lock_init(&cma->lock);
 
+	mutex_init(&cma->alloc_mutex);
+
 #ifdef CONFIG_CMA_DEBUGFS
 	INIT_HLIST_HEAD(&cma->mem_head);
 	spin_lock_init(&cma->mem_head_lock);
@@ -492,10 +493,10 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
 		spin_unlock_irq(&cma->lock);
 
 		pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit);
-		mutex_lock(&cma_mutex);
+		mutex_lock(&cma->alloc_mutex);
 		ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA,
 				     GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0));
-		mutex_unlock(&cma_mutex);
+		mutex_unlock(&cma->alloc_mutex);
 		if (ret == 0) {
 			page = pfn_to_page(pfn);
 			break;
diff --git a/mm/cma.h b/mm/cma.h
index 12aba820969c..f63cd31123a9 100644
--- a/mm/cma.h
+++ b/mm/cma.h
@@ -16,6 +16,7 @@ struct cma {
 	unsigned long   *bitmap;
 	unsigned int order_per_bit; /* Order of pages represented by one bit */
 	spinlock_t	lock;
+	struct mutex alloc_mutex;
 #ifdef CONFIG_CMA_DEBUGFS
 	struct hlist_head mem_head;
 	spinlock_t mem_head_lock;
-- 
Gitee


From 79bb3548369d070682d1e07291d783d5a36ca6bf Mon Sep 17 00:00:00 2001
From: yangge <yangge1116@126.com>
Date: Sat, 11 Jan 2025 15:58:20 +0800
Subject: [PATCH 07/10] mm: replace free hugepage folios after migration

commit 04f13d241b8b146b23038bffd907cb8278391d07 upstream.

My machine has 4 NUMA nodes, each equipped with 32GB of memory.  I have
configured each NUMA node with 16GB of CMA and 16GB of in-use hugetlb
pages.  The allocation of contiguous memory via cma_alloc() can fail
probabilistically.

When there are free hugetlb folios in the hugetlb pool, during the
migration of in-use hugetlb folios, new folios are allocated from the free
hugetlb pool.  After the migration is completed, the old folios are
released back to the free hugetlb pool instead of being returned to the
buddy system.  This can cause test_pages_isolated() check to fail,
ultimately leading to the failure of cma_alloc().

Call trace:

cma_alloc()
    __alloc_contig_migrate_range() // migrate in-use hugepage
    test_pages_isolated()
        __test_page_isolated_in_pageblock()
             PageBuddy(page) // check if the page is in buddy

To address this issue, we introduce a function named
replace_free_hugepage_folios().  This function will replace the hugepage
in the free hugepage pool with a new one and release the old one to the
buddy system.  After the migration of in-use hugetlb pages is completed,
we will invoke replace_free_hugepage_folios() to ensure that these
hugepages are properly released to the buddy system.  Following this step,
when test_pages_isolated() is executed for inspection, it will
successfully pass.

Additionally, when alloc_contig_range() is used to migrate multiple in-use
hugetlb pages, it can result in some in-use hugetlb pages being released
back to the free hugetlb pool and subsequently being reallocated and used
again.  For example:

[huge 0] [huge 1]

To migrate huge 0, we obtain huge x from the pool.  After the migration is
completed, we return the now-freed huge 0 back to the pool.  When it's
time to migrate huge 1, we can simply reuse the now-freed huge 0 from the
pool.  As a result, when replace_free_hugepage_folios() is executed, it
cannot release huge 0 back to the buddy system.  To address this issue, we
should prevent the reuse of isolated free hugepages during the migration
process.

Link: https://lkml.kernel.org/r/1734503588-16254-1-git-send-email-yangge1116@126.com
Link: https://lkml.kernel.org/r/1736582300-11364-1-git-send-email-yangge1116@126.com
Signed-off-by: yangge <yangge1116@126.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <21cnbao@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h |  7 +++++++
 mm/hugetlb.c            | 42 +++++++++++++++++++++++++++++++++++++++++
 mm/page_alloc.c         | 12 +++++++++++-
 3 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index fc2023d07f69..3f49d02174fc 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -745,6 +745,7 @@ struct huge_bootmem_page {
 };
 
 int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list);
+int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn);
 struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 				unsigned long addr, int avoid_reserve);
 struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
@@ -1049,6 +1050,12 @@ static inline int isolate_or_dissolve_huge_page(struct page *page,
 	return -ENOMEM;
 }
 
+static inline int replace_free_hugepage_folios(unsigned long start_pfn,
+		unsigned long end_pfn)
+{
+	return 0;
+}
+
 static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 					   unsigned long addr,
 					   int avoid_reserve)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7c196b754071..cfbba11d6406 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -47,6 +47,7 @@
 #include <linux/page_owner.h>
 #include "internal.h"
 #include "hugetlb_vmemmap.h"
+#include <linux/page-isolation.h>
 
 int hugetlb_max_hstate __read_mostly;
 unsigned int default_hstate_idx;
@@ -1341,6 +1342,9 @@ static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h,
 		if (folio_test_hwpoison(folio))
 			continue;
 
+		if (is_migrate_isolate_page(&folio->page))
+			continue;
+
 		list_move(&folio->lru, &h->hugepage_activelist);
 		folio_ref_unfreeze(folio, 1);
 		folio_clear_hugetlb_freed(folio);
@@ -3014,6 +3018,44 @@ static int alloc_and_dissolve_hugetlb_folio(struct hstate *h,
 	return ret;
 }
 
+/*
+ *  replace_free_hugepage_folios - Replace free hugepage folios in a given pfn
+ *  range with new folios.
+ *  @start_pfn: start pfn of the given pfn range
+ *  @end_pfn: end pfn of the given pfn range
+ *  Returns 0 on success, otherwise negated error.
+ */
+int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn)
+{
+	struct hstate *h;
+	struct folio *folio;
+	int ret = 0;
+
+	LIST_HEAD(isolate_list);
+
+	while (start_pfn < end_pfn) {
+		folio = pfn_folio(start_pfn);
+		if (folio_test_hugetlb(folio)) {
+			h = folio_hstate(folio);
+		} else {
+			start_pfn++;
+			continue;
+		}
+
+		if (!folio_ref_count(folio)) {
+			ret = alloc_and_dissolve_hugetlb_folio(h, folio,
+							       &isolate_list);
+			if (ret)
+				break;
+
+			putback_movable_pages(&isolate_list);
+		}
+		start_pfn++;
+	}
+
+	return ret;
+}
+
 int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
 {
 	struct hstate *h;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 786648d205f1..aa292df1c282 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6430,7 +6430,17 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 	ret = __alloc_contig_migrate_range(&cc, start, end);
 	if (ret && ret != -EBUSY)
 		goto done;
-	ret = 0;
+
+	/*
+	 * When in-use hugetlb pages are migrated, they may simply be released
+	 * back into the free hugepage pool instead of being returned to the
+	 * buddy system.  After the migration of in-use huge pages is completed,
+	 * we will invoke replace_free_hugepage_folios() to ensure that these
+	 * hugepages are properly released to the buddy system.
+	 */
+	ret = replace_free_hugepage_folios(start, end);
+	if (ret)
+		goto done;
 
 	/*
 	 * Pages from [start, end) are within a pageblock_nr_pages
-- 
Gitee


From 5da37f54612ae6c646ea0d7d8173e3c7219aabdc Mon Sep 17 00:00:00 2001
From: Ge Yang <yangge1116@126.com>
Date: Wed, 19 Feb 2025 11:46:44 +0800
Subject: [PATCH 08/10] mm/hugetlb: wait for hugetlb folios to be freed

commit 67bab13307c83fb742c2556b06cdc39dbad27f07 upstream.

Since the introduction of commit c77c0a8ac4c52 ("mm/hugetlb: defer freeing
of huge pages if in non-task context"), which supports deferring the
freeing of hugetlb pages, the allocation of contiguous memory through
cma_alloc() may fail probabilistically.

In the CMA allocation process, if it is found that the CMA area is
occupied by in-use hugetlb folios, these in-use hugetlb folios need to be
migrated to another location.  When there are no available hugetlb folios
in the free hugetlb pool during the migration of in-use hugetlb folios,
new folios are allocated from the buddy system.  A temporary state is set
on the newly allocated folio.  Upon completion of the hugetlb folio
migration, the temporary state is transferred from the new folios to the
old folios.  Normally, when the old folios with the temporary state are
freed, it is directly released back to the buddy system.  However, due to
the deferred freeing of hugetlb pages, the PageBuddy() check fails,
ultimately leading to the failure of cma_alloc().

Here is a simplified call trace illustrating the process:
cma_alloc()
    ->__alloc_contig_migrate_range() // Migrate in-use hugetlb folios
        ->unmap_and_move_huge_page()
            ->folio_putback_hugetlb() // Free old folios
    ->test_pages_isolated()
        ->__test_page_isolated_in_pageblock()
             ->PageBuddy(page) // Check if the page is in buddy

To resolve this issue, we have implemented a function named
wait_for_freed_hugetlb_folios().  This function ensures that the hugetlb
folios are properly released back to the buddy system after their
migration is completed.  By invoking wait_for_freed_hugetlb_folios()
before calling PageBuddy(), we ensure that PageBuddy() will succeed.

Link: https://lkml.kernel.org/r/1739936804-18199-1-git-send-email-yangge1116@126.com
Fixes: c77c0a8ac4c5 ("mm/hugetlb: defer freeing of huge pages if in non-task context")
Signed-off-by: Ge Yang <yangge1116@126.com>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h |  5 +++++
 mm/hugetlb.c            |  8 ++++++++
 mm/page_isolation.c     | 10 ++++++++++
 3 files changed, 23 insertions(+)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 3f49d02174fc..a3b0ad17c417 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -746,6 +746,7 @@ struct huge_bootmem_page {
 
 int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list);
 int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn);
+void wait_for_freed_hugetlb_folios(void);
 struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 				unsigned long addr, int avoid_reserve);
 struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
@@ -1056,6 +1057,10 @@ static inline int replace_free_hugepage_folios(unsigned long start_pfn,
 	return 0;
 }
 
+static inline void wait_for_freed_hugetlb_folios(void)
+{
+}
+
 static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 					   unsigned long addr,
 					   int avoid_reserve)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index cfbba11d6406..60d3def1342d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3056,6 +3056,14 @@ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn)
 	return ret;
 }
 
+void wait_for_freed_hugetlb_folios(void)
+{
+	if (llist_empty(&hpage_freelist))
+		return;
+
+	flush_work(&free_hpage_work);
+}
+
 int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
 {
 	struct hstate *h;
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index bcf99ba747a0..b8ae5fa32b1d 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -659,6 +659,16 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
 	struct zone *zone;
 	int ret;
 
+	/*
+	 * Due to the deferred freeing of hugetlb folios, the hugepage folios may
+	 * not immediately release to the buddy system. This can cause PageBuddy()
+	 * to fail in __test_page_isolated_in_pageblock(). To ensure that the
+	 * hugetlb folios are properly released back to the buddy system, we
+	 * invoke the wait_for_freed_hugetlb_folios() function to wait for the
+	 * release to complete.
+	 */
+	wait_for_freed_hugetlb_folios();
+
 	/*
 	 * Note: pageblock_nr_pages != MAX_ORDER. Then, chunks of free pages
 	 * are not aligned to pageblock_nr_pages.
-- 
Gitee


From 37873182de5a40fd540615947c3832224e34a2f6 Mon Sep 17 00:00:00 2001
From: Kemeng Shi <shikemeng@huaweicloud.com>
Date: Fri, 1 Sep 2023 23:51:41 +0800
Subject: [PATCH 09/10] mm/compaction: factor out code to test if we should run
 compaction for target order

commit e19a3f595ae47bd8c034b98eb0b28a3877413387 upstream.

We always do zone_watermark_ok check and compaction_suitable check
together to test if compaction for target order should be ran.  Factor
these code out to remove repeat code.

Link: https://lkml.kernel.org/r/20230901155141.249860-7-shikemeng@huaweicloud.com
Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/compaction.c | 66 +++++++++++++++++++++++++++++--------------------
 1 file changed, 39 insertions(+), 27 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 8b889bee2ace..8779571b2e1e 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -2377,6 +2377,30 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
 	return false;
 }
 
+/*
+ * Should we do compaction for target allocation order.
+ * Return COMPACT_SUCCESS if allocation for target order can be already
+ * satisfied
+ * Return COMPACT_SKIPPED if compaction for target order is likely to fail
+ * Return COMPACT_CONTINUE if compaction for target order should be ran
+ */
+static enum compact_result
+compaction_suit_allocation_order(struct zone *zone, unsigned int order,
+				 int highest_zoneidx, unsigned int alloc_flags)
+{
+	unsigned long watermark;
+
+	watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
+	if (zone_watermark_ok(zone, order, watermark, highest_zoneidx,
+			      alloc_flags))
+		return COMPACT_SUCCESS;
+
+	if (!compaction_suitable(zone, order, highest_zoneidx))
+		return COMPACT_SKIPPED;
+
+	return COMPACT_CONTINUE;
+}
+
 static enum compact_result
 compact_zone(struct compact_control *cc, struct capture_control *capc)
 {
@@ -2402,19 +2426,11 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
 	cc->migratetype = gfp_migratetype(cc->gfp_mask);
 
 	if (!is_via_compact_memory(cc->order)) {
-		unsigned long watermark;
-
-		/* Allocation can already succeed, nothing to do */
-		watermark = wmark_pages(cc->zone,
-					cc->alloc_flags & ALLOC_WMARK_MASK);
-		if (zone_watermark_ok(cc->zone, cc->order, watermark,
-				      cc->highest_zoneidx, cc->alloc_flags))
-			return COMPACT_SUCCESS;
-
-		/* Compaction is likely to fail */
-		if (!compaction_suitable(cc->zone, cc->order,
-					 cc->highest_zoneidx))
-			return COMPACT_SKIPPED;
+		ret = compaction_suit_allocation_order(cc->zone, cc->order,
+						       cc->highest_zoneidx,
+						       cc->alloc_flags);
+		if (ret != COMPACT_CONTINUE)
+			return ret;
 	}
 
 	/*
@@ -2908,6 +2924,7 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat)
 	int zoneid;
 	struct zone *zone;
 	enum zone_type highest_zoneidx = pgdat->kcompactd_highest_zoneidx;
+	enum compact_result ret;
 
 	for (zoneid = 0; zoneid <= highest_zoneidx; zoneid++) {
 		zone = &pgdat->node_zones[zoneid];
@@ -2915,14 +2932,10 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat)
 		if (!populated_zone(zone))
 			continue;
 
-		/* Allocation can already succeed, check other zones */
-		if (zone_watermark_ok(zone, pgdat->kcompactd_max_order,
-				      min_wmark_pages(zone),
-				      highest_zoneidx, 0))
-			continue;
-
-		if (compaction_suitable(zone, pgdat->kcompactd_max_order,
-					highest_zoneidx))
+		ret = compaction_suit_allocation_order(zone,
+				pgdat->kcompactd_max_order,
+				highest_zoneidx, ALLOC_WMARK_MIN);
+		if (ret == COMPACT_CONTINUE)
 			return true;
 	}
 
@@ -2945,6 +2958,8 @@ static void kcompactd_do_work(pg_data_t *pgdat)
 		.ignore_skip_hint = false,
 		.gfp_mask = GFP_KERNEL,
 	};
+	enum compact_result ret;
+
 	trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
 							cc.highest_zoneidx);
 	count_compact_event(KCOMPACTD_WAKE);
@@ -2959,12 +2974,9 @@ static void kcompactd_do_work(pg_data_t *pgdat)
 		if (compaction_deferred(zone, cc.order))
 			continue;
 
-		/* Allocation can already succeed, nothing to do */
-		if (zone_watermark_ok(zone, cc.order,
-				      min_wmark_pages(zone), zoneid, 0))
-			continue;
-
-		if (!compaction_suitable(zone, cc.order, zoneid))
+		ret = compaction_suit_allocation_order(zone,
+				cc.order, zoneid, ALLOC_WMARK_MIN);
+		if (ret != COMPACT_CONTINUE)
 			continue;
 
 		if (kthread_should_stop())
-- 
Gitee


From 2c333276aa97e2e3510a6910f388bf41de61f683 Mon Sep 17 00:00:00 2001
From: yangge <yangge1116@126.com>
Date: Sat, 25 Jan 2025 14:53:57 +0800
Subject: [PATCH 10/10] mm: compaction: use the proper flag to determine
 watermarks

commit 6268f0a166ebcf5a31577036f4c1e613d5ab4fb1 upstream.

There are 4 NUMA nodes on my machine, and each NUMA node has 32GB of
memory.  I have configured 16GB of CMA memory on each NUMA node, and
starting a 32GB virtual machine with device passthrough is extremely slow,
taking almost an hour.

Long term GUP cannot allocate memory from CMA area, so a maximum of 16 GB
of no-CMA memory on a NUMA node can be used as virtual machine memory.
There is 16GB of free CMA memory on a NUMA node, which is sufficient to
pass the order-0 watermark check, causing the __compaction_suitable()
function to consistently return true.

For costly allocations, if the __compaction_suitable() function always
returns true, it causes the __alloc_pages_slowpath() function to fail to
exit at the appropriate point.  This prevents timely fallback to
allocating memory on other nodes, ultimately resulting in excessively long
virtual machine startup times.

Call trace:
__alloc_pages_slowpath
    if (compact_result == COMPACT_SKIPPED ||
        compact_result == COMPACT_DEFERRED)
        goto nopage; // should exit __alloc_pages_slowpath() from here

We could use the real unmovable allocation context to have
__zone_watermark_unusable_free() subtract CMA pages, and thus we won't
pass the order-0 check anymore once the non-CMA part is exhausted.  There
is some risk that in some different scenario the compaction could in fact
migrate pages from the exhausted non-CMA part of the zone to the CMA part
and succeed, and we'll skip it instead.  But only __GFP_NORETRY
allocations should be affected in the immediate "goto nopage" when
compaction is skipped, others will attempt with DEF_COMPACT_PRIORITY
anyway and won't fail without trying to compact-migrate the non-CMA
pageblocks into CMA pageblocks first, so it should be fine.

After this fix, it only takes a few tens of seconds to start a 32GB
virtual machine with device passthrough functionality.

Link: https://lore.kernel.org/lkml/1736335854-548-1-git-send-email-yangge1116@126.com/
Link: https://lkml.kernel.org/r/1737788037-8439-1-git-send-email-yangge1116@126.com
Signed-off-by: yangge <yangge1116@126.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Barry Song <21cnbao@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/compaction.c | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 8779571b2e1e..eb77b1456012 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -2386,7 +2386,8 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
  */
 static enum compact_result
 compaction_suit_allocation_order(struct zone *zone, unsigned int order,
-				 int highest_zoneidx, unsigned int alloc_flags)
+				 int highest_zoneidx, unsigned int alloc_flags,
+				 bool async)
 {
 	unsigned long watermark;
 
@@ -2395,6 +2396,23 @@ compaction_suit_allocation_order(struct zone *zone, unsigned int order,
 			      alloc_flags))
 		return COMPACT_SUCCESS;
 
+	/*
+	 * For unmovable allocations (without ALLOC_CMA), check if there is enough
+	 * free memory in the non-CMA pageblocks. Otherwise compaction could form
+	 * the high-order page in CMA pageblocks, which would not help the
+	 * allocation to succeed. However, limit the check to costly order async
+	 * compaction (such as opportunistic THP attempts) because there is the
+	 * possibility that compaction would migrate pages from non-CMA to CMA
+	 * pageblock.
+	 */
+	if (order > PAGE_ALLOC_COSTLY_ORDER && async &&
+	    !(alloc_flags & ALLOC_CMA)) {
+		watermark = low_wmark_pages(zone) + compact_gap(order);
+		if (!__zone_watermark_ok(zone, 0, watermark, highest_zoneidx,
+					   0, zone_page_state(zone, NR_FREE_PAGES)))
+			return COMPACT_SKIPPED;
+	}
+
 	if (!compaction_suitable(zone, order, highest_zoneidx))
 		return COMPACT_SKIPPED;
 
@@ -2428,7 +2446,8 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
 	if (!is_via_compact_memory(cc->order)) {
 		ret = compaction_suit_allocation_order(cc->zone, cc->order,
 						       cc->highest_zoneidx,
-						       cc->alloc_flags);
+						       cc->alloc_flags,
+						       cc->mode == MIGRATE_ASYNC);
 		if (ret != COMPACT_CONTINUE)
 			return ret;
 	}
@@ -2934,7 +2953,8 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat)
 
 		ret = compaction_suit_allocation_order(zone,
 				pgdat->kcompactd_max_order,
-				highest_zoneidx, ALLOC_WMARK_MIN);
+				highest_zoneidx, ALLOC_WMARK_MIN,
+				false);
 		if (ret == COMPACT_CONTINUE)
 			return true;
 	}
@@ -2975,7 +2995,8 @@ static void kcompactd_do_work(pg_data_t *pgdat)
 			continue;
 
 		ret = compaction_suit_allocation_order(zone,
-				cc.order, zoneid, ALLOC_WMARK_MIN);
+				cc.order, zoneid, ALLOC_WMARK_MIN,
+				false);
 		if (ret != COMPACT_CONTINUE)
 			continue;
 
-- 
Gitee