From f49b96278179b669d90fec03d3f9160f1b60eb25 Mon Sep 17 00:00:00 2001 From: leizongkun Date: Mon, 3 Nov 2025 21:13:13 +0800 Subject: [PATCH 1/2] kvm: supports hugepages POD when migrate vm euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/ID49LF?from=project-issue CVE: NA Reference: NA -------------------------------- memlink supports hugepages POD when migrate vm Signed-off-by: wangzhigang Signed-off-by: zhangliang Signed-off-by: leizongkun --- arch/arm64/kvm/arm.c | 3 + arch/arm64/kvm/mmu.c | 128 +++++++++++++++++++++++++++++++++++++++ include/linux/hugetlb.h | 9 +++ include/linux/kvm_host.h | 2 + include/uapi/linux/kvm.h | 3 + mm/hugetlb.c | 4 +- virt/kvm/kvm_main.c | 1 + 7 files changed, 148 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index d3831c273843..5f277922b3b9 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2073,6 +2073,9 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) return kvm_vm_set_attr(kvm, &attr); } + case KVM_POD_TOUCHED_LOG: { + return kvm_mmu_mark_touched_log(kvm); + } #ifdef CONFIG_VIRT_PLAT_DEV case KVM_CREATE_SHADOW_DEV: { struct kvm_master_dev_info *mdi; diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 9873838ef342..0efcc6ee70b5 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -2285,3 +2285,131 @@ void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled) trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled); } + +static int __kvm_mmu_hva_is_mapped(unsigned long addr) +{ + struct vm_area_struct *vma; + int mapped = 1; + unsigned long vma_page_size; + + if (kvm_is_error_hva(addr)) + return -1; + + down_read(¤t->mm->mmap_lock); + vma = find_vma_intersection(current->mm, addr, addr + 1); + if (vma == NULL) + goto out; + + vma_page_size = vma_kernel_pagesize(vma); + mapped = (vma_page_size - (addr % vma_page_size)) >> PAGE_SHIFT; + + if (is_vm_hugetlb_page(vma)) { + if (!hugetlbfs_pagecache_present(hstate_vma(vma), vma, addr)) + mapped = -mapped; + } else { + /* + * Currently, only hugetlb file mapping is supported. + * Print warning message for other file mapping in order + * to remind developers to adapt. + */ + WARN_ON(vma->vm_file); + } +out: + up_read(¤t->mm->mmap_lock); + return mapped; +} + +static void kvm_mmu_mark_memslot_touched_log_range( + struct kvm_memory_slot *slot, + gfn_t start, gfn_t end) +{ + struct page *page = NULL; + struct page *hpage = NULL; + int npages; + + while (start < end) { + npages = gfn_to_page_many_atomic(slot, start, &page, 1); + if (npages <= 0) { + npages = __kvm_mmu_hva_is_mapped(gfn_to_hva_memslot(slot, start)); + } else { + if (PageCompound(page)) { + hpage = compound_head(page); + npages = 1 << compound_order(hpage); + npages -= start % npages; + } else { + npages = 1; + } + put_page(page); + } + if (WARN_ON(npages == 0)) + npages = 1; + if (npages < 0) { + /* skip npage gfns, which are unmapped */ + start -= npages; + continue; + } + + bitmap_set(slot->dirty_bitmap, start - slot->base_gfn, + min(end - start, (gfn_t)npages)); + start += npages; + } +} + +#define MARK_TOUCHED_STEP_GFN (512 * 256) +static void clear_memslot_dirty_bitmap(struct kvm_memory_slot *memslot) +{ + unsigned long n = kvm_dirty_bitmap_bytes(memslot); + + memslot->flags &= ~KVM_MEM_HUGE_POD; + memset(memslot->dirty_bitmap, 0, n); +} + +int kvm_mmu_mark_touched_log(struct kvm *kvm) +{ + int i, bkt; + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + bool has_next = true; + unsigned long gfn_offset = 0; + + mutex_lock(&kvm->slots_lock); + + while (has_next) { + has_next = false; + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + kvm_for_each_memslot(memslot, bkt, slots) { + gfn_t start, end; + + /* skip invalid memslot */ + if (memslot->flags & KVM_MEMSLOT_INVALID) + continue; + + /* only dirty log memslot need set bitmaps */ + if (!memslot->dirty_bitmap) + continue; + + if (memslot->flags & KVM_MEM_HUGE_POD) + clear_memslot_dirty_bitmap(memslot); + + start = max(memslot->base_gfn + gfn_offset, memslot->base_gfn); + end = min(start + MARK_TOUCHED_STEP_GFN, + memslot->base_gfn + memslot->npages); + if (start >= end) + continue; + has_next = true; + kvm_mmu_mark_memslot_touched_log_range(memslot, start, end); + } + } + if (has_next) { + mutex_unlock(&kvm->slots_lock); + gfn_offset += MARK_TOUCHED_STEP_GFN; + cond_resched(); + mutex_lock(&kvm->slots_lock); + } else { + mutex_unlock(&kvm->slots_lock); + } + } + + return 0; +} diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 93da3db9d7ce..5a28d76b2845 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -174,6 +174,9 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx); pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pud_t *pud); +bool hugetlbfs_pagecache_present(struct hstate *h, + struct vm_area_struct *vma, + unsigned long address); struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage); @@ -1295,6 +1298,12 @@ static inline void hugetlb_register_node(struct node *node) static inline void hugetlb_unregister_node(struct node *node) { } + +static inline bool hugetlbfs_pagecache_present( + struct hstate *h, struct vm_area_struct *vma, unsigned long address) +{ + return false; +} #endif /* CONFIG_HUGETLB_PAGE */ static inline spinlock_t *huge_pte_lock(struct hstate *h, diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index a5cebf76aaa5..f74a8afe3953 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2447,6 +2447,8 @@ static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu) } #endif /* CONFIG_KVM_XFER_TO_GUEST_WORK */ +int kvm_mmu_mark_touched_log(struct kvm *kvm); + /* * If more than one page is being (un)accounted, @virt must be the address of * the first page of a block of pages what were allocated together (i.e diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 3178fbd16892..0036cfaf5d69 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -102,6 +102,7 @@ struct kvm_userspace_memory_region { */ #define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0) #define KVM_MEM_READONLY (1UL << 1) +#define KVM_MEM_HUGE_POD (1UL << 9) /* for KVM_IRQ_LINE */ struct kvm_irq_level { @@ -1568,6 +1569,8 @@ struct kvm_user_data { struct kvm_numa_info numa_info; }; +#define KVM_POD_TOUCHED_LOG _IO(KVMIO, 0xfe) + /* enable ucontrol for s390 */ struct kvm_s390_ucas_mapping { __u64 user_addr; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1cd1196e0d66..40bfffb1d0b0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6080,8 +6080,8 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, /* * Return whether there is a pagecache page to back given address within VMA. */ -static bool hugetlbfs_pagecache_present(struct hstate *h, - struct vm_area_struct *vma, unsigned long address) +bool hugetlbfs_pagecache_present(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) { struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx = linear_page_index(vma, address); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 9e351bce483e..053b42ec258a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1586,6 +1586,7 @@ static int check_memory_region_flags(const struct kvm_userspace_memory_region *m #ifdef __KVM_HAVE_READONLY_MEM valid_flags |= KVM_MEM_READONLY; #endif + valid_flags |= KVM_MEM_HUGE_POD; if (mem->flags & ~valid_flags) return -EINVAL; -- Gitee From 3fa71f36757203b5a2f08f8ecbd59bda8616e8dc Mon Sep 17 00:00:00 2001 From: leizongkun Date: Tue, 4 Nov 2025 09:19:22 +0800 Subject: [PATCH 2/2] reclaim_notify: supports hugepages reclaim notify euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/ID49LF?from=project-issue CVE: NA Reference: NA -------------------------------- memlink supports hugepages reclaime notify Signed-off-by: wangzhigang Signed-off-by: zhangliang Signed-off-by: leizongkun --- include/linux/mm.h | 1 + include/linux/mm_types.h | 2 +- mm/hugetlb.c | 86 ++++++++++++++++++++++++++++++++++++++++ mm/reclaim_notify.c | 11 +++++ virt/kvm/kvm_main.c | 4 +- 5 files changed, 101 insertions(+), 3 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 55bb6ba97a63..3365f2056b3f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4215,6 +4215,7 @@ void vma_pgtable_walk_end(struct vm_area_struct *vma); enum reclaim_reason { RR_KSWAPD, RR_DIRECT_RECLAIM, + RR_HUGEPAGE_RECLAIM, RR_TYPES }; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 64c38b09e18d..262641e1af75 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1006,7 +1006,7 @@ struct mm_struct { /* total used reliable pages */ atomic_long_t reliable_nr_page; #endif -#if IS_ENABLED(CONFIG_ETMEM) && IS_ENABLED(CONFIG_KVM) +#if IS_ENABLED(CONFIG_ETMEM) || IS_ENABLED(CONFIG_KVM) struct kvm *kvm; #endif } __randomize_layout; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 40bfffb1d0b0..22d77129f75e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3211,6 +3211,77 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) return ret; } +static struct mutex *reclaim_notify_mutex_table; + +static void hugetlb_reclaim_notify_init(void) +{ + int i; + + if (!IS_ENABLED(CONFIG_RECLAIM_NOTIFY)) + return; + + if (!IS_ENABLED(CONFIG_KVM)) + return; + + reclaim_notify_mutex_table = kmalloc_array(MAX_NUMNODES, sizeof(struct mutex), GFP_KERNEL); + if (!reclaim_notify_mutex_table) + return; + + for (i = 0; i < MAX_NUMNODES; i++) + mutex_init(&reclaim_notify_mutex_table[i]); +} + +static bool try_hugetlb_reclaim_notify(struct hstate *h, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address) +{ + struct mempolicy *mpol; + gfp_t gfp_mask; + int nid; + nodemask_t *nodemask; + + if (!IS_ENABLED(CONFIG_RECLAIM_NOTIFY)) + return false; + + if (!IS_ENABLED(CONFIG_KVM)) + return false; + + if (!reclaim_notify_mutex_table) + return false; + + if (hstate_is_gigantic(h)) + return false; + +#if IS_ENABLED(CONFIG_KVM) + if (!mm->kvm) + return false; +#endif + + if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { + gfp_mask = htlb_alloc_mask(h); + nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); + mpol_cond_put(mpol); + + if (!mutex_trylock(&reclaim_notify_mutex_table[nid])) { + /* release mm lock for VM_FAULT_RETRY */ + mmap_read_unlock(mm); + + /* wait for others reclaim notify complete */ + mutex_lock(&reclaim_notify_mutex_table[nid]); + mutex_unlock(&reclaim_notify_mutex_table[nid]); + return true; + } + + /* release mm lock for VM_FAULT_RETRY */ + mmap_read_unlock(mm); + + do_reclaim_notify(RR_HUGEPAGE_RECLAIM, &nid); + + mutex_unlock(&reclaim_notify_mutex_table[nid]); + return true; + } + return false; +} + struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { @@ -4618,6 +4689,8 @@ static int __init hugetlb_init(void) hugetlb_cgroup_file_init(); hugetlb_sysctl_init(); + hugetlb_reclaim_notify_init(); + #ifdef CONFIG_SMP num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus()); #else @@ -6261,6 +6334,19 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, ret = vmf_error(PTR_ERR(folio)); else ret = 0; + + if (ret && PTR_ERR(folio) == -ENOSPC && + (flags & FAULT_FLAG_ALLOW_RETRY) && + !(flags & FAULT_FLAG_TRIED) && + !(flags & FAULT_FLAG_RETRY_NOWAIT)) { + hugetlb_vma_unlock_read(vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + + if (try_hugetlb_reclaim_notify(h, mm, vma, address)) + return VM_FAULT_RETRY; + return ret; + } + goto out; } clear_huge_page(&folio->page, address, pages_per_huge_page(h)); diff --git a/mm/reclaim_notify.c b/mm/reclaim_notify.c index d910e2956102..386cecc1e672 100644 --- a/mm/reclaim_notify.c +++ b/mm/reclaim_notify.c @@ -70,6 +70,17 @@ unsigned long do_reclaim_notify(enum reclaim_reason reason, data.nr_nid = idx; data.sync = true; + } else if (reason == RR_HUGEPAGE_RECLAIM) { + if (WARN_ON((int *)reclaim_context == NULL)) + return 0; + + nid = *(int *)reclaim_context; + if (numa_is_remote_node(nid)) + return 0; + + data.nid[0] = nid; + data.nr_nid = 1; + data.sync = true; } else { pg_data_t *pgdat = (pg_data_t *)reclaim_context; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 053b42ec258a..0a8e12c9ba0b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1324,7 +1324,7 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm_destroy_pm_notifier(kvm); kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); -#if IS_ENABLED(CONFIG_ETMEM) +#if IS_ENABLED(CONFIG_ETMEM) || IS_ENABLED(CONFIG_KVM) if (mm->kvm == kvm) mm->kvm = NULL; #endif @@ -5151,7 +5151,7 @@ static int kvm_dev_ioctl_create_vm(unsigned long type) goto put_kvm; } -#if IS_ENABLED(CONFIG_ETMEM) +#if IS_ENABLED(CONFIG_ETMEM) || IS_ENABLED(CONFIG_KVM) if (kvm->mm->kvm == NULL) kvm->mm->kvm = kvm; #endif -- Gitee