From 77a95858870947e016cad5e817ed167a40a5bf07 Mon Sep 17 00:00:00 2001 From: nicunshu Date: Tue, 24 Jun 2025 17:11:35 +0800 Subject: [PATCH 01/34] mm: gmem: Introduce CONFIG_GMEM euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- Introduce config GMEM in preparation for isolation code for gmem. Signed-off-by: nicunshu --- mm/Kconfig | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/mm/Kconfig b/mm/Kconfig index bf162fee0b5f..56e0df21bc85 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1489,6 +1489,21 @@ config NUMABALANCING_MEM_SAMPLING if unsure, say N to disable the NUMABALANCING_MEM_SAMPLING. +config GMEM + bool "gmem subsystem for multi-MMU cooperative management" + depends on (ARM64 || X86_64) && MMU && TRANSPARENT_HUGEPAGE + select ARCH_USES_HIGH_VMA_FLAGS + default y + help + This provides a high-level interface that decouples MMUspecific functions. + Device drivers can thus attach themselves to a process’s address space and + let the OS take charge of their memory management. This eliminates + the need for device drivers to reinvent the wheel and allows them to + benefit from general memory optimizations integrated by GMEM. + + say Y here to enable gmem subsystem + + source "mm/damon/Kconfig" config THP_CONTROL -- Gitee From 270d4bb93ebb4afac4754e35321ab820de5d2a77 Mon Sep 17 00:00:00 2001 From: Ni Cunshu Date: Tue, 24 Jun 2025 17:32:20 +0800 Subject: [PATCH 02/34] mm: gmem: Introduce new node state N_HETEROGENEOUS euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- Introduce new node state N_HETEROGENEOUS to indicate heterogeneous memory devices. Co-developed-by: Jiangtian Feng Signed-off-by: Jiangtian Feng Co-developed-by: liuzixian Signed-off-by: liuzixian Signed-off-by: Ni Cunshu --- drivers/base/node.c | 6 ++++++ include/linux/nodemask.h | 12 ++++++++++++ mm/page_alloc.c | 3 +++ 3 files changed, 21 insertions(+) diff --git a/drivers/base/node.c b/drivers/base/node.c index 75f78552da5a..0ece939cc5f6 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -935,6 +935,9 @@ static struct node_attr node_state_attr[] = { [N_CPU] = _NODE_ATTR(has_cpu, N_CPU), [N_GENERIC_INITIATOR] = _NODE_ATTR(has_generic_initiator, N_GENERIC_INITIATOR), +#ifdef CONFIG_GMEM + [N_HETEROGENEOUS] = _NODE_ATTR(has_hetero_memory, N_HETEROGENEOUS), +#endif }; static struct attribute *node_state_attrs[] = { @@ -947,6 +950,9 @@ static struct attribute *node_state_attrs[] = { &node_state_attr[N_MEMORY].attr.attr, &node_state_attr[N_CPU].attr.attr, &node_state_attr[N_GENERIC_INITIATOR].attr.attr, +#ifdef CONFIG_GMEM + &node_state_attr[N_HETEROGENEOUS].attr.attr, +#endif NULL }; diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 8d07116caaf1..f005f3d903ae 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -407,6 +407,11 @@ enum node_states { N_MEMORY, /* The node has memory(regular, high, movable) */ N_CPU, /* The node has one or more cpus */ N_GENERIC_INITIATOR, /* The node has one or more Generic Initiators */ +#ifdef CONFIG_GMEM +#ifndef __GENKSYMS__ + N_HETEROGENEOUS, /* The node has heterogeneous memory */ +#endif +#endif NR_NODE_STATES }; @@ -536,6 +541,13 @@ static inline int node_random(const nodemask_t *maskp) #define for_each_node(node) for_each_node_state(node, N_POSSIBLE) #define for_each_online_node(node) for_each_node_state(node, N_ONLINE) +#ifdef CONFIG_GMEM +/* For h-NUMA topology */ +#define hnode_map node_states[N_HETEROGENEOUS] +#define num_hnodes() num_node_state(N_HETEROGENEOUS) +#define for_each_hnode(node) for_each_node_state(node, N_HETEROGENEOUS) +#endif + /* * For nodemask scratch area. * NODEMASK_ALLOC(type, name) allocates an object with a specified type and diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ce0203f660e8..32b3921949cc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -195,6 +195,9 @@ EXPORT_SYMBOL(latent_entropy); nodemask_t node_states[NR_NODE_STATES] __read_mostly = { [N_POSSIBLE] = NODE_MASK_ALL, [N_ONLINE] = { { [0] = 1UL } }, +#ifdef CONFIG_GMEM + [N_HETEROGENEOUS] = NODE_MASK_NONE, +#endif #ifndef CONFIG_NUMA [N_NORMAL_MEMORY] = { { [0] = 1UL } }, #ifdef CONFIG_HIGHMEM -- Gitee From b179186ab273350897758b0444ec7ed598e480e9 Mon Sep 17 00:00:00 2001 From: Yang Yanchao Date: Tue, 24 Jun 2025 19:21:37 +0800 Subject: [PATCH 03/34] mm: gmem: Introduce gmem related madvise euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues//ICHFJN --------------------------------------------- Introduce hmadvise via ioctl in order to specific gmem behavior. Introduce new madvise opcode for hmadvise: MADV_PREFETCH: prefetch pages for hNUMA node MADV_PINNED: pin pages In order to avoid conflict to existing or new madvise opcode, make the new one begin with 0x1000. Signed-off-by: Yang Yanchao --- include/uapi/asm-generic/mman-common.h | 5 +++++ init/main.c | 8 ++++++++ 2 files changed, 13 insertions(+) diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 14e5498efd7a..5bd675448f53 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -79,6 +79,11 @@ #define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ +/* for hmadvise */ +#define MADV_GMEM_BASE 0x1000 +#define MADV_PREFETCH MADV_GMEM_BASE /* prefetch pages for hNUMA node */ +#define MADV_PINNED (MADV_GMEM_BASE+1) /* pin these pages */ + #define MADV_ETMEM_BASE 0x1100 #define MADV_SWAPFLAG MADV_ETMEM_BASE /* for memory to be swap out */ #define MADV_SWAPFLAG_REMOVE (MADV_SWAPFLAG + 1) diff --git a/init/main.c b/init/main.c index f97f06547078..51395ee7a27d 100644 --- a/init/main.c +++ b/init/main.c @@ -102,6 +102,10 @@ #include #include +#ifdef CONFIG_GMEM +#include +#endif + #include #include #include @@ -905,6 +909,10 @@ void start_kernel(void) smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ boot_cpu_hotplug_init(); +#ifdef CONFIG_GMEM + hnuma_init(); +#endif + pr_notice("Kernel command line: %s\n", saved_command_line); /* parameters may set static keys */ jump_label_init(); -- Gitee From a2f8d9879ba4162d5c67e1328b8c30cc6b7544ae Mon Sep 17 00:00:00 2001 From: Liu Chao Date: Tue, 24 Jun 2025 19:39:12 +0800 Subject: [PATCH 04/34] mm: gmem: Introduce vm_object in preparation for gmem euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- Defines a centrailized logical mapping table that reflects the mapping information regardless of the underlying arch-specific MMUs. Co-developed-by: Ni Cunshu Signed-off-by: Ni Cunshu Signed-off-by: Liu Chao --- include/linux/mm_types.h | 44 +++++++++++++++++++++++++++++++++++++++ include/linux/vm_object.h | 16 ++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 include/linux/vm_object.h diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 64c38b09e18d..bcfbaa36bbbb 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -20,6 +20,10 @@ #include #include +#ifdef CONFIG_GMEM +#include +#endif + #include #include @@ -612,6 +616,43 @@ struct vm_userfaultfd_ctx { struct vm_userfaultfd_ctx {}; #endif /* CONFIG_USERFAULTFD */ +#ifdef CONFIG_GMEM +/* + * Defines a centralized logical mapping table that reflects the mapping information + * regardless of the underlying arch-specific MMUs. + * The implementation of this data structure borrows the VM_OBJECT from FreeBSD as well + * as the filemap address_space struct from Linux page cache. + * Only VMAs point to VM_OBJECTs and maintain logical mappings, because we assume that + * the coordiantion between page tables must happen with CPU page table involved. That + * is to say, a generalized process unit must involve in a UVA-programming model, otherwise + * there is no point to support UVA programming. + * However, a VMA only needs to maintain logical mappings if the process has been + * attached to a GMEM VA space. In normal cases, a CPU process does not need it. (unless + * we later build a reservation system on top of the logical mapping tables to support + * reservation-based superpages and rangeTLBs). + * A GM_REGION does not need to maintain logical mappings. In the case that a device wants + * to support its private address space with local physical memory, GMEM should forward address + * space management to the core VM, using VMAs, instead of using GM_REGIONs. + */ +struct vm_object { + spinlock_t lock; + struct vm_area_struct *vma; + + /* + * The logical_page_table is a container that holds the mapping + * information between a VA and a struct page. + */ + struct xarray *logical_page_table; + atomic_t nr_pages; + + /* + * a vm object might be referred by multiple VMAs to share + * memory. + */ + atomic_t ref_count; +}; +#endif + struct anon_vma_name { struct kref kref; /* The name needs to be at the end because it is dynamically sized. */ @@ -732,6 +773,9 @@ struct vm_area_struct { struct vma_numab_state *numab_state; /* NUMA Balancing state */ #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; +#ifdef CONFIG_GMEM + struct vm_object *vm_obj; +#endif #ifdef CONFIG_SHARE_POOL struct sp_area *spa; #endif diff --git a/include/linux/vm_object.h b/include/linux/vm_object.h new file mode 100644 index 000000000000..d39b461799f2 --- /dev/null +++ b/include/linux/vm_object.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _VM_OBJECT_H +#define _VM_OBJECT_H + +#include +#include + +#ifdef CONFIG_GMEM +/* vm_object KAPI */ +static inline struct gm_mapping *vm_object_lookup(struct vm_object *obj, + unsigned long va) { return NULL; } +static inline void vm_object_mapping_create(struct vm_object *obj, + unsigned long start) { return 0; } +#endif + +#endif /* _VM_OBJECT_H */ -- Gitee From fa595d88caef62da2a76a7d5027367662925bc4d Mon Sep 17 00:00:00 2001 From: wangbin Date: Tue, 24 Jun 2025 19:45:30 +0800 Subject: [PATCH 05/34] mm: gmem: Introduce GMEM euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- The functions of the GMEM can be summarized as follows: First, the accelerator driver can directly reuse the core VM code of Linux by providing the MMU operation function required by the GMEM, and no independent memory management mechanism is required. Second, the GMEM can coordinate a page table between multiple heterogeneous MMUs, so as to implement memory coherence (memory coherence) between the CPU and the accelerator in a same address space. From a kernel point of view, the driver code for memory management with repetitive functions is greatly reduced. From the perspective of driver programming, the development and maintenance workload of driver code is greatly reduced. From the perspective of application development, the same address space greatly reduces programming complexity, while GMEM provides heterogeneous memory semantics to enhance flexibility and ease of use in performance tuning. To enable gmem, add "gmem=on" in kernel commandline. Co-developed-by: Yang Yanchao Signed-off-by: Yang Yanchao Co-developed-by: Ni Cunshu Signed-off-by: Ni Cunshu Co-developed-by: luochunsheng Signed-off-by: luochunsheng Co-developed-by: Weixi Zhu Signed-off-by: Weixi Zhu Signed-off-by: wangbin --- include/linux/gmem.h | 347 ++++++++++++++++ include/linux/gmem_as.h | 36 ++ include/linux/mm.h | 41 ++ include/linux/vm_object.h | 1 + mm/Makefile | 2 +- mm/gmem.c | 836 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 1262 insertions(+), 1 deletion(-) create mode 100644 include/linux/gmem.h create mode 100644 include/linux/gmem_as.h create mode 100644 mm/gmem.c diff --git a/include/linux/gmem.h b/include/linux/gmem.h new file mode 100644 index 000000000000..3216b55d659d --- /dev/null +++ b/include/linux/gmem.h @@ -0,0 +1,347 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Generalized Memory Management. + * + * Copyright (C) 2023- Huawei, Inc. + * Author: Weixi Zhu + * + */ +#ifndef _GMEM_H +#define _GMEM_H + +#include + +struct hnode; + +/* + * enum gm_ret - The return value of GMEM KPI that can be used to tell + * the core VM or peripheral driver whether the GMEM KPI was + * executed successfully. + * + * @GM_RET_SUCCESS: The invoked GMEM KPI behaved as expected. + * @GM_RET_FAILURE_UNKNOWN: The GMEM KPI failed with unknown reason. + * Any external status related to this KPI invocation changes must be rolled back. + */ +enum gm_ret { + GM_RET_SUCCESS = 0, + GM_RET_NOMEM, + GM_RET_PAGE_EXIST, + GM_RET_DMA_ERROR, + GM_RET_MIGRATING, + GM_RET_FAILURE_UNKNOWN, + GM_RET_UNIMPLEMENTED, +}; + +/* + * Defines a contiguous range of virtual addresses inside a struct gm_as + * As an analogy, this is conceptually similar as virtual_address_struct + */ +struct gm_region { + unsigned long start_va; + unsigned long end_va; + struct rb_node node; + struct gm_as *as; /* The address space that it belongs to */ + + /* Do we need another list_node to maintain a tailQ of allocated VMAs inside a gm_as? */ + struct list_head mapping_set_link; + + void (*callback_op)(void *args); + void *cb_args; +}; + +/* This holds a list of regions that must not be concurrently manipulated. */ +struct gm_mapping_set { + unsigned int region_cnt; + struct list_head gm_region_list; +}; + +/** + * enum gm_mmu_mode - defines the method to share a physical page table. + * + * @GM_MMU_MODE_SHARE: Literally share a physical page table with another + * attached device's MMU. Nothing is guaranteed about the allocated address. + * @GM_MMU_MODE_COHERENT_EXCLUSIVE: Maintain a coherent page table that holds + * exclusive mapping entries, so that device memory accesses can trigger fault-driven + * migration for automatic data locality optimizations. + * @GM_MMU_MODE_REPLICATE: Maintain a coherent page table that replicates physical + * mapping entries whenever a physical mapping is installed inside the address space, so + * that it may minimize the page faults to be triggered by this device. + */ +enum gm_mmu_mode { + GM_MMU_MODE_SHARE, + GM_MMU_MODE_COHERENT_EXCLUSIVE, + GM_MMU_MODE_REPLICATE, +}; + +/* + * This is the parameter list of peer_map/unmap mmu operations. + * if device should copy data to/from host, set copy and dma_addr + */ +struct gm_fault_t { + struct mm_struct *mm; + struct gm_dev *dev; + unsigned long va; + unsigned long size; + unsigned long prot; + bool copy; + dma_addr_t dma_addr; + int behavior; +}; + +struct gm_memcpy_t { + struct mm_struct *mm; + struct gm_dev *dev; + unsigned long src; + unsigned long dest; + dma_addr_t dma_addr; + size_t size; +}; + +/** + * + * This struct defines a series of MMU functions registered by a peripheral + * device that is to be invoked by GMEM. + * + * pmap is an opaque pointer that identifies a physical page table of a device. + * A physical page table holds the physical mappings that can be interpreted by + * the hardware MMU. + */ +struct gm_mmu { + /* + * Each bit indicates a supported page size for page-based TLB. + * Currently we do not consider range TLBs. + */ + unsigned long pgsize_bitmap; + + /* + * cookie identifies the type of the MMU. If two gm_mmu shares the same cookie, + * then it means their page table formats are compatible. + * In that case, they can share the same void *pmap as the input arg. + */ + unsigned long cookie; + + /* Synchronize VMA in a peer OS to interact with the host OS */ + enum gm_ret (*peer_va_alloc_fixed)(struct mm_struct *mm, unsigned long va, + unsigned long size, unsigned long prot); + enum gm_ret (*peer_va_free)(struct mm_struct *mm, unsigned long va, + unsigned long size); + + /* Create physical mappings on peer host. + * If copy is set, copy data [dma_addr, dma_addr + size] to peer host + */ + enum gm_ret (*peer_map)(struct gm_fault_t *gmf); + /* + * Destroy physical mappings on peer host. + * If copy is set, copy data back to [dma_addr, dma_addr + size] + */ + enum gm_ret (*peer_unmap)(struct gm_fault_t *gmf); + + /* Create or destroy a device's physical page table. */ + enum gm_ret (*pmap_create)(struct gm_dev *dev, void **pmap); + enum gm_ret (*pmap_destroy)(void *pmap); + + /* Create or destroy a physical mapping of a created physical page table */ + enum gm_ret (*pmap_enter)(void *pmap, unsigned long va, unsigned long size, + unsigned long pa, unsigned long prot); + enum gm_ret (*pmap_release)(void *pmap, unsigned long va, unsigned long size); + + /* Change the protection of a virtual page */ + enum gm_ret (*pmap_protect)(void *pmap, unsigned long va, unsigned long size, + unsigned long new_prot); + + /* Invalidation functions of the MMU TLB */ + enum gm_ret (*tlb_invl)(void *pmap, unsigned long va, unsigned long size); + enum gm_ret (*tlb_invl_coalesced)(void *pmap, struct list_head *mappings); +}; + +/** + * unsigned long defines a composable flag to describe the capabilities of a device. + * + * @GM_DEV_CAP_REPLAYABLE: Memory accesses can be replayed to recover page faults. + * @GM_DEV_CAP_PEER: The device has its own VMA/PA management, controlled by another peer OS + */ +#define GM_DEV_CAP_REPLAYABLE 0x00000001 +#define GM_DEV_CAP_PEER 0x00000010 + +#define gm_dev_is_peer(dev) (((dev)->capability & GM_DEV_CAP_PEER) != 0) + +struct gm_context { + struct gm_as *as; + struct gm_dev *dev; + void *pmap; + /* + * consider a better container to maintain multiple ctx inside a device or multiple ctx + * inside a va space. + * A device may simultaneously have multiple contexts for time-sliced ctx switching + */ + struct list_head gm_dev_link; + + /* A va space may have multiple gm_context */ + struct list_head gm_as_link; +}; +#define get_gm_context(head) (list_entry((head)->prev, struct gm_context, ctx_link)) + +struct gm_dev { + int id; + + /* identifies the device capability + * For example, whether the device supports page faults or whether it has its + * own OS that manages the VA and PA resources. + */ + unsigned long capability; + struct gm_mmu *mmu; + void *dev_data; + /* + * TODO: Use a better container of struct gm_context to support time-sliced context switch. + * A collection of device contexts. If the device does not support time-sliced context + * switch, then the size of the collection should never be greater than one. + * We need to think about what operators should the container be optimized for. + * A list, a radix-tree or what? What would gm_dev_activate require? + * Are there any accelerators that are really going to support time-sliced context switch? + */ + struct gm_context *current_ctx; + + struct list_head gm_ctx_list; + + /* Add tracking of registered device local physical memory. */ + nodemask_t registered_hnodes; + struct device *dma_dev; + + struct gm_mapping *gm_mapping; +}; + +#define GM_PAGE_DIRTY 0x8 /* Whether the page is dirty */ +#define GM_PAGE_CPU 0x10 /* Determines whether page is a pointer or a pfn number. */ +#define GM_PAGE_DEVICE 0x20 +#define GM_PAGE_NOMAP 0x40 +#define GM_PAGE_PINNED 0x80 +#define GM_PAGE_WILLNEED 0x100 + +#define GM_PAGE_TYPE_MASK (GM_PAGE_CPU | GM_PAGE_DEVICE | GM_PAGE_NOMAP) + +/* Records the status of a page-size physical page */ +struct gm_mapping { + unsigned int flag; + + union { + struct page *page; /* CPU node */ + struct gm_dev *dev; /* hetero-node */ + unsigned long pfn; + }; + + struct mutex lock; +}; + +static inline void gm_mapping_flags_set(struct gm_mapping *gm_mapping, int flags) +{ + if (flags & GM_PAGE_TYPE_MASK) + gm_mapping->flag &= ~GM_PAGE_TYPE_MASK; + + gm_mapping->flag |= flags; +} + +static inline void gm_mapping_flags_clear(struct gm_mapping *gm_mapping, int flags) +{ + gm_mapping->flag &= ~flags; +} + +static inline bool gm_mapping_cpu(struct gm_mapping *gm_mapping) +{ + return !!(gm_mapping->flag & GM_PAGE_CPU); +} + +static inline bool gm_mapping_device(struct gm_mapping *gm_mapping) +{ + return !!(gm_mapping->flag & GM_PAGE_DEVICE); +} + +static inline bool gm_mapping_nomap(struct gm_mapping *gm_mapping) +{ + return !!(gm_mapping->flag & GM_PAGE_NOMAP); +} + +static inline bool gm_mapping_willneed(struct gm_mapping *gm_mapping) +{ + return !!(gm_mapping->flag & GM_PAGE_WILLNEED); +} + +static inline bool gm_mapping_pinned(struct gm_mapping *gm_mapping) +{ + return !!(gm_mapping->flag & GM_PAGE_PINNED); +} + +#define test_gm_mapping_mapped_on_node(i) { /* implement this */ } +#define set_gm_mapping_mapped_on_node(i) { /* implement this */ } +#define unset_gm_mapping_mapped_on_node(i) { /* implement this */ } + +/* GMEM Device KPI */ +extern enum gm_ret gm_dev_create(struct gm_mmu *mmu, void *dev_data, unsigned long cap, + struct gm_dev **new_dev); +extern enum gm_ret gm_dev_destroy(struct gm_dev *dev); +extern enum gm_ret gm_dev_switch(struct gm_dev *dev, struct gm_as *as); +extern enum gm_ret gm_dev_detach(struct gm_dev *dev, struct gm_as *as); +extern enum gm_ret gm_dev_register_physmem(struct gm_dev *dev, unsigned long begin, + unsigned long end); +enum gm_ret gm_dev_fault(struct mm_struct *mm, unsigned long addr, struct gm_dev *dev, + int behavior); +vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, enum page_entry_size pe_size); + +/* GMEM address space KPI */ +extern enum gm_ret gm_dev_register_physmem(struct gm_dev *dev, unsigned long begin, + unsigned long end); +extern void gm_dev_unregister_physmem(struct gm_dev *dev, unsigned int nid); +extern struct gm_mapping *gm_mappings_alloc(unsigned int nid, unsigned int order); +extern enum gm_ret gm_as_create(unsigned long begin, unsigned long end, enum gm_as_alloc policy, + unsigned long cache_quantum, struct gm_as **new_as); +extern enum gm_ret gm_as_destroy(struct gm_as *as); +extern enum gm_ret gm_as_attach(struct gm_as *as, struct gm_dev *dev, enum gm_mmu_mode mode, + bool activate, struct gm_context **out_ctx); +extern unsigned long gm_as_alloc(struct gm_as *as, unsigned long hint, unsigned long size, + unsigned long align, unsigned long no_cross, unsigned long max_va, + struct gm_region **new_region); + +extern int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior); + +enum gmem_stat_item { + NR_PAGE_MIGRATING, + NR_GMEM_STAT_ITEMS +}; + +extern void gmem_state_counter(enum gmem_stat_item item, int val); +extern void gmem_state_counter_show(void); + +/* h-NUMA topology */ +struct hnode { + unsigned int id; + + struct gm_dev *dev; + + struct xarray pages; +}; + +extern struct hnode *hnodes[]; + +static inline bool is_hnode(int node) +{ + return (node < MAX_NUMNODES) && !node_isset(node, node_possible_map) && + node_isset(node, hnode_map); +} + +static inline bool is_hnode_allowed(int node) +{ + return (node < MAX_NUMNODES) && is_hnode(node) && + node_isset(node, current->mems_allowed); +} + +static inline struct hnode *get_hnode(unsigned int hnid) +{ + return hnodes[hnid]; +} + +void __init hnuma_init(void); +unsigned int alloc_hnode_id(void); +void free_hnode_id(unsigned int nid); +void hnode_init(struct hnode *hnode, unsigned int hnid, struct gm_dev *dev); +void hnode_deinit(unsigned int hnid, struct gm_dev *dev); + +#endif /* _GMEM_H */ diff --git a/include/linux/gmem_as.h b/include/linux/gmem_as.h new file mode 100644 index 000000000000..d691de1162eb --- /dev/null +++ b/include/linux/gmem_as.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _GMEM_AS_H +#define _GMEM_AS_H + +#define GMEM_MMAP_RETRY_TIMES 10 /* gmem retry times before OOM */ + +/** + * enum gm_as_alloc - defines different allocation policy for virtual addresses. + * + * @GM_AS_ALLOC_DEFAULT: An object cache is applied to accelerate VA allocations. + * @GM_AS_ALLOC_FIRSTFIT: Prefer allocation efficiency. + * @GM_AS_ALLOC_BESTFIT: Prefer space efficiency. + * @GM_AS_ALLOC_NEXTFIT: Perform an address-ordered search for free addresses, + * beginning where the previous search ended. + */ +enum gm_as_alloc { + GM_AS_ALLOC_DEFAULT = 0, + GM_AS_ALLOC_FIRSTFIT, + GM_AS_ALLOC_BESTFIT, + GM_AS_ALLOC_NEXTFIT, +}; + +/* Defines an address space. */ +struct gm_as { + spinlock_t rbtree_lock; /* spinlock of struct gm_as */ + struct rb_root rbroot; /*root of gm_region_t */ + enum gm_as_alloc policy; + unsigned long start_va; + unsigned long end_va; + /* defines the VA unit size if an object cache is applied */ + unsigned long cache_quantum; + /* tracks device contexts attached to this va space, using gm_as_link */ + struct list_head gm_ctx_list; +}; + +#endif diff --git a/include/linux/mm.h b/include/linux/mm.h index 55bb6ba97a63..5b13113a0925 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -342,6 +342,12 @@ extern unsigned int kobjsize(const void *objp); #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) #define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) + +#ifdef CONFIG_GMEM +#define VM_PEER_SHARED BIT(56) +#else +#define VM_PEER_SHARED VM_NONE +#endif #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ #ifdef CONFIG_ARCH_HAS_PKEYS @@ -576,6 +582,13 @@ struct vm_fault { KABI_RESERVE(3) }; +/* page entry size for vm->huge_fault() */ +enum page_entry_size { + PE_SIZE_PTE = 0, + PE_SIZE_PMD, + PE_SIZE_PUD, +}; + /* * These are the virtual MM functions - opening of an area, closing and * unmapping it (needed to keep files on disk up-to-date etc), pointer @@ -3404,6 +3417,10 @@ unsigned long randomize_page(unsigned long start, unsigned long range); extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); +extern unsigned long get_unmapped_area_aligned(struct file *file, + unsigned long addr, unsigned long len, unsigned long pgoff, + unsigned long flags, unsigned long align); + extern unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf); @@ -4297,4 +4314,28 @@ int set_linear_mapping_invalid(unsigned long start_pfn, unsigned long end_pfn, return -EINVAL; } #endif + +#ifdef CONFIG_GMEM +DECLARE_STATIC_KEY_FALSE(gmem_status); + +static inline bool gmem_is_enabled(void) +{ + return static_branch_likely(&gmem_status); +} + +static inline bool vma_is_peer_shared(struct vm_area_struct *vma) +{ + if (!gmem_is_enabled()) + return false; + + return !!(vma->vm_flags & VM_PEER_SHARED); +} +#else +static inline bool gmem_is_enabled(void) { return false; } +static inline bool vma_is_peer_shared(struct vm_area_struct *vma) +{ + return false; +} +#endif + #endif /* _LINUX_MM_H */ diff --git a/include/linux/vm_object.h b/include/linux/vm_object.h index d39b461799f2..083a1278901a 100644 --- a/include/linux/vm_object.h +++ b/include/linux/vm_object.h @@ -7,6 +7,7 @@ #ifdef CONFIG_GMEM /* vm_object KAPI */ +static inline int __init vm_object_init(void) { return 0; } static inline struct gm_mapping *vm_object_lookup(struct vm_object *obj, unsigned long va) { return NULL; } static inline void vm_object_mapping_create(struct vm_object *obj, diff --git a/mm/Makefile b/mm/Makefile index 49ec4f839fe7..46d2d12fad3e 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -41,7 +41,7 @@ mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \ mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \ msync.o page_vma_mapped.o pagewalk.o \ pgtable-generic.o rmap.o vmalloc.o - +mmu-$(CONFIG_GMEM) += gmem.o ifdef CONFIG_CROSS_MEMORY_ATTACH mmu-$(CONFIG_MMU) += process_vm_access.o diff --git a/mm/gmem.c b/mm/gmem.c new file mode 100644 index 000000000000..add5062296b6 --- /dev/null +++ b/mm/gmem.c @@ -0,0 +1,836 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Generalized Memory Management. + * + * Copyright (C) 2023- Huawei, Inc. + * Author: Weixi Zhu + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +DEFINE_STATIC_KEY_FALSE(gmem_status); +EXPORT_SYMBOL_GPL(gmem_status); + +static struct kmem_cache *gm_as_cache; +static struct kmem_cache *gm_dev_cache; +static struct kmem_cache *gm_ctx_cache; +static struct kmem_cache *gm_region_cache; +static DEFINE_XARRAY_ALLOC(gm_dev_id_pool); + +static bool enable_gmem; + +static inline unsigned long pe_mask(unsigned int order) +{ + if (order == 0) + return PAGE_MASK; + if (order == PMD_ORDER) + return HPAGE_PMD_MASK; + if (order == PUD_ORDER) + return HPAGE_PUD_MASK; + return ~0; +} + +static struct percpu_counter g_gmem_stats[NR_GMEM_STAT_ITEMS]; + +void gmem_state_counter(enum gmem_stat_item item, int val) +{ + if (!gmem_is_enabled()) + return; + + if (WARN_ON_ONCE(unlikely(item >= NR_GMEM_STAT_ITEMS))) + return; + + percpu_counter_add(&g_gmem_stats[item], val); +} + +static int gmem_stat_init(void) +{ + int i, rc; + + for (i = 0; i < NR_GMEM_STAT_ITEMS; i++) { + rc = percpu_counter_init(&g_gmem_stats[i], 0, GFP_KERNEL); + if (rc) { + for (i--; i >= 0; i--) + percpu_counter_destroy(&g_gmem_stats[i]); + + break; /* break the initialization process */ + } + } + + return rc; +} + +#ifdef CONFIG_PROC_FS +static int gmemstat_show(struct seq_file *m, void *arg) +{ + if (!gmem_is_enabled()) + return 0; + + seq_printf( + m, "migrating : %lld\n", + percpu_counter_read_positive(&g_gmem_stats[NR_PAGE_MIGRATING])); + + return 0; +} +#endif /* CONFIG_PROC_FS */ + +static struct workqueue_struct *prefetch_wq; + +#define GM_WORK_CONCURRENCY 4 + +static int __init gmem_init(void) +{ + int err = -ENOMEM; + + if (!enable_gmem) + return 0; + + gm_as_cache = KMEM_CACHE(gm_as, 0); + if (!gm_as_cache) + goto out; + + gm_dev_cache = KMEM_CACHE(gm_dev, 0); + if (!gm_dev_cache) + goto free_as; + + gm_ctx_cache = KMEM_CACHE(gm_context, 0); + if (!gm_ctx_cache) + goto free_dev; + + gm_region_cache = KMEM_CACHE(gm_region, 0); + if (!gm_region_cache) + goto free_ctx; + + err = vm_object_init(); + if (err) + goto free_ctx; + + err = gmem_stat_init(); + if (err) + goto free_ctx; + + prefetch_wq = alloc_workqueue("prefetch", + __WQ_LEGACY | WQ_UNBOUND | WQ_HIGHPRI | + WQ_CPU_INTENSIVE, + GM_WORK_CONCURRENCY); + if (!prefetch_wq) { + pr_info("fail to alloc workqueue prefetch_wq\n"); + err = -EFAULT; + goto free_ctx; + } + +#ifdef CONFIG_PROC_FS + proc_create_single("gmemstat", 0444, NULL, gmemstat_show); +#endif + + static_branch_enable(&gmem_status); + + return 0; + +free_ctx: + kmem_cache_destroy(gm_ctx_cache); +free_dev: + kmem_cache_destroy(gm_dev_cache); +free_as: + kmem_cache_destroy(gm_as_cache); +out: + return -ENOMEM; +} +subsys_initcall(gmem_init); + +static int __init setup_gmem(char *str) +{ + strtobool(str, &enable_gmem); + + return 1; +} +__setup("gmem=", setup_gmem); + +/* + * Create a GMEM device, register its MMU function and the page table. + * The returned device pointer will be passed by new_dev. + * A unique id will be assigned to the GMEM device, using Linux's xarray. + */ +gm_ret_t gm_dev_create(struct gm_mmu *mmu, void *dev_data, unsigned long cap, + struct gm_dev **new_dev) +{ + struct gm_dev *dev; + + if (!gmem_is_enabled()) + return GM_RET_FAILURE_UNKNOWN; + + dev = kmem_cache_alloc(gm_dev_cache, GFP_KERNEL); + if (!dev) + return GM_RET_NOMEM; + + if (xa_alloc(&gm_dev_id_pool, &dev->id, dev, xa_limit_32b, + GFP_KERNEL)) { + kmem_cache_free(gm_dev_cache, dev); + return GM_RET_NOMEM; + } + + dev->capability = cap; + dev->mmu = mmu; + dev->dev_data = dev_data; + dev->current_ctx = NULL; + INIT_LIST_HEAD(&dev->gm_ctx_list); + *new_dev = dev; + nodes_clear(dev->registered_hnodes); + return GM_RET_SUCCESS; +} +EXPORT_SYMBOL_GPL(gm_dev_create); + +// Destroy a GMEM device and reclaim the resources. +gm_ret_t gm_dev_destroy(struct gm_dev *dev) +{ + // TODO: implement it + xa_erase(&gm_dev_id_pool, dev->id); + return GM_RET_SUCCESS; +} +EXPORT_SYMBOL_GPL(gm_dev_destroy); + +/* Handle the page fault triggered by a given device */ +gm_ret_t gm_dev_fault(struct mm_struct *mm, unsigned long addr, struct gm_dev *dev, + int behavior) +{ + gm_ret_t ret = GM_RET_SUCCESS; + struct gm_mmu *mmu = dev->mmu; + struct device *dma_dev = dev->dma_dev; + struct vm_area_struct *vma; + vm_object_t *obj; + struct gm_mapping *gm_mapping; + unsigned long size = HPAGE_SIZE; + struct gm_fault_t gmf = { .mm = mm, + .va = addr, + .dev = dev, + .size = size, + .copy = false, + .behavior = behavior }; + struct page *page = NULL; + + mmap_read_lock(mm); + + vma = find_vma(mm, addr); + if (!vma) { + pr_info("gmem: %s no vma\n", __func__); + ret = GM_RET_FAILURE_UNKNOWN; + goto mmap_unlock; + } + obj = vma->vm_obj; + if (!obj) { + pr_info("gmem: %s no vm_obj\n", __func__); + ret = GM_RET_FAILURE_UNKNOWN; + goto mmap_unlock; + } + + xa_lock(obj->logical_page_table); + gm_mapping = vm_object_lookup(obj, addr); + if (!gm_mapping) { + vm_object_mapping_create(obj, addr); + gm_mapping = vm_object_lookup(obj, addr); + } + xa_unlock(obj->logical_page_table); + + mutex_lock(&gm_mapping->lock); + if (gm_mapping_nomap(gm_mapping)) { + goto peer_map; + } else if (gm_mapping_device(gm_mapping)) { + if (behavior == MADV_WILLNEED || behavior == MADV_PINNED) { + goto peer_map; + } else { + ret = 0; + goto unlock; + } + } else if (gm_mapping_cpu(gm_mapping)) { + page = gm_mapping->page; + if (!page) { + pr_err("gmem: host gm_mapping page is NULL. Set nomap\n"); + gm_mapping_flags_set(gm_mapping, GM_PAGE_NOMAP); + goto unlock; + } + get_page(page); + zap_page_range_single(vma, addr, size, NULL); + gmf.dma_addr = + dma_map_page(dma_dev, page, 0, size, DMA_BIDIRECTIONAL); + if (dma_mapping_error(dma_dev, gmf.dma_addr)) + pr_info("gmem: dma map failed\n"); + + gmf.copy = true; + } + +peer_map: + ret = mmu->peer_map(&gmf); + if (ret != GM_RET_SUCCESS) { + if (ret == GM_RET_MIGRATING) { + /* + * gmem page is migrating due to overcommit. + * update page to willneed and this will stop page evicting + */ + gm_mapping_flags_set(gm_mapping, GM_PAGE_WILLNEED); + gmem_state_counter(NR_PAGE_MIGRATING, 1); + ret = GM_RET_SUCCESS; + } else { + pr_err("gmem: peer map failed\n"); + if (page) { + gm_mapping_flags_set(gm_mapping, GM_PAGE_NOMAP); + put_page(page); + } + } + goto unlock; + } + + if (page) { + dma_unmap_page(dma_dev, gmf.dma_addr, size, DMA_BIDIRECTIONAL); + put_page(page); + } + + gm_mapping_flags_set(gm_mapping, GM_PAGE_DEVICE); + gm_mapping->dev = dev; +unlock: + mutex_unlock(&gm_mapping->lock); +mmap_unlock: + mmap_read_unlock(mm); + return ret; +} +EXPORT_SYMBOL_GPL(gm_dev_fault); + +vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, + unsigned int order) +{ + vm_fault_t ret = 0; + struct vm_area_struct *vma = vmf->vma; + unsigned long addr = vmf->address & pe_mask(order); + vm_object_t *obj = vma->vm_obj; + struct gm_mapping *gm_mapping; + unsigned long size = HPAGE_SIZE; + struct gm_dev *dev; + struct device *dma_dev; + struct gm_fault_t gmf = { + .mm = vma->vm_mm, + .va = addr, + .size = size, + .copy = true, + }; + + gm_mapping = vm_object_lookup(obj, addr); + if (!gm_mapping) { + pr_err("gmem: host fault gm_mapping should not be NULL\n"); + return VM_FAULT_SIGBUS; + } + + dev = gm_mapping->dev; + gmf.dev = dev; + dma_dev = dev->dma_dev; + gmf.dma_addr = + dma_map_page(dma_dev, vmf->page, 0, size, DMA_BIDIRECTIONAL); + if (dma_mapping_error(dma_dev, gmf.dma_addr)) { + pr_err("gmem: host fault dma mapping error\n"); + return VM_FAULT_SIGBUS; + } + if (dev->mmu->peer_unmap(&gmf) != GM_RET_SUCCESS) { + pr_err("gmem: peer unmap failed\n"); + dma_unmap_page(dma_dev, gmf.dma_addr, size, DMA_BIDIRECTIONAL); + return VM_FAULT_SIGBUS; + } + + dma_unmap_page(dma_dev, gmf.dma_addr, size, DMA_BIDIRECTIONAL); + return ret; +} + +/* + * Register the local physical memory of a gmem device. + * This implies dynamically creating + * the struct page data structures. + */ +gm_ret_t gm_dev_register_physmem(struct gm_dev *dev, unsigned long begin, unsigned long end) +{ + struct gm_mapping *mapping; + unsigned long addr = PAGE_ALIGN(begin); + unsigned int nid; + int i, page_num = (end - addr) >> PAGE_SHIFT; + struct hnode *hnode = kmalloc(sizeof(struct hnode), GFP_KERNEL); + + if (!hnode) + goto err; + + nid = alloc_hnode_id(); + if (nid == MAX_NUMNODES) + goto free_hnode; + hnode_init(hnode, nid, dev); + + mapping = kvmalloc_array(page_num, sizeof(struct gm_mapping), GFP_KERNEL); + if (!mapping) + goto deinit_hnode; + + for (i = 0; i < page_num; i++, addr += PAGE_SIZE) { + mapping[i].pfn = addr >> PAGE_SHIFT; + mapping[i].flag = 0; + } + + xa_lock(&hnode->pages); + for (i = 0; i < page_num; i++) { + if (xa_err(__xa_store(&hnode->pages, i, mapping + i, + GFP_KERNEL))) { + /* Probably nomem */ + kvfree(mapping); + xa_unlock(&hnode->pages); + goto deinit_hnode; + } + __xa_set_mark(&hnode->pages, i, XA_MARK_0); + } + xa_unlock(&hnode->pages); + + return GM_RET_SUCCESS; + +deinit_hnode: + hnode_deinit(nid, dev); + free_hnode_id(nid); +free_hnode: + kfree(hnode); +err: + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(gm_dev_register_physmem); + +void gm_dev_unregister_physmem(struct gm_dev *dev, unsigned int nid) +{ + struct hnode *hnode = get_hnode(nid); + struct gm_mapping *mapping = xa_load(&hnode->pages, 0); + + kvfree(mapping); + hnode_deinit(nid, dev); + free_hnode_id(nid); + kfree(hnode); +} +EXPORT_SYMBOL_GPL(gm_dev_unregister_physmem); + +struct gm_mapping *gm_mappings_alloc(unsigned int nid, unsigned int order) +{ + struct gm_mapping *mapping; + struct hnode *node = get_hnode(nid); + XA_STATE(xas, &node->pages, 0); + + /* TODO: support order > 0 */ + if (order != 0) + return ERR_PTR(-EINVAL); + + xa_lock(&node->pages); + mapping = xas_find_marked(&xas, ULONG_MAX, XA_MARK_0); + if (!mapping) { + xa_unlock(&node->pages); + return ERR_PTR(-ENOMEM); + } + + xas_clear_mark(&xas, XA_MARK_0); + xa_unlock(&node->pages); + + return mapping; +} +EXPORT_SYMBOL_GPL(gm_mappings_alloc); + +/* GMEM Virtual Address Space API */ +gm_ret_t gm_as_create(unsigned long begin, unsigned long end, enum gm_as_alloc policy, + unsigned long cache_quantum, struct gm_as **new_as) +{ + struct gm_as *as; + + if (!new_as) + return -EINVAL; + + as = kmem_cache_alloc(gm_as_cache, GFP_ATOMIC); + if (!as) + return -ENOMEM; + + spin_lock_init(&as->rbtree_lock); + as->rbroot = RB_ROOT; + as->start_va = begin; + as->end_va = end; + as->policy = policy; + + INIT_LIST_HEAD(&as->gm_ctx_list); + + *new_as = as; + return GM_RET_SUCCESS; +} +EXPORT_SYMBOL_GPL(gm_as_create); + +gm_ret_t gm_as_destroy(struct gm_as *as) +{ + struct gm_context *ctx, *tmp_ctx; + + list_for_each_entry_safe(ctx, tmp_ctx, &as->gm_ctx_list, gm_as_link) + kfree(ctx); + + kmem_cache_free(gm_as_cache, as); + + return GM_RET_SUCCESS; +} +EXPORT_SYMBOL_GPL(gm_as_destroy); + +gm_ret_t gm_as_attach(struct gm_as *as, struct gm_dev *dev, enum gm_mmu_mode mode, + bool activate, struct gm_context **out_ctx) +{ + struct gm_context *ctx; + int nid; + int ret; + + ctx = kmem_cache_alloc(gm_ctx_cache, GFP_KERNEL); + if (!ctx) + return GM_RET_NOMEM; + + ctx->as = as; + ctx->dev = dev; + ctx->pmap = NULL; + ret = dev->mmu->pmap_create(dev, &ctx->pmap); + if (ret) { + kmem_cache_free(gm_ctx_cache, ctx); + return ret; + } + + INIT_LIST_HEAD(&ctx->gm_dev_link); + INIT_LIST_HEAD(&ctx->gm_as_link); + list_add_tail(&dev->gm_ctx_list, &ctx->gm_dev_link); + list_add_tail(&ctx->gm_as_link, &as->gm_ctx_list); + + if (activate) { + /* + * Here we should really have a callback function to perform the context switch + * for the hardware. E.g. in x86 this function is effectively + * flushing the CR3 value. Currently we do not care time-sliced context switch, + * unless someone wants to support it. + */ + dev->current_ctx = ctx; + } + *out_ctx = ctx; + + /* + * gm_as_attach will be used to attach device to process address space. + * Handle this case and add hnodes registered by device to process mems_allowed. + */ + for_each_node_mask(nid, dev->registered_hnodes) + node_set(nid, current->mems_allowed); + return GM_RET_SUCCESS; +} +EXPORT_SYMBOL_GPL(gm_as_attach); + +DEFINE_SPINLOCK(hnode_lock); +struct hnode *hnodes[MAX_NUMNODES]; + +void __init hnuma_init(void) +{ + unsigned int node; + + for_each_node(node) + node_set(node, hnode_map); +} + +unsigned int alloc_hnode_id(void) +{ + unsigned int node; + + spin_lock(&hnode_lock); + node = first_unset_node(hnode_map); + node_set(node, hnode_map); + spin_unlock(&hnode_lock); + + return node; +} + +void free_hnode_id(unsigned int nid) +{ + node_clear(nid, hnode_map); +} + +void hnode_init(struct hnode *hnode, unsigned int hnid, struct gm_dev *dev) +{ + hnodes[hnid] = hnode; + hnodes[hnid]->id = hnid; + hnodes[hnid]->dev = dev; + node_set(hnid, dev->registered_hnodes); + xa_init(&hnodes[hnid]->pages); +} + +void hnode_deinit(unsigned int hnid, struct gm_dev *dev) +{ + hnodes[hnid]->id = 0; + hnodes[hnid]->dev = NULL; + node_clear(hnid, dev->registered_hnodes); + xa_destroy(&hnodes[hnid]->pages); + hnodes[hnid] = NULL; +} + +struct prefetch_data { + struct mm_struct *mm; + struct gm_dev *dev; + unsigned long addr; + size_t size; + struct work_struct work; + int *res; +}; + +static void prefetch_work_cb(struct work_struct *work) +{ + struct prefetch_data *d = + container_of(work, struct prefetch_data, work); + unsigned long addr = d->addr, end = d->addr + d->size; + int page_size = HPAGE_SIZE; + int ret; + + do { + /* MADV_WILLNEED: dev will soon access this addr. */ + ret = gm_dev_fault(d->mm, addr, d->dev, MADV_WILLNEED); + if (ret == GM_RET_PAGE_EXIST) { + pr_info("%s: device has done page fault, ignore prefetch\n", + __func__); + } else if (ret != GM_RET_SUCCESS) { + *d->res = -EFAULT; + pr_err("%s: call dev fault error %d\n", __func__, ret); + } + } while (addr += page_size, addr != end); + + kfree(d); +} + +static int hmadvise_do_prefetch(struct gm_dev *dev, unsigned long addr, size_t size) +{ + unsigned long start, end, per_size; + int page_size = HPAGE_SIZE; + struct prefetch_data *data; + struct vm_area_struct *vma; + int res = GM_RET_SUCCESS; + unsigned long old_start; + + /* overflow */ + if (check_add_overflow(addr, size, &end)) + return -EINVAL; + + old_start = end; + + /* Align addr by rounding outward to make page cover addr. */ + end = round_up(end, page_size); + start = round_down(addr, page_size); + size = end - start; + + if (!end && old_start) + return -EINVAL; + + if (size == 0) + return 0; + + mmap_read_lock(current->mm); + vma = find_vma(current->mm, start); + if (!vma || start < vma->vm_start || end > vma->vm_end) { + mmap_read_unlock(current->mm); + return GM_RET_FAILURE_UNKNOWN; + } + mmap_read_unlock(current->mm); + + per_size = (size / GM_WORK_CONCURRENCY) & ~(page_size - 1); + + while (start < end) { + data = kzalloc(sizeof(struct prefetch_data), GFP_KERNEL); + if (!data) { + flush_workqueue(prefetch_wq); + return GM_RET_NOMEM; + } + + INIT_WORK(&data->work, prefetch_work_cb); + data->mm = current->mm; + data->dev = dev; + data->addr = start; + data->res = &res; + if (per_size == 0) + data->size = size; + else + /* Process (1.x * per_size) for the last time */ + data->size = (end - start < 2 * per_size) ? + (end - start) : + per_size; + queue_work(prefetch_wq, &data->work); + start += data->size; + } + + flush_workqueue(prefetch_wq); + return res; +} + +static int gmem_unmap_vma_pages(struct vm_area_struct *vma, unsigned long start, + unsigned long end, int page_size) +{ + struct gm_fault_t gmf = { + .mm = current->mm, + .size = page_size, + .copy = false, + }; + struct gm_mapping *gm_mapping; + vm_object_t *obj; + int ret; + + obj = vma->vm_obj; + if (!obj) { + pr_err("gmem: peer-shared vma should have vm_object\n"); + return -EINVAL; + } + + for (; start < end; start += page_size) { + xa_lock(obj->logical_page_table); + gm_mapping = vm_object_lookup(obj, start); + if (!gm_mapping) { + xa_unlock(obj->logical_page_table); + continue; + } + xa_unlock(obj->logical_page_table); + mutex_lock(&gm_mapping->lock); + if (gm_mapping_nomap(gm_mapping)) { + mutex_unlock(&gm_mapping->lock); + continue; + } else if (gm_mapping_cpu(gm_mapping)) { + zap_page_range_single(vma, start, page_size, NULL); + } else { + gmf.va = start; + gmf.dev = gm_mapping->dev; + ret = gm_mapping->dev->mmu->peer_unmap(&gmf); + if (ret) { + pr_err("gmem: peer_unmap failed. ret %d\n", + ret); + mutex_unlock(&gm_mapping->lock); + continue; + } + } + gm_mapping_flags_set(gm_mapping, GM_PAGE_NOMAP); + mutex_unlock(&gm_mapping->lock); + } + + return 0; +} + +static int hmadvise_do_eagerfree(unsigned long addr, size_t size) +{ + unsigned long start, end, i_start, i_end; + int page_size = HPAGE_SIZE; + struct vm_area_struct *vma; + int ret = GM_RET_SUCCESS; + unsigned long old_start; + + /* overflow */ + if (check_add_overflow(addr, size, &end)) + return -EINVAL; + + old_start = addr; + + /* Align addr by rounding inward to avoid excessive page release. */ + end = round_down(end, page_size); + start = round_up(addr, page_size); + if (start >= end) + return ret; + + /* Check to see whether len was rounded up from small -ve to zero */ + if (old_start && !start) + return -EINVAL; + + mmap_read_lock(current->mm); + do { + vma = find_vma_intersection(current->mm, start, end); + if (!vma) { + pr_info("gmem: there is no valid vma\n"); + break; + } + + if (!vma_is_peer_shared(vma)) { + pr_debug("gmem: not peer-shared vma, skip dontneed\n"); + start = vma->vm_end; + continue; + } + + i_start = start > vma->vm_start ? start : vma->vm_start; + i_end = end < vma->vm_end ? end : vma->vm_end; + ret = gmem_unmap_vma_pages(vma, i_start, i_end, page_size); + if (ret) + break; + + start = vma->vm_end; + } while (start < end); + + mmap_read_unlock(current->mm); + return ret; +} + +static bool check_hmadvise_behavior(int behavior) +{ + return behavior == MADV_DONTNEED; +} + +int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior) +{ + int error = -EINVAL; + struct hnode *node; + + if (hnid == -1) { + if (check_hmadvise_behavior(behavior)) { + goto no_hnid; + } else { + pr_err("hmadvise: behavior %d need hnid or is invalid\n", + behavior); + return error; + } + } + + if (hnid < 0) + return error; + + if (!is_hnode(hnid) || !is_hnode_allowed(hnid)) + return error; + + node = get_hnode(hnid); + if (!node) { + pr_err("hmadvise: hnode id %d is invalid\n", hnid); + return error; + } + +no_hnid: + switch (behavior) { + case MADV_PREFETCH: + return hmadvise_do_prefetch(node->dev, start, len_in); + case MADV_DONTNEED: + return hmadvise_do_eagerfree(start, len_in); + default: + pr_err("hmadvise: unsupported behavior %d\n", behavior); + } + + return error; +} +EXPORT_SYMBOL_GPL(hmadvise_inner); -- Gitee From 8022b5ef72474aed5105df3af9935022e47287fb Mon Sep 17 00:00:00 2001 From: Chen Jun Date: Tue, 24 Jun 2025 19:49:28 +0800 Subject: [PATCH 06/34] mm: gmem: Add gm_dev in struct device euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- Add gm_dev in struct device to keep track on gmem device. Co-developed-by: Jiangtian Feng Signed-off-by: Jiangtian Feng Co-developed-by: luochunsheng Signed-off-by: luochunsheng Signed-off-by: Chen Jun --- include/linux/device.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/linux/device.h b/include/linux/device.h index 54a4967c496c..94262735406a 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -826,7 +826,13 @@ struct device { KABI_RESERVE(2) KABI_RESERVE(3) #endif + +#ifdef CONFIG_GMEM + KABI_USE(4, void *gm_dev) +#else KABI_RESERVE(4) +#endif + KABI_RESERVE(5) KABI_RESERVE(6) KABI_RESERVE(7) -- Gitee From eb1183db4d9e9a74b9b2ed78b8ca89cca2f9a73f Mon Sep 17 00:00:00 2001 From: Liu Chao Date: Wed, 25 Jun 2025 10:36:28 +0800 Subject: [PATCH 07/34] mm: gmem: Introduce vm_object for gmem euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- Introduce vm_object for gmem. Co-developed-by: fangchuangchuang Signed-off-by: fangchuangchuang Co-developed-by: Lemmy Huang Signed-off-by: Lemmy Huang Signed-off-by: Liu Chao --- include/linux/vm_object.h | 25 ++- include/uapi/asm-generic/mman-common.h | 2 + kernel/fork.c | 12 ++ mm/Makefile | 2 +- mm/gmem.c | 6 +- mm/huge_memory.c | 114 ++++++++++- mm/memory.c | 82 ++++++-- mm/mempolicy.c | 4 + mm/mmap.c | 260 ++++++++++++++++++++++++- mm/vm_object.c | 228 ++++++++++++++++++++++ 10 files changed, 709 insertions(+), 26 deletions(-) create mode 100644 mm/vm_object.c diff --git a/include/linux/vm_object.h b/include/linux/vm_object.h index 083a1278901a..e5327665b6b7 100644 --- a/include/linux/vm_object.h +++ b/include/linux/vm_object.h @@ -6,12 +6,31 @@ #include #ifdef CONFIG_GMEM -/* vm_object KAPI */ -static inline int __init vm_object_init(void) { return 0; } +/* vm_object KPI */ +int __init vm_object_init(void); +struct vm_object *vm_object_create(struct vm_area_struct *vma); +void vm_object_drop_locked(struct vm_area_struct *vma); +void dup_vm_object(struct vm_area_struct *dst, struct vm_area_struct *src); +void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, + unsigned long end); + +gm_mapping_t *alloc_gm_mapping(void); +struct gm_mapping *vm_object_lookup(struct vm_object *obj, unsigned long va); +void vm_object_mapping_create(struct vm_object *obj, unsigned long start); +void free_gm_mappings(struct vm_area_struct *vma); +#else +static inline void __init vm_object_init(void) {} +static inline struct vm_object *vm_object_create(struct vm_area_struct *vma) { return NULL; } +static inline void vm_object_drop_locked(struct vm_area_struct *vma) {} +static inline void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, + unsigned long end) {} + +static inline gm_mapping_t *alloc_gm_mapping(void) { return NULL; } static inline struct gm_mapping *vm_object_lookup(struct vm_object *obj, unsigned long va) { return NULL; } static inline void vm_object_mapping_create(struct vm_object *obj, - unsigned long start) { return 0; } + unsigned long start) {} +static inline void free_gm_mappings(struct vm_area_struct *vma) {} #endif #endif /* _VM_OBJECT_H */ diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 5bd675448f53..cdcb59fbfe7f 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -33,6 +33,8 @@ #define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be * uninitialized */ +#define MAP_PEER_SHARED 0x8000000 + /* * Flags for mlock */ diff --git a/kernel/fork.c b/kernel/fork.c index 78663ca68160..d984d93b3d39 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -99,6 +99,11 @@ #include #include #include + +#ifdef CONFIG_GMEM +#include +#endif + #ifdef CONFIG_QOS_SCHED_SMART_GRID #include #endif @@ -526,6 +531,13 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) vma_numab_state_init(new); dup_anon_vma_name(orig, new); +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(orig)) { + pr_debug("gmem: peer-shared vma should not be dup\n"); + new->vm_obj = vm_object_create(new); + } +#endif + return new; } diff --git a/mm/Makefile b/mm/Makefile index 46d2d12fad3e..cedd58296019 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -41,7 +41,7 @@ mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \ mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \ msync.o page_vma_mapped.o pagewalk.o \ pgtable-generic.o rmap.o vmalloc.o -mmu-$(CONFIG_GMEM) += gmem.o +mmu-$(CONFIG_GMEM) += gmem.o vm_object.o ifdef CONFIG_CROSS_MEMORY_ATTACH mmu-$(CONFIG_MMU) += process_vm_access.o diff --git a/mm/gmem.c b/mm/gmem.c index add5062296b6..ebf6a93bc33a 100644 --- a/mm/gmem.c +++ b/mm/gmem.c @@ -231,7 +231,7 @@ gm_ret_t gm_dev_fault(struct mm_struct *mm, unsigned long addr, struct gm_dev *d struct gm_mmu *mmu = dev->mmu; struct device *dma_dev = dev->dma_dev; struct vm_area_struct *vma; - vm_object_t *obj; + struct vm_object *obj; struct gm_mapping *gm_mapping; unsigned long size = HPAGE_SIZE; struct gm_fault_t gmf = { .mm = mm, @@ -334,7 +334,7 @@ vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, vm_fault_t ret = 0; struct vm_area_struct *vma = vmf->vma; unsigned long addr = vmf->address & pe_mask(order); - vm_object_t *obj = vma->vm_obj; + struct vm_object *obj = vma->vm_obj; struct gm_mapping *gm_mapping; unsigned long size = HPAGE_SIZE; struct gm_dev *dev; @@ -697,7 +697,7 @@ static int gmem_unmap_vma_pages(struct vm_area_struct *vma, unsigned long start, .copy = false, }; struct gm_mapping *gm_mapping; - vm_object_t *obj; + struct vm_object *obj; int ret; obj = vma->vm_obj; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a28dda799978..8db8ec9dcc37 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -38,6 +38,10 @@ #include #include #include +#ifdef CONFIG_GMEM +#include +#endif + #include #include @@ -1344,6 +1348,12 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma, const int order = HPAGE_PMD_ORDER; struct folio *folio; +#ifdef CONFIG_GMEM + /* always try to compact hugepage for peer shared vma */ + if (vma_is_peer_shared(vma)) + gfp = GFP_TRANSHUGE; +#endif + folio = vma_alloc_folio(gfp, order, vma, addr & HPAGE_PMD_MASK, true); if (unlikely(!folio)) { @@ -1391,6 +1401,99 @@ static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd, count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); } +#ifdef CONFIG_GMEM + +struct gm_mapping *vma_prepare_gm_mapping(struct vm_area_struct *vma, unsigned long haddr) +{ + struct gm_mapping *gm_mapping; + + xa_lock(vma->vm_obj->logical_page_table); + gm_mapping = vm_object_lookup(vma->vm_obj, haddr); + if (!gm_mapping) { + vm_object_mapping_create(vma->vm_obj, haddr); + gm_mapping = vm_object_lookup(vma->vm_obj, haddr); + } + xa_unlock(vma->vm_obj->logical_page_table); + + return gm_mapping; +} + +static vm_fault_t __do_peer_shared_anonymous_page(struct vm_fault *vmf) +{ + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + struct vm_area_struct *vma = vmf->vma; + struct folio *folio = NULL; + bool is_new_folio = false; + pgtable_t pgtable = NULL; + struct gm_mapping *gm_mapping; + vm_fault_t ret = 0; + + gm_mapping = vma_prepare_gm_mapping(vma, haddr); + if (!gm_mapping) + return VM_FAULT_OOM; + + mutex_lock(&gm_mapping->lock); + + if (gm_mapping_cpu(gm_mapping)) + folio = page_folio(gm_mapping->page); + if (!folio) { + folio = vma_alloc_anon_folio_pmd(vma, haddr); + is_new_folio = true; + } + + if (unlikely(!folio)) { + ret = VM_FAULT_FALLBACK; + goto release; + } + + pgtable = pte_alloc_one(vma->vm_mm); + if (unlikely(!pgtable)) { + ret = VM_FAULT_OOM; + goto release; + } + + /** + * if page is mapped in device, release device mapping and + * deliver the page content to host. + */ + if (gm_mapping_device(gm_mapping)) { + vmf->page = &folio->page; + ret = gm_host_fault_locked(vmf, PMD_ORDER); + if (ret) + goto release; + } + + /* map page in pgtable */ + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + + BUG_ON(!pmd_none(*vmf->pmd)); + ret = check_stable_address_space(vma->vm_mm); + if (ret) + goto unlock_release; + pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); + map_anon_folio_pmd(folio, vmf->pmd, vma, haddr); + mm_inc_nr_ptes(vma->vm_mm); + spin_unlock(vmf->ptl); + + /* finally setup cpu mapping */ + gm_mapping_flags_set(gm_mapping, GM_MAPPING_CPU); + gm_mapping->page = &folio->page; + mutex_unlock(&gm_mapping->lock); + + return 0; +unlock_release: + spin_unlock(vmf->ptl); +release: + if (pgtable) + pte_free(vma->vm_mm, pgtable); + if (is_new_folio) + folio_put(folio); + mutex_unlock(&gm_mapping->lock); + return ret; +} + +#endif + static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) { unsigned long haddr = vmf->address & HPAGE_PMD_MASK; @@ -1424,7 +1527,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) pte_free(vma->vm_mm, pgtable); ret = handle_userfault(vmf, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); - return ret; + goto gm_mapping_release; } pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); map_anon_folio_pmd(folio, vmf->pmd, vma, haddr); @@ -1496,16 +1599,19 @@ static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - vm_fault_t ret; - if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) return VM_FAULT_FALLBACK; ret = vmf_anon_prepare(vmf); if (ret) return ret; + khugepaged_enter_vma(vma, vma->vm_flags); +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) + return __do_peer_shared_anonymous_page(vmf); +#endif + if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm) && transparent_hugepage_use_zero_page()) { diff --git a/mm/memory.c b/mm/memory.c index 4bb3acfc3dd9..9aa4d8174724 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -77,6 +77,10 @@ #include #include #include +#ifdef CONFIG_GMEM +#include +#endif + #include #include @@ -1710,6 +1714,47 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, return addr; } +#ifdef CONFIG_GMEM +static inline void zap_logic_pmd_range(struct vm_area_struct *vma, + unsigned long addr, + unsigned long end) +{ + gm_mapping_t *gm_mapping = NULL; + struct page *page = NULL; + + xa_lock(vma->vm_obj->logical_page_table); + gm_mapping = vm_object_lookup(vma->vm_obj, addr); + + if (gm_mapping && gm_mapping_cpu(gm_mapping)) { + page = gm_mapping->page; + if (page && (page_ref_count(page) != 0)) { + put_page(page); + gm_mapping->page = NULL; + } + } + xa_unlock(vma->vm_obj->logical_page_table); +} + +static inline void zap_logic_pud_range(struct vm_area_struct *vma, + unsigned long addr, + unsigned long end) +{ + unsigned long next; + + do { + next = pmd_addr_end(addr, end); + zap_logic_pmd_range(vma, addr, next); + } while (addr = next, addr != end); +} +#else +static inline void zap_logic_pmd_range(struct vm_area_struct *vma, + unsigned long addr, + unsigned long end) {} +static inline void zap_logic_pud_range(struct vm_area_struct *vma, + unsigned long addr, + unsigned long end) {} +#endif + static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, @@ -1724,10 +1769,8 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) __split_huge_pmd(vma, pmd, addr, false, NULL); - else if (zap_huge_pmd(tlb, vma, pmd, addr)) { - addr = next; - continue; - } + else if (zap_huge_pmd(tlb, vma, pmd, addr)) + goto next; /* fall through */ } else if (details && details->single_folio && folio_test_pmd_mappable(details->single_folio) && @@ -1740,18 +1783,30 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, */ spin_unlock(ptl); } - if (pmd_none(*pmd)) { - addr = next; - continue; + + /* + * Here there can be other concurrent MADV_DONTNEED or + * trans huge page faults running, and if the pmd is + * none or trans huge it can change under us. This is + * because MADV_DONTNEED holds the mmap_lock in read + * mode. + */ + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) { + if (vma_is_peer_shared(vma)) + zap_logic_pmd_range(vma, addr, next); + goto next; } - addr = zap_pte_range(tlb, vma, pmd, addr, next, details); - if (addr != next) - pmd--; - } while (pmd++, cond_resched(), addr != end); + + next = zap_pte_range(tlb, vma, pmd, addr, next, details); +next: + cond_resched(); + } while (pmd++, addr = next, addr != end); return addr; } + + static inline unsigned long zap_pud_range(struct mmu_gather *tlb, struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, @@ -1813,8 +1868,11 @@ void unmap_page_range(struct mmu_gather *tlb, pgd = pgd_offset(vma->vm_mm, addr); do { next = pgd_addr_end(addr, end); - if (pgd_none_or_clear_bad(pgd)) + if (pgd_none_or_clear_bad(pgd)) { + if (vma_is_peer_shared(vma)) + zap_logic_pud_range(vma, addr, next); continue; + } next = zap_p4d_range(tlb, vma, pgd, addr, next, details); } while (pgd++, addr = next, addr != end); tlb_end_vma(tlb, vma); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 8d7732e276f3..4ee93da660cd 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1903,7 +1903,11 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, bool vma_migratable(struct vm_area_struct *vma) { +#ifdef CONFIG_GMEM + if (vma->vm_flags & (VM_IO | VM_PFNMAP | VM_PEER_SHARED)) +#else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) +#endif return false; /* diff --git a/mm/mmap.c b/mm/mmap.c index fb54df419ea2..a69e32ffb6b0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -47,6 +47,9 @@ #include #include #include + +#include + #include #include @@ -644,6 +647,10 @@ static inline int dup_anon_vma(struct vm_area_struct *dst, * anon pages imported. */ if (src->anon_vma && !dst->anon_vma) { +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(dst)) + dup_vm_object(dst, src); +#endif int ret; vma_assert_write_locked(dst); @@ -760,6 +767,39 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, return 0; } +#ifdef CONFIG_GMEM +struct gmem_vma_list { + struct vm_area_struct *vma; + struct list_head list; +}; + +void gmem_reserve_vma(struct vm_area_struct *value, struct list_head *head) +{ + struct gmem_vma_list *node = kmalloc(sizeof(struct gmem_vma_list), GFP_KERNEL); + + if (!node) + return; + + node->vma = value; + list_add_tail(&node->list, head); +} + +void gmem_release_vma(struct mm_struct *mm, struct list_head *head) +{ + struct gmem_vma_list *node, *next; + + list_for_each_entry_safe(node, next, head, list) { + struct vm_area_struct *vma = node->vma; + + if (vma != NULL) + vm_area_free(vma); + + list_del(&node->list); + kfree(node); + } +} +#endif + /* * If the vma has a ->close operation then the driver probably needs to release * per-vma resources, so we don't attempt to merge those if the caller indicates @@ -1082,6 +1122,11 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, vma_iter_store(vmi, vma); if (adj_start) { +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(adjust)) + vm_object_adjust(adjust, adjust->vm_start + adj_start, + adjust->vm_end); +#endif adjust->vm_start += adj_start; adjust->vm_pgoff += adj_start >> PAGE_SHIFT; if (adj_start < 0) { @@ -1316,7 +1361,17 @@ unsigned long __do_mmap_mm(struct mm_struct *mm, struct file *file, unsigned lon /* Obtain the address to map to. we verify (or select) it and ensure * that it represents a valid section of the address space. */ +#ifdef CONFIG_GMEM + if (gmem_is_enabled() && (flags & MAP_PEER_SHARED)) { + len = round_up(len, SZ_2M); + addr = get_unmapped_area_aligned(file, addr, len, pgoff, flags, + SZ_2M); + } else { + addr = get_unmapped_area(file, addr, len, pgoff, flags); + } +#else addr = get_unmapped_area(file, addr, len, pgoff, flags); +#endif if (IS_ERR_VALUE(addr)) return addr; @@ -1439,6 +1494,10 @@ unsigned long __do_mmap_mm(struct mm_struct *mm, struct file *file, unsigned lon if (file && is_file_hugepages(file)) vm_flags |= VM_NORESERVE; } +#ifdef CONFIG_GMEM + if (gmem_is_enabled() && (flags & MAP_PEER_SHARED)) + vm_flags |= VM_PEER_SHARED; +#endif addr = __mmap_region_ext(mm, file, addr, len, vm_flags, pgoff, uf); if (!IS_ERR_VALUE(addr) && @@ -1447,6 +1506,7 @@ unsigned long __do_mmap_mm(struct mm_struct *mm, struct file *file, unsigned lon *populate = len; return addr; } +EXPORT_SYMBOL(__do_mmap_mm); unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, @@ -1933,6 +1993,27 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, EXPORT_SYMBOL(get_unmapped_area); +unsigned long +get_unmapped_area_aligned(struct file *file, unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags, unsigned long align) +{ + if (len > TASK_SIZE) + return -ENOMEM; + + addr = current->mm->get_unmapped_area(file, addr, len + align, pgoff, flags); + if (IS_ERR_VALUE(addr)) + return addr; + + addr = round_up(addr, align); + if (addr > TASK_SIZE - len) + return -ENOMEM; + if (!IS_ALIGNED(addr, PMD_SIZE)) + return -EINVAL; + + return addr; +} +EXPORT_SYMBOL(get_unmapped_area_aligned); + /** * find_vma_intersection() - Look up the first VMA which intersects the interval * @mm: The process address space. @@ -2472,6 +2553,11 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, if (err) goto out_free_mpol; +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) + dup_vm_object(new, vma); +#endif + if (new->vm_file) get_file(new->vm_file); @@ -2486,6 +2572,18 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_prepare(&vp); vma_adjust_trans_huge(vma, vma->vm_start, addr, 0); +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) { + if (new_below) { + vm_object_adjust(new, new->vm_start, addr); + vm_object_adjust(vma, addr, vma->vm_end); + } else { + vm_object_adjust(vma, vma->vm_start, addr); + vm_object_adjust(new, addr, new->vm_end); + } + } +#endif + if (new_below) { vma->vm_start = addr; vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT; @@ -2523,6 +2621,68 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, return __split_vma(vmi, vma, addr, new_below); } +#ifdef CONFIG_GMEM +static void munmap_in_peer_devices(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long start, unsigned long end) +{ + unsigned long addr = start; + struct vm_object *obj = vma->vm_obj; + gm_ret_t ret; + gm_context_t *ctx, *tmp; + gm_mapping_t *gm_mapping; + + struct gm_fault_t gmf = { + .mm = mm, + .copy = false, + }; + + if (!obj) + return; + + do { + xa_lock(obj->logical_page_table); + gm_mapping = vm_object_lookup(obj, addr); + if (!gm_mapping) { + xa_unlock(obj->logical_page_table); + continue; + } + xa_unlock(obj->logical_page_table); + + mutex_lock(&gm_mapping->lock); + if (!gm_mapping_device(gm_mapping)) { + mutex_unlock(&gm_mapping->lock); + continue; + } + + gmf.va = addr; + gmf.size = HPAGE_SIZE; + gmf.dev = gm_mapping->dev; + ret = gm_mapping->dev->mmu->peer_unmap(&gmf); + if (ret != GM_RET_SUCCESS) { + pr_err("%s: call dev peer_unmap error %d\n", __func__, ret); + mutex_unlock(&gm_mapping->lock); + continue; + } + mutex_unlock(&gm_mapping->lock); + } while (addr += HPAGE_SIZE, addr != end); + + if (!mm->gm_as) + return; + + list_for_each_entry_safe(ctx, tmp, &mm->gm_as->gm_ctx_list, gm_as_link) { + if (!gm_dev_is_peer(ctx->dev)) + continue; + if (!ctx->dev->mmu->peer_va_free) + continue; + + ret = ctx->dev->mmu->peer_va_free(mm, start, end - start); + if (ret != GM_RET_SUCCESS) + pr_debug("gmem: free_vma(start:%lx, len:%lx) ret %d\n", + start, end - start, ret); + } +} +#endif + /* * do_vmi_align_munmap() - munmap the aligned region from @start to @end. * @vmi: The vma iterator @@ -2653,6 +2813,10 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, prev = vma_iter_prev_range(vmi); next = vma_next(vmi); +#ifdef CONFIG_GMEM + if (gmem_is_enabled()) + munmap_in_peer_devices(mm, vma, start, end); +#endif if (next) vma_iter_prev_range(vmi); @@ -2711,6 +2875,17 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long end; struct vm_area_struct *vma; + if (gmem_is_enabled()) { + vma = find_vma_intersection(mm, start, start + len); + if (!vma) + return 0; + if (vma_is_peer_shared(vma)) { + if (!IS_ALIGNED(start, PMD_SIZE)) + return -EINVAL; + + len = round_up(len, SZ_2M); + } + } if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) return -EINVAL; @@ -2748,6 +2923,48 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, return do_vmi_munmap(&vmi, mm, start, len, uf, false); } +#ifdef CONFIG_GMEM +static int alloc_va_in_peer_devices(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long addr, unsigned long len, + vm_flags_t vm_flags) +{ + gm_context_t *ctx, *tmp; + gm_prot_t prot = VM_NONE; + gm_ret_t ret; + + pr_debug("gmem: start mmap, as %p\n", mm->gm_as); + if (!mm->gm_as) + return -ENODEV; + + if (!vma->vm_obj) + vma->vm_obj = vm_object_create(vma); + if (!vma->vm_obj) + return -ENOMEM; + /* + * TODO: consider the concurrency problem of device + * attaching/detaching from the gm_as. + */ + list_for_each_entry_safe(ctx, tmp, &mm->gm_as->gm_ctx_list, gm_as_link) { + if (!gm_dev_is_peer(ctx->dev)) + continue; + + if (!ctx->dev->mmu->peer_va_alloc_fixed) { + pr_debug("gmem: mmu ops has no alloc_vma\n"); + continue; + } + + pr_debug("gmem: call vma_alloc\n"); + ret = ctx->dev->mmu->peer_va_alloc_fixed(mm, addr, len, vm_flags); + if (ret != GM_RET_SUCCESS) { + pr_debug("gmem: alloc_vma ret %d\n", ret); + return ret; + } + } + + return GM_RET_SUCCESS; +} +#endif + static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, @@ -2762,7 +2979,12 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, pgoff_t vm_pgoff; int error; VMA_ITERATOR(vmi, mm, addr); +#ifdef CONFIG_GMEM + unsigned int retry_times = 0; + LIST_HEAD(reserve_list); +retry: +#endif /* Check against address space limit. */ if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { unsigned long nr_pages; @@ -2774,21 +2996,33 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, nr_pages = count_vma_pages_range(mm, addr, end); if (!may_expand_vm(mm, vm_flags, - (len >> PAGE_SHIFT) - nr_pages)) + (len >> PAGE_SHIFT) - nr_pages)) { +#ifdef CONFIG_GMEM + gmem_release_vma(mm, &reserve_list); +#endif return -ENOMEM; + } } /* Unmap any existing mapping in the area */ - if (do_vmi_munmap(&vmi, mm, addr, len, uf, false)) + if (do_vmi_munmap(&vmi, mm, addr, len, uf, false)) { +#ifdef CONFIG_GMEM + gmem_release_vma(mm, &reserve_list); +#endif return -ENOMEM; + } /* * Private writable mapping: check memory availability */ if (accountable_mapping(file, vm_flags)) { charged = len >> PAGE_SHIFT; - if (security_vm_enough_memory_mm(mm, charged)) + if (security_vm_enough_memory_mm(mm, charged)) { +#ifdef CONFIG_GMEM + gmem_release_vma(mm, &reserve_list); +#endif return -ENOMEM; + } vm_flags |= VM_ACCOUNT; } @@ -2931,6 +3165,23 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, file = vma->vm_file; ksm_add_vma(vma); expanded: +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) { + gm_ret_t ret = alloc_va_in_peer_devices(mm, vma, addr, len, vm_flags); + + if (ret == GM_RET_NOMEM && retry_times < GMEM_MMAP_RETRY_TIMES) { + retry_times++; + addr = get_unmapped_area(file, addr, len, pgoff, 0); + gmem_reserve_vma(vma, &reserve_list); + goto retry; + } else if (ret != GM_RET_SUCCESS) { + pr_debug("gmem: alloc_vma ret %d\n", ret); + error = -ENOMEM; + goto free_vma; + } + gmem_release_vma(mm, &reserve_list); + } +#endif perf_event_mmap(vma); vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); @@ -2974,6 +3225,9 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, unacct_error: if (charged) vm_unacct_memory(charged); +#ifdef CONFIG_GMEM + gmem_release_vma(mm, &reserve_list); +#endif return error; } diff --git a/mm/vm_object.c b/mm/vm_object.c new file mode 100644 index 000000000000..8d3d6b121649 --- /dev/null +++ b/mm/vm_object.c @@ -0,0 +1,228 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Logical Mapping Management + * + * Copyright (C) 2023- Huawei, Inc. + * Author: Weixi zhu, chao Liu + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Sine VM_OBJECT maintains the logical page table under each VMA, and each VMA + * points to a VM_OBJECT. Ultimately VM_OBJECTs must be maintained as long as VMA + * gets changed: merge, split, adjust + */ +static struct kmem_cache *vm_object_cachep; +static struct kmem_cache *gm_mapping_cachep; + +/* gm_mapping will not be release dynamically */ +gm_mapping_t *alloc_gm_mapping(void) +{ + gm_mapping_t *gm_mapping = kmem_cache_zalloc(gm_mapping_cachep, GFP_KERNEL); + + if (!gm_mapping) + return NULL; + + gm_mapping_flags_set(gm_mapping, GM_PAGE_NOMAP); + mutex_init(&gm_mapping->lock); + + return gm_mapping; +} +EXPORT_SYMBOL(alloc_gm_mapping); + +static inline void release_gm_mapping(gm_mapping_t *mapping) +{ + kmem_cache_free(gm_mapping_cachep, mapping); +} + +static inline gm_mapping_t *lookup_gm_mapping(struct vm_object *obj, unsigned long pindex) +{ + return xa_load(obj->logical_page_table, pindex); +} + +int __init vm_object_init(void) +{ + vm_object_cachep = KMEM_CACHE(vm_object, 0); + if (!vm_object_cachep) + goto out; + + gm_mapping_cachep = KMEM_CACHE(gm_mapping, 0); + if (!gm_mapping_cachep) + goto free_vm_object; + + return 0; +free_vm_object: + kmem_cache_destroy(vm_object_cachep); +out: + return -ENOMEM; +} + +/* + * Create a VM_OBJECT and attach it to a VMA + * This should be called when a VMA is created. + */ +struct vm_object *vm_object_create(struct vm_area_struct *vma) +{ + struct vm_object *obj = kmem_cache_alloc(vm_object_cachep, GFP_KERNEL); + + if (!obj) + return NULL; + + spin_lock_init(&obj->lock); + obj->vma = vma; + + /* + * The logical page table maps linear_page_index(obj->vma, va) + * to pointers of struct gm_mapping. + */ + obj->logical_page_table = kmalloc(sizeof(struct xarray), GFP_KERNEL); + if (!obj->logical_page_table) { + kmem_cache_free(vm_object_cachep, obj); + return NULL; + } + + xa_init(obj->logical_page_table); + atomic_set(&obj->nr_pages, 0); + atomic_set(&obj->ref_count, 1); + + return obj; +} + +/* This should be called when a VMA no longer refers to a VM_OBJECT */ +void vm_object_drop_locked(struct vm_area_struct *vma) +{ + struct vm_object *obj = vma->vm_obj; + + if (!obj) { + pr_err("vm_object: vm_obj of the vma is NULL\n"); + return; + } + + /* + * We must enter this with VMA write-locked, which is unfortunately a giant lock. + * Note that Linux 6.0 has per-VMA lock: + * https://lwn.net/Articles/906852/ + * https://lwn.net/Articles/906833/ + */ + free_gm_mappings(vma); + mmap_assert_write_locked(vma->vm_mm); + vma->vm_obj = NULL; + + if (atomic_dec_and_test(&obj->ref_count)) { + xa_destroy(obj->logical_page_table); + kfree(obj->logical_page_table); + kmem_cache_free(vm_object_cachep, obj); + } +} + +void dup_vm_object(struct vm_area_struct *dst, struct vm_area_struct *src) +{ + unsigned long index; + gm_mapping_t *mapping; + unsigned long moved_pages = 0; + + XA_STATE(xas, src->vm_obj->logical_page_table, linear_page_index(src, src->vm_start)); + + xa_lock(dst->vm_obj->logical_page_table); + rcu_read_lock(); + xas_for_each(&xas, mapping, linear_page_index(src, src->vm_end)) { + index = xas.xa_index - src->vm_pgoff + dst->vm_pgoff + + ((src->vm_start - dst->vm_start) >> PAGE_SHIFT); + __xa_store(dst->vm_obj->logical_page_table, index, mapping, GFP_KERNEL); + moved_pages++; + } + rcu_read_unlock(); + atomic_add(moved_pages, &dst->vm_obj->nr_pages); + xa_unlock(dst->vm_obj->logical_page_table); +} + +void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end) +{ + /* remove logical mapping in [vma->vm_start, start) and [end, vm->vm_end) */ + unsigned long removed_pages = 0; + gm_mapping_t *mapping; + + XA_STATE(xas, vma->vm_obj->logical_page_table, linear_page_index(vma, vma->vm_start)); + + xas_lock(&xas); + if (vma->vm_start < start) { + xas_for_each(&xas, mapping, linear_page_index(vma, start)) { + xas_store(&xas, NULL); + removed_pages++; + } + } + + if (vma->vm_end > end) { + xas_set(&xas, linear_page_index(vma, end)); + + xas_for_each(&xas, mapping, linear_page_index(vma, vma->vm_end)) { + xas_store(&xas, NULL); + removed_pages++; + } + } + atomic_sub(removed_pages, &vma->vm_obj->nr_pages); + xas_unlock(&xas); +} + +/* + * Given a VA, the page_index is computed by + * page_index = linear_page_index(struct vm_area_struct *vma, unsigned long address) + */ +struct gm_mapping *vm_object_lookup(struct vm_object *obj, unsigned long va) +{ + return lookup_gm_mapping(obj, linear_page_index(obj->vma, va)); +} +EXPORT_SYMBOL_GPL(vm_object_lookup); + +void vm_object_mapping_create(struct vm_object *obj, unsigned long start) +{ + pgoff_t index = linear_page_index(obj->vma, start); + gm_mapping_t *gm_mapping; + + gm_mapping = alloc_gm_mapping(); + if (!gm_mapping) + return; + + __xa_store(obj->logical_page_table, index, gm_mapping, GFP_KERNEL); +} + +void free_gm_mappings(struct vm_area_struct *vma) +{ + gm_mapping_t *gm_mapping; + XA_STATE(xas, vma->vm_obj->logical_page_table, linear_page_index(vma, vma->vm_start)); + + xa_lock(vma->vm_obj->logical_page_table); + xas_for_each(&xas, gm_mapping, linear_page_index(vma, vma->vm_end)) { + release_gm_mapping(gm_mapping); + xas_store(&xas, NULL); + } + xa_unlock(vma->vm_obj->logical_page_table); +} -- Gitee From e1a0e71181c91723d9d71ed83ef59912b24c1da4 Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Wed, 25 Jun 2025 10:49:25 +0800 Subject: [PATCH 08/34] openeuler_defconfig: Enable gmem related configs euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- Enable gmem related configs. Signed-off-by: Ma Wupeng --- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 8a94aa5182fc..c88e3a8314be 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -1236,6 +1236,7 @@ CONFIG_LRU_GEN=y CONFIG_ARM64_HAFT=y CONFIG_ARCH_SUPPORTS_PER_VMA_LOCK=y CONFIG_PER_VMA_LOCK=y +CONFIG_GMEM=y CONFIG_LOCK_MM_AND_FIND_VMA=y CONFIG_IOMMU_MM_DATA=y # CONFIG_ASCEND_FEATURES is not set diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 2d20a95852e0..2216209d62ba 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -1206,6 +1206,7 @@ CONFIG_LRU_GEN=y # CONFIG_LRU_GEN_STATS is not set CONFIG_ARCH_SUPPORTS_PER_VMA_LOCK=y CONFIG_PER_VMA_LOCK=y +CONFIG_GMEM=y CONFIG_LOCK_MM_AND_FIND_VMA=y CONFIG_IOMMU_MM_DATA=y CONFIG_PAGE_CACHE_LIMIT=y -- Gitee From 4f623cccc42fbbdd6799f29b72cbe6c924e4c4a5 Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Wed, 25 Jun 2025 10:57:25 +0800 Subject: [PATCH 09/34] mm: gmem: Display VM_PEER_SHARED as ps during smaps euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- Display VM_PEER_SHARED as ps during smaps. Signed-off-by: Ma Wupeng --- fs/proc/task_mmu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 8a691365061c..84faaddafddf 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -698,6 +698,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR [ilog2(VM_UFFD_MINOR)] = "ui", #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ +#ifdef CONFIG_GMEM + [ilog2(VM_PEER_SHARED)] = "ps", +#endif #ifdef CONFIG_X86_USER_SHADOW_STACK [ilog2(VM_SHADOW_STACK)] = "ss", #endif -- Gitee From aab7e5b92496143085b24b327c325ffb55904939 Mon Sep 17 00:00:00 2001 From: nicunshu Date: Thu, 10 Jul 2025 20:32:48 +0800 Subject: [PATCH 10/34] mm: gmem: use thp_vma_suitable_order instead of transhuge_vma_suitable euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- use thp_vma_suitable_order instead of transhuge_vma_suitable and add config isolation huge_mm.h Signed-off-by: nicunshu --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 8db8ec9dcc37..86ad7f44aef4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1527,7 +1527,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) pte_free(vma->vm_mm, pgtable); ret = handle_userfault(vmf, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); - goto gm_mapping_release; + return ret; } pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); map_anon_folio_pmd(folio, vmf->pmd, vma, haddr); -- Gitee From 065f64c54625e71ae6f2f47fcec4b5e2578eea79 Mon Sep 17 00:00:00 2001 From: nicunshu Date: Sat, 28 Jun 2025 10:17:48 +0800 Subject: [PATCH 11/34] mm: gmem: change conflict flag euleros inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- change conflict mmap flag used in gmem Fixes: 4c627cebab85 ("mm: gmem: Introduce GMEM") Signed-off-by: nicunshu --- include/uapi/asm-generic/mman-common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index cdcb59fbfe7f..d8857c71d4bb 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -33,7 +33,7 @@ #define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be * uninitialized */ -#define MAP_PEER_SHARED 0x8000000 +#define MAP_PEER_SHARED 0x1000000 /* * Flags for mlock -- Gitee From a1161cfa6b559ca6081ebee0a629d0f75a3c18ba Mon Sep 17 00:00:00 2001 From: nicunshu Date: Thu, 10 Jul 2025 20:40:51 +0800 Subject: [PATCH 12/34] mm: gmem: use kabi_reserve to avoid kabi breakage euleros inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- use kabi_reserve to avoid kabi breakage Fixes: 4c627cebab85 ("mm: gmem: Introduce GMEM") Signed-off-by: nicunshu --- include/linux/mm_types.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index bcfbaa36bbbb..28b308a7d5c6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1060,7 +1060,11 @@ struct mm_struct { #else KABI_RESERVE(1) #endif +#ifdef CONFIG_GMEM + KABI_USE(2, gm_as_t *gm_as) +#else KABI_RESERVE(2) +#endif KABI_RESERVE(3) KABI_RESERVE(4) KABI_RESERVE(5) -- Gitee From be6afa2ffb2210e598c8938bad1a54aaa2157d0c Mon Sep 17 00:00:00 2001 From: nicunshu Date: Thu, 10 Jul 2025 21:19:16 +0800 Subject: [PATCH 13/34] mm: gmem: remove deprecated function euleros inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- remove deprecated function pmd_none_or_trans_huge_or_clear_bad. Use pmd_none_or_clear_bad and pmd_trans_huge instead. remove deprecated page size function and avoid kabi problem. Fixes: 4c627cebab85 ("mm: gmem: Introduce GMEM") Signed-off-by: nicunshu --- include/linux/gmem.h | 2 +- include/linux/mm.h | 7 ------- include/linux/mm_types.h | 9 +++++---- include/linux/vm_object.h | 4 ++-- mm/gmem.c | 18 +++++++++--------- mm/huge_memory.c | 1 - mm/memory.c | 31 ++++++++++++++++++------------- mm/mmap.c | 15 ++++++++------- mm/vm_object.c | 16 ++++++++-------- 9 files changed, 51 insertions(+), 52 deletions(-) diff --git a/include/linux/gmem.h b/include/linux/gmem.h index 3216b55d659d..fefe17d6f50d 100644 --- a/include/linux/gmem.h +++ b/include/linux/gmem.h @@ -284,7 +284,7 @@ extern enum gm_ret gm_dev_register_physmem(struct gm_dev *dev, unsigned long beg unsigned long end); enum gm_ret gm_dev_fault(struct mm_struct *mm, unsigned long addr, struct gm_dev *dev, int behavior); -vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, enum page_entry_size pe_size); +vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, unsigned int order); /* GMEM address space KPI */ extern enum gm_ret gm_dev_register_physmem(struct gm_dev *dev, unsigned long begin, diff --git a/include/linux/mm.h b/include/linux/mm.h index 5b13113a0925..11d6a86b3aab 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -582,13 +582,6 @@ struct vm_fault { KABI_RESERVE(3) }; -/* page entry size for vm->huge_fault() */ -enum page_entry_size { - PE_SIZE_PTE = 0, - PE_SIZE_PMD, - PE_SIZE_PUD, -}; - /* * These are the virtual MM functions - opening of an area, closing and * unmapping it (needed to keep files on disk up-to-date etc), pointer diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 28b308a7d5c6..e9cd4439e08d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -773,13 +773,14 @@ struct vm_area_struct { struct vma_numab_state *numab_state; /* NUMA Balancing state */ #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; -#ifdef CONFIG_GMEM - struct vm_object *vm_obj; -#endif #ifdef CONFIG_SHARE_POOL struct sp_area *spa; #endif +#ifdef CONFIG_GMEM + KABI_USE(1, struct vm_object *vm_obj) +#else KABI_RESERVE(1) +#endif KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) @@ -1061,7 +1062,7 @@ struct mm_struct { KABI_RESERVE(1) #endif #ifdef CONFIG_GMEM - KABI_USE(2, gm_as_t *gm_as) + KABI_USE(2, struct gm_as *gm_as) #else KABI_RESERVE(2) #endif diff --git a/include/linux/vm_object.h b/include/linux/vm_object.h index e5327665b6b7..f17d78a62416 100644 --- a/include/linux/vm_object.h +++ b/include/linux/vm_object.h @@ -14,7 +14,7 @@ void dup_vm_object(struct vm_area_struct *dst, struct vm_area_struct *src); void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end); -gm_mapping_t *alloc_gm_mapping(void); +struct gm_mapping *alloc_gm_mapping(void); struct gm_mapping *vm_object_lookup(struct vm_object *obj, unsigned long va); void vm_object_mapping_create(struct vm_object *obj, unsigned long start); void free_gm_mappings(struct vm_area_struct *vma); @@ -25,7 +25,7 @@ static inline void vm_object_drop_locked(struct vm_area_struct *vma) {} static inline void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end) {} -static inline gm_mapping_t *alloc_gm_mapping(void) { return NULL; } +static inline struct gm_mapping *alloc_gm_mapping(void) { return NULL; } static inline struct gm_mapping *vm_object_lookup(struct vm_object *obj, unsigned long va) { return NULL; } static inline void vm_object_mapping_create(struct vm_object *obj, diff --git a/mm/gmem.c b/mm/gmem.c index ebf6a93bc33a..adf640790df5 100644 --- a/mm/gmem.c +++ b/mm/gmem.c @@ -61,7 +61,7 @@ static inline unsigned long pe_mask(unsigned int order) return HPAGE_PMD_MASK; if (order == PUD_ORDER) return HPAGE_PUD_MASK; - return ~0; + return 0; } static struct percpu_counter g_gmem_stats[NR_GMEM_STAT_ITEMS]; @@ -185,7 +185,7 @@ __setup("gmem=", setup_gmem); * The returned device pointer will be passed by new_dev. * A unique id will be assigned to the GMEM device, using Linux's xarray. */ -gm_ret_t gm_dev_create(struct gm_mmu *mmu, void *dev_data, unsigned long cap, +enum gm_ret gm_dev_create(struct gm_mmu *mmu, void *dev_data, unsigned long cap, struct gm_dev **new_dev) { struct gm_dev *dev; @@ -215,7 +215,7 @@ gm_ret_t gm_dev_create(struct gm_mmu *mmu, void *dev_data, unsigned long cap, EXPORT_SYMBOL_GPL(gm_dev_create); // Destroy a GMEM device and reclaim the resources. -gm_ret_t gm_dev_destroy(struct gm_dev *dev) +enum gm_ret gm_dev_destroy(struct gm_dev *dev) { // TODO: implement it xa_erase(&gm_dev_id_pool, dev->id); @@ -224,10 +224,10 @@ gm_ret_t gm_dev_destroy(struct gm_dev *dev) EXPORT_SYMBOL_GPL(gm_dev_destroy); /* Handle the page fault triggered by a given device */ -gm_ret_t gm_dev_fault(struct mm_struct *mm, unsigned long addr, struct gm_dev *dev, +enum gm_ret gm_dev_fault(struct mm_struct *mm, unsigned long addr, struct gm_dev *dev, int behavior) { - gm_ret_t ret = GM_RET_SUCCESS; + enum gm_ret ret = GM_RET_SUCCESS; struct gm_mmu *mmu = dev->mmu; struct device *dma_dev = dev->dma_dev; struct vm_area_struct *vma; @@ -376,7 +376,7 @@ vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, * This implies dynamically creating * the struct page data structures. */ -gm_ret_t gm_dev_register_physmem(struct gm_dev *dev, unsigned long begin, unsigned long end) +enum gm_ret gm_dev_register_physmem(struct gm_dev *dev, unsigned long begin, unsigned long end) { struct gm_mapping *mapping; unsigned long addr = PAGE_ALIGN(begin); @@ -463,7 +463,7 @@ struct gm_mapping *gm_mappings_alloc(unsigned int nid, unsigned int order) EXPORT_SYMBOL_GPL(gm_mappings_alloc); /* GMEM Virtual Address Space API */ -gm_ret_t gm_as_create(unsigned long begin, unsigned long end, enum gm_as_alloc policy, +enum gm_ret gm_as_create(unsigned long begin, unsigned long end, enum gm_as_alloc policy, unsigned long cache_quantum, struct gm_as **new_as) { struct gm_as *as; @@ -488,7 +488,7 @@ gm_ret_t gm_as_create(unsigned long begin, unsigned long end, enum gm_as_alloc p } EXPORT_SYMBOL_GPL(gm_as_create); -gm_ret_t gm_as_destroy(struct gm_as *as) +enum gm_ret gm_as_destroy(struct gm_as *as) { struct gm_context *ctx, *tmp_ctx; @@ -501,7 +501,7 @@ gm_ret_t gm_as_destroy(struct gm_as *as) } EXPORT_SYMBOL_GPL(gm_as_destroy); -gm_ret_t gm_as_attach(struct gm_as *as, struct gm_dev *dev, enum gm_mmu_mode mode, +enum gm_ret gm_as_attach(struct gm_as *as, struct gm_dev *dev, enum gm_mmu_mode mode, bool activate, struct gm_context **out_ctx) { struct gm_context *ctx; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 86ad7f44aef4..59e2ca2478d3 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1543,7 +1543,6 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) pte_free(vma->vm_mm, pgtable); folio_put(folio); return ret; - } /* diff --git a/mm/memory.c b/mm/memory.c index 9aa4d8174724..70d92ba26cc2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1719,7 +1719,7 @@ static inline void zap_logic_pmd_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end) { - gm_mapping_t *gm_mapping = NULL; + struct gm_mapping *gm_mapping = NULL; struct page *page = NULL; xa_lock(vma->vm_obj->logical_page_table); @@ -1769,8 +1769,10 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) __split_huge_pmd(vma, pmd, addr, false, NULL); - else if (zap_huge_pmd(tlb, vma, pmd, addr)) - goto next; + else if (zap_huge_pmd(tlb, vma, pmd, addr)) { + addr = next; + continue; + } /* fall through */ } else if (details && details->single_folio && folio_test_pmd_mappable(details->single_folio) && @@ -1783,7 +1785,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, */ spin_unlock(ptl); } - +#ifdef CONFIG_GMEM /* * Here there can be other concurrent MADV_DONTNEED or * trans huge page faults running, and if the pmd is @@ -1791,22 +1793,23 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, * because MADV_DONTNEED holds the mmap_lock in read * mode. */ - if (pmd_none_or_trans_huge_or_clear_bad(pmd)) { + if (pmd_none_or_clear_bad(pmd) || pmd_trans_huge(*pmd)) { if (vma_is_peer_shared(vma)) zap_logic_pmd_range(vma, addr, next); - goto next; } - - next = zap_pte_range(tlb, vma, pmd, addr, next, details); -next: - cond_resched(); - } while (pmd++, addr = next, addr != end); +#endif + if (pmd_none(*pmd)) { + addr = next; + continue; + } + addr = zap_pte_range(tlb, vma, pmd, addr, next, details); + if (addr != next) + pmd--; + } while (pmd++, cond_resched(), addr != end); return addr; } - - static inline unsigned long zap_pud_range(struct mmu_gather *tlb, struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, @@ -1869,8 +1872,10 @@ void unmap_page_range(struct mmu_gather *tlb, do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) { +#ifdef CONFIG_GMEM if (vma_is_peer_shared(vma)) zap_logic_pud_range(vma, addr, next); +#endif continue; } next = zap_p4d_range(tlb, vma, pgd, addr, next, details); diff --git a/mm/mmap.c b/mm/mmap.c index a69e32ffb6b0..1504db3e74c2 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1993,6 +1993,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, EXPORT_SYMBOL(get_unmapped_area); +#ifdef CONFIG_GMEM unsigned long get_unmapped_area_aligned(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, unsigned long align) @@ -2013,6 +2014,7 @@ get_unmapped_area_aligned(struct file *file, unsigned long addr, unsigned long l return addr; } EXPORT_SYMBOL(get_unmapped_area_aligned); +#endif /** * find_vma_intersection() - Look up the first VMA which intersects the interval @@ -2627,9 +2629,9 @@ static void munmap_in_peer_devices(struct mm_struct *mm, { unsigned long addr = start; struct vm_object *obj = vma->vm_obj; - gm_ret_t ret; - gm_context_t *ctx, *tmp; - gm_mapping_t *gm_mapping; + enum gm_ret ret; + struct gm_context *ctx, *tmp; + struct gm_mapping *gm_mapping; struct gm_fault_t gmf = { .mm = mm, @@ -2928,9 +2930,8 @@ static int alloc_va_in_peer_devices(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long len, vm_flags_t vm_flags) { - gm_context_t *ctx, *tmp; - gm_prot_t prot = VM_NONE; - gm_ret_t ret; + struct gm_context *ctx, *tmp; + enum gm_ret ret; pr_debug("gmem: start mmap, as %p\n", mm->gm_as); if (!mm->gm_as) @@ -3167,7 +3168,7 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, expanded: #ifdef CONFIG_GMEM if (vma_is_peer_shared(vma)) { - gm_ret_t ret = alloc_va_in_peer_devices(mm, vma, addr, len, vm_flags); + enum gm_ret ret = alloc_va_in_peer_devices(mm, vma, addr, len, vm_flags); if (ret == GM_RET_NOMEM && retry_times < GMEM_MMAP_RETRY_TIMES) { retry_times++; diff --git a/mm/vm_object.c b/mm/vm_object.c index 8d3d6b121649..6ac4c172cfdd 100644 --- a/mm/vm_object.c +++ b/mm/vm_object.c @@ -44,9 +44,9 @@ static struct kmem_cache *vm_object_cachep; static struct kmem_cache *gm_mapping_cachep; /* gm_mapping will not be release dynamically */ -gm_mapping_t *alloc_gm_mapping(void) +struct gm_mapping *alloc_gm_mapping(void) { - gm_mapping_t *gm_mapping = kmem_cache_zalloc(gm_mapping_cachep, GFP_KERNEL); + struct gm_mapping *gm_mapping = kmem_cache_zalloc(gm_mapping_cachep, GFP_KERNEL); if (!gm_mapping) return NULL; @@ -58,12 +58,12 @@ gm_mapping_t *alloc_gm_mapping(void) } EXPORT_SYMBOL(alloc_gm_mapping); -static inline void release_gm_mapping(gm_mapping_t *mapping) +static inline void release_gm_mapping(struct gm_mapping *mapping) { kmem_cache_free(gm_mapping_cachep, mapping); } -static inline gm_mapping_t *lookup_gm_mapping(struct vm_object *obj, unsigned long pindex) +static inline struct gm_mapping *lookup_gm_mapping(struct vm_object *obj, unsigned long pindex) { return xa_load(obj->logical_page_table, pindex); } @@ -146,7 +146,7 @@ void vm_object_drop_locked(struct vm_area_struct *vma) void dup_vm_object(struct vm_area_struct *dst, struct vm_area_struct *src) { unsigned long index; - gm_mapping_t *mapping; + struct gm_mapping *mapping; unsigned long moved_pages = 0; XA_STATE(xas, src->vm_obj->logical_page_table, linear_page_index(src, src->vm_start)); @@ -168,7 +168,7 @@ void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, unsigned { /* remove logical mapping in [vma->vm_start, start) and [end, vm->vm_end) */ unsigned long removed_pages = 0; - gm_mapping_t *mapping; + struct gm_mapping *mapping; XA_STATE(xas, vma->vm_obj->logical_page_table, linear_page_index(vma, vma->vm_start)); @@ -205,7 +205,7 @@ EXPORT_SYMBOL_GPL(vm_object_lookup); void vm_object_mapping_create(struct vm_object *obj, unsigned long start) { pgoff_t index = linear_page_index(obj->vma, start); - gm_mapping_t *gm_mapping; + struct gm_mapping *gm_mapping; gm_mapping = alloc_gm_mapping(); if (!gm_mapping) @@ -216,7 +216,7 @@ void vm_object_mapping_create(struct vm_object *obj, unsigned long start) void free_gm_mappings(struct vm_area_struct *vma) { - gm_mapping_t *gm_mapping; + struct gm_mapping *gm_mapping; XA_STATE(xas, vma->vm_obj->logical_page_table, linear_page_index(vma, vma->vm_start)); xa_lock(vma->vm_obj->logical_page_table); -- Gitee From ddec706019b44a41e4a11a25ef622ffd84234a55 Mon Sep 17 00:00:00 2001 From: nicunshu Date: Thu, 17 Jul 2025 19:41:25 +0800 Subject: [PATCH 14/34] mm: gmem: fix code sytle problems euleros inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- fix multiple code sytle problems Fixes: 4c627cebab85 ("mm: gmem: Introduce GMEM") Signed-off-by: nicunshu --- include/linux/gmem.h | 9 +++++---- include/linux/vm_object.h | 6 +++++- init/main.c | 8 -------- kernel/fork.c | 5 +---- mm/Kconfig | 2 +- mm/gmem.c | 15 +++++++++------ mm/huge_memory.c | 6 +++++- mm/memory.c | 4 ++-- mm/mempolicy.c | 5 +---- mm/mm_init.c | 6 ++++++ mm/mmap.c | 17 ++++++----------- mm/vm_object.c | 18 +++++++++++++++++- 12 files changed, 58 insertions(+), 43 deletions(-) diff --git a/include/linux/gmem.h b/include/linux/gmem.h index fefe17d6f50d..b0cdb6d0ab9a 100644 --- a/include/linux/gmem.h +++ b/include/linux/gmem.h @@ -302,13 +302,14 @@ extern unsigned long gm_as_alloc(struct gm_as *as, unsigned long hint, unsigned extern int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior); -enum gmem_stat_item { - NR_PAGE_MIGRATING, +enum gmem_stats_item { + NR_PAGE_MIGRATING_H2D, + NR_PAGE_MIGRATING_D2H, NR_GMEM_STAT_ITEMS }; -extern void gmem_state_counter(enum gmem_stat_item item, int val); -extern void gmem_state_counter_show(void); +extern void gmem_stats_counter(enum gmem_stats_item item, int val); +extern void gmem_stats_counter_show(void); /* h-NUMA topology */ struct hnode { diff --git a/include/linux/vm_object.h b/include/linux/vm_object.h index f17d78a62416..ca82642eb2df 100644 --- a/include/linux/vm_object.h +++ b/include/linux/vm_object.h @@ -10,9 +10,10 @@ int __init vm_object_init(void); struct vm_object *vm_object_create(struct vm_area_struct *vma); void vm_object_drop_locked(struct vm_area_struct *vma); -void dup_vm_object(struct vm_area_struct *dst, struct vm_area_struct *src); +void dup_vm_object(struct vm_area_struct *dst, struct vm_area_struct *src, bool dst_peer_shared); void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end); +void dup_peer_shared_vma(struct vm_area_struct *vma); struct gm_mapping *alloc_gm_mapping(void); struct gm_mapping *vm_object_lookup(struct vm_object *obj, unsigned long va); @@ -22,6 +23,9 @@ void free_gm_mappings(struct vm_area_struct *vma); static inline void __init vm_object_init(void) {} static inline struct vm_object *vm_object_create(struct vm_area_struct *vma) { return NULL; } static inline void vm_object_drop_locked(struct vm_area_struct *vma) {} +static inline void dup_vm_object(struct vm_area_struct *dst, + struct vm_area_struct *src, bool dst_peer_shared) {} +static inline void dup_peer_shared_vma(struct vm_area_struct *vma) {} static inline void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end) {} diff --git a/init/main.c b/init/main.c index 51395ee7a27d..f97f06547078 100644 --- a/init/main.c +++ b/init/main.c @@ -102,10 +102,6 @@ #include #include -#ifdef CONFIG_GMEM -#include -#endif - #include #include #include @@ -909,10 +905,6 @@ void start_kernel(void) smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ boot_cpu_hotplug_init(); -#ifdef CONFIG_GMEM - hnuma_init(); -#endif - pr_notice("Kernel command line: %s\n", saved_command_line); /* parameters may set static keys */ jump_label_init(); diff --git a/kernel/fork.c b/kernel/fork.c index d984d93b3d39..cf44a02680d6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -532,10 +532,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) dup_anon_vma_name(orig, new); #ifdef CONFIG_GMEM - if (vma_is_peer_shared(orig)) { - pr_debug("gmem: peer-shared vma should not be dup\n"); - new->vm_obj = vm_object_create(new); - } + dup_peer_shared_vma(new); #endif return new; diff --git a/mm/Kconfig b/mm/Kconfig index 56e0df21bc85..85a8979276c5 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1495,7 +1495,7 @@ config GMEM select ARCH_USES_HIGH_VMA_FLAGS default y help - This provides a high-level interface that decouples MMUspecific functions. + This provides a high-level interface that decouples MMU-specific functions. Device drivers can thus attach themselves to a process’s address space and let the OS take charge of their memory management. This eliminates the need for device drivers to reinvent the wheel and allows them to diff --git a/mm/gmem.c b/mm/gmem.c index adf640790df5..c484c2c40101 100644 --- a/mm/gmem.c +++ b/mm/gmem.c @@ -66,7 +66,7 @@ static inline unsigned long pe_mask(unsigned int order) static struct percpu_counter g_gmem_stats[NR_GMEM_STAT_ITEMS]; -void gmem_state_counter(enum gmem_stat_item item, int val) +void gmem_stats_counter(enum gmem_stats_item item, int val) { if (!gmem_is_enabled()) return; @@ -95,14 +95,17 @@ static int gmem_stat_init(void) } #ifdef CONFIG_PROC_FS -static int gmemstat_show(struct seq_file *m, void *arg) +static int gmem_stats_show(struct seq_file *m, void *arg) { if (!gmem_is_enabled()) return 0; seq_printf( - m, "migrating : %lld\n", - percpu_counter_read_positive(&g_gmem_stats[NR_PAGE_MIGRATING])); + m, "migrating H2D : %lld\n", + percpu_counter_read_positive(&g_gmem_stats[NR_PAGE_MIGRATING_H2D])); + seq_printf( + m, "migrating D2H : %lld\n", + percpu_counter_read_positive(&g_gmem_stats[NR_PAGE_MIGRATING_D2H])); return 0; } @@ -154,7 +157,7 @@ static int __init gmem_init(void) } #ifdef CONFIG_PROC_FS - proc_create_single("gmemstat", 0444, NULL, gmemstat_show); + proc_create_single("gmemstat", 0444, NULL, gmem_stats_show); #endif static_branch_enable(&gmem_status); @@ -301,7 +304,7 @@ enum gm_ret gm_dev_fault(struct mm_struct *mm, unsigned long addr, struct gm_dev * update page to willneed and this will stop page evicting */ gm_mapping_flags_set(gm_mapping, GM_PAGE_WILLNEED); - gmem_state_counter(NR_PAGE_MIGRATING, 1); + gmem_stats_counter(NR_PAGE_MIGRATING_D2H, 1); ret = GM_RET_SUCCESS; } else { pr_err("gmem: peer map failed\n"); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 59e2ca2478d3..05a3c7f800ee 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1543,6 +1543,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) pte_free(vma->vm_mm, pgtable); folio_put(folio); return ret; + } /* @@ -1598,12 +1599,14 @@ static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + vm_fault_t ret; + if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) return VM_FAULT_FALLBACK; ret = vmf_anon_prepare(vmf); if (ret) return ret; - khugepaged_enter_vma(vma, vma->vm_flags); #ifdef CONFIG_GMEM @@ -1654,6 +1657,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) return __do_huge_pmd_anonymous_page(vmf); } + static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write, pgtable_t pgtable) diff --git a/mm/memory.c b/mm/memory.c index 70d92ba26cc2..ef556a62670e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1793,8 +1793,8 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, * because MADV_DONTNEED holds the mmap_lock in read * mode. */ - if (pmd_none_or_clear_bad(pmd) || pmd_trans_huge(*pmd)) { - if (vma_is_peer_shared(vma)) + if (vma_is_peer_shared(vma)) { + if (pmd_none_or_clear_bad(pmd) || pmd_trans_huge(*pmd)) zap_logic_pmd_range(vma, addr, next); } #endif diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4ee93da660cd..db78ce14658a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1903,11 +1903,8 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, bool vma_migratable(struct vm_area_struct *vma) { -#ifdef CONFIG_GMEM + if (vma->vm_flags & (VM_IO | VM_PFNMAP | VM_PEER_SHARED)) -#else - if (vma->vm_flags & (VM_IO | VM_PFNMAP)) -#endif return false; /* diff --git a/mm/mm_init.c b/mm/mm_init.c index 6677aaa5972d..1a3d3b6e52c9 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -30,6 +30,9 @@ #include "internal.h" #include "slab.h" #include "shuffle.h" +#ifdef CONFIG_GMEM +#include +#endif #include @@ -2797,6 +2800,9 @@ static void __init mem_init_print_info(void) */ void __init mm_core_init(void) { +#ifdef CONFIG_GMEM + hnuma_init(); +#endif /* Initializations relying on SMP setup */ build_all_zonelists(NULL); page_alloc_init_cpuhp(); diff --git a/mm/mmap.c b/mm/mmap.c index 1504db3e74c2..b23b7d92908c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -647,12 +647,10 @@ static inline int dup_anon_vma(struct vm_area_struct *dst, * anon pages imported. */ if (src->anon_vma && !dst->anon_vma) { + int ret; #ifdef CONFIG_GMEM - if (vma_is_peer_shared(dst)) - dup_vm_object(dst, src); + dup_vm_object(dst, src, true); #endif - int ret; - vma_assert_write_locked(dst); dst->anon_vma = src->anon_vma; ret = anon_vma_clone(dst, src); @@ -1363,9 +1361,9 @@ unsigned long __do_mmap_mm(struct mm_struct *mm, struct file *file, unsigned lon */ #ifdef CONFIG_GMEM if (gmem_is_enabled() && (flags & MAP_PEER_SHARED)) { - len = round_up(len, SZ_2M); + len = round_up(len, PMD_SIZE); addr = get_unmapped_area_aligned(file, addr, len, pgoff, flags, - SZ_2M); + PMD_SIZE); } else { addr = get_unmapped_area(file, addr, len, pgoff, flags); } @@ -2554,12 +2552,9 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, err = anon_vma_clone(new, vma); if (err) goto out_free_mpol; - -#ifdef CONFIG_GMEM - if (vma_is_peer_shared(vma)) - dup_vm_object(new, vma); +#ifdef COFNIG_GMEM + dup_vm_object(new, vma, false); #endif - if (new->vm_file) get_file(new->vm_file); diff --git a/mm/vm_object.c b/mm/vm_object.c index 6ac4c172cfdd..25af359def56 100644 --- a/mm/vm_object.c +++ b/mm/vm_object.c @@ -143,12 +143,20 @@ void vm_object_drop_locked(struct vm_area_struct *vma) } } -void dup_vm_object(struct vm_area_struct *dst, struct vm_area_struct *src) +void dup_vm_object(struct vm_area_struct *dst, struct vm_area_struct *src, bool dst_peer_shared) { unsigned long index; struct gm_mapping *mapping; unsigned long moved_pages = 0; + if (dst_peer_shared) { + if (!vma_is_peer_shared(dst)) + return; + } else { + if (!vma_is_peer_shared(src)) + return; + } + XA_STATE(xas, src->vm_obj->logical_page_table, linear_page_index(src, src->vm_start)); xa_lock(dst->vm_obj->logical_page_table); @@ -164,6 +172,14 @@ void dup_vm_object(struct vm_area_struct *dst, struct vm_area_struct *src) xa_unlock(dst->vm_obj->logical_page_table); } +void dup_peer_shared_vma(struct vm_area_struct *vma) +{ + if (vma_is_peer_shared(vma)) { + pr_debug("gmem: peer-shared vma should not be dup\n"); + vma->vm_obj = vm_object_create(vma); + } +} + void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end) { /* remove logical mapping in [vma->vm_start, start) and [end, vm->vm_end) */ -- Gitee From 3486a2f3c6424261b1cba7dca0eb56c5795f4397 Mon Sep 17 00:00:00 2001 From: nicunshu Date: Thu, 17 Jul 2025 21:59:35 +0800 Subject: [PATCH 15/34] mm: gmem: introduce hmemcpy and add multiple protection euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- add multiple param check for protection Signed-off-by: nicunshu --- include/linux/gmem.h | 15 +- include/linux/gmem_as.h | 36 ---- include/linux/mm.h | 2 + include/linux/mm_types.h | 35 +++- include/linux/printk.h | 6 + kernel/fork.c | 8 + mm/gmem.c | 363 ++++++++++++++++++++++++++++++++++----- mm/huge_memory.c | 1 - mm/memory.c | 10 +- mm/mmap.c | 148 +++++++++++----- 10 files changed, 495 insertions(+), 129 deletions(-) delete mode 100644 include/linux/gmem_as.h diff --git a/include/linux/gmem.h b/include/linux/gmem.h index b0cdb6d0ab9a..d37e79a7052d 100644 --- a/include/linux/gmem.h +++ b/include/linux/gmem.h @@ -121,10 +121,8 @@ struct gm_mmu { unsigned long cookie; /* Synchronize VMA in a peer OS to interact with the host OS */ - enum gm_ret (*peer_va_alloc_fixed)(struct mm_struct *mm, unsigned long va, - unsigned long size, unsigned long prot); - enum gm_ret (*peer_va_free)(struct mm_struct *mm, unsigned long va, - unsigned long size); + enum gm_ret (*peer_va_alloc_fixed)(struct gm_fault_t *gmf); + enum gm_ret (*peer_va_free)(struct gm_fault_t *gmf); /* Create physical mappings on peer host. * If copy is set, copy data [dma_addr, dma_addr + size] to peer host @@ -152,6 +150,9 @@ struct gm_mmu { /* Invalidation functions of the MMU TLB */ enum gm_ret (*tlb_invl)(void *pmap, unsigned long va, unsigned long size); enum gm_ret (*tlb_invl_coalesced)(void *pmap, struct list_head *mappings); + + // copy one area of memory from device to host or from host to device + enum gm_ret (*peer_hmemcpy)(struct gm_memcpy_t *gmc); }; /** @@ -301,6 +302,7 @@ extern unsigned long gm_as_alloc(struct gm_as *as, unsigned long hint, unsigned struct gm_region **new_region); extern int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior); +extern int hmemcpy(int hnid, unsigned long dest, unsigned long src, size_t size); enum gmem_stats_item { NR_PAGE_MIGRATING_H2D, @@ -339,6 +341,11 @@ static inline struct hnode *get_hnode(unsigned int hnid) return hnodes[hnid]; } +static inline int get_hnuma_id(struct gm_dev *gm_dev) +{ + return first_node(gm_dev->registered_hnodes); +} + void __init hnuma_init(void); unsigned int alloc_hnode_id(void); void free_hnode_id(unsigned int nid); diff --git a/include/linux/gmem_as.h b/include/linux/gmem_as.h deleted file mode 100644 index d691de1162eb..000000000000 --- a/include/linux/gmem_as.h +++ /dev/null @@ -1,36 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _GMEM_AS_H -#define _GMEM_AS_H - -#define GMEM_MMAP_RETRY_TIMES 10 /* gmem retry times before OOM */ - -/** - * enum gm_as_alloc - defines different allocation policy for virtual addresses. - * - * @GM_AS_ALLOC_DEFAULT: An object cache is applied to accelerate VA allocations. - * @GM_AS_ALLOC_FIRSTFIT: Prefer allocation efficiency. - * @GM_AS_ALLOC_BESTFIT: Prefer space efficiency. - * @GM_AS_ALLOC_NEXTFIT: Perform an address-ordered search for free addresses, - * beginning where the previous search ended. - */ -enum gm_as_alloc { - GM_AS_ALLOC_DEFAULT = 0, - GM_AS_ALLOC_FIRSTFIT, - GM_AS_ALLOC_BESTFIT, - GM_AS_ALLOC_NEXTFIT, -}; - -/* Defines an address space. */ -struct gm_as { - spinlock_t rbtree_lock; /* spinlock of struct gm_as */ - struct rb_root rbroot; /*root of gm_region_t */ - enum gm_as_alloc policy; - unsigned long start_va; - unsigned long end_va; - /* defines the VA unit size if an object cache is applied */ - unsigned long cache_quantum; - /* tracks device contexts attached to this va space, using gm_as_link */ - struct list_head gm_ctx_list; -}; - -#endif diff --git a/include/linux/mm.h b/include/linux/mm.h index 11d6a86b3aab..30222ae6daa5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3410,9 +3410,11 @@ unsigned long randomize_page(unsigned long start, unsigned long range); extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); +#ifdef CONFIG_GMEM extern unsigned long get_unmapped_area_aligned(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, unsigned long align); +#endif extern unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index e9cd4439e08d..f012f7c7c4d4 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -20,10 +20,6 @@ #include #include -#ifdef CONFIG_GMEM -#include -#endif - #include #include @@ -651,6 +647,37 @@ struct vm_object { */ atomic_t ref_count; }; + +#define GMEM_MMAP_RETRY_TIMES 10 /* gmem retry times before OOM */ + +/** + * enum gm_as_alloc - defines different allocation policy for virtual addresses. + * + * @GM_AS_ALLOC_DEFAULT: An object cache is applied to accelerate VA allocations. + * @GM_AS_ALLOC_FIRSTFIT: Prefer allocation efficiency. + * @GM_AS_ALLOC_BESTFIT: Prefer space efficiency. + * @GM_AS_ALLOC_NEXTFIT: Perform an address-ordered search for free addresses, + * beginning where the previous search ended. + */ +enum gm_as_alloc { + GM_AS_ALLOC_DEFAULT = 0, + GM_AS_ALLOC_FIRSTFIT, + GM_AS_ALLOC_BESTFIT, + GM_AS_ALLOC_NEXTFIT, +}; + +/* Defines an address space. */ +struct gm_as { + spinlock_t rbtree_lock; /* spinlock of struct gm_as */ + struct rb_root rbroot; /*root of gm_region_t */ + enum gm_as_alloc policy; + unsigned long start_va; + unsigned long end_va; + /* defines the VA unit size if an object cache is applied */ + unsigned long cache_quantum; + /* tracks device contexts attached to this va space, using gm_as_link */ + struct list_head gm_ctx_list; +}; #endif struct anon_vma_name { diff --git a/include/linux/printk.h b/include/linux/printk.h index e4878bb58f66..c4fc04998932 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -761,3 +761,9 @@ static inline void print_hex_dump_debug(const char *prefix_str, int prefix_type, print_hex_dump_debug(prefix_str, prefix_type, 16, 1, buf, len, true) #endif + +#ifdef CONFIG_GMEM +#define gmem_err(fmt, ...) \ + ((void)pr_err("[gmem]" fmt "\n", ##__VA_ARGS__)) + +#endif diff --git a/kernel/fork.c b/kernel/fork.c index cf44a02680d6..bce23359c908 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -119,6 +119,10 @@ #include #endif +#ifdef CONFIG_GMEM +#include +#endif + #include #define CREATE_TRACE_POINTS @@ -560,6 +564,10 @@ static void vm_area_free_rcu_cb(struct rcu_head *head) void vm_area_free(struct vm_area_struct *vma) { +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) + vm_object_drop_locked(vma); +#endif #ifdef CONFIG_PER_VMA_LOCK call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb); #else diff --git a/mm/gmem.c b/mm/gmem.c index c484c2c40101..ce591b9ed8ca 100644 --- a/mm/gmem.c +++ b/mm/gmem.c @@ -87,7 +87,7 @@ static int gmem_stat_init(void) for (i--; i >= 0; i--) percpu_counter_destroy(&g_gmem_stats[i]); - break; /* break the initialization process */ + break; /* break the initialization process */ } } @@ -112,6 +112,7 @@ static int gmem_stats_show(struct seq_file *m, void *arg) #endif /* CONFIG_PROC_FS */ static struct workqueue_struct *prefetch_wq; +static struct workqueue_struct *hmemcpy_wq; #define GM_WORK_CONCURRENCY 4 @@ -140,20 +141,27 @@ static int __init gmem_init(void) err = vm_object_init(); if (err) - goto free_ctx; + goto free_region; err = gmem_stat_init(); if (err) - goto free_ctx; + goto free_region; prefetch_wq = alloc_workqueue("prefetch", - __WQ_LEGACY | WQ_UNBOUND | WQ_HIGHPRI | - WQ_CPU_INTENSIVE, - GM_WORK_CONCURRENCY); + __WQ_LEGACY | WQ_UNBOUND | WQ_HIGHPRI | WQ_CPU_INTENSIVE, GM_WORK_CONCURRENCY); if (!prefetch_wq) { - pr_info("fail to alloc workqueue prefetch_wq\n"); + gmem_err("fail to alloc workqueue prefetch_wq\n"); err = -EFAULT; - goto free_ctx; + goto free_region; + } + + hmemcpy_wq = alloc_workqueue("hmemcpy", __WQ_LEGACY | WQ_UNBOUND + | WQ_HIGHPRI | WQ_CPU_INTENSIVE, GM_WORK_CONCURRENCY); + if (!hmemcpy_wq) { + gmem_err("fail to alloc workqueue hmemcpy_wq\n"); + err = -EFAULT; + destroy_workqueue(prefetch_wq); + goto free_region; } #ifdef CONFIG_PROC_FS @@ -164,6 +172,8 @@ static int __init gmem_init(void) return 0; +free_region: + kmem_cache_destroy(gm_region_cache); free_ctx: kmem_cache_destroy(gm_ctx_cache); free_dev: @@ -237,25 +247,28 @@ enum gm_ret gm_dev_fault(struct mm_struct *mm, unsigned long addr, struct gm_dev struct vm_object *obj; struct gm_mapping *gm_mapping; unsigned long size = HPAGE_SIZE; - struct gm_fault_t gmf = { .mm = mm, - .va = addr, - .dev = dev, - .size = size, - .copy = false, - .behavior = behavior }; + struct gm_fault_t gmf = { + .mm = mm, + .va = addr, + .dev = dev, + .size = size, + .copy = false, + .behavior = behavior + }; struct page *page = NULL; mmap_read_lock(mm); vma = find_vma(mm, addr); - if (!vma) { + if (!vma || vma->vm_start > addr) { + gmem_err("%s failed to find vma by addr %p\n", __func__, (void *)addr); pr_info("gmem: %s no vma\n", __func__); ret = GM_RET_FAILURE_UNKNOWN; goto mmap_unlock; } obj = vma->vm_obj; if (!obj) { - pr_info("gmem: %s no vm_obj\n", __func__); + gmem_err("%s no vm_obj\n", __func__); ret = GM_RET_FAILURE_UNKNOWN; goto mmap_unlock; } @@ -268,6 +281,11 @@ enum gm_ret gm_dev_fault(struct mm_struct *mm, unsigned long addr, struct gm_dev } xa_unlock(obj->logical_page_table); + if (unlikely(!gm_mapping)) { + gmem_err("OOM when creating vm_obj!\n"); + ret = GM_RET_NOMEM; + goto mmap_unlock; + } mutex_lock(&gm_mapping->lock); if (gm_mapping_nomap(gm_mapping)) { goto peer_map; @@ -281,16 +299,17 @@ enum gm_ret gm_dev_fault(struct mm_struct *mm, unsigned long addr, struct gm_dev } else if (gm_mapping_cpu(gm_mapping)) { page = gm_mapping->page; if (!page) { - pr_err("gmem: host gm_mapping page is NULL. Set nomap\n"); + gmem_err("host gm_mapping page is NULL. Set nomap\n"); gm_mapping_flags_set(gm_mapping, GM_PAGE_NOMAP); goto unlock; } get_page(page); + /* zap_page_range_single can be used in Linux 6.4 and later versions. */ zap_page_range_single(vma, addr, size, NULL); gmf.dma_addr = dma_map_page(dma_dev, page, 0, size, DMA_BIDIRECTIONAL); if (dma_mapping_error(dma_dev, gmf.dma_addr)) - pr_info("gmem: dma map failed\n"); + gmem_err("dma map failed\n"); gmf.copy = true; } @@ -307,7 +326,7 @@ enum gm_ret gm_dev_fault(struct mm_struct *mm, unsigned long addr, struct gm_dev gmem_stats_counter(NR_PAGE_MIGRATING_D2H, 1); ret = GM_RET_SUCCESS; } else { - pr_err("gmem: peer map failed\n"); + gmem_err("peer map failed\n"); if (page) { gm_mapping_flags_set(gm_mapping, GM_PAGE_NOMAP); put_page(page); @@ -351,7 +370,7 @@ vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, gm_mapping = vm_object_lookup(obj, addr); if (!gm_mapping) { - pr_err("gmem: host fault gm_mapping should not be NULL\n"); + gmem_err("host fault gm_mapping should not be NULL\n"); return VM_FAULT_SIGBUS; } @@ -361,11 +380,11 @@ vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, gmf.dma_addr = dma_map_page(dma_dev, vmf->page, 0, size, DMA_BIDIRECTIONAL); if (dma_mapping_error(dma_dev, gmf.dma_addr)) { - pr_err("gmem: host fault dma mapping error\n"); + gmem_err("host fault dma mapping error\n"); return VM_FAULT_SIGBUS; } if (dev->mmu->peer_unmap(&gmf) != GM_RET_SUCCESS) { - pr_err("gmem: peer unmap failed\n"); + gmem_err("peer unmap failed\n"); dma_unmap_page(dma_dev, gmf.dma_addr, size, DMA_BIDIRECTIONAL); return VM_FAULT_SIGBUS; } @@ -467,7 +486,7 @@ EXPORT_SYMBOL_GPL(gm_mappings_alloc); /* GMEM Virtual Address Space API */ enum gm_ret gm_as_create(unsigned long begin, unsigned long end, enum gm_as_alloc policy, - unsigned long cache_quantum, struct gm_as **new_as) + unsigned long cache_quantum, struct gm_as **new_as) { struct gm_as *as; @@ -505,7 +524,7 @@ enum gm_ret gm_as_destroy(struct gm_as *as) EXPORT_SYMBOL_GPL(gm_as_destroy); enum gm_ret gm_as_attach(struct gm_as *as, struct gm_dev *dev, enum gm_mmu_mode mode, - bool activate, struct gm_context **out_ctx) + bool activate, struct gm_context **out_ctx) { struct gm_context *ctx; int nid; @@ -617,11 +636,11 @@ static void prefetch_work_cb(struct work_struct *work) /* MADV_WILLNEED: dev will soon access this addr. */ ret = gm_dev_fault(d->mm, addr, d->dev, MADV_WILLNEED); if (ret == GM_RET_PAGE_EXIST) { - pr_info("%s: device has done page fault, ignore prefetch\n", + gmem_err("%s: device has done page fault, ignore prefetch\n", __func__); } else if (ret != GM_RET_SUCCESS) { *d->res = -EFAULT; - pr_err("%s: call dev fault error %d\n", __func__, ret); + gmem_err("%s: call dev fault error %d\n", __func__, ret); } } while (addr += page_size, addr != end); @@ -638,8 +657,10 @@ static int hmadvise_do_prefetch(struct gm_dev *dev, unsigned long addr, size_t s unsigned long old_start; /* overflow */ - if (check_add_overflow(addr, size, &end)) + if (check_add_overflow(addr, size, &end)) { + gmem_err("addr plus size will cause overflow!\n"); return -EINVAL; + } old_start = end; @@ -648,8 +669,10 @@ static int hmadvise_do_prefetch(struct gm_dev *dev, unsigned long addr, size_t s start = round_down(addr, page_size); size = end - start; - if (!end && old_start) + if (!end && old_start) { + gmem_err("end addr align up 2M causes invalid addr %p\n", (void *)end); return -EINVAL; + } if (size == 0) return 0; @@ -658,6 +681,12 @@ static int hmadvise_do_prefetch(struct gm_dev *dev, unsigned long addr, size_t s vma = find_vma(current->mm, start); if (!vma || start < vma->vm_start || end > vma->vm_end) { mmap_read_unlock(current->mm); + gmem_err("failed to find vma by invalid start %p or size 0x%zx.\n", + (void *)start, size); + return GM_RET_FAILURE_UNKNOWN; + } else if (!vma_is_peer_shared(vma)) { + mmap_read_unlock(current->mm); + gmem_err("%s the vma does not use VM_PEER_SHARED\n", __func__); return GM_RET_FAILURE_UNKNOWN; } mmap_read_unlock(current->mm); @@ -705,7 +734,7 @@ static int gmem_unmap_vma_pages(struct vm_area_struct *vma, unsigned long start, obj = vma->vm_obj; if (!obj) { - pr_err("gmem: peer-shared vma should have vm_object\n"); + gmem_err("peer-shared vma should have vm_object\n"); return -EINVAL; } @@ -728,8 +757,7 @@ static int gmem_unmap_vma_pages(struct vm_area_struct *vma, unsigned long start, gmf.dev = gm_mapping->dev; ret = gm_mapping->dev->mmu->peer_unmap(&gmf); if (ret) { - pr_err("gmem: peer_unmap failed. ret %d\n", - ret); + gmem_err("peer_unmap failed. ret %d\n", ret); mutex_unlock(&gm_mapping->lock); continue; } @@ -750,31 +778,38 @@ static int hmadvise_do_eagerfree(unsigned long addr, size_t size) unsigned long old_start; /* overflow */ - if (check_add_overflow(addr, size, &end)) + if (check_add_overflow(addr, size, &end)) { + gmem_err("addr plus size will cause overflow!\n"); return -EINVAL; + } old_start = addr; /* Align addr by rounding inward to avoid excessive page release. */ end = round_down(end, page_size); start = round_up(addr, page_size); - if (start >= end) + if (start >= end) { + pr_debug("gmem:start align up 2M >= end align down 2M.\n"); return ret; + } /* Check to see whether len was rounded up from small -ve to zero */ - if (old_start && !start) + if (old_start && !start) { + gmem_err("start addr align up 2M causes invalid addr %p", (void *)start); return -EINVAL; + } mmap_read_lock(current->mm); do { vma = find_vma_intersection(current->mm, start, end); if (!vma) { - pr_info("gmem: there is no valid vma\n"); + gmem_err("gmem: there is no valid vma\n"); break; } if (!vma_is_peer_shared(vma)) { - pr_debug("gmem: not peer-shared vma, skip dontneed\n"); + pr_debug("gmem:not peer-shared vma %p-%p, skip dontneed\n", + (void *)vma->vm_start, (void *)vma->vm_end); start = vma->vm_end; continue; } @@ -806,21 +841,25 @@ int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior) if (check_hmadvise_behavior(behavior)) { goto no_hnid; } else { - pr_err("hmadvise: behavior %d need hnid or is invalid\n", - behavior); + gmem_err("hmadvise: behavior %d need hnid or is invalid\n", + behavior); return error; } } - if (hnid < 0) + if (hnid < 0) { + gmem_err("hmadvise: invalid hnid %d < 0\n", hnid); return error; + } - if (!is_hnode(hnid) || !is_hnode_allowed(hnid)) + if (!is_hnode(hnid) || !is_hnode_allowed(hnid)) { + gmem_err("hmadvise: can't find hnode by hnid:%d or hnode is not allowed\n", hnid); return error; + } node = get_hnode(hnid); if (!node) { - pr_err("hmadvise: hnode id %d is invalid\n", hnid); + gmem_err("hmadvise: hnode id %d is invalid\n", hnid); return error; } @@ -831,9 +870,249 @@ int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior) case MADV_DONTNEED: return hmadvise_do_eagerfree(start, len_in); default: - pr_err("hmadvise: unsupported behavior %d\n", behavior); + gmem_err("hmadvise: unsupported behavior %d\n", behavior); } return error; } EXPORT_SYMBOL_GPL(hmadvise_inner); + +struct hmemcpy_data { + struct mm_struct *mm; + int hnid; + unsigned long src; + unsigned long dest; + size_t size; + struct work_struct work; +}; + +static bool hnid_match_dest(int hnid, struct gm_mapping *dest) +{ + return (hnid < 0) ? gm_mapping_cpu(dest) : gm_mapping_device(dest); +} + +static void do_hmemcpy(struct mm_struct *mm, int hnid, unsigned long dest, + unsigned long src, size_t size) +{ + enum gm_ret ret; + int page_size = HPAGE_SIZE; + struct vm_area_struct *vma_dest, *vma_src; + struct gm_mapping *gm_mmaping_dest, *gm_mmaping_src; + struct gm_dev *dev = NULL; + struct hnode *node; + struct gm_memcpy_t gmc = {0}; + + if (size == 0) + return; + + vma_dest = find_vma(mm, dest); + vma_src = find_vma(mm, src); + + gm_mmaping_dest = vm_object_lookup(vma_dest->vm_obj, dest & ~(page_size - 1)); + gm_mmaping_src = vm_object_lookup(vma_src->vm_obj, src & ~(page_size - 1)); + + if (!gm_mmaping_src) { + gmem_err("%s: gm_mmaping_src is NULL, src=%p; size=0x%zx\n", + __func__, (void *)src, size); + return; + } + + if (hnid != -1) { + node = get_hnode(hnid); + if (node) + dev = node->dev; + if (!dev) { + gmem_err("%s: hnode's dev is NULL\n", __func__); + return; + } + } + + // Trigger dest page fault on host or device + if (!gm_mmaping_dest || gm_mapping_nomap(gm_mmaping_dest) + || !hnid_match_dest(hnid, gm_mmaping_dest)) { + if (hnid == -1) { + mmap_read_lock(mm); + handle_mm_fault(vma_dest, dest & ~(page_size - 1), FAULT_FLAG_USER | + FAULT_FLAG_INSTRUCTION | FAULT_FLAG_WRITE, NULL); + mmap_read_unlock(mm); + } else { + ret = gm_dev_fault(mm, dest & ~(page_size - 1), dev, MADV_WILLNEED); + if (ret != GM_RET_SUCCESS) { + gmem_err("%s: gm_dev_fault failed\n", __func__); + return; + } + } + } + if (!gm_mmaping_dest) + gm_mmaping_dest = vm_object_lookup(vma_dest->vm_obj, round_down(dest, page_size)); + + if (gm_mmaping_dest && gm_mmaping_dest != gm_mmaping_src) + mutex_lock(&gm_mmaping_dest->lock); + mutex_lock(&gm_mmaping_src->lock); + // Use memcpy when there is no device address, otherwise use peer_memcpy + if (hnid == -1) { + if (gm_mapping_cpu(gm_mmaping_src)) { // host to host + memcpy(page_to_virt(gm_mmaping_dest->page) + (dest & (page_size - 1)), + page_to_virt(gm_mmaping_src->page) + (src & (page_size - 1)), + size); + goto unlock; + } else { // device to host + dev = gm_mmaping_src->dev; + gmc.dma_addr = phys_to_dma(dev->dma_dev, + page_to_phys(gm_mmaping_dest->page) + (dest & (page_size - 1))); + gmc.src = src; + } + } else { + if (gm_mapping_cpu(gm_mmaping_src)) { // host to device + gmc.dest = dest; + gmc.dma_addr = phys_to_dma(dev->dma_dev, + page_to_phys(gm_mmaping_src->page) + (src & (page_size - 1))); + } else { // device to device + if (dev == gm_mmaping_src->dev) { // same device + gmc.dest = dest; + gmc.src = src; + } else { // TODO: different devices + gmem_err("%s: device to device is unimplemented\n", __func__); + goto unlock; + } + } + } + gmc.mm = mm; + gmc.dev = dev; + gmc.size = size; + dev->mmu->peer_hmemcpy(&gmc); + +unlock: + mutex_unlock(&gm_mmaping_src->lock); + if (gm_mmaping_dest && gm_mmaping_dest != gm_mmaping_src) + mutex_unlock(&gm_mmaping_dest->lock); +} + +/* + * Each page needs to be copied in three parts when the address is not aligned. + * | <--a-->| | + * | -------|--------- | + * | / /| / / | + * | / / | / / | + * | / / |/ / | + * | ----------|------ | + * | <----b--->| | + * |<----page x---->|<----page y---->| + */ + +static void hmemcpy_work_cb(struct work_struct *work) +{ + size_t i; + int remain, a, b, page_size = HPAGE_SIZE; + struct hmemcpy_data *d = container_of(work, struct hmemcpy_data, work); + unsigned long src = d->src, dest = d->dest; + + a = min(page_size - (src & (page_size - 1)), page_size - (dest & (page_size - 1))); + b = max(page_size - (src & (page_size - 1)), page_size - (dest & (page_size - 1))); + + for (i = page_size; i < d->size; i += page_size) { + if (a != 0) + do_hmemcpy(d->mm, d->hnid, dest, src, a); + if (b - a != 0) + do_hmemcpy(d->mm, d->hnid, dest + a, src + a, b - a); + if (page_size - b != 0) + do_hmemcpy(d->mm, d->hnid, dest + b, src + b, page_size - b); + src += page_size; + dest += page_size; + } + + remain = d->size + page_size - i; + if (remain == 0) + goto out; + + if (remain < a) { + do_hmemcpy(d->mm, d->hnid, dest, src, remain); + } else if (remain < b) { + do_hmemcpy(d->mm, d->hnid, dest, src, a); + do_hmemcpy(d->mm, d->hnid, dest + a, src + a, remain - a); + } else { + do_hmemcpy(d->mm, d->hnid, dest, src, a); + do_hmemcpy(d->mm, d->hnid, dest + a, src + a, b - a); + do_hmemcpy(d->mm, d->hnid, dest + b, src + b, remain - b); + } + +out: + kfree(d); +} + +int hmemcpy(int hnid, unsigned long dest, unsigned long src, size_t size) +{ + int page_size = HPAGE_SIZE; + unsigned long per_size, copied = 0; + struct hmemcpy_data *data; + struct vm_area_struct *vma_dest, *vma_src; + + if (hnid < 0) { + if (hnid != -1) { + gmem_err("hmadvise: invalid hnid %d < 0\n", hnid); + return -EINVAL; + } + } else if (!is_hnode(hnid) || !is_hnode_allowed(hnid)) { + gmem_err( + "hmadvise: can't find hnode by hnid:%d or hnode is not allowed\n", + hnid); + return -EINVAL; + } + + vma_dest = find_vma(current->mm, dest); + vma_src = find_vma(current->mm, src); + + if (!vma_src || vma_src->vm_start > src || !vma_is_peer_shared(vma_src) + || vma_src->vm_end < (src + size)) { + gmem_err("failed to find peer_shared vma by invalid src:%p or size :0x%zx", + (void *)src, size); + return -EINVAL; + } + + if (!vma_dest || vma_dest->vm_start > dest || !vma_is_peer_shared(vma_dest) + || vma_dest->vm_end < (dest + size)) { + gmem_err("failed to find peer_shared vma by invalid dest:%p or size :0x%zx", + (void *)dest, size); + return -EINVAL; + } + + if (!(vma_dest->vm_flags & VM_WRITE)) { + gmem_err("dest is not writable.\n"); + return -EINVAL; + } + + if (!(vma_dest->vm_flags & VM_WRITE)) { + gmem_err("dest is not writable.\n"); + return -EINVAL; + } + + per_size = (size / GM_WORK_CONCURRENCY) & ~(page_size - 1); + + while (copied < size) { + data = kzalloc(sizeof(struct hmemcpy_data), GFP_KERNEL); + if (data == NULL) { + flush_workqueue(hmemcpy_wq); + return GM_RET_NOMEM; + } + INIT_WORK(&data->work, hmemcpy_work_cb); + data->mm = current->mm; + data->hnid = hnid; + data->src = src; + data->dest = dest; + if (per_size == 0) { + data->size = size; + } else { + // Process (1.x * per_size) for the last time + data->size = (size - copied < 2 * per_size) ? (size - copied) : per_size; + } + + queue_work(hmemcpy_wq, &data->work); + src += data->size; + dest += data->size; + copied += data->size; + } + + flush_workqueue(hmemcpy_wq); + return 0; +} +EXPORT_SYMBOL_GPL(hmemcpy); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 05a3c7f800ee..086f5b692973 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1657,7 +1657,6 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) return __do_huge_pmd_anonymous_page(vmf); } - static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write, pgtable_t pgtable) diff --git a/mm/memory.c b/mm/memory.c index ef556a62670e..568f3e295fdb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1829,8 +1829,11 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, goto next; /* fall through */ } - if (pud_none_or_clear_bad(pud)) + if (pud_none_or_clear_bad(pud)) { + if (vma_is_peer_shared(vma)) + zap_logic_pud_range(vma, addr, next); continue; + } next = zap_pmd_range(tlb, vma, pud, addr, next, details); next: cond_resched(); @@ -1850,8 +1853,11 @@ static inline unsigned long zap_p4d_range(struct mmu_gather *tlb, p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); - if (p4d_none_or_clear_bad(p4d)) + if (p4d_none_or_clear_bad(p4d)) { + if (vma_is_peer_shared(vma)) + zap_logic_pud_range(vma, addr, next); continue; + } next = zap_pud_range(tlb, vma, p4d, addr, next, details); } while (p4d++, addr = next, addr != end); diff --git a/mm/mmap.c b/mm/mmap.c index b23b7d92908c..242110526d16 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1523,7 +1523,26 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, struct file *file = NULL; unsigned long retval; +#ifdef CONFIG_GMEM + if (gmem_is_enabled() && (flags & MAP_SHARED) && (flags & MAP_PEER_SHARED)) { + retval = -EINVAL; + gmem_err(" MAP_PEER_SHARED and MAP_SHARE cannot be used together.\n"); + goto out_fput; + } + if (gmem_is_enabled() && (flags & MAP_HUGETLB) && (flags & MAP_PEER_SHARED)) { + retval = -EINVAL; + gmem_err(" MAP_PEER_SHARED and MAP_HUGETLB cannot be used together.\n"); + goto out_fput; + } +#endif if (!(flags & MAP_ANONYMOUS)) { +#ifdef CONFIG_GMEM + if (gmem_is_enabled() && (flags & MAP_PEER_SHARED)) { + retval = -EINVAL; + gmem_err(" MAP_PEER_SHARED cannot map file page.\n"); + goto out_fput; + } +#endif audit_mmap_fd(fd, flags); file = fget(fd); if (!file) @@ -2619,10 +2638,10 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, } #ifdef CONFIG_GMEM -static void munmap_in_peer_devices(struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long start, unsigned long end) +static void munmap_single_vma_in_peer_devices(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long start_addr, unsigned long end_addr) { - unsigned long addr = start; + unsigned long start, end, addr; struct vm_object *obj = vma->vm_obj; enum gm_ret ret; struct gm_context *ctx, *tmp; @@ -2633,9 +2652,20 @@ static void munmap_in_peer_devices(struct mm_struct *mm, .copy = false, }; + start = max(vma->vm_start, start_addr); + if (start >= vma->vm_end) + return; + addr = start; + end = min(vma->vm_end, end_addr); + if (end <= vma->vm_start) + return; + if (!obj) return; + if (!mm->gm_as) + return; + do { xa_lock(obj->logical_page_table); gm_mapping = vm_object_lookup(obj, addr); @@ -2656,28 +2686,40 @@ static void munmap_in_peer_devices(struct mm_struct *mm, gmf.dev = gm_mapping->dev; ret = gm_mapping->dev->mmu->peer_unmap(&gmf); if (ret != GM_RET_SUCCESS) { - pr_err("%s: call dev peer_unmap error %d\n", __func__, ret); + gmem_err("%s: call dev peer_unmap error %d\n", __func__, ret); mutex_unlock(&gm_mapping->lock); continue; } mutex_unlock(&gm_mapping->lock); } while (addr += HPAGE_SIZE, addr != end); - if (!mm->gm_as) - return; - list_for_each_entry_safe(ctx, tmp, &mm->gm_as->gm_ctx_list, gm_as_link) { if (!gm_dev_is_peer(ctx->dev)) continue; if (!ctx->dev->mmu->peer_va_free) continue; - ret = ctx->dev->mmu->peer_va_free(mm, start, end - start); + gmf.va = start; + gmf.size = end - start; + gmf.dev = ctx->dev; + + ret = ctx->dev->mmu->peer_va_free(&gmf); if (ret != GM_RET_SUCCESS) pr_debug("gmem: free_vma(start:%lx, len:%lx) ret %d\n", start, end - start, ret); } } + +static void munmap_in_peer_devices(struct mm_struct *mm, unsigned long start, unsigned long end) +{ + struct vm_area_struct *vma; + + VMA_ITERATOR(vmi, mm, start); + for_each_vma_range(vmi, vma, end) { + if (vma_is_peer_shared(vma)) + munmap_single_vma_in_peer_devices(mm, vma, start, end); + } +} #endif /* @@ -2754,6 +2796,10 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, locked_vm += vma_pages(next); count++; +#ifdef CONFIG_GMEM + if (gmem_is_enabled()) + munmap_single_vma_in_peer_devices(mm, vma, start, end); +#endif if (unlikely(uf)) { /* * If userfaultfd_unmap_prep returns an error the vmas @@ -2812,7 +2858,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, next = vma_next(vmi); #ifdef CONFIG_GMEM if (gmem_is_enabled()) - munmap_in_peer_devices(mm, vma, start, end); + munmap_in_peer_devices(mm, start, end); #endif if (next) vma_iter_prev_range(vmi); @@ -2872,17 +2918,23 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long end; struct vm_area_struct *vma; +#ifdef CONFIG_GMEM + struct vm_area_struct *vma_end; if (gmem_is_enabled()) { vma = find_vma_intersection(mm, start, start + len); + vma_end = find_vma(mm, start + len); if (!vma) return 0; if (vma_is_peer_shared(vma)) { if (!IS_ALIGNED(start, PMD_SIZE)) return -EINVAL; - - len = round_up(len, SZ_2M); } + /* Prevents partial release of the peer_share page. */ + if (vma_end && vma_end->vm_start < (start + len) && vma_is_peer_shared(vma_end)) + len = round_up(len, SZ_2M); } +#endif + if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) return -EINVAL; @@ -2921,16 +2973,19 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, } #ifdef CONFIG_GMEM -static int alloc_va_in_peer_devices(struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long addr, unsigned long len, - vm_flags_t vm_flags) +static int alloc_va_in_peer_devices(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, unsigned long len, vm_flags_t vm_flags) { struct gm_context *ctx, *tmp; enum gm_ret ret; - pr_debug("gmem: start mmap, as %p\n", mm->gm_as); - if (!mm->gm_as) - return -ENODEV; + if (!mm->gm_as) { + ret = gm_as_create(0, ULONG_MAX, GM_AS_ALLOC_DEFAULT, PAGE_SIZE, + &mm->gm_as); + if (ret) + return ret; + } + pr_debug("gmem: start mmap, as %p\n", (void *)mm->gm_as); if (!vma->vm_obj) vma->vm_obj = vm_object_create(vma); @@ -2940,7 +2995,16 @@ static int alloc_va_in_peer_devices(struct mm_struct *mm, * TODO: consider the concurrency problem of device * attaching/detaching from the gm_as. */ + ret = -ENODEV; list_for_each_entry_safe(ctx, tmp, &mm->gm_as->gm_ctx_list, gm_as_link) { + struct gm_fault_t gmf = { + .mm = mm, + .dev = ctx->dev, + .va = addr, + .size = len, + .prot = vm_flags, + }; + if (!gm_dev_is_peer(ctx->dev)) continue; @@ -2948,16 +3012,15 @@ static int alloc_va_in_peer_devices(struct mm_struct *mm, pr_debug("gmem: mmu ops has no alloc_vma\n"); continue; } - pr_debug("gmem: call vma_alloc\n"); - ret = ctx->dev->mmu->peer_va_alloc_fixed(mm, addr, len, vm_flags); + ret = ctx->dev->mmu->peer_va_alloc_fixed(&gmf); if (ret != GM_RET_SUCCESS) { pr_debug("gmem: alloc_vma ret %d\n", ret); return ret; } } - return GM_RET_SUCCESS; + return ret; } #endif @@ -2994,7 +3057,8 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, if (!may_expand_vm(mm, vm_flags, (len >> PAGE_SHIFT) - nr_pages)) { #ifdef CONFIG_GMEM - gmem_release_vma(mm, &reserve_list); + if (gmem_is_enabled()) + gmem_release_vma(mm, &reserve_list); #endif return -ENOMEM; } @@ -3003,7 +3067,8 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, /* Unmap any existing mapping in the area */ if (do_vmi_munmap(&vmi, mm, addr, len, uf, false)) { #ifdef CONFIG_GMEM - gmem_release_vma(mm, &reserve_list); + if (gmem_is_enabled()) + gmem_release_vma(mm, &reserve_list); #endif return -ENOMEM; } @@ -3015,7 +3080,8 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, charged = len >> PAGE_SHIFT; if (security_vm_enough_memory_mm(mm, charged)) { #ifdef CONFIG_GMEM - gmem_release_vma(mm, &reserve_list); + if (gmem_is_enabled()) + gmem_release_vma(mm, &reserve_list); #endif return -ENOMEM; } @@ -3082,6 +3148,24 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, vma->vm_page_prot = vm_get_page_prot(vm_flags); vma->vm_pgoff = pgoff; +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) { + enum gm_ret ret = alloc_va_in_peer_devices(mm, vma, addr, len, vm_flags); + + if (ret == GM_RET_NOMEM && retry_times < GMEM_MMAP_RETRY_TIMES) { + retry_times++; + addr = get_unmapped_area(file, addr, len, pgoff, 0); + gmem_reserve_vma(vma, &reserve_list); + goto retry; + } else if (ret != GM_RET_SUCCESS) { + pr_debug("gmem: alloc_vma ret %d\n", ret); + error = -ENOMEM; + goto free_vma; + } + gmem_release_vma(mm, &reserve_list); + } +#endif + if (vma_iter_prealloc(&vmi, vma)) { error = -ENOMEM; goto free_vma; @@ -3161,23 +3245,6 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, file = vma->vm_file; ksm_add_vma(vma); expanded: -#ifdef CONFIG_GMEM - if (vma_is_peer_shared(vma)) { - enum gm_ret ret = alloc_va_in_peer_devices(mm, vma, addr, len, vm_flags); - - if (ret == GM_RET_NOMEM && retry_times < GMEM_MMAP_RETRY_TIMES) { - retry_times++; - addr = get_unmapped_area(file, addr, len, pgoff, 0); - gmem_reserve_vma(vma, &reserve_list); - goto retry; - } else if (ret != GM_RET_SUCCESS) { - pr_debug("gmem: alloc_vma ret %d\n", ret); - error = -ENOMEM; - goto free_vma; - } - gmem_release_vma(mm, &reserve_list); - } -#endif perf_event_mmap(vma); vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); @@ -3222,6 +3289,7 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, if (charged) vm_unacct_memory(charged); #ifdef CONFIG_GMEM + if (gmem_is_enabled()) gmem_release_vma(mm, &reserve_list); #endif return error; -- Gitee From 1e993a29e4ad2afcc0a49f54f68b981672b7a7fa Mon Sep 17 00:00:00 2001 From: nicunshu Date: Sat, 26 Jul 2025 17:40:11 +0800 Subject: [PATCH 16/34] mm: gmem: optimize hmemcpy and other gmem functions euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- optimize multiple gmem related functions Signed-off-by: nicunshu --- include/linux/gmem.h | 16 +- include/linux/mman.h | 3 +- include/linux/printk.h | 6 - kernel/fork.c | 4 +- mm/gmem.c | 400 ++++++++++++++++++----------------------- mm/memory.c | 17 ++ mm/mempolicy.c | 6 +- mm/mmap.c | 205 +++++++-------------- mm/mprotect.c | 18 +- mm/util.c | 138 +++++++++++++- mm/vm_object.c | 2 +- 11 files changed, 420 insertions(+), 395 deletions(-) diff --git a/include/linux/gmem.h b/include/linux/gmem.h index d37e79a7052d..7beebc67c398 100644 --- a/include/linux/gmem.h +++ b/include/linux/gmem.h @@ -278,20 +278,18 @@ static inline bool gm_mapping_pinned(struct gm_mapping *gm_mapping) /* GMEM Device KPI */ extern enum gm_ret gm_dev_create(struct gm_mmu *mmu, void *dev_data, unsigned long cap, struct gm_dev **new_dev); -extern enum gm_ret gm_dev_destroy(struct gm_dev *dev); extern enum gm_ret gm_dev_switch(struct gm_dev *dev, struct gm_as *as); extern enum gm_ret gm_dev_detach(struct gm_dev *dev, struct gm_as *as); extern enum gm_ret gm_dev_register_physmem(struct gm_dev *dev, unsigned long begin, unsigned long end); -enum gm_ret gm_dev_fault(struct mm_struct *mm, unsigned long addr, struct gm_dev *dev, - int behavior); +enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, + struct gm_dev *dev, int behavior); vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, unsigned int order); /* GMEM address space KPI */ extern enum gm_ret gm_dev_register_physmem(struct gm_dev *dev, unsigned long begin, unsigned long end); extern void gm_dev_unregister_physmem(struct gm_dev *dev, unsigned int nid); -extern struct gm_mapping *gm_mappings_alloc(unsigned int nid, unsigned int order); extern enum gm_ret gm_as_create(unsigned long begin, unsigned long end, enum gm_as_alloc policy, unsigned long cache_quantum, struct gm_as **new_as); extern enum gm_ret gm_as_destroy(struct gm_as *as); @@ -322,8 +320,6 @@ struct hnode { struct xarray pages; }; -extern struct hnode *hnodes[]; - static inline bool is_hnode(int node) { return (node < MAX_NUMNODES) && !node_isset(node, node_possible_map) && @@ -336,11 +332,6 @@ static inline bool is_hnode_allowed(int node) node_isset(node, current->mems_allowed); } -static inline struct hnode *get_hnode(unsigned int hnid) -{ - return hnodes[hnid]; -} - static inline int get_hnuma_id(struct gm_dev *gm_dev) { return first_node(gm_dev->registered_hnodes); @@ -352,4 +343,7 @@ void free_hnode_id(unsigned int nid); void hnode_init(struct hnode *hnode, unsigned int hnid, struct gm_dev *dev); void hnode_deinit(unsigned int hnid, struct gm_dev *dev); +#define gmem_err(fmt, ...) \ + ((void)pr_err("[gmem]" fmt "\n", ##__VA_ARGS__)) + #endif /* _GMEM_H */ diff --git a/include/linux/mman.h b/include/linux/mman.h index 8ddca62d6460..30ec68346f6b 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -55,7 +55,8 @@ | MAP_32BIT \ | MAP_ABOVE4G \ | MAP_HUGE_2MB \ - | MAP_HUGE_1GB) + | MAP_HUGE_1GB \ + | MAP_PEER_SHARED) extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; diff --git a/include/linux/printk.h b/include/linux/printk.h index c4fc04998932..e4878bb58f66 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -761,9 +761,3 @@ static inline void print_hex_dump_debug(const char *prefix_str, int prefix_type, print_hex_dump_debug(prefix_str, prefix_type, 16, 1, buf, len, true) #endif - -#ifdef CONFIG_GMEM -#define gmem_err(fmt, ...) \ - ((void)pr_err("[gmem]" fmt "\n", ##__VA_ARGS__)) - -#endif diff --git a/kernel/fork.c b/kernel/fork.c index bce23359c908..f6c45be64ab9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1783,7 +1783,9 @@ static struct mm_struct *dup_mm(struct task_struct *tsk, err = dup_mmap(mm, oldmm); if (err) goto free_pt; - +#ifdef CONFIG_GMEM + mm->gm_as = NULL; +#endif mm->hiwater_rss = get_mm_rss(mm); mm->hiwater_vm = mm->total_vm; diff --git a/mm/gmem.c b/mm/gmem.c index ce591b9ed8ca..039f4cfe28db 100644 --- a/mm/gmem.c +++ b/mm/gmem.c @@ -53,6 +53,9 @@ static DEFINE_XARRAY_ALLOC(gm_dev_id_pool); static bool enable_gmem; +DEFINE_SPINLOCK(hnode_lock); +struct hnode *hnodes[MAX_NUMNODES]; + static inline unsigned long pe_mask(unsigned int order) { if (order == 0) @@ -77,15 +80,17 @@ void gmem_stats_counter(enum gmem_stats_item item, int val) percpu_counter_add(&g_gmem_stats[item], val); } -static int gmem_stat_init(void) +static int gmem_stats_init(void) { int i, rc; for (i = 0; i < NR_GMEM_STAT_ITEMS; i++) { rc = percpu_counter_init(&g_gmem_stats[i], 0, GFP_KERNEL); if (rc) { - for (i--; i >= 0; i--) - percpu_counter_destroy(&g_gmem_stats[i]); + int j; + + for (j = i-1; j >= 0; j--) + percpu_counter_destroy(&g_gmem_stats[j]); break; /* break the initialization process */ } @@ -112,7 +117,6 @@ static int gmem_stats_show(struct seq_file *m, void *arg) #endif /* CONFIG_PROC_FS */ static struct workqueue_struct *prefetch_wq; -static struct workqueue_struct *hmemcpy_wq; #define GM_WORK_CONCURRENCY 4 @@ -143,7 +147,7 @@ static int __init gmem_init(void) if (err) goto free_region; - err = gmem_stat_init(); + err = gmem_stats_init(); if (err) goto free_region; @@ -155,17 +159,8 @@ static int __init gmem_init(void) goto free_region; } - hmemcpy_wq = alloc_workqueue("hmemcpy", __WQ_LEGACY | WQ_UNBOUND - | WQ_HIGHPRI | WQ_CPU_INTENSIVE, GM_WORK_CONCURRENCY); - if (!hmemcpy_wq) { - gmem_err("fail to alloc workqueue hmemcpy_wq\n"); - err = -EFAULT; - destroy_workqueue(prefetch_wq); - goto free_region; - } - #ifdef CONFIG_PROC_FS - proc_create_single("gmemstat", 0444, NULL, gmem_stats_show); + proc_create_single("gmemstats", 0444, NULL, gmem_stats_show); #endif static_branch_enable(&gmem_status); @@ -227,18 +222,9 @@ enum gm_ret gm_dev_create(struct gm_mmu *mmu, void *dev_data, unsigned long cap, } EXPORT_SYMBOL_GPL(gm_dev_create); -// Destroy a GMEM device and reclaim the resources. -enum gm_ret gm_dev_destroy(struct gm_dev *dev) -{ - // TODO: implement it - xa_erase(&gm_dev_id_pool, dev->id); - return GM_RET_SUCCESS; -} -EXPORT_SYMBOL_GPL(gm_dev_destroy); - -/* Handle the page fault triggered by a given device */ -enum gm_ret gm_dev_fault(struct mm_struct *mm, unsigned long addr, struct gm_dev *dev, - int behavior) +/* Handle the page fault triggered by a given device with mmap lock*/ +enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struct gm_dev *dev, + int behavior) { enum gm_ret ret = GM_RET_SUCCESS; struct gm_mmu *mmu = dev->mmu; @@ -257,20 +243,18 @@ enum gm_ret gm_dev_fault(struct mm_struct *mm, unsigned long addr, struct gm_dev }; struct page *page = NULL; - mmap_read_lock(mm); - vma = find_vma(mm, addr); if (!vma || vma->vm_start > addr) { - gmem_err("%s failed to find vma by addr %p\n", __func__, (void *)addr); + gmem_err("%s failed to find vma\n", __func__); pr_info("gmem: %s no vma\n", __func__); ret = GM_RET_FAILURE_UNKNOWN; - goto mmap_unlock; + goto out; } obj = vma->vm_obj; if (!obj) { gmem_err("%s no vm_obj\n", __func__); ret = GM_RET_FAILURE_UNKNOWN; - goto mmap_unlock; + goto out; } xa_lock(obj->logical_page_table); @@ -284,7 +268,7 @@ enum gm_ret gm_dev_fault(struct mm_struct *mm, unsigned long addr, struct gm_dev if (unlikely(!gm_mapping)) { gmem_err("OOM when creating vm_obj!\n"); ret = GM_RET_NOMEM; - goto mmap_unlock; + goto out; } mutex_lock(&gm_mapping->lock); if (gm_mapping_nomap(gm_mapping)) { @@ -344,11 +328,10 @@ enum gm_ret gm_dev_fault(struct mm_struct *mm, unsigned long addr, struct gm_dev gm_mapping->dev = dev; unlock: mutex_unlock(&gm_mapping->lock); -mmap_unlock: - mmap_read_unlock(mm); +out: return ret; } -EXPORT_SYMBOL_GPL(gm_dev_fault); +EXPORT_SYMBOL_GPL(gm_dev_fault_locked); vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, unsigned int order) @@ -393,6 +376,24 @@ vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, return ret; } +static inline struct hnode *get_hnode(unsigned int hnid) +{ + return hnodes[hnid]; +} + +static struct gm_dev *get_gm_dev(unsigned int nid) +{ + struct hnode *hnode; + struct gm_dev *dev = NULL; + + spin_lock(&hnode_lock); + hnode = get_hnode(nid); + if (hnode) + dev = hnode->dev; + spin_unlock(&hnode_lock); + return dev; +} + /* * Register the local physical memory of a gmem device. * This implies dynamically creating @@ -409,15 +410,16 @@ enum gm_ret gm_dev_register_physmem(struct gm_dev *dev, unsigned long begin, uns if (!hnode) goto err; + mapping = kvmalloc_array(page_num, sizeof(struct gm_mapping), GFP_KERNEL); + if (!mapping) + goto free_hnode; + + spin_lock(&hnode_lock); nid = alloc_hnode_id(); if (nid == MAX_NUMNODES) - goto free_hnode; + goto unlock_hnode; hnode_init(hnode, nid, dev); - mapping = kvmalloc_array(page_num, sizeof(struct gm_mapping), GFP_KERNEL); - if (!mapping) - goto deinit_hnode; - for (i = 0; i < page_num; i++, addr += PAGE_SIZE) { mapping[i].pfn = addr >> PAGE_SHIFT; mapping[i].flag = 0; @@ -436,11 +438,14 @@ enum gm_ret gm_dev_register_physmem(struct gm_dev *dev, unsigned long begin, uns } xa_unlock(&hnode->pages); + spin_unlock(&hnode_lock); return GM_RET_SUCCESS; deinit_hnode: hnode_deinit(nid, dev); free_hnode_id(nid); +unlock_hnode: + spin_unlock(&hnode_lock); free_hnode: kfree(hnode); err: @@ -450,40 +455,31 @@ EXPORT_SYMBOL_GPL(gm_dev_register_physmem); void gm_dev_unregister_physmem(struct gm_dev *dev, unsigned int nid) { - struct hnode *hnode = get_hnode(nid); - struct gm_mapping *mapping = xa_load(&hnode->pages, 0); + struct hnode *hnode = NULL; + struct gm_mapping *mapping = NULL; + + spin_lock(&hnode_lock); + + if (!node_isset(nid, dev->registered_hnodes)) + goto unlock; + + hnode = get_hnode(nid); + + if (!hnode) + goto unlock; + mapping = xa_load(&hnode->pages, 0); + + if (mapping) + kvfree(mapping); - kvfree(mapping); hnode_deinit(nid, dev); free_hnode_id(nid); kfree(hnode); +unlock: + spin_unlock(&hnode_lock); } EXPORT_SYMBOL_GPL(gm_dev_unregister_physmem); -struct gm_mapping *gm_mappings_alloc(unsigned int nid, unsigned int order) -{ - struct gm_mapping *mapping; - struct hnode *node = get_hnode(nid); - XA_STATE(xas, &node->pages, 0); - - /* TODO: support order > 0 */ - if (order != 0) - return ERR_PTR(-EINVAL); - - xa_lock(&node->pages); - mapping = xas_find_marked(&xas, ULONG_MAX, XA_MARK_0); - if (!mapping) { - xa_unlock(&node->pages); - return ERR_PTR(-ENOMEM); - } - - xas_clear_mark(&xas, XA_MARK_0); - xa_unlock(&node->pages); - - return mapping; -} -EXPORT_SYMBOL_GPL(gm_mappings_alloc); - /* GMEM Virtual Address Space API */ enum gm_ret gm_as_create(unsigned long begin, unsigned long end, enum gm_as_alloc policy, unsigned long cache_quantum, struct gm_as **new_as) @@ -569,32 +565,30 @@ enum gm_ret gm_as_attach(struct gm_as *as, struct gm_dev *dev, enum gm_mmu_mode } EXPORT_SYMBOL_GPL(gm_as_attach); -DEFINE_SPINLOCK(hnode_lock); -struct hnode *hnodes[MAX_NUMNODES]; - void __init hnuma_init(void) { unsigned int node; - + spin_lock(&hnode_lock); for_each_node(node) node_set(node, hnode_map); + spin_unlock(&hnode_lock); } unsigned int alloc_hnode_id(void) { unsigned int node; - spin_lock(&hnode_lock); node = first_unset_node(hnode_map); node_set(node, hnode_map); - spin_unlock(&hnode_lock); return node; } void free_hnode_id(unsigned int nid) { + spin_lock(&hnode_lock); node_clear(nid, hnode_map); + spin_unlock(&hnode_lock); } void hnode_init(struct hnode *hnode, unsigned int hnid, struct gm_dev *dev) @@ -634,7 +628,9 @@ static void prefetch_work_cb(struct work_struct *work) do { /* MADV_WILLNEED: dev will soon access this addr. */ - ret = gm_dev_fault(d->mm, addr, d->dev, MADV_WILLNEED); + mmap_read_lock(d->mm); + ret = gm_dev_fault_locked(d->mm, addr, d->dev, MADV_WILLNEED); + mmap_read_unlock(d->mm); if (ret == GM_RET_PAGE_EXIST) { gmem_err("%s: device has done page fault, ignore prefetch\n", __func__); @@ -670,7 +666,7 @@ static int hmadvise_do_prefetch(struct gm_dev *dev, unsigned long addr, size_t s size = end - start; if (!end && old_start) { - gmem_err("end addr align up 2M causes invalid addr %p\n", (void *)end); + gmem_err("end addr align up 2M causes invalid addr\n"); return -EINVAL; } @@ -681,8 +677,7 @@ static int hmadvise_do_prefetch(struct gm_dev *dev, unsigned long addr, size_t s vma = find_vma(current->mm, start); if (!vma || start < vma->vm_start || end > vma->vm_end) { mmap_read_unlock(current->mm); - gmem_err("failed to find vma by invalid start %p or size 0x%zx.\n", - (void *)start, size); + gmem_err("failed to find vma by invalid start or size.\n"); return GM_RET_FAILURE_UNKNOWN; } else if (!vma_is_peer_shared(vma)) { mmap_read_unlock(current->mm); @@ -795,7 +790,7 @@ static int hmadvise_do_eagerfree(unsigned long addr, size_t size) /* Check to see whether len was rounded up from small -ve to zero */ if (old_start && !start) { - gmem_err("start addr align up 2M causes invalid addr %p", (void *)start); + gmem_err("start addr align up 2M causes invalid addr"); return -EINVAL; } @@ -808,8 +803,7 @@ static int hmadvise_do_eagerfree(unsigned long addr, size_t size) } if (!vma_is_peer_shared(vma)) { - pr_debug("gmem:not peer-shared vma %p-%p, skip dontneed\n", - (void *)vma->vm_start, (void *)vma->vm_end); + pr_debug("gmem:not peer-shared vma, skip dontneed\n"); start = vma->vm_end; continue; } @@ -835,7 +829,7 @@ static bool check_hmadvise_behavior(int behavior) int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior) { int error = -EINVAL; - struct hnode *node; + struct gm_dev *dev = NULL; if (hnid == -1) { if (check_hmadvise_behavior(behavior)) { @@ -857,8 +851,8 @@ int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior) return error; } - node = get_hnode(hnid); - if (!node) { + dev = get_gm_dev(hnid); + if (!dev) { gmem_err("hmadvise: hnode id %d is invalid\n", hnid); return error; } @@ -866,7 +860,7 @@ int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior) no_hnid: switch (behavior) { case MADV_PREFETCH: - return hmadvise_do_prefetch(node->dev, start, len_in); + return hmadvise_do_prefetch(dev, start, len_in); case MADV_DONTNEED: return hmadvise_do_eagerfree(start, len_in); default: @@ -877,15 +871,6 @@ int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior) } EXPORT_SYMBOL_GPL(hmadvise_inner); -struct hmemcpy_data { - struct mm_struct *mm; - int hnid; - unsigned long src; - unsigned long dest; - size_t size; - struct work_struct work; -}; - static bool hnid_match_dest(int hnid, struct gm_mapping *dest) { return (hnid < 0) ? gm_mapping_cpu(dest) : gm_mapping_device(dest); @@ -897,84 +882,82 @@ static void do_hmemcpy(struct mm_struct *mm, int hnid, unsigned long dest, enum gm_ret ret; int page_size = HPAGE_SIZE; struct vm_area_struct *vma_dest, *vma_src; - struct gm_mapping *gm_mmaping_dest, *gm_mmaping_src; + struct gm_mapping *gm_mapping_dest, *gm_mapping_src; struct gm_dev *dev = NULL; - struct hnode *node; struct gm_memcpy_t gmc = {0}; if (size == 0) return; + mmap_read_lock(mm); vma_dest = find_vma(mm, dest); vma_src = find_vma(mm, src); - gm_mmaping_dest = vm_object_lookup(vma_dest->vm_obj, dest & ~(page_size - 1)); - gm_mmaping_src = vm_object_lookup(vma_src->vm_obj, src & ~(page_size - 1)); + if (!vma_src || vma_src->vm_start > src || !vma_dest || vma_dest->vm_start > dest) { + gmem_err("hmemcpy: the vma find by src/dest is NULL!\n"); + goto unlock_mm; + } - if (!gm_mmaping_src) { - gmem_err("%s: gm_mmaping_src is NULL, src=%p; size=0x%zx\n", - __func__, (void *)src, size); - return; + gm_mapping_dest = vm_object_lookup(vma_dest->vm_obj, dest & ~(page_size - 1)); + gm_mapping_src = vm_object_lookup(vma_src->vm_obj, src & ~(page_size - 1)); + + if (!gm_mapping_src) { + gmem_err("hmemcpy: gm_mapping_src is NULL\n"); + goto unlock_mm; } if (hnid != -1) { - node = get_hnode(hnid); - if (node) - dev = node->dev; + dev = get_gm_dev(hnid); if (!dev) { - gmem_err("%s: hnode's dev is NULL\n", __func__); - return; + gmem_err("hmemcpy: hnode's dev is NULL\n"); + goto unlock_mm; } } // Trigger dest page fault on host or device - if (!gm_mmaping_dest || gm_mapping_nomap(gm_mmaping_dest) - || !hnid_match_dest(hnid, gm_mmaping_dest)) { + if (!gm_mapping_dest || gm_mapping_nomap(gm_mapping_dest) + || !hnid_match_dest(hnid, gm_mapping_dest)) { if (hnid == -1) { - mmap_read_lock(mm); - handle_mm_fault(vma_dest, dest & ~(page_size - 1), FAULT_FLAG_USER | - FAULT_FLAG_INSTRUCTION | FAULT_FLAG_WRITE, NULL); - mmap_read_unlock(mm); + ret = handle_mm_fault(vma_dest, dest & ~(page_size - 1), FAULT_FLAG_USER | + FAULT_FLAG_INSTRUCTION | FAULT_FLAG_WRITE, NULL); + if (ret) { + gmem_err("%s: failed to execute host page fault, ret:%d\n", + __func__, ret); + goto unlock_mm; + } } else { - ret = gm_dev_fault(mm, dest & ~(page_size - 1), dev, MADV_WILLNEED); + ret = gm_dev_fault_locked(mm, dest & ~(page_size - 1), dev, MADV_WILLNEED); if (ret != GM_RET_SUCCESS) { - gmem_err("%s: gm_dev_fault failed\n", __func__); - return; + gmem_err("%s: failed to excecute dev page fault.\n", __func__); + goto unlock_mm; } } } - if (!gm_mmaping_dest) - gm_mmaping_dest = vm_object_lookup(vma_dest->vm_obj, round_down(dest, page_size)); + if (!gm_mapping_dest) + gm_mapping_dest = vm_object_lookup(vma_dest->vm_obj, round_down(dest, page_size)); - if (gm_mmaping_dest && gm_mmaping_dest != gm_mmaping_src) - mutex_lock(&gm_mmaping_dest->lock); - mutex_lock(&gm_mmaping_src->lock); + if (gm_mapping_dest && gm_mapping_dest != gm_mapping_src) + mutex_lock(&gm_mapping_dest->lock); + mutex_lock(&gm_mapping_src->lock); // Use memcpy when there is no device address, otherwise use peer_memcpy if (hnid == -1) { - if (gm_mapping_cpu(gm_mmaping_src)) { // host to host - memcpy(page_to_virt(gm_mmaping_dest->page) + (dest & (page_size - 1)), - page_to_virt(gm_mmaping_src->page) + (src & (page_size - 1)), - size); - goto unlock; + if (gm_mapping_cpu(gm_mapping_src)) { // host to host + gmem_err("hmemcpy: host to host is unimplemented\n"); + goto unlock_gm_mmaping; } else { // device to host - dev = gm_mmaping_src->dev; + dev = gm_mapping_src->dev; gmc.dma_addr = phys_to_dma(dev->dma_dev, - page_to_phys(gm_mmaping_dest->page) + (dest & (page_size - 1))); + page_to_phys(gm_mapping_dest->page) + (dest & (page_size - 1))); gmc.src = src; } } else { - if (gm_mapping_cpu(gm_mmaping_src)) { // host to device + if (gm_mapping_cpu(gm_mapping_src)) { // host to device gmc.dest = dest; gmc.dma_addr = phys_to_dma(dev->dma_dev, - page_to_phys(gm_mmaping_src->page) + (src & (page_size - 1))); + page_to_phys(gm_mapping_src->page) + (src & (page_size - 1))); } else { // device to device - if (dev == gm_mmaping_src->dev) { // same device - gmc.dest = dest; - gmc.src = src; - } else { // TODO: different devices - gmem_err("%s: device to device is unimplemented\n", __func__); - goto unlock; - } + gmem_err("hmemcpy: device to device is unimplemented\n"); + goto unlock_gm_mmaping; } } gmc.mm = mm; @@ -982,137 +965,100 @@ static void do_hmemcpy(struct mm_struct *mm, int hnid, unsigned long dest, gmc.size = size; dev->mmu->peer_hmemcpy(&gmc); -unlock: - mutex_unlock(&gm_mmaping_src->lock); - if (gm_mmaping_dest && gm_mmaping_dest != gm_mmaping_src) - mutex_unlock(&gm_mmaping_dest->lock); +unlock_gm_mmaping: + mutex_unlock(&gm_mapping_src->lock); + if (gm_mapping_dest && gm_mapping_dest != gm_mapping_src) + mutex_unlock(&gm_mapping_dest->lock); +unlock_mm: + mmap_read_unlock(mm); } /* * Each page needs to be copied in three parts when the address is not aligned. - * | <--a-->| | + * | ml <--0-->|<1><--2-> | * | -------|--------- | * | / /| / / | * | / / | / / | * | / / |/ / | * | ----------|------ | - * | <----b--->| | + * | | | * |<----page x---->|<----page y---->| */ -static void hmemcpy_work_cb(struct work_struct *work) +static void __hmemcpy(int hnid, unsigned long dest, unsigned long src, size_t size) { - size_t i; - int remain, a, b, page_size = HPAGE_SIZE; - struct hmemcpy_data *d = container_of(work, struct hmemcpy_data, work); - unsigned long src = d->src, dest = d->dest; - - a = min(page_size - (src & (page_size - 1)), page_size - (dest & (page_size - 1))); - b = max(page_size - (src & (page_size - 1)), page_size - (dest & (page_size - 1))); - - for (i = page_size; i < d->size; i += page_size) { - if (a != 0) - do_hmemcpy(d->mm, d->hnid, dest, src, a); - if (b - a != 0) - do_hmemcpy(d->mm, d->hnid, dest + a, src + a, b - a); - if (page_size - b != 0) - do_hmemcpy(d->mm, d->hnid, dest + b, src + b, page_size - b); - src += page_size; - dest += page_size; - } + int i = 0; + // offsets within the huge page for the source and destination addresses + int src_offset = src & (HPAGE_SIZE - 1); + int dst_offset = dest & (HPAGE_SIZE - 1); + // Divide each page into three parts according to the align + int ml[3] = { + HPAGE_SIZE - (src_offset < dst_offset ? dst_offset : src_offset), + src_offset < dst_offset ? (dst_offset - src_offset) : (src_offset - dst_offset), + src_offset < dst_offset ? src_offset : dst_offset + }; + struct mm_struct *mm = current->mm; - remain = d->size + page_size - i; - if (remain == 0) - goto out; + if (size == 0) + return; - if (remain < a) { - do_hmemcpy(d->mm, d->hnid, dest, src, remain); - } else if (remain < b) { - do_hmemcpy(d->mm, d->hnid, dest, src, a); - do_hmemcpy(d->mm, d->hnid, dest + a, src + a, remain - a); - } else { - do_hmemcpy(d->mm, d->hnid, dest, src, a); - do_hmemcpy(d->mm, d->hnid, dest + a, src + a, b - a); - do_hmemcpy(d->mm, d->hnid, dest + b, src + b, remain - b); + while (size >= ml[i]) { + if (ml[i] > 0) { + do_hmemcpy(mm, hnid, dest, src, ml[i]); + src += ml[i]; + dest += ml[i]; + size -= ml[i]; + } + i = (i + 1) % 3; } -out: - kfree(d); + if (size > 0) + do_hmemcpy(mm, hnid, dest, src, size); } int hmemcpy(int hnid, unsigned long dest, unsigned long src, size_t size) { - int page_size = HPAGE_SIZE; - unsigned long per_size, copied = 0; - struct hmemcpy_data *data; struct vm_area_struct *vma_dest, *vma_src; + struct mm_struct *mm = current->mm; if (hnid < 0) { if (hnid != -1) { - gmem_err("hmadvise: invalid hnid %d < 0\n", hnid); + gmem_err("hmemcpy: invalid hnid %d < 0\n", hnid); return -EINVAL; } } else if (!is_hnode(hnid) || !is_hnode_allowed(hnid)) { - gmem_err( - "hmadvise: can't find hnode by hnid:%d or hnode is not allowed\n", - hnid); + gmem_err("hmemcpy: can't find hnode by hnid:%d or hnode is not allowed\n", hnid); return -EINVAL; } - vma_dest = find_vma(current->mm, dest); - vma_src = find_vma(current->mm, src); - - if (!vma_src || vma_src->vm_start > src || !vma_is_peer_shared(vma_src) - || vma_src->vm_end < (src + size)) { - gmem_err("failed to find peer_shared vma by invalid src:%p or size :0x%zx", - (void *)src, size); - return -EINVAL; - } + mmap_read_lock(mm); + vma_dest = find_vma(mm, dest); + vma_src = find_vma(mm, src); - if (!vma_dest || vma_dest->vm_start > dest || !vma_is_peer_shared(vma_dest) - || vma_dest->vm_end < (dest + size)) { - gmem_err("failed to find peer_shared vma by invalid dest:%p or size :0x%zx", - (void *)dest, size); - return -EINVAL; + if ((ULONG_MAX - size < src) || !vma_src || vma_src->vm_start > src || + !vma_is_peer_shared(vma_src) || vma_src->vm_end < (src + size)) { + gmem_err("failed to find peer_shared vma by invalid src or size\n"); + goto unlock; } - if (!(vma_dest->vm_flags & VM_WRITE)) { - gmem_err("dest is not writable.\n"); - return -EINVAL; + if ((ULONG_MAX - size < dest) || !vma_dest || vma_dest->vm_start > dest || + !vma_is_peer_shared(vma_dest) || vma_dest->vm_end < (dest + size)) { + gmem_err("failed to find peer_shared vma by invalid dest or size\n"); + goto unlock; } if (!(vma_dest->vm_flags & VM_WRITE)) { gmem_err("dest is not writable.\n"); - return -EINVAL; + goto unlock; } + mmap_read_unlock(mm); - per_size = (size / GM_WORK_CONCURRENCY) & ~(page_size - 1); - - while (copied < size) { - data = kzalloc(sizeof(struct hmemcpy_data), GFP_KERNEL); - if (data == NULL) { - flush_workqueue(hmemcpy_wq); - return GM_RET_NOMEM; - } - INIT_WORK(&data->work, hmemcpy_work_cb); - data->mm = current->mm; - data->hnid = hnid; - data->src = src; - data->dest = dest; - if (per_size == 0) { - data->size = size; - } else { - // Process (1.x * per_size) for the last time - data->size = (size - copied < 2 * per_size) ? (size - copied) : per_size; - } - - queue_work(hmemcpy_wq, &data->work); - src += data->size; - dest += data->size; - copied += data->size; - } + __hmemcpy(hnid, dest, src, size); - flush_workqueue(hmemcpy_wq); return 0; + +unlock: + mmap_read_unlock(mm); + return -EINVAL; } EXPORT_SYMBOL_GPL(hmemcpy); diff --git a/mm/memory.c b/mm/memory.c index 568f3e295fdb..0e4097f99f19 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1722,6 +1722,9 @@ static inline void zap_logic_pmd_range(struct vm_area_struct *vma, struct gm_mapping *gm_mapping = NULL; struct page *page = NULL; + if (!vma->vm_obj) + return; + xa_lock(vma->vm_obj->logical_page_table); gm_mapping = vm_object_lookup(vma->vm_obj, addr); @@ -5924,9 +5927,23 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; + if (vma_is_peer_shared(vma)) + return VM_FAULT_OOM; } else { vmf.orig_pmd = pmdp_get_lockless(vmf.pmd); +#ifdef CONFIG_GMEM +#define THP_ENABLE_PATH "/sys/kernel/mm/transparent_hugepage/enabled" + + if (vma_is_peer_shared(vma) && pmd_none(*vmf.pmd) && + (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags))) { + /* if transparent hugepage is not enabled, return pagefault failed */ + gmem_err("transparent hugepage is not enabled. check %s\n", + THP_ENABLE_PATH); + return VM_FAULT_SIGBUS; + } +#endif + if (unlikely(is_swap_pmd(vmf.orig_pmd))) { VM_BUG_ON(thp_migration_supported() && !is_pmd_migration_entry(vmf.orig_pmd)); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index db78ce14658a..879cf8b45f2a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1903,9 +1903,13 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, bool vma_migratable(struct vm_area_struct *vma) { - +#ifdef CONFIG_GMEM if (vma->vm_flags & (VM_IO | VM_PFNMAP | VM_PEER_SHARED)) return false; +#else + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + return false; +#endif /* * DAX device mappings require predictable access latency, so avoid diff --git a/mm/mmap.c b/mm/mmap.c index 242110526d16..4543ed83ff57 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -765,39 +765,6 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, return 0; } -#ifdef CONFIG_GMEM -struct gmem_vma_list { - struct vm_area_struct *vma; - struct list_head list; -}; - -void gmem_reserve_vma(struct vm_area_struct *value, struct list_head *head) -{ - struct gmem_vma_list *node = kmalloc(sizeof(struct gmem_vma_list), GFP_KERNEL); - - if (!node) - return; - - node->vma = value; - list_add_tail(&node->list, head); -} - -void gmem_release_vma(struct mm_struct *mm, struct list_head *head) -{ - struct gmem_vma_list *node, *next; - - list_for_each_entry_safe(node, next, head, list) { - struct vm_area_struct *vma = node->vma; - - if (vma != NULL) - vm_area_free(vma); - - list_del(&node->list); - kfree(node); - } -} -#endif - /* * If the vma has a ->close operation then the driver probably needs to release * per-vma resources, so we don't attempt to merge those if the caller indicates @@ -1373,7 +1340,7 @@ unsigned long __do_mmap_mm(struct mm_struct *mm, struct file *file, unsigned lon if (IS_ERR_VALUE(addr)) return addr; - if (flags & MAP_FIXED_NOREPLACE) { + if ((flags & MAP_FIXED_NOREPLACE) || (gmem_is_enabled() && (flags & MAP_PEER_SHARED))) { if (find_vma_intersection(mm, addr, addr + len)) return -EEXIST; } @@ -1493,8 +1460,12 @@ unsigned long __do_mmap_mm(struct mm_struct *mm, struct file *file, unsigned lon vm_flags |= VM_NORESERVE; } #ifdef CONFIG_GMEM - if (gmem_is_enabled() && (flags & MAP_PEER_SHARED)) - vm_flags |= VM_PEER_SHARED; + if (flags & MAP_PEER_SHARED) { + if (gmem_is_enabled()) + vm_flags |= VM_PEER_SHARED; + else + return -EINVAL; + } #endif addr = __mmap_region_ext(mm, file, addr, len, vm_flags, pgoff, uf); @@ -2705,8 +2676,7 @@ static void munmap_single_vma_in_peer_devices(struct mm_struct *mm, struct vm_ar ret = ctx->dev->mmu->peer_va_free(&gmf); if (ret != GM_RET_SUCCESS) - pr_debug("gmem: free_vma(start:%lx, len:%lx) ret %d\n", - start, end - start, ret); + pr_debug("gmem: free_vma failed, ret %d\n", ret); } } @@ -2720,6 +2690,38 @@ static void munmap_in_peer_devices(struct mm_struct *mm, unsigned long start, un munmap_single_vma_in_peer_devices(mm, vma, start, end); } } + +static unsigned long gmem_unmap_align(struct mm_struct *mm, unsigned long start, size_t len) +{ + struct vm_area_struct *vma, *vma_end; + + vma = find_vma_intersection(mm, start, start + len); + vma_end = find_vma(mm, start + len); + if (!vma || !vma_is_peer_shared(vma)) + return 0; + if (vma_is_peer_shared(vma)) { + if (!IS_ALIGNED(start, PMD_SIZE)) + return -EINVAL; + } + + /* Prevents partial release of the peer_share page. */ + if (vma_end && vma_end->vm_start < (start + len) && vma_is_peer_shared(vma_end)) + len = round_up(len, SZ_2M); + return len; +} + +static void gmem_unmap_region(struct mm_struct *mm, unsigned long start, size_t len) +{ + unsigned long end, ret; + + ret = gmem_unmap_align(mm, start, len); + + if (!ret || IS_ERR_VALUE(ret)) + return; + + end = start + ret; + munmap_in_peer_devices(mm, start, end); +} #endif /* @@ -2856,10 +2858,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, prev = vma_iter_prev_range(vmi); next = vma_next(vmi); -#ifdef CONFIG_GMEM - if (gmem_is_enabled()) - munmap_in_peer_devices(mm, start, end); -#endif + if (next) vma_iter_prev_range(vmi); @@ -2919,19 +2918,13 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, struct vm_area_struct *vma; #ifdef CONFIG_GMEM - struct vm_area_struct *vma_end; if (gmem_is_enabled()) { - vma = find_vma_intersection(mm, start, start + len); - vma_end = find_vma(mm, start + len); - if (!vma) - return 0; - if (vma_is_peer_shared(vma)) { - if (!IS_ALIGNED(start, PMD_SIZE)) - return -EINVAL; - } - /* Prevents partial release of the peer_share page. */ - if (vma_end && vma_end->vm_start < (start + len) && vma_is_peer_shared(vma_end)) - len = round_up(len, SZ_2M); + unsigned long ret = gmem_unmap_align(mm, start, len); + + if (IS_ERR_VALUE(ret)) + return ret; + else if (ret) + len = ret; } #endif @@ -2969,60 +2962,12 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, { VMA_ITERATOR(vmi, mm, start); - return do_vmi_munmap(&vmi, mm, start, len, uf, false); -} - #ifdef CONFIG_GMEM -static int alloc_va_in_peer_devices(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, unsigned long len, vm_flags_t vm_flags) -{ - struct gm_context *ctx, *tmp; - enum gm_ret ret; - - if (!mm->gm_as) { - ret = gm_as_create(0, ULONG_MAX, GM_AS_ALLOC_DEFAULT, PAGE_SIZE, - &mm->gm_as); - if (ret) - return ret; - } - pr_debug("gmem: start mmap, as %p\n", (void *)mm->gm_as); - - if (!vma->vm_obj) - vma->vm_obj = vm_object_create(vma); - if (!vma->vm_obj) - return -ENOMEM; - /* - * TODO: consider the concurrency problem of device - * attaching/detaching from the gm_as. - */ - ret = -ENODEV; - list_for_each_entry_safe(ctx, tmp, &mm->gm_as->gm_ctx_list, gm_as_link) { - struct gm_fault_t gmf = { - .mm = mm, - .dev = ctx->dev, - .va = addr, - .size = len, - .prot = vm_flags, - }; - - if (!gm_dev_is_peer(ctx->dev)) - continue; - - if (!ctx->dev->mmu->peer_va_alloc_fixed) { - pr_debug("gmem: mmu ops has no alloc_vma\n"); - continue; - } - pr_debug("gmem: call vma_alloc\n"); - ret = ctx->dev->mmu->peer_va_alloc_fixed(&gmf); - if (ret != GM_RET_SUCCESS) { - pr_debug("gmem: alloc_vma ret %d\n", ret); - return ret; - } - } - - return ret; -} + if (gmem_is_enabled()) + gmem_unmap_region(mm, start, len); #endif + return do_vmi_munmap(&vmi, mm, start, len, uf, false); +} static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, unsigned long addr, unsigned long len, @@ -3038,12 +2983,7 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, pgoff_t vm_pgoff; int error; VMA_ITERATOR(vmi, mm, addr); -#ifdef CONFIG_GMEM - unsigned int retry_times = 0; - LIST_HEAD(reserve_list); -retry: -#endif /* Check against address space limit. */ if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { unsigned long nr_pages; @@ -3056,20 +2996,12 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, if (!may_expand_vm(mm, vm_flags, (len >> PAGE_SHIFT) - nr_pages)) { -#ifdef CONFIG_GMEM - if (gmem_is_enabled()) - gmem_release_vma(mm, &reserve_list); -#endif return -ENOMEM; } } /* Unmap any existing mapping in the area */ if (do_vmi_munmap(&vmi, mm, addr, len, uf, false)) { -#ifdef CONFIG_GMEM - if (gmem_is_enabled()) - gmem_release_vma(mm, &reserve_list); -#endif return -ENOMEM; } @@ -3079,10 +3011,6 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, if (accountable_mapping(file, vm_flags)) { charged = len >> PAGE_SHIFT; if (security_vm_enough_memory_mm(mm, charged)) { -#ifdef CONFIG_GMEM - if (gmem_is_enabled()) - gmem_release_vma(mm, &reserve_list); -#endif return -ENOMEM; } vm_flags |= VM_ACCOUNT; @@ -3148,24 +3076,6 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, vma->vm_page_prot = vm_get_page_prot(vm_flags); vma->vm_pgoff = pgoff; -#ifdef CONFIG_GMEM - if (vma_is_peer_shared(vma)) { - enum gm_ret ret = alloc_va_in_peer_devices(mm, vma, addr, len, vm_flags); - - if (ret == GM_RET_NOMEM && retry_times < GMEM_MMAP_RETRY_TIMES) { - retry_times++; - addr = get_unmapped_area(file, addr, len, pgoff, 0); - gmem_reserve_vma(vma, &reserve_list); - goto retry; - } else if (ret != GM_RET_SUCCESS) { - pr_debug("gmem: alloc_vma ret %d\n", ret); - error = -ENOMEM; - goto free_vma; - } - gmem_release_vma(mm, &reserve_list); - } -#endif - if (vma_iter_prealloc(&vmi, vma)) { error = -ENOMEM; goto free_vma; @@ -3288,10 +3198,6 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, unacct_error: if (charged) vm_unacct_memory(charged); -#ifdef CONFIG_GMEM - if (gmem_is_enabled()) - gmem_release_vma(mm, &reserve_list); -#endif return error; } @@ -3347,6 +3253,11 @@ static int __vm_munmap(unsigned long start, size_t len, bool unlock) if (sp_check_addr(start)) return -EINVAL; +#ifdef CONFIG_GMEM + if (gmem_is_enabled()) + gmem_unmap_region(mm, start, len); +#endif + if (mmap_write_lock_killable(mm)) return -EINTR; @@ -3728,6 +3639,10 @@ void exit_mmap(struct mm_struct *mm) __mt_destroy(&mm->mm_mt); mmap_write_unlock(mm); vm_unacct_memory(nr_accounted); +#ifdef CONFIG_GMEM + if (gmem_is_enabled() && mm->gm_as) + gm_as_destroy(mm->gm_as); +#endif } /* Insert vm structure into process list sorted by address diff --git a/mm/mprotect.c b/mm/mprotect.c index e65363eb603e..4eac8ad8a718 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -693,7 +693,11 @@ static int do_mprotect_pkey(unsigned long start, size_t len, unsigned long prot, int pkey) { unsigned long nstart, end, tmp, reqprot; +#ifdef CONFIG_GMEM + struct vm_area_struct *vma, *prev, *vma_end; +#else struct vm_area_struct *vma, *prev; +#endif int error; const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); const bool rier = (current->personality & READ_IMPLIES_EXEC) && @@ -736,7 +740,19 @@ static int do_mprotect_pkey(unsigned long start, size_t len, error = -ENOMEM; if (!vma) goto out; - +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) { + start = ALIGN_DOWN(start, HPAGE_SIZE); + vma_end = find_vma(current->mm, end); + if (vma_end && vma_end->vm_start < end && vma_is_peer_shared(vma_end)) + end = ALIGN(end, HPAGE_SIZE); + if (end <= start) { + error = -ENOMEM; + goto out; + } + len = end - start; + } +#endif if (unlikely(grows & PROT_GROWSDOWN)) { if (vma->vm_start >= end) goto out; diff --git a/mm/util.c b/mm/util.c index 7a5eed15c98f..65392c97b1e9 100644 --- a/mm/util.c +++ b/mm/util.c @@ -27,6 +27,9 @@ #include #include +#ifdef CONFIG_GMEM +#include +#endif #include "internal.h" #include "swap.h" @@ -540,6 +543,114 @@ int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc) } EXPORT_SYMBOL_GPL(account_locked_vm); +#ifdef CONFIG_GMEM +static unsigned long alloc_va_in_peer_devices(unsigned long addr, unsigned long len, + unsigned long flag) +{ + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + struct gm_context *ctx, *tmp; + unsigned long prot = VM_NONE; + enum gm_ret ret; + char *thp_enable_path = "/sys/kernel/mm/transparent_hugepage/enabled"; + + vma = find_vma(mm, addr); + if (!vma) { + gmem_err("vma for addr %lx is NULL, should not happen\n", addr); + return -EINVAL; + } + + if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags)) { + gmem_err("transparent hugepage is not enabled. check %s\n", + thp_enable_path); + return -EINVAL; + } + + prot |= vma->vm_flags; + + if (!mm->gm_as) { + ret = gm_as_create(0, ULONG_MAX, GM_AS_ALLOC_DEFAULT, HPAGE_SIZE, &mm->gm_as); + if (ret) { + gmem_err("gm_as_create failed\n"); + return ret; + } + } + + ret = -ENODEV; + // TODO: consider the concurrency problem of device attaching/detaching from the gm_as. + list_for_each_entry_safe(ctx, tmp, &mm->gm_as->gm_ctx_list, gm_as_link) { + struct gm_fault_t gmf = { + .mm = mm, + .dev = ctx->dev, + .va = addr, + .size = len, + .prot = prot, + }; + + if (!gm_dev_is_peer(ctx->dev)) + continue; + + if (!ctx->dev->mmu->peer_va_alloc_fixed) { + pr_debug("gmem: mmu ops has no alloc_vma\n"); + continue; + } + + ret = ctx->dev->mmu->peer_va_alloc_fixed(&gmf); + if (ret != GM_RET_SUCCESS) { + gmem_err("device mmap failed\n"); + return ret; + } + } + + if (!vma->vm_obj) + vma->vm_obj = vm_object_create(vma); + if (!vma->vm_obj) + return -ENOMEM; + + return ret; +} + +struct gmem_vma_list { + unsigned long start; + size_t len; + struct list_head list; +}; + +static void gmem_reserve_vma(struct mm_struct *mm, unsigned long start, + size_t len, struct list_head *head) +{ + struct vm_area_struct *vma; + struct gmem_vma_list *node = kmalloc(sizeof(struct gmem_vma_list), GFP_KERNEL); + + vma = find_vma(mm, start); + if (!vma || vma->vm_start >= start + len) { + kfree(node); + return; + } + vm_flags_set(vma, ~VM_PEER_SHARED); + + node->start = start; + node->len = round_up(len, SZ_2M); + list_add_tail(&node->list, head); +} + +static void gmem_release_vma(struct mm_struct *mm, struct list_head *head) +{ + struct gmem_vma_list *node, *next; + + list_for_each_entry_safe(node, next, head, list) { + unsigned long start = node->start; + size_t len = node->len; + + if (len) + vm_munmap(start, len); + + list_del(&node->list); + kfree(node); + } +} +#endif + unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long pgoff) @@ -548,7 +659,11 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, struct mm_struct *mm = current->mm; unsigned long populate; LIST_HEAD(uf); - +#ifdef CONFIG_GMEM + unsigned int retry_times = 0; + LIST_HEAD(reserve_list); +retry: +#endif ret = security_mmap_file(file, prot, flag); if (!ret) { if (mmap_write_lock_killable(mm)) @@ -559,6 +674,27 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(ret, populate); +#ifdef CONFIG_GMEM + if (gmem_is_enabled() && !IS_ERR_VALUE(ret) && flag & MAP_PEER_SHARED) { + enum gm_ret gm_ret = 0; + + gm_ret = alloc_va_in_peer_devices(ret, len, flag); + /* + * if alloc_va_in_peer_devices failed + * add vma to reserve_list and release after find a proper vma + */ + if (gm_ret == GM_RET_NOMEM && retry_times < GMEM_MMAP_RETRY_TIMES) { + retry_times++; + gmem_reserve_vma(mm, ret, len, &reserve_list); + goto retry; + } else if (gm_ret != GM_RET_SUCCESS) { + gmem_err("alloc vma ret %lu\n", ret); + gmem_reserve_vma(mm, ret, len, &reserve_list); + ret = -ENOMEM; + } + gmem_release_vma(mm, &reserve_list); + } +#endif } return ret; } diff --git a/mm/vm_object.c b/mm/vm_object.c index 25af359def56..3c8932c47270 100644 --- a/mm/vm_object.c +++ b/mm/vm_object.c @@ -236,7 +236,7 @@ void free_gm_mappings(struct vm_area_struct *vma) XA_STATE(xas, vma->vm_obj->logical_page_table, linear_page_index(vma, vma->vm_start)); xa_lock(vma->vm_obj->logical_page_table); - xas_for_each(&xas, gm_mapping, linear_page_index(vma, vma->vm_end)) { + xas_for_each(&xas, gm_mapping, linear_page_index(vma, vma->vm_end - SZ_2M)) { release_gm_mapping(gm_mapping); xas_store(&xas, NULL); } -- Gitee From c715ff8a40b2afbb96291ce13cd81f3820c85cdd Mon Sep 17 00:00:00 2001 From: nicunshu Date: Mon, 8 Sep 2025 11:38:17 +0800 Subject: [PATCH 17/34] mm:fix hnode and vma bug euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- remove is_hnode_allow which is not used. split or merge vm_object when split or merge vma. Signed-off-by: nicunshu --- arch/arm64/include/asm/rsi_cmds.h | 1 + include/linux/gmem.h | 6 -- include/linux/vm_object.h | 6 +- mm/gmem.c | 4 +- mm/mmap.c | 39 ++++---- mm/vm_object.c | 145 +++++++++++++++++++++++++++++- 6 files changed, 174 insertions(+), 27 deletions(-) diff --git a/arch/arm64/include/asm/rsi_cmds.h b/arch/arm64/include/asm/rsi_cmds.h index e6a211001bd3..ccdeffcefbff 100644 --- a/arch/arm64/include/asm/rsi_cmds.h +++ b/arch/arm64/include/asm/rsi_cmds.h @@ -9,6 +9,7 @@ #include #include +#include "string.h" #define RSI_GRANULE_SHIFT 12 #define RSI_GRANULE_SIZE (_AC(1, UL) << RSI_GRANULE_SHIFT) diff --git a/include/linux/gmem.h b/include/linux/gmem.h index 7beebc67c398..a2becb381cc9 100644 --- a/include/linux/gmem.h +++ b/include/linux/gmem.h @@ -326,12 +326,6 @@ static inline bool is_hnode(int node) node_isset(node, hnode_map); } -static inline bool is_hnode_allowed(int node) -{ - return (node < MAX_NUMNODES) && is_hnode(node) && - node_isset(node, current->mems_allowed); -} - static inline int get_hnuma_id(struct gm_dev *gm_dev) { return first_node(gm_dev->registered_hnodes); diff --git a/include/linux/vm_object.h b/include/linux/vm_object.h index ca82642eb2df..d37cd0353f85 100644 --- a/include/linux/vm_object.h +++ b/include/linux/vm_object.h @@ -13,6 +13,8 @@ void vm_object_drop_locked(struct vm_area_struct *vma); void dup_vm_object(struct vm_area_struct *dst, struct vm_area_struct *src, bool dst_peer_shared); void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end); +void vm_object_merge(struct vm_area_struct *vma, unsigned long addr); +void vm_object_split(struct vm_area_struct *old_vma, struct vm_area_struct *new_vma); void dup_peer_shared_vma(struct vm_area_struct *vma); struct gm_mapping *alloc_gm_mapping(void); @@ -28,7 +30,9 @@ static inline void dup_vm_object(struct vm_area_struct *dst, static inline void dup_peer_shared_vma(struct vm_area_struct *vma) {} static inline void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end) {} - +static inline void vm_object_merge(struct vm_area_struct *vma, unsigned long addr) {} +static inline void vm_object_split(struct vm_area_struct *old_vma, + struct vm_area_struct *new_vma) {} static inline struct gm_mapping *alloc_gm_mapping(void) { return NULL; } static inline struct gm_mapping *vm_object_lookup(struct vm_object *obj, unsigned long va) { return NULL; } diff --git a/mm/gmem.c b/mm/gmem.c index 039f4cfe28db..1397a56e42bb 100644 --- a/mm/gmem.c +++ b/mm/gmem.c @@ -846,7 +846,7 @@ int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior) return error; } - if (!is_hnode(hnid) || !is_hnode_allowed(hnid)) { + if (!is_hnode(hnid)) { gmem_err("hmadvise: can't find hnode by hnid:%d or hnode is not allowed\n", hnid); return error; } @@ -1026,7 +1026,7 @@ int hmemcpy(int hnid, unsigned long dest, unsigned long src, size_t size) gmem_err("hmemcpy: invalid hnid %d < 0\n", hnid); return -EINVAL; } - } else if (!is_hnode(hnid) || !is_hnode_allowed(hnid)) { + } else if (!is_hnode(hnid)) { gmem_err("hmemcpy: can't find hnode by hnid:%d or hnode is not allowed\n", hnid); return -EINVAL; } diff --git a/mm/mmap.c b/mm/mmap.c index 4543ed83ff57..9da8268345b0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -706,6 +706,13 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, /* Only handles expanding */ VM_WARN_ON(vma->vm_start < start || vma->vm_end > end); + if (vma_is_peer_shared(vma)) { + if (!remove_next) + vm_object_adjust(vma, start, end); + else + vm_object_merge(vma, next->vm_end); + } + /* Note: vma iterator must be pointing to 'start' */ vma_iter_config(vmi, start, end); if (vma_iter_prealloc(vmi, vma)) @@ -757,6 +764,9 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_prepare(&vp); vma_adjust_trans_huge(vma, start, end, 0); + if (vma_is_peer_shared(vma)) + vm_object_adjust(vma, start, end); + vma_iter_clear(vmi); vma->vm_start = start; vma->vm_end = end; @@ -1007,6 +1017,9 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, if (!next->anon_vma) err = dup_anon_vma(prev, curr, &anon_dup); } + if (vma_is_peer_shared(prev)) { + vm_object_merge(prev, next->vm_end); + } } else if (merge_prev) { /* case 2 */ if (curr) { vma_start_write(curr); @@ -1025,6 +1038,9 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, } if (!err) err = dup_anon_vma(prev, curr, &anon_dup); + if (vma_is_peer_shared(prev)) { + vm_object_merge(prev, end); + } } } else { /* merge_next */ vma_start_write(next); @@ -1035,6 +1051,9 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, adjust = next; adj_start = -(prev->vm_end - addr); err = dup_anon_vma(next, prev, &anon_dup); + if (vma_is_peer_shared(prev)) { + vm_object_merge(prev, addr); + } } else { /* * Note that cases 3 and 8 are the ONLY ones where prev @@ -1050,6 +1069,8 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, remove = curr; err = dup_anon_vma(next, curr, &anon_dup); } + if (vma_is_peer_shared(curr)) + vm_object_merge(vma, next->vm_end); } } @@ -1087,11 +1108,6 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, vma_iter_store(vmi, vma); if (adj_start) { -#ifdef CONFIG_GMEM - if (vma_is_peer_shared(adjust)) - vm_object_adjust(adjust, adjust->vm_start + adj_start, - adjust->vm_end); -#endif adjust->vm_start += adj_start; adjust->vm_pgoff += adj_start >> PAGE_SHIFT; if (adj_start < 0) { @@ -2559,17 +2575,8 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_prepare(&vp); vma_adjust_trans_huge(vma, vma->vm_start, addr, 0); -#ifdef CONFIG_GMEM - if (vma_is_peer_shared(vma)) { - if (new_below) { - vm_object_adjust(new, new->vm_start, addr); - vm_object_adjust(vma, addr, vma->vm_end); - } else { - vm_object_adjust(vma, vma->vm_start, addr); - vm_object_adjust(new, addr, new->vm_end); - } - } -#endif + if (vma_is_peer_shared(vma)) + vm_object_split(vma, new); if (new_below) { vma->vm_start = addr; diff --git a/mm/vm_object.c b/mm/vm_object.c index 3c8932c47270..43dc927477bc 100644 --- a/mm/vm_object.c +++ b/mm/vm_object.c @@ -180,13 +180,152 @@ void dup_peer_shared_vma(struct vm_area_struct *vma) } } +/** + * new_vma is part of old_vma, so old_vma->vm_start <= new_vma->vm_start + * and new_vma->vm_end < old_vma->vm_end + */ +void vm_object_split(struct vm_area_struct *old_vma, struct vm_area_struct *new_vma) +{ + unsigned long index; + struct gm_mapping *page; + unsigned long transferred_pages = 0; + + XA_STATE(xas, old_vma->vm_obj->logical_page_table, + linear_page_index(old_vma, new_vma->vm_start)); + + xa_lock(old_vma->vm_obj->logical_page_table); + xa_lock(new_vma->vm_obj->logical_page_table); + xas_for_each(&xas, page, linear_page_index(old_vma, new_vma->vm_end - SZ_2M)) { + index = xas.xa_index - old_vma->vm_pgoff + new_vma->vm_pgoff - + ((new_vma->vm_start - old_vma->vm_start) >> PAGE_SHIFT); + __xa_store(new_vma->vm_obj->logical_page_table, index, page, GFP_KERNEL); + xas_store(&xas, NULL); + transferred_pages++; + } + + atomic_sub(transferred_pages, &old_vma->vm_obj->nr_pages); + atomic_add(transferred_pages, &new_vma->vm_obj->nr_pages); + xa_unlock(new_vma->vm_obj->logical_page_table); + xa_unlock(old_vma->vm_obj->logical_page_table); +} + +void vm_object_merge(struct vm_area_struct *vma, unsigned long addr) +{ + unsigned long index; + struct gm_mapping *page; + struct vm_area_struct *next, *n_next; + unsigned long moved_pages = 0; + + VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_start); + next = vma_next(&vmi); + next = vma_next(&vmi); + if (!next) + return; + + if (addr < vma->vm_end) { + /* case 4: move logical mapping in [end, vma->vm_end) from vma to next */ + XA_STATE(xas, vma->vm_obj->logical_page_table, + linear_page_index(vma, addr)); + + xa_lock(vma->vm_obj->logical_page_table); + xa_lock(next->vm_obj->logical_page_table); + xas_for_each(&xas, page, linear_page_index(vma, vma->vm_end - SZ_2M)) { + index = xas.xa_index - vma->vm_pgoff + next->vm_pgoff - + ((next->vm_start - vma->vm_start) >> PAGE_SHIFT); + __xa_store(next->vm_obj->logical_page_table, index, page, GFP_KERNEL); + xas_store(&xas, NULL); + moved_pages++; + } + atomic_sub(moved_pages, &vma->vm_obj->nr_pages); + atomic_add(moved_pages, &next->vm_obj->nr_pages); + xa_unlock(next->vm_obj->logical_page_table); + xa_unlock(vma->vm_obj->logical_page_table); + } else { + n_next = vma_next(&vmi); + + if (addr == next->vm_end) { + /* case 1, 7, 8: copy all logical mappings from next to vma */ + XA_STATE(xas, next->vm_obj->logical_page_table, + linear_page_index(next, next->vm_start)); + + xa_lock(vma->vm_obj->logical_page_table); + rcu_read_lock(); + xas_for_each(&xas, page, linear_page_index(next, next->vm_end - SZ_2M)) { + index = xas.xa_index - next->vm_pgoff + vma->vm_pgoff + + ((next->vm_start - vma->vm_start) >> PAGE_SHIFT); + __xa_store(vma->vm_obj->logical_page_table, + index, page, GFP_KERNEL); + xas_store(&xas, NULL); + moved_pages++; + } + rcu_read_unlock(); + atomic_add(moved_pages, &vma->vm_obj->nr_pages); + xa_unlock(vma->vm_obj->logical_page_table); + } else if (next->vm_start < addr && addr < next->vm_end) { + /* case 5: move logical mapping in [next->vm_start, end) from next to vma */ + XA_STATE(xas, next->vm_obj->logical_page_table, + linear_page_index(next, next->vm_start)); + + xa_lock(vma->vm_obj->logical_page_table); + xa_lock(next->vm_obj->logical_page_table); + xas_for_each(&xas, page, linear_page_index(next, addr - SZ_2M)) { + index = xas.xa_index - next->vm_pgoff + vma->vm_pgoff + + ((next->vm_start - vma->vm_start) >> PAGE_SHIFT); + __xa_store(vma->vm_obj->logical_page_table, + index, page, GFP_KERNEL); + xas_store(&xas, NULL); + moved_pages++; + } + atomic_add(moved_pages, &vma->vm_obj->nr_pages); + atomic_sub(moved_pages, &next->vm_obj->nr_pages); + xa_unlock(next->vm_obj->logical_page_table); + xa_unlock(vma->vm_obj->logical_page_table); + } else if (n_next && addr == n_next->vm_end) { + /* case 6: copy all logical mappings from next and n_next to vma */ + XA_STATE(xas_next, next->vm_obj->logical_page_table, + linear_page_index(next, next->vm_start)); + XA_STATE(xas_n_next, n_next->vm_obj->logical_page_table, + linear_page_index(n_next, n_next->vm_start)); + + xa_lock(vma->vm_obj->logical_page_table); + rcu_read_lock(); + + xas_for_each(&xas_next, page, + linear_page_index(next, next->vm_end - SZ_2M)) { + index = xas_next.xa_index - next->vm_pgoff + vma->vm_pgoff + + ((next->vm_start - vma->vm_start) >> PAGE_SHIFT); + __xa_store(vma->vm_obj->logical_page_table, + index, page, GFP_KERNEL); + xas_store(&xas_next, NULL); + moved_pages++; + } + + xas_for_each(&xas_n_next, page, + linear_page_index(n_next, n_next->vm_end - SZ_2M)) { + index = xas_n_next.xa_index - n_next->vm_pgoff + vma->vm_pgoff + + ((n_next->vm_start - vma->vm_start) >> PAGE_SHIFT); + __xa_store(vma->vm_obj->logical_page_table, + index, page, GFP_KERNEL); + xas_store(&xas_n_next, NULL); + moved_pages++; + } + + rcu_read_unlock(); + atomic_add(moved_pages, &vma->vm_obj->nr_pages); + xa_unlock(vma->vm_obj->logical_page_table); + } + } + /* case 2, 3: do nothing */ +} + void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end) { /* remove logical mapping in [vma->vm_start, start) and [end, vm->vm_end) */ unsigned long removed_pages = 0; struct gm_mapping *mapping; - XA_STATE(xas, vma->vm_obj->logical_page_table, linear_page_index(vma, vma->vm_start)); + XA_STATE(xas, vma->vm_obj->logical_page_table, + linear_page_index(vma, vma->vm_start)); xas_lock(&xas); if (vma->vm_start < start) { @@ -233,7 +372,9 @@ void vm_object_mapping_create(struct vm_object *obj, unsigned long start) void free_gm_mappings(struct vm_area_struct *vma) { struct gm_mapping *gm_mapping; - XA_STATE(xas, vma->vm_obj->logical_page_table, linear_page_index(vma, vma->vm_start)); + + XA_STATE(xas, vma->vm_obj->logical_page_table, + linear_page_index(vma, vma->vm_start)); xa_lock(vma->vm_obj->logical_page_table); xas_for_each(&xas, gm_mapping, linear_page_index(vma, vma->vm_end - SZ_2M)) { -- Gitee From cf8fda4b0facd2af89cef850e52f0b0c6411ea61 Mon Sep 17 00:00:00 2001 From: Bin Wang Date: Wed, 17 Sep 2025 09:31:43 +0800 Subject: [PATCH 18/34] gmem: support allocating overlimit pages in device. euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- gmem support swapping pages in device to host. This feature allows device releasing some oldest pages and allocate more memory than limit. Signed-off-by: Bin Wang --- include/linux/gmem.h | 114 +++++++++-- mm/Makefile | 2 +- mm/gmem.c | 231 ++++++--------------- mm/gmem_phys.c | 466 +++++++++++++++++++++++++++++++++++++++++++ mm/gmem_stat.c | 164 +++++++++++++++ mm/mmap.c | 20 +- 6 files changed, 810 insertions(+), 187 deletions(-) create mode 100644 mm/gmem_phys.c create mode 100644 mm/gmem_stat.c diff --git a/include/linux/gmem.h b/include/linux/gmem.h index a2becb381cc9..15e341f38953 100644 --- a/include/linux/gmem.h +++ b/include/linux/gmem.h @@ -80,6 +80,7 @@ enum gm_mmu_mode { struct gm_fault_t { struct mm_struct *mm; struct gm_dev *dev; + unsigned long pfn; unsigned long va; unsigned long size; unsigned long prot; @@ -88,13 +89,23 @@ struct gm_fault_t { int behavior; }; +enum gm_memcpy_kind { + GM_MEMCPY_INIT, + GM_MEMCPY_H2H, + GM_MEMCPY_H2D, + GM_MEMCPY_D2H, + GM_MEMCPY_D2D, + GM_MEMCPY_KIND_INVALID, +}; + struct gm_memcpy_t { struct mm_struct *mm; struct gm_dev *dev; - unsigned long src; - unsigned long dest; - dma_addr_t dma_addr; + dma_addr_t src; + dma_addr_t dest; + size_t size; + enum gm_memcpy_kind kind; }; /** @@ -134,6 +145,8 @@ struct gm_mmu { */ enum gm_ret (*peer_unmap)(struct gm_fault_t *gmf); + enum gm_ret (*import_phys_mem)(struct mm_struct *mm, int hnid, unsigned long page_cnt); + /* Create or destroy a device's physical page table. */ enum gm_ret (*pmap_create)(struct gm_dev *dev, void **pmap); enum gm_ret (*pmap_destroy)(void *pmap); @@ -225,11 +238,11 @@ struct gm_mapping { unsigned int flag; union { - struct page *page; /* CPU node */ - struct gm_dev *dev; /* hetero-node */ - unsigned long pfn; + struct page *page; /* CPU node */ + struct gm_page *gm_page; /* hetero-node */ }; + struct gm_dev *dev; struct mutex lock; }; @@ -280,16 +293,12 @@ extern enum gm_ret gm_dev_create(struct gm_mmu *mmu, void *dev_data, unsigned lo struct gm_dev **new_dev); extern enum gm_ret gm_dev_switch(struct gm_dev *dev, struct gm_as *as); extern enum gm_ret gm_dev_detach(struct gm_dev *dev, struct gm_as *as); -extern enum gm_ret gm_dev_register_physmem(struct gm_dev *dev, unsigned long begin, - unsigned long end); +extern int gm_dev_register_hnode(struct gm_dev *dev); enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struct gm_dev *dev, int behavior); vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, unsigned int order); /* GMEM address space KPI */ -extern enum gm_ret gm_dev_register_physmem(struct gm_dev *dev, unsigned long begin, - unsigned long end); -extern void gm_dev_unregister_physmem(struct gm_dev *dev, unsigned int nid); extern enum gm_ret gm_as_create(unsigned long begin, unsigned long end, enum gm_as_alloc policy, unsigned long cache_quantum, struct gm_as **new_as); extern enum gm_ret gm_as_destroy(struct gm_as *as); @@ -314,12 +323,42 @@ extern void gmem_stats_counter_show(void); /* h-NUMA topology */ struct hnode { unsigned int id; - struct gm_dev *dev; - struct xarray pages; + struct task_struct *swapd_task; + + struct list_head freelist; + struct list_head activelist; + spinlock_t freelist_lock; + spinlock_t activelist_lock; + atomic_t nr_free_pages; + atomic_t nr_active_pages; + + unsigned long max_memsize; + + bool import_failed; }; +static inline void hnode_active_pages_inc(struct hnode *hnode) +{ + atomic_inc(&hnode->nr_active_pages); +} + +static inline void hnode_active_pages_dec(struct hnode *hnode) +{ + atomic_dec(&hnode->nr_active_pages); +} + +static inline void hnode_free_pages_inc(struct hnode *hnode) +{ + atomic_inc(&hnode->nr_free_pages); +} + +static inline void hnode_free_pages_dec(struct hnode *hnode) +{ + atomic_dec(&hnode->nr_free_pages); +} + static inline bool is_hnode(int node) { return (node < MAX_NUMNODES) && !node_isset(node, node_possible_map) && @@ -334,9 +373,58 @@ static inline int get_hnuma_id(struct gm_dev *gm_dev) void __init hnuma_init(void); unsigned int alloc_hnode_id(void); void free_hnode_id(unsigned int nid); +struct hnode *get_hnode(unsigned int hnid); +struct gm_dev *get_gm_dev(unsigned int nid); void hnode_init(struct hnode *hnode, unsigned int hnid, struct gm_dev *dev); void hnode_deinit(unsigned int hnid, struct gm_dev *dev); +struct gm_page { + struct list_head gm_page_list; + + unsigned long flags; + unsigned long dev_pfn; + unsigned long dev_dma_addr; + unsigned int hnid; + + /* + * The same functionality as rmap, we need know which process + * maps to this gm_page with which virtual address. + * */ + unsigned long va; + struct mm_struct *mm; + + atomic_t refcount; +}; + +#define NUM_IMPORT_PAGES 16 + +int __init gm_page_cachep_init(void); +void gm_page_cachep_destroy(void); +struct gm_page *alloc_gm_page_struct(void); +void hnode_freelist_add(struct hnode *hnode, struct gm_page *gm_page); +void hnode_activelist_add(struct hnode *hnode, struct gm_page *gm_page); +void hnode_activelist_del(struct hnode *hnode, struct gm_page *gm_page); +void hnode_activelist_del_and_add(struct hnode *hnode, struct gm_page *gm_page); +void mark_gm_page_active(struct gm_page *gm_page); +int gm_add_pages(unsigned int hnid, struct list_head *pages); +void gm_free_page(struct gm_page *gm_page); +struct gm_page *gm_alloc_page(struct mm_struct *mm, struct hnode *hnode); + +static inline void get_gm_page(struct gm_page *gm_page) +{ + atomic_inc(&gm_page->refcount); +} + +static inline void put_gm_page(struct gm_page *gm_page) +{ + if (atomic_dec_and_test(&gm_page->refcount)) + gm_free_page(gm_page); +} + +int hnode_init_sysfs(unsigned int hnid); +int __init gm_init_sysfs(void); +void gm_deinit_sysfs(void); + #define gmem_err(fmt, ...) \ ((void)pr_err("[gmem]" fmt "\n", ##__VA_ARGS__)) diff --git a/mm/Makefile b/mm/Makefile index cedd58296019..70d7fb204b57 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -41,7 +41,7 @@ mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \ mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \ msync.o page_vma_mapped.o pagewalk.o \ pgtable-generic.o rmap.o vmalloc.o -mmu-$(CONFIG_GMEM) += gmem.o vm_object.o +mmu-$(CONFIG_GMEM) += gmem.o gmem_phys.o gmem_stat.o vm_object.o ifdef CONFIG_CROSS_MEMORY_ATTACH mmu-$(CONFIG_MMU) += process_vm_access.o diff --git a/mm/gmem.c b/mm/gmem.c index 1397a56e42bb..30a83c9daced 100644 --- a/mm/gmem.c +++ b/mm/gmem.c @@ -53,9 +53,6 @@ static DEFINE_XARRAY_ALLOC(gm_dev_id_pool); static bool enable_gmem; -DEFINE_SPINLOCK(hnode_lock); -struct hnode *hnodes[MAX_NUMNODES]; - static inline unsigned long pe_mask(unsigned int order) { if (order == 0) @@ -143,10 +140,18 @@ static int __init gmem_init(void) if (!gm_region_cache) goto free_ctx; - err = vm_object_init(); + err = gm_page_cachep_init(); if (err) goto free_region; + err = gm_init_sysfs(); + if (err) + goto free_gm_page; + + err = vm_object_init(); + if (err) + goto free_gm_sysfs; + err = gmem_stats_init(); if (err) goto free_region; @@ -167,6 +172,10 @@ static int __init gmem_init(void) return 0; +free_gm_sysfs: + gm_deinit_sysfs(); +free_gm_page: + gm_page_cachep_destroy(); free_region: kmem_cache_destroy(gm_region_cache); free_ctx: @@ -228,10 +237,12 @@ enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struc { enum gm_ret ret = GM_RET_SUCCESS; struct gm_mmu *mmu = dev->mmu; + struct hnode *hnode; struct device *dma_dev = dev->dma_dev; struct vm_area_struct *vma; struct vm_object *obj; struct gm_mapping *gm_mapping; + struct gm_page *gm_page; unsigned long size = HPAGE_SIZE; struct gm_fault_t gmf = { .mm = mm, @@ -243,16 +254,22 @@ enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struc }; struct page *page = NULL; + hnode = get_hnode(get_hnuma_id(dev)); + if (!hnode) { + gmem_err("gmem device should correspond to a hnuma node"); + ret = -EINVAL; + goto out; + } + vma = find_vma(mm, addr); if (!vma || vma->vm_start > addr) { - gmem_err("%s failed to find vma\n", __func__); - pr_info("gmem: %s no vma\n", __func__); + gmem_err("%s failed to find vma", __func__); ret = GM_RET_FAILURE_UNKNOWN; goto out; } obj = vma->vm_obj; if (!obj) { - gmem_err("%s no vm_obj\n", __func__); + gmem_err("%s no vm_obj", __func__); ret = GM_RET_FAILURE_UNKNOWN; goto out; } @@ -266,7 +283,7 @@ enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struc xa_unlock(obj->logical_page_table); if (unlikely(!gm_mapping)) { - gmem_err("OOM when creating vm_obj!\n"); + gmem_err("OOM when creating vm_obj!"); ret = GM_RET_NOMEM; goto out; } @@ -274,8 +291,9 @@ enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struc if (gm_mapping_nomap(gm_mapping)) { goto peer_map; } else if (gm_mapping_device(gm_mapping)) { - if (behavior == MADV_WILLNEED || behavior == MADV_PINNED) { - goto peer_map; + if (behavior == MADV_WILLNEED) { + mark_gm_page_active(gm_mapping->gm_page); + goto unlock; } else { ret = 0; goto unlock; @@ -283,7 +301,7 @@ enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struc } else if (gm_mapping_cpu(gm_mapping)) { page = gm_mapping->page; if (!page) { - gmem_err("host gm_mapping page is NULL. Set nomap\n"); + gmem_err("host gm_mapping page is NULL. Set nomap"); gm_mapping_flags_set(gm_mapping, GM_PAGE_NOMAP); goto unlock; } @@ -293,12 +311,21 @@ enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struc gmf.dma_addr = dma_map_page(dma_dev, page, 0, size, DMA_BIDIRECTIONAL); if (dma_mapping_error(dma_dev, gmf.dma_addr)) - gmem_err("dma map failed\n"); + gmem_err("dma map failed"); gmf.copy = true; } peer_map: + gm_page = gm_alloc_page(mm, hnode); + if (!gm_page) { + gmem_err("Alloc gm_page for device fault failed."); + ret = -ENOMEM; + goto unlock; + } + + gmf.pfn = gm_page->dev_pfn; + ret = mmu->peer_map(&gmf); if (ret != GM_RET_SUCCESS) { if (ret == GM_RET_MIGRATING) { @@ -310,7 +337,7 @@ enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struc gmem_stats_counter(NR_PAGE_MIGRATING_D2H, 1); ret = GM_RET_SUCCESS; } else { - gmem_err("peer map failed\n"); + gmem_err("peer map failed"); if (page) { gm_mapping_flags_set(gm_mapping, GM_PAGE_NOMAP); put_page(page); @@ -321,11 +348,16 @@ enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struc if (page) { dma_unmap_page(dma_dev, gmf.dma_addr, size, DMA_BIDIRECTIONAL); - put_page(page); + folio_put(page_folio(page)); } gm_mapping_flags_set(gm_mapping, GM_PAGE_DEVICE); gm_mapping->dev = dev; + gm_page->va = addr; + gm_page->mm = mm; + gm_mapping->gm_page = gm_page; + hnode_activelist_add(hnode, gm_page); + hnode_active_pages_inc(hnode); unlock: mutex_unlock(&gm_mapping->lock); out: @@ -343,6 +375,7 @@ vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, struct gm_mapping *gm_mapping; unsigned long size = HPAGE_SIZE; struct gm_dev *dev; + struct hnode *hnode; struct device *dma_dev; struct gm_fault_t gmf = { .mm = vma->vm_mm, @@ -359,6 +392,7 @@ vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, dev = gm_mapping->dev; gmf.dev = dev; + gmf.pfn = gm_mapping->gm_page->dev_pfn; dma_dev = dev->dma_dev; gmf.dma_addr = dma_map_page(dma_dev, vmf->page, 0, size, DMA_BIDIRECTIONAL); @@ -373,113 +407,13 @@ vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, } dma_unmap_page(dma_dev, gmf.dma_addr, size, DMA_BIDIRECTIONAL); + hnode = get_hnode(gm_mapping->gm_page->hnid); + hnode_activelist_del(hnode, gm_mapping->gm_page); + hnode_active_pages_dec(hnode); + put_gm_page(gm_mapping->gm_page); return ret; } -static inline struct hnode *get_hnode(unsigned int hnid) -{ - return hnodes[hnid]; -} - -static struct gm_dev *get_gm_dev(unsigned int nid) -{ - struct hnode *hnode; - struct gm_dev *dev = NULL; - - spin_lock(&hnode_lock); - hnode = get_hnode(nid); - if (hnode) - dev = hnode->dev; - spin_unlock(&hnode_lock); - return dev; -} - -/* - * Register the local physical memory of a gmem device. - * This implies dynamically creating - * the struct page data structures. - */ -enum gm_ret gm_dev_register_physmem(struct gm_dev *dev, unsigned long begin, unsigned long end) -{ - struct gm_mapping *mapping; - unsigned long addr = PAGE_ALIGN(begin); - unsigned int nid; - int i, page_num = (end - addr) >> PAGE_SHIFT; - struct hnode *hnode = kmalloc(sizeof(struct hnode), GFP_KERNEL); - - if (!hnode) - goto err; - - mapping = kvmalloc_array(page_num, sizeof(struct gm_mapping), GFP_KERNEL); - if (!mapping) - goto free_hnode; - - spin_lock(&hnode_lock); - nid = alloc_hnode_id(); - if (nid == MAX_NUMNODES) - goto unlock_hnode; - hnode_init(hnode, nid, dev); - - for (i = 0; i < page_num; i++, addr += PAGE_SIZE) { - mapping[i].pfn = addr >> PAGE_SHIFT; - mapping[i].flag = 0; - } - - xa_lock(&hnode->pages); - for (i = 0; i < page_num; i++) { - if (xa_err(__xa_store(&hnode->pages, i, mapping + i, - GFP_KERNEL))) { - /* Probably nomem */ - kvfree(mapping); - xa_unlock(&hnode->pages); - goto deinit_hnode; - } - __xa_set_mark(&hnode->pages, i, XA_MARK_0); - } - xa_unlock(&hnode->pages); - - spin_unlock(&hnode_lock); - return GM_RET_SUCCESS; - -deinit_hnode: - hnode_deinit(nid, dev); - free_hnode_id(nid); -unlock_hnode: - spin_unlock(&hnode_lock); -free_hnode: - kfree(hnode); -err: - return -ENOMEM; -} -EXPORT_SYMBOL_GPL(gm_dev_register_physmem); - -void gm_dev_unregister_physmem(struct gm_dev *dev, unsigned int nid) -{ - struct hnode *hnode = NULL; - struct gm_mapping *mapping = NULL; - - spin_lock(&hnode_lock); - - if (!node_isset(nid, dev->registered_hnodes)) - goto unlock; - - hnode = get_hnode(nid); - - if (!hnode) - goto unlock; - mapping = xa_load(&hnode->pages, 0); - - if (mapping) - kvfree(mapping); - - hnode_deinit(nid, dev); - free_hnode_id(nid); - kfree(hnode); -unlock: - spin_unlock(&hnode_lock); -} -EXPORT_SYMBOL_GPL(gm_dev_unregister_physmem); - /* GMEM Virtual Address Space API */ enum gm_ret gm_as_create(unsigned long begin, unsigned long end, enum gm_as_alloc policy, unsigned long cache_quantum, struct gm_as **new_as) @@ -565,50 +499,6 @@ enum gm_ret gm_as_attach(struct gm_as *as, struct gm_dev *dev, enum gm_mmu_mode } EXPORT_SYMBOL_GPL(gm_as_attach); -void __init hnuma_init(void) -{ - unsigned int node; - spin_lock(&hnode_lock); - for_each_node(node) - node_set(node, hnode_map); - spin_unlock(&hnode_lock); -} - -unsigned int alloc_hnode_id(void) -{ - unsigned int node; - - node = first_unset_node(hnode_map); - node_set(node, hnode_map); - - return node; -} - -void free_hnode_id(unsigned int nid) -{ - spin_lock(&hnode_lock); - node_clear(nid, hnode_map); - spin_unlock(&hnode_lock); -} - -void hnode_init(struct hnode *hnode, unsigned int hnid, struct gm_dev *dev) -{ - hnodes[hnid] = hnode; - hnodes[hnid]->id = hnid; - hnodes[hnid]->dev = dev; - node_set(hnid, dev->registered_hnodes); - xa_init(&hnodes[hnid]->pages); -} - -void hnode_deinit(unsigned int hnid, struct gm_dev *dev) -{ - hnodes[hnid]->id = 0; - hnodes[hnid]->dev = NULL; - node_clear(hnid, dev->registered_hnodes); - xa_destroy(&hnodes[hnid]->pages); - hnodes[hnid] = NULL; -} - struct prefetch_data { struct mm_struct *mm; struct gm_dev *dev; @@ -725,6 +615,7 @@ static int gmem_unmap_vma_pages(struct vm_area_struct *vma, unsigned long start, }; struct gm_mapping *gm_mapping; struct vm_object *obj; + struct hnode *hnode; int ret; obj = vma->vm_obj; @@ -756,6 +647,10 @@ static int gmem_unmap_vma_pages(struct vm_area_struct *vma, unsigned long start, mutex_unlock(&gm_mapping->lock); continue; } + hnode = get_hnode(gm_mapping->gm_page->hnid); + hnode_activelist_del(hnode, gm_mapping->gm_page); + hnode_active_pages_dec(hnode); + put_gm_page(gm_mapping->gm_page); } gm_mapping_flags_set(gm_mapping, GM_PAGE_NOMAP); mutex_unlock(&gm_mapping->lock); @@ -946,15 +841,18 @@ static void do_hmemcpy(struct mm_struct *mm, int hnid, unsigned long dest, goto unlock_gm_mmaping; } else { // device to host dev = gm_mapping_src->dev; - gmc.dma_addr = phys_to_dma(dev->dma_dev, + gmc.dest = phys_to_dma(dev->dma_dev, page_to_phys(gm_mapping_dest->page) + (dest & (page_size - 1))); - gmc.src = src; + gmc.src = gm_mapping_src->gm_page->dev_dma_addr + (src & (page_size - 1)); + gmc.kind = GM_MEMCPY_D2H; } } else { if (gm_mapping_cpu(gm_mapping_src)) { // host to device - gmc.dest = dest; - gmc.dma_addr = phys_to_dma(dev->dma_dev, + gmc.dest = gm_mapping_dest->gm_page->dev_dma_addr + + (dest & (page_size - 1)); + gmc.src = phys_to_dma(dev->dma_dev, page_to_phys(gm_mapping_src->page) + (src & (page_size - 1))); + gmc.kind = GM_MEMCPY_H2D; } else { // device to device gmem_err("hmemcpy: device to device is unimplemented\n"); goto unlock_gm_mmaping; @@ -1062,3 +960,4 @@ int hmemcpy(int hnid, unsigned long dest, unsigned long src, size_t size) return -EINVAL; } EXPORT_SYMBOL_GPL(hmemcpy); + diff --git a/mm/gmem_phys.c b/mm/gmem_phys.c new file mode 100644 index 000000000000..1ff4407d5aff --- /dev/null +++ b/mm/gmem_phys.c @@ -0,0 +1,466 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * GMEM physical memory management. + * + * Copyright (C) 2025- Huawei, Inc. + * Author: Bin Wang + * + */ + +#include +#include +#include +#include +#include + +#include +#include + +#define NUM_SWAP_PAGES 16 +#define MAX_SWAP_RETRY_TIMES 10 + +static struct kmem_cache *gm_page_cachep; + +DEFINE_SPINLOCK(hnode_lock); +struct hnode *hnodes[MAX_NUMNODES]; + +void __init hnuma_init(void) +{ + unsigned int node; + + spin_lock(&hnode_lock); + for_each_node(node) + node_set(node, hnode_map); + spin_unlock(&hnode_lock); +} + +unsigned int alloc_hnode_id(void) +{ + unsigned int node; + + node = first_unset_node(hnode_map); + node_set(node, hnode_map); + + return node; +} + +void free_hnode_id(unsigned int nid) +{ + node_clear(nid, hnode_map); +} + +void hnode_init(struct hnode *hnode, unsigned int hnid, struct gm_dev *dev) +{ + hnode->id = hnid; + hnode->dev = dev; + INIT_LIST_HEAD(&hnode->freelist); + INIT_LIST_HEAD(&hnode->activelist); + spin_lock_init(&hnode->freelist_lock); + spin_lock_init(&hnode->activelist_lock); + atomic_set(&hnode->nr_free_pages, 0); + atomic_set(&hnode->nr_active_pages, 0); + hnode->import_failed = false; + hnode->max_memsize = 0; + + node_set(hnid, dev->registered_hnodes); + hnodes[hnid] = hnode; +} + +void hnode_deinit(unsigned int hnid, struct gm_dev *dev) +{ + hnodes[hnid]->id = 0; + hnodes[hnid]->dev = NULL; + node_clear(hnid, dev->registered_hnodes); + hnodes[hnid] = NULL; +} + +struct hnode *get_hnode(unsigned int hnid) +{ + if (!hnodes[hnid]) + gmem_err("h-NUMA node for hnode id %u is NULL.", hnid); + return hnodes[hnid]; +} + +struct gm_dev *get_gm_dev(unsigned int nid) +{ + struct hnode *hnode; + struct gm_dev *dev = NULL; + + spin_lock(&hnode_lock); + hnode = get_hnode(nid); + if (hnode) + dev = hnode->dev; + spin_unlock(&hnode_lock); + return dev; +} + +static void init_swapd(struct hnode *hnode); + +int gm_dev_register_hnode(struct gm_dev *dev) +{ + unsigned int hnid; + struct hnode *hnode = kmalloc(sizeof(struct hnode), GFP_KERNEL); + int ret; + + if (!hnode) + return -ENOMEM; + + spin_lock(&hnode_lock); + hnid = alloc_hnode_id(); + spin_unlock(&hnode_lock); + + if (hnid == MAX_NUMNODES) + goto free_hnode; + + ret = hnode_init_sysfs(hnid); + if (ret) + goto free_hnode; + + hnode_init(hnode, hnid, dev); + init_swapd(hnode); + + return GM_RET_SUCCESS; + +free_hnode: + kfree(hnode); + return -EBUSY; +} +EXPORT_SYMBOL_GPL(gm_dev_register_hnode); + +int __init gm_page_cachep_init(void) +{ + gm_page_cachep = KMEM_CACHE(gm_page, 0); + if (!gm_page_cachep) + return -EINVAL; + return 0; +} + +void gm_page_cachep_destroy(void) +{ + kmem_cache_destroy(gm_page_cachep); +} + +struct gm_page *alloc_gm_page_struct(void) +{ + struct gm_page *gm_page = kmem_cache_zalloc(gm_page_cachep, GFP_KERNEL); + + if (!gm_page) + return NULL; + atomic_set(&gm_page->refcount, 0); + return gm_page; +} +EXPORT_SYMBOL(alloc_gm_page_struct); + +void hnode_freelist_add(struct hnode *hnode, struct gm_page *gm_page) +{ + spin_lock(&hnode->freelist_lock); + list_add(&gm_page->gm_page_list, &hnode->freelist); + spin_unlock(&hnode->freelist_lock); +} + +void hnode_activelist_add(struct hnode *hnode, struct gm_page *gm_page) +{ + spin_lock(&hnode->activelist_lock); + list_add_tail(&gm_page->gm_page_list, &hnode->activelist); + spin_unlock(&hnode->activelist_lock); +} + +void hnode_activelist_del(struct hnode *hnode, struct gm_page *gm_page) +{ + spin_lock(&hnode->activelist_lock); + list_del(&gm_page->gm_page_list); + spin_unlock(&hnode->activelist_lock); +} + +void hnode_activelist_del_and_add(struct hnode *hnode, struct gm_page *gm_page) +{ + spin_lock(&hnode->activelist_lock); + list_move_tail(&gm_page->gm_page_list, &hnode->activelist); + spin_unlock(&hnode->activelist_lock); +} + +void mark_gm_page_active(struct gm_page *gm_page) +{ + struct hnode *hnode = get_hnode(gm_page->hnid); + + if (!hnode) + return; + + hnode_activelist_del_and_add(hnode, gm_page); +} + +int gm_add_pages(unsigned int hnid, struct list_head *pages) +{ + struct hnode *hnode; + struct gm_page *gm_page, *n; + + hnode = get_hnode(hnid); + if (!hnode) + return -EINVAL; + + list_for_each_entry_safe(gm_page, n, pages, gm_page_list) { + list_del(&gm_page->gm_page_list); + hnode_freelist_add(hnode, gm_page); + hnode_free_pages_inc(hnode); + } + + return 0; +} +EXPORT_SYMBOL(gm_add_pages); + +void gm_free_page(struct gm_page *gm_page) +{ + struct hnode *hnode; + + hnode = get_hnode(gm_page->hnid); + if (!hnode) + return; + hnode_freelist_add(hnode, gm_page); + hnode_free_pages_inc(hnode); +} + +static int gm_evict_page_locked(struct gm_page *gm_page) +{ + struct gm_dev *gm_dev; + struct gm_mapping *gm_mapping; + struct vm_area_struct *vma; + struct mm_struct *mm = gm_page->mm; + struct page *page; + struct device *dma_dev; + unsigned long va = gm_page->va; + struct folio *folio = NULL; + struct gm_fault_t gmf = { + .mm = mm, + .va = va, + .size = HPAGE_SIZE, + .copy = true + }; + int ret = 0; + + gm_dev = get_gm_dev(gm_page->hnid); + if (!gm_dev) + return -EINVAL; + + vma = find_vma(mm, va); + if (!vma || !vma->vm_obj) { + gmem_err("%s: cannot find vma or vma->vm_obj is null for va %lx", __func__, va); + return -EINVAL; + } + + gm_mapping = vm_object_lookup(vma->vm_obj, va); + if (!gm_mapping) { + gmem_err("%s: no gm_mapping for va %lx", __func__, va); + return -EINVAL; + } + + mutex_lock(&gm_mapping->lock); + if (!gm_mapping_device(gm_mapping)) { + gmem_err("%s: evicting gm_page conflicts with unmap.", __func__); + ret = 0; + goto gm_mapping_unlock; + } + + folio = vma_alloc_folio(GFP_TRANSHUGE, HPAGE_PMD_ORDER, vma, va, true); + if (!folio) { + gmem_err("%s: allocate host page failed.", __func__); + ret = -ENOMEM; + goto gm_mapping_unlock; + } + page = &folio->page; + + gmf.dev = gm_dev; + gmf.pfn = gm_page->dev_pfn; + dma_dev = gm_dev->dma_dev; + gmf.dma_addr = dma_map_page(dma_dev, page, 0, HPAGE_SIZE, DMA_BIDIRECTIONAL); + if (dma_mapping_error(dma_dev, gmf.dma_addr)) { + gmem_err("%s: dma map failed.", __func__); + ret = -EINVAL; + goto gm_mapping_unlock; + } + + ret = gm_dev->mmu->peer_unmap(&gmf); + if (ret) + gmem_err("%s: peer_unmap failed.", __func__); + + dma_unmap_page(dma_dev, gmf.dma_addr, HPAGE_SIZE, DMA_BIDIRECTIONAL); + gm_mapping_flags_set(gm_mapping, GM_PAGE_CPU); + gm_mapping->page = page; + put_gm_page(gm_page); +gm_mapping_unlock: + mutex_unlock(&gm_mapping->lock); + return ret; +} + +static int gm_evict_page(struct gm_page *gm_page) +{ + struct mm_struct *mm = gm_page->mm; + int ret; + + mmap_read_lock(mm); + ret = gm_evict_page_locked(gm_page); + mmap_read_unlock(mm); + return ret; +} + +static void gm_do_swap(struct hnode *hnode) +{ + struct list_head swap_list; + struct gm_page *gm_page, *n; + unsigned int nr_swap_pages = 0; + int ret; + + INIT_LIST_HEAD(&swap_list); + + spin_lock(&hnode->activelist_lock); + list_for_each_entry_safe(gm_page, n, &hnode->activelist, gm_page_list) { + /* Move gm_page to temporary list. */ + get_gm_page(gm_page); + list_move(&gm_page->gm_page_list, &swap_list); + nr_swap_pages++; + if (nr_swap_pages >= NUM_SWAP_PAGES) + break; + } + spin_unlock(&hnode->activelist_lock); + + list_for_each_entry_safe(gm_page, n, &swap_list, gm_page_list) { + list_del(&gm_page->gm_page_list); + ret = gm_evict_page_locked(gm_page); + if (ret) { + gmem_err("%s: evict gm_page %lx failed, va %lx", __func__, + (unsigned long)gm_page, gm_page->va); + if (ret == -ENOMEM) { + /* + * Failed to allocate host page, so return gm_page + * to activelist. + */ + hnode_activelist_add(hnode, gm_page); + } else { + /* + * Conflicts with process exit, so return gm_page + * to freelist to avoid memory leak. + */ + atomic_set(&gm_page->refcount, 0); + hnode_freelist_add(hnode, gm_page); + hnode_active_pages_dec(hnode); + hnode_free_pages_inc(hnode); + } + put_gm_page(gm_page); + continue; + } + + hnode_active_pages_dec(hnode); + put_gm_page(gm_page); + } +}; + +static inline bool need_wake_up_swapd(struct hnode *hnode) +{ + return false; +} + +static int swapd_func(void *data) +{ + struct hnode *hnode = (struct hnode *)data; + + while (!kthread_should_stop()) { + if (!need_wake_up_swapd(hnode)) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } + + gm_do_swap(hnode); + } + + return 0; +}; + +static void init_swapd(struct hnode *hnode) +{ + hnode->swapd_task = kthread_run(swapd_func, NULL, "gm_swapd/%u", hnode->id); + if (IS_ERR(hnode->swapd_task)) { + gmem_err("%s: create swapd task failed", __func__); + hnode->swapd_task = NULL; + } +} + +static void wake_up_swapd(struct hnode *hnode) +{ + if (likely(hnode->swapd_task)) + wake_up_process(hnode->swapd_task); +} + +static bool can_import(struct hnode *hnode) +{ + unsigned long nr_pages; + unsigned long used_mem; + + nr_pages = atomic_read(&hnode->nr_free_pages) + atomic_read(&hnode->nr_active_pages); + used_mem = nr_pages * HPAGE_SIZE; + + /* GMEM usable memory is unlimited if max_memsize is zero. */ + if (!hnode->max_memsize) + return true; + return used_mem < hnode->max_memsize; +} + +static struct gm_page *get_gm_page_from_freelist(struct hnode *hnode) +{ + struct gm_page *gm_page; + + spin_lock(&hnode->freelist_lock); + gm_page = list_first_entry_or_null(&hnode->freelist, struct gm_page, gm_page_list); + /* Delete from freelist. */ + if (gm_page) { + list_del(&gm_page->gm_page_list); + hnode_free_pages_dec(hnode); + get_gm_page(gm_page); + /* TODO: wakeup swapd if needed. */ + if (need_wake_up_swapd(hnode)) + wake_up_swapd(hnode); + } + spin_unlock(&hnode->freelist_lock); + + return gm_page; +} + +/* + * gm_alloc_page - Allocate a gm_page. + * + * Allocate a gm_page from hnode freelist. If failed to allocate gm_page, try + * to import memory from device. And if failed to import memory, try to swap + * several gm_pages to host and allocate gm_page again. + */ +struct gm_page *gm_alloc_page(struct mm_struct *mm, struct hnode *hnode) +{ + struct gm_page *gm_page; + struct gm_dev *gm_dev; + int retry_times = 0; + int ret = 0; + + if (hnode->dev) + gm_dev = hnode->dev; + else + return NULL; + +retry: + gm_page = get_gm_page_from_freelist(hnode); + if (!gm_page && can_import(hnode) && !hnode->import_failed) { + /* Import pages from device. */ + ret = gm_dev->mmu->import_phys_mem(mm, hnode->id, NUM_IMPORT_PAGES); + if (!ret) + goto retry; + hnode->import_failed = true; + } + + /* Try to swap pages. */ + if (!gm_page) { + if (retry_times > MAX_SWAP_RETRY_TIMES) + return NULL; + gm_do_swap(hnode); + retry_times++; + goto retry; + } + + return gm_page; +} + diff --git a/mm/gmem_stat.c b/mm/gmem_stat.c new file mode 100644 index 000000000000..c8b4e200ea90 --- /dev/null +++ b/mm/gmem_stat.c @@ -0,0 +1,164 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * GMEM statistics. + * + * Copyright (C) 2025- Huawei, Inc. + * Author: Bin Wang + * + */ + +#include +#include +#include + +static struct kobject *gm_kobj; + +struct hnode_kobject { + struct kobject kobj; + unsigned int hnid; +}; + +#define HNODE_NAME_LEN 32 + +static struct hnode *get_hnode_kobj(struct kobject *kobj) +{ + struct hnode *hnode; + struct hnode_kobject *hnode_kobj; + + hnode_kobj = container_of(kobj, struct hnode_kobject, kobj); + hnode = get_hnode(hnode_kobj->hnid); + if (!hnode) { + gmem_err("%s: failed to get hnode from kobject", __func__); + return NULL; + } + + return hnode; +} + + +static ssize_t max_memsize_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hnode *hnode = get_hnode_kobj(kobj); + + if (!hnode) + return -EINVAL; + + return sprintf(buf, "%lu\n", hnode->max_memsize); +} + +static ssize_t max_memsize_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct hnode *hnode = get_hnode_kobj(kobj); + + if (!hnode) + return -EINVAL; + + hnode->max_memsize = memparse(buf, NULL) & (~(HPAGE_SIZE - 1)); + return count; +} + +static struct kobj_attribute max_memsize_attr = + __ATTR(max_memsize, 0644, max_memsize_show, max_memsize_store); + +static ssize_t nr_freepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hnode *hnode = get_hnode_kobj(kobj); + + if (!hnode) + return -EINVAL; + + return sprintf(buf, "%u\n", atomic_read(&hnode->nr_free_pages)); +} + +static struct kobj_attribute nr_freepages_attr = + __ATTR(nr_freepages, 0444, nr_freepages_show, NULL); + +static ssize_t nr_activepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hnode *hnode = get_hnode_kobj(kobj); + + if (!hnode) + return -EINVAL; + + return sprintf(buf, "%u\n", atomic_read(&hnode->nr_active_pages)); +} + +static struct kobj_attribute nr_activepages_attr = + __ATTR(nr_activepages, 0444, nr_activepages_show, NULL); + +static struct attribute *hnode_attrs[] = { + &max_memsize_attr.attr, + &nr_freepages_attr.attr, + &nr_activepages_attr.attr, + NULL, +}; + +static struct attribute_group hnode_attr_group = { + .attrs = hnode_attrs, +}; + +static void hnode_kobj_release(struct kobject *kobj) +{ + struct hnode_kobject *hnode_kobj = + container_of(kobj, struct hnode_kobject, kobj); + kfree(hnode_kobj); +} + +static const struct kobj_type hnode_kobj_ktype = { + .release = hnode_kobj_release, + .sysfs_ops = &kobj_sysfs_ops, +}; + +int hnode_init_sysfs(unsigned int hnid) +{ + int ret; + struct hnode_kobject *hnode_kobj; + + hnode_kobj = kzalloc(sizeof(struct hnode_kobject), GFP_KERNEL); + if (!hnode_kobj) + return -ENOMEM; + + ret = kobject_init_and_add(&hnode_kobj->kobj, &hnode_kobj_ktype, + gm_kobj, "hnode%u", hnid); + if (ret) { + gmem_err("%s: failed to init hnode object", __func__); + goto free_hnode_kobj; + } + + ret = sysfs_create_group(&hnode_kobj->kobj, &hnode_attr_group); + if (ret) { + gmem_err("%s: failed to register hnode group", __func__); + goto delete_hnode_kobj; + } + + hnode_kobj->hnid = hnid; + return 0; + +delete_hnode_kobj: + kobject_put(&hnode_kobj->kobj); +free_hnode_kobj: + kfree(hnode_kobj); + return ret; +} +EXPORT_SYMBOL(hnode_init_sysfs); + +int __init gm_init_sysfs(void) +{ + gm_kobj = kobject_create_and_add("gmem", mm_kobj); + if (!gm_kobj) { + gmem_err("%s: failed to create gmem object", __func__); + return -ENOMEM; + } + + return 0; +} + +void gm_deinit_sysfs(void) +{ + kobject_put(gm_kobj); +} diff --git a/mm/mmap.c b/mm/mmap.c index 9da8268345b0..588b934f551f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2624,7 +2624,7 @@ static void munmap_single_vma_in_peer_devices(struct mm_struct *mm, struct vm_ar enum gm_ret ret; struct gm_context *ctx, *tmp; struct gm_mapping *gm_mapping; - + struct hnode *hnode; struct gm_fault_t gmf = { .mm = mm, .copy = false, @@ -2663,11 +2663,21 @@ static void munmap_single_vma_in_peer_devices(struct mm_struct *mm, struct vm_ar gmf.size = HPAGE_SIZE; gmf.dev = gm_mapping->dev; ret = gm_mapping->dev->mmu->peer_unmap(&gmf); - if (ret != GM_RET_SUCCESS) { - gmem_err("%s: call dev peer_unmap error %d\n", __func__, ret); + if (ret != GM_RET_SUCCESS) + gmem_err("%s: call dev peer_unmap error %d", __func__, ret); + + /* + * Regardless of whether the gm_page is unmapped, we should release it. + */ + hnode = get_hnode(gm_mapping->gm_page->hnid); + if (!hnode) { mutex_unlock(&gm_mapping->lock); continue; } + hnode_activelist_del(hnode, gm_mapping->gm_page); + hnode_active_pages_dec(hnode); + put_gm_page(gm_mapping->gm_page); + gm_mapping->gm_page = NULL; mutex_unlock(&gm_mapping->lock); } while (addr += HPAGE_SIZE, addr != end); @@ -2805,10 +2815,6 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, locked_vm += vma_pages(next); count++; -#ifdef CONFIG_GMEM - if (gmem_is_enabled()) - munmap_single_vma_in_peer_devices(mm, vma, start, end); -#endif if (unlikely(uf)) { /* * If userfaultfd_unmap_prep returns an error the vmas -- Gitee From 15e9380d2c4a1054d3f763cb7050e210e2e7e657 Mon Sep 17 00:00:00 2001 From: Bin Wang Date: Tue, 23 Sep 2025 21:33:47 +0800 Subject: [PATCH 19/34] gmem_phys: Fix the oops issue caused by concurrency in the evict and unmap processes. euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- When exiting process unmaps vma, swap process may be in a window when it can see rmap (gm_page->mm and gm_page->va). This cause a UAF bug. add rmap and remove rmap should be protected by lock. Signed-off-by: Bin Wang --- include/linux/gmem.h | 54 +++++++++------- mm/gmem.c | 44 ++++++-------- mm/gmem_phys.c | 142 +++++++++++++++++++++++++++++-------------- mm/gmem_stat.c | 44 ++++++++++++++ mm/memory.c | 61 +++++++++++++++++++ mm/mmap.c | 5 +- mm/vm_object.c | 2 +- 7 files changed, 257 insertions(+), 95 deletions(-) diff --git a/include/linux/gmem.h b/include/linux/gmem.h index 15e341f38953..e95d9a5be3c5 100644 --- a/include/linux/gmem.h +++ b/include/linux/gmem.h @@ -224,14 +224,13 @@ struct gm_dev { struct gm_mapping *gm_mapping; }; -#define GM_PAGE_DIRTY 0x8 /* Whether the page is dirty */ -#define GM_PAGE_CPU 0x10 /* Determines whether page is a pointer or a pfn number. */ -#define GM_PAGE_DEVICE 0x20 -#define GM_PAGE_NOMAP 0x40 -#define GM_PAGE_PINNED 0x80 -#define GM_PAGE_WILLNEED 0x100 +#define GM_MAPPING_CPU 0x10 /* Determines whether page is a pointer or a pfn number. */ +#define GM_MAPPING_DEVICE 0x20 +#define GM_MAPPING_NOMAP 0x40 +#define GM_MAPPING_PINNED 0x80 +#define GM_MAPPING_WILLNEED 0x100 -#define GM_PAGE_TYPE_MASK (GM_PAGE_CPU | GM_PAGE_DEVICE | GM_PAGE_NOMAP) +#define GM_MAPPING_TYPE_MASK (GM_MAPPING_CPU | GM_MAPPING_DEVICE | GM_MAPPING_NOMAP) /* Records the status of a page-size physical page */ struct gm_mapping { @@ -248,8 +247,8 @@ struct gm_mapping { static inline void gm_mapping_flags_set(struct gm_mapping *gm_mapping, int flags) { - if (flags & GM_PAGE_TYPE_MASK) - gm_mapping->flag &= ~GM_PAGE_TYPE_MASK; + if (flags & GM_MAPPING_TYPE_MASK) + gm_mapping->flag &= ~GM_MAPPING_TYPE_MASK; gm_mapping->flag |= flags; } @@ -261,27 +260,17 @@ static inline void gm_mapping_flags_clear(struct gm_mapping *gm_mapping, int fla static inline bool gm_mapping_cpu(struct gm_mapping *gm_mapping) { - return !!(gm_mapping->flag & GM_PAGE_CPU); + return !!(gm_mapping->flag & GM_MAPPING_CPU); } static inline bool gm_mapping_device(struct gm_mapping *gm_mapping) { - return !!(gm_mapping->flag & GM_PAGE_DEVICE); + return !!(gm_mapping->flag & GM_MAPPING_DEVICE); } static inline bool gm_mapping_nomap(struct gm_mapping *gm_mapping) { - return !!(gm_mapping->flag & GM_PAGE_NOMAP); -} - -static inline bool gm_mapping_willneed(struct gm_mapping *gm_mapping) -{ - return !!(gm_mapping->flag & GM_PAGE_WILLNEED); -} - -static inline bool gm_mapping_pinned(struct gm_mapping *gm_mapping) -{ - return !!(gm_mapping->flag & GM_PAGE_PINNED); + return !!(gm_mapping->flag & GM_MAPPING_NOMAP); } #define test_gm_mapping_mapped_on_node(i) { /* implement this */ } @@ -392,10 +381,29 @@ struct gm_page { * */ unsigned long va; struct mm_struct *mm; + spinlock_t rmap_lock; + unsigned int flag; atomic_t refcount; }; +#define GM_PAGE_EVICTING 0x1 + +static inline void gm_page_flags_set(struct gm_page *gm_page, int flags) +{ + gm_page->flag |= flags; +} + +static inline void gm_page_flags_clear(struct gm_page *gm_page, int flags) +{ + gm_page->flag &= ~flags; +} + +static inline bool gm_page_evicting(struct gm_page *gm_page) +{ + return !!(gm_page->flag & GM_PAGE_EVICTING); +} + #define NUM_IMPORT_PAGES 16 int __init gm_page_cachep_init(void); @@ -406,6 +414,8 @@ void hnode_activelist_add(struct hnode *hnode, struct gm_page *gm_page); void hnode_activelist_del(struct hnode *hnode, struct gm_page *gm_page); void hnode_activelist_del_and_add(struct hnode *hnode, struct gm_page *gm_page); void mark_gm_page_active(struct gm_page *gm_page); +void gm_page_add_rmap(struct gm_page *gm_page, struct mm_struct *mm, unsigned long va); +void gm_page_remove_rmap(struct gm_page *gm_page); int gm_add_pages(unsigned int hnid, struct list_head *pages); void gm_free_page(struct gm_page *gm_page); struct gm_page *gm_alloc_page(struct mm_struct *mm, struct hnode *hnode); diff --git a/mm/gmem.c b/mm/gmem.c index 30a83c9daced..2d62265e0795 100644 --- a/mm/gmem.c +++ b/mm/gmem.c @@ -302,7 +302,7 @@ enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struc page = gm_mapping->page; if (!page) { gmem_err("host gm_mapping page is NULL. Set nomap"); - gm_mapping_flags_set(gm_mapping, GM_PAGE_NOMAP); + gm_mapping_flags_set(gm_mapping, GM_MAPPING_NOMAP); goto unlock; } get_page(page); @@ -328,21 +328,9 @@ enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struc ret = mmu->peer_map(&gmf); if (ret != GM_RET_SUCCESS) { - if (ret == GM_RET_MIGRATING) { - /* - * gmem page is migrating due to overcommit. - * update page to willneed and this will stop page evicting - */ - gm_mapping_flags_set(gm_mapping, GM_PAGE_WILLNEED); - gmem_stats_counter(NR_PAGE_MIGRATING_D2H, 1); - ret = GM_RET_SUCCESS; - } else { - gmem_err("peer map failed"); - if (page) { - gm_mapping_flags_set(gm_mapping, GM_PAGE_NOMAP); - put_page(page); - } - } + gmem_err("peer map failed"); + if (page) + gm_mapping_flags_set(gm_mapping, GM_MAPPING_CPU); goto unlock; } @@ -351,10 +339,9 @@ enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struc folio_put(page_folio(page)); } - gm_mapping_flags_set(gm_mapping, GM_PAGE_DEVICE); + gm_mapping_flags_set(gm_mapping, GM_MAPPING_DEVICE); gm_mapping->dev = dev; - gm_page->va = addr; - gm_page->mm = mm; + gm_page_add_rmap(gm_page, mm, addr); gm_mapping->gm_page = gm_page; hnode_activelist_add(hnode, gm_page); hnode_active_pages_inc(hnode); @@ -408,6 +395,7 @@ vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, dma_unmap_page(dma_dev, gmf.dma_addr, size, DMA_BIDIRECTIONAL); hnode = get_hnode(gm_mapping->gm_page->hnid); + gm_page_remove_rmap(gm_mapping->gm_page); hnode_activelist_del(hnode, gm_mapping->gm_page); hnode_active_pages_dec(hnode); put_gm_page(gm_mapping->gm_page); @@ -648,11 +636,12 @@ static int gmem_unmap_vma_pages(struct vm_area_struct *vma, unsigned long start, continue; } hnode = get_hnode(gm_mapping->gm_page->hnid); + gm_page_remove_rmap(gm_mapping->gm_page); hnode_activelist_del(hnode, gm_mapping->gm_page); hnode_active_pages_dec(hnode); put_gm_page(gm_mapping->gm_page); } - gm_mapping_flags_set(gm_mapping, GM_PAGE_NOMAP); + gm_mapping_flags_set(gm_mapping, GM_MAPPING_NOMAP); mutex_unlock(&gm_mapping->lock); } @@ -789,7 +778,7 @@ static void do_hmemcpy(struct mm_struct *mm, int hnid, unsigned long dest, vma_src = find_vma(mm, src); if (!vma_src || vma_src->vm_start > src || !vma_dest || vma_dest->vm_start > dest) { - gmem_err("hmemcpy: the vma find by src/dest is NULL!\n"); + gmem_err("hmemcpy: the vma find by src/dest is NULL!"); goto unlock_mm; } @@ -797,14 +786,19 @@ static void do_hmemcpy(struct mm_struct *mm, int hnid, unsigned long dest, gm_mapping_src = vm_object_lookup(vma_src->vm_obj, src & ~(page_size - 1)); if (!gm_mapping_src) { - gmem_err("hmemcpy: gm_mapping_src is NULL\n"); + gmem_err("hmemcpy: gm_mapping_src is NULL"); + goto unlock_mm; + } + + if (gm_mapping_nomap(gm_mapping_src)) { + gmem_err("hmemcpy: src address is not mapping to CPU or device"); goto unlock_mm; } if (hnid != -1) { dev = get_gm_dev(hnid); if (!dev) { - gmem_err("hmemcpy: hnode's dev is NULL\n"); + gmem_err("hmemcpy: hnode's dev is NULL"); goto unlock_mm; } } @@ -816,14 +810,14 @@ static void do_hmemcpy(struct mm_struct *mm, int hnid, unsigned long dest, ret = handle_mm_fault(vma_dest, dest & ~(page_size - 1), FAULT_FLAG_USER | FAULT_FLAG_INSTRUCTION | FAULT_FLAG_WRITE, NULL); if (ret) { - gmem_err("%s: failed to execute host page fault, ret:%d\n", + gmem_err("%s: failed to execute host page fault, ret:%d", __func__, ret); goto unlock_mm; } } else { ret = gm_dev_fault_locked(mm, dest & ~(page_size - 1), dev, MADV_WILLNEED); if (ret != GM_RET_SUCCESS) { - gmem_err("%s: failed to excecute dev page fault.\n", __func__); + gmem_err("%s: failed to excecute dev page fault.", __func__); goto unlock_mm; } } diff --git a/mm/gmem_phys.c b/mm/gmem_phys.c index 1ff4407d5aff..7a8c27a4d27f 100644 --- a/mm/gmem_phys.c +++ b/mm/gmem_phys.c @@ -147,6 +147,7 @@ struct gm_page *alloc_gm_page_struct(void) if (!gm_page) return NULL; atomic_set(&gm_page->refcount, 0); + spin_lock_init(&gm_page->rmap_lock); return gm_page; } EXPORT_SYMBOL(alloc_gm_page_struct); @@ -168,14 +169,18 @@ void hnode_activelist_add(struct hnode *hnode, struct gm_page *gm_page) void hnode_activelist_del(struct hnode *hnode, struct gm_page *gm_page) { spin_lock(&hnode->activelist_lock); - list_del(&gm_page->gm_page_list); + /* If a gm_page is being evicted, it is currently located in the + * temporary linked list. */ + if (!gm_page_evicting(gm_page)) + list_del_init(&gm_page->gm_page_list); spin_unlock(&hnode->activelist_lock); } void hnode_activelist_del_and_add(struct hnode *hnode, struct gm_page *gm_page) { spin_lock(&hnode->activelist_lock); - list_move_tail(&gm_page->gm_page_list, &hnode->activelist); + if (!gm_page_evicting(gm_page)) + list_move_tail(&gm_page->gm_page_list, &hnode->activelist); spin_unlock(&hnode->activelist_lock); } @@ -219,82 +224,133 @@ void gm_free_page(struct gm_page *gm_page) hnode_free_pages_inc(hnode); } -static int gm_evict_page_locked(struct gm_page *gm_page) +void gm_page_add_rmap(struct gm_page *gm_page, struct mm_struct *mm, unsigned long va) +{ + spin_lock(&gm_page->rmap_lock); + gm_page->mm = mm; + gm_page->va = va; + spin_unlock(&gm_page->rmap_lock); +} + +void gm_page_remove_rmap(struct gm_page *gm_page) +{ + spin_lock(&gm_page->rmap_lock); + gm_page->mm = NULL; + gm_page->va = 0; + spin_unlock(&gm_page->rmap_lock); +} + +enum gm_evict_ret { + GM_EVICT_SUCCESS = 0, + GM_EVICT_UNMAP, + GM_EVICT_FALLBACK, + GM_EVICT_DEVERR, +}; + +enum gm_evict_ret gm_evict_page_locked(struct gm_page *gm_page) { struct gm_dev *gm_dev; struct gm_mapping *gm_mapping; struct vm_area_struct *vma; - struct mm_struct *mm = gm_page->mm; + struct mm_struct *mm; struct page *page; struct device *dma_dev; - unsigned long va = gm_page->va; + unsigned long va; struct folio *folio = NULL; struct gm_fault_t gmf = { - .mm = mm, - .va = va, .size = HPAGE_SIZE, .copy = true }; - int ret = 0; + enum gm_evict_ret ret = GM_EVICT_SUCCESS; + int err; gm_dev = get_gm_dev(gm_page->hnid); if (!gm_dev) - return -EINVAL; + return GM_EVICT_DEVERR; + + spin_lock(&gm_page->rmap_lock); + if (!gm_page->mm) { + /* Evicting gm_page conflicts with unmap.*/ + ret = GM_EVICT_UNMAP; + goto rmap_unlock; + } + mm = gm_page->mm; + va = gm_page->va; vma = find_vma(mm, va); if (!vma || !vma->vm_obj) { gmem_err("%s: cannot find vma or vma->vm_obj is null for va %lx", __func__, va); - return -EINVAL; + ret = GM_EVICT_UNMAP; + goto rmap_unlock; } gm_mapping = vm_object_lookup(vma->vm_obj, va); if (!gm_mapping) { gmem_err("%s: no gm_mapping for va %lx", __func__, va); - return -EINVAL; + ret = GM_EVICT_UNMAP; + goto rmap_unlock; } + spin_unlock(&gm_page->rmap_lock); + mutex_lock(&gm_mapping->lock); if (!gm_mapping_device(gm_mapping)) { - gmem_err("%s: evicting gm_page conflicts with unmap.", __func__); - ret = 0; + /* Evicting gm_page conflicts with unmap.*/ + ret = GM_EVICT_UNMAP; + goto gm_mapping_unlock; + } + + if (gm_mapping->gm_page != gm_page) { + /* gm_mapping maps to another gm_page. */ + ret = GM_EVICT_UNMAP; goto gm_mapping_unlock; } folio = vma_alloc_folio(GFP_TRANSHUGE, HPAGE_PMD_ORDER, vma, va, true); if (!folio) { gmem_err("%s: allocate host page failed.", __func__); - ret = -ENOMEM; + ret = GM_EVICT_FALLBACK; goto gm_mapping_unlock; } page = &folio->page; + gmf.mm = mm; + gmf.va = va; gmf.dev = gm_dev; gmf.pfn = gm_page->dev_pfn; dma_dev = gm_dev->dma_dev; gmf.dma_addr = dma_map_page(dma_dev, page, 0, HPAGE_SIZE, DMA_BIDIRECTIONAL); if (dma_mapping_error(dma_dev, gmf.dma_addr)) { gmem_err("%s: dma map failed.", __func__); - ret = -EINVAL; + ret = GM_EVICT_FALLBACK; goto gm_mapping_unlock; } - ret = gm_dev->mmu->peer_unmap(&gmf); - if (ret) + err = gm_dev->mmu->peer_unmap(&gmf); + if (err) { gmem_err("%s: peer_unmap failed.", __func__); + ret = GM_EVICT_DEVERR; + goto dma_unmap; + } - dma_unmap_page(dma_dev, gmf.dma_addr, HPAGE_SIZE, DMA_BIDIRECTIONAL); - gm_mapping_flags_set(gm_mapping, GM_PAGE_CPU); + gm_mapping_flags_set(gm_mapping, GM_MAPPING_CPU); + gm_page_remove_rmap(gm_page); gm_mapping->page = page; put_gm_page(gm_page); +dma_unmap: + dma_unmap_page(dma_dev, gmf.dma_addr, HPAGE_SIZE, DMA_BIDIRECTIONAL); gm_mapping_unlock: mutex_unlock(&gm_mapping->lock); return ret; +rmap_unlock: + spin_unlock(&gm_page->rmap_lock); + return ret; } -static int gm_evict_page(struct gm_page *gm_page) +enum gm_evict_ret gm_evict_page(struct gm_page *gm_page) { struct mm_struct *mm = gm_page->mm; - int ret; + enum gm_evict_ret ret; mmap_read_lock(mm); ret = gm_evict_page_locked(gm_page); @@ -315,6 +371,7 @@ static void gm_do_swap(struct hnode *hnode) list_for_each_entry_safe(gm_page, n, &hnode->activelist, gm_page_list) { /* Move gm_page to temporary list. */ get_gm_page(gm_page); + gm_page_flags_set(gm_page, GM_PAGE_EVICTING); list_move(&gm_page->gm_page_list, &swap_list); nr_swap_pages++; if (nr_swap_pages >= NUM_SWAP_PAGES) @@ -323,33 +380,26 @@ static void gm_do_swap(struct hnode *hnode) spin_unlock(&hnode->activelist_lock); list_for_each_entry_safe(gm_page, n, &swap_list, gm_page_list) { - list_del(&gm_page->gm_page_list); + list_del_init(&gm_page->gm_page_list); ret = gm_evict_page_locked(gm_page); - if (ret) { - gmem_err("%s: evict gm_page %lx failed, va %lx", __func__, - (unsigned long)gm_page, gm_page->va); - if (ret == -ENOMEM) { - /* - * Failed to allocate host page, so return gm_page - * to activelist. - */ - hnode_activelist_add(hnode, gm_page); - } else { - /* - * Conflicts with process exit, so return gm_page - * to freelist to avoid memory leak. - */ - atomic_set(&gm_page->refcount, 0); - hnode_freelist_add(hnode, gm_page); - hnode_active_pages_dec(hnode); - hnode_free_pages_inc(hnode); - } + gm_page_flags_clear(gm_page, GM_PAGE_EVICTING); + if (ret == GM_EVICT_UNMAP) { + /* Evicting gm_page conflicts with unmap.*/ + put_gm_page(gm_page); + } else if (ret == GM_EVICT_FALLBACK) { + /* An error occurred with the host, and gm_page needs + * to be added back to the activelist. */ + hnode_activelist_add(hnode, gm_page); + put_gm_page(gm_page); + } else if (ret == GM_EVICT_DEVERR) { + /* It generally occurs when the process has already + * exited, at which point gm_page needs to be returned + * to the freelist. */ + put_gm_page(gm_page); + } else { + hnode_active_pages_dec(hnode); put_gm_page(gm_page); - continue; } - - hnode_active_pages_dec(hnode); - put_gm_page(gm_page); } }; @@ -411,7 +461,7 @@ static struct gm_page *get_gm_page_from_freelist(struct hnode *hnode) gm_page = list_first_entry_or_null(&hnode->freelist, struct gm_page, gm_page_list); /* Delete from freelist. */ if (gm_page) { - list_del(&gm_page->gm_page_list); + list_del_init(&gm_page->gm_page_list); hnode_free_pages_dec(hnode); get_gm_page(gm_page); /* TODO: wakeup swapd if needed. */ diff --git a/mm/gmem_stat.c b/mm/gmem_stat.c index c8b4e200ea90..032c93d1e1c5 100644 --- a/mm/gmem_stat.c +++ b/mm/gmem_stat.c @@ -91,10 +91,54 @@ static ssize_t nr_activepages_show(struct kobject *kobj, static struct kobj_attribute nr_activepages_attr = __ATTR(nr_activepages, 0444, nr_activepages_show, NULL); +static ssize_t nr_freelist_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + unsigned int nr_freelist = 0; + struct gm_page *gm_page; + struct hnode *hnode = get_hnode_kobj(kobj); + + if (!hnode) + return -EINVAL; + + spin_lock(&hnode->freelist_lock); + list_for_each_entry(gm_page, &hnode->freelist, gm_page_list) { + nr_freelist++; + } + spin_unlock(&hnode->freelist_lock); + return sprintf(buf, "%u\n", nr_freelist); +} + +static struct kobj_attribute nr_freelist_attr = + __ATTR(nr_freelist, 0444, nr_freelist_show, NULL); + +static ssize_t nr_activelist_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + unsigned int nr_activelist = 0; + struct gm_page *gm_page; + struct hnode *hnode = get_hnode_kobj(kobj); + + if (!hnode) + return -EINVAL; + + spin_lock(&hnode->activelist_lock); + list_for_each_entry(gm_page, &hnode->activelist, gm_page_list) { + nr_activelist++; + } + spin_unlock(&hnode->activelist_lock); + return sprintf(buf, "%u\n", nr_activelist); +} + +static struct kobj_attribute nr_activelist_attr = + __ATTR(nr_activelist, 0444, nr_activelist_show, NULL); + static struct attribute *hnode_attrs[] = { &max_memsize_attr.attr, &nr_freepages_attr.attr, &nr_activepages_attr.attr, + &nr_freelist_attr.attr, + &nr_activelist_attr.attr, NULL, }; diff --git a/mm/memory.c b/mm/memory.c index 0e4097f99f19..5fc9346749e5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1937,6 +1937,64 @@ static void unmap_single_vma(struct mmu_gather *tlb, } } +#ifdef CONFIG_GMEM + +static void unmap_single_peer_shared_vma(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long start_addr, unsigned long end_addr) +{ + unsigned long start, end, addr; + struct vm_object *obj = vma->vm_obj; + struct gm_mapping *gm_mapping; + struct hnode *hnode; + + start = max(vma->vm_start, start_addr); + if (start >= vma->vm_end) + return; + addr = start; + end = min(vma->vm_end, end_addr); + if (end <= vma->vm_start) + return; + + if (!obj) + return; + + if (!mm->gm_as) + return; + + do { + xa_lock(obj->logical_page_table); + gm_mapping = vm_object_lookup(obj, addr); + if (!gm_mapping) { + xa_unlock(obj->logical_page_table); + continue; + } + xa_unlock(obj->logical_page_table); + + mutex_lock(&gm_mapping->lock); + if (!gm_mapping_device(gm_mapping)) { + mutex_unlock(&gm_mapping->lock); + continue; + } + + /* + * Regardless of whether the gm_page is unmapped, we should release it. + */ + hnode = get_hnode(gm_mapping->gm_page->hnid); + if (!hnode) { + mutex_unlock(&gm_mapping->lock); + continue; + } + gm_page_remove_rmap(gm_mapping->gm_page); + hnode_activelist_del(hnode, gm_mapping->gm_page); + hnode_active_pages_dec(hnode); + put_gm_page(gm_mapping->gm_page); + gm_mapping->gm_page = NULL; + mutex_unlock(&gm_mapping->lock); + } while (addr += HPAGE_SIZE, addr != end); +} + +#endif + /** * unmap_vmas - unmap a range of memory covered by a list of vma's * @tlb: address of the caller's struct mmu_gather @@ -1980,6 +2038,9 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, unmap_single_vma(tlb, vma, start, end, &details, mm_wr_locked); hugetlb_zap_end(vma, &details); +#ifdef CONFIG_GMEM + unmap_single_peer_shared_vma(vma->vm_mm, vma, start, end); +#endif vma = mas_find(mas, tree_end - 1); } while (vma && likely(!xa_is_zero(vma))); mmu_notifier_invalidate_range_end(&range); diff --git a/mm/mmap.c b/mm/mmap.c index 588b934f551f..771bb8ae2417 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2617,7 +2617,7 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, #ifdef CONFIG_GMEM static void munmap_single_vma_in_peer_devices(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long start_addr, unsigned long end_addr) + unsigned long start_addr, unsigned long end_addr) { unsigned long start, end, addr; struct vm_object *obj = vma->vm_obj; @@ -2661,6 +2661,7 @@ static void munmap_single_vma_in_peer_devices(struct mm_struct *mm, struct vm_ar gmf.va = addr; gmf.size = HPAGE_SIZE; + gmf.pfn = gm_mapping->gm_page->dev_pfn; gmf.dev = gm_mapping->dev; ret = gm_mapping->dev->mmu->peer_unmap(&gmf); if (ret != GM_RET_SUCCESS) @@ -2674,9 +2675,11 @@ static void munmap_single_vma_in_peer_devices(struct mm_struct *mm, struct vm_ar mutex_unlock(&gm_mapping->lock); continue; } + gm_page_remove_rmap(gm_mapping->gm_page); hnode_activelist_del(hnode, gm_mapping->gm_page); hnode_active_pages_dec(hnode); put_gm_page(gm_mapping->gm_page); + gm_mapping_flags_set(gm_mapping, GM_MAPPING_NOMAP); gm_mapping->gm_page = NULL; mutex_unlock(&gm_mapping->lock); } while (addr += HPAGE_SIZE, addr != end); diff --git a/mm/vm_object.c b/mm/vm_object.c index 43dc927477bc..427be06a6adb 100644 --- a/mm/vm_object.c +++ b/mm/vm_object.c @@ -51,7 +51,7 @@ struct gm_mapping *alloc_gm_mapping(void) if (!gm_mapping) return NULL; - gm_mapping_flags_set(gm_mapping, GM_PAGE_NOMAP); + gm_mapping_flags_set(gm_mapping, GM_MAPPING_NOMAP); mutex_init(&gm_mapping->lock); return gm_mapping; -- Gitee From 2651feb197d01aa9eccd40c7945b46f22c0192d7 Mon Sep 17 00:00:00 2001 From: Bin Wang Date: Thu, 25 Sep 2025 09:14:02 +0800 Subject: [PATCH 20/34] gmem_phys: Fix memory leak issue after peer_map failure. euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- Release gm_page when fail to do device mapping. Signed-off-by: Bin Wang --- mm/gmem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/gmem.c b/mm/gmem.c index 2d62265e0795..2a7666005f1d 100644 --- a/mm/gmem.c +++ b/mm/gmem.c @@ -331,6 +331,7 @@ enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struc gmem_err("peer map failed"); if (page) gm_mapping_flags_set(gm_mapping, GM_MAPPING_CPU); + put_gm_page(gm_page); goto unlock; } -- Gitee From bdec4ae7388ec23eae81da372f7219b47ace9153 Mon Sep 17 00:00:00 2001 From: Bin Wang Date: Thu, 25 Sep 2025 09:34:35 +0800 Subject: [PATCH 21/34] gmem_stat: Remove read permission for regular users euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- Only root can read active and free pages number. Signed-off-by: Bin Wang --- mm/gmem_stat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/gmem_stat.c b/mm/gmem_stat.c index 032c93d1e1c5..34dfd5a7ba40 100644 --- a/mm/gmem_stat.c +++ b/mm/gmem_stat.c @@ -61,7 +61,7 @@ static ssize_t max_memsize_store(struct kobject *kobj, } static struct kobj_attribute max_memsize_attr = - __ATTR(max_memsize, 0644, max_memsize_show, max_memsize_store); + __ATTR(max_memsize, 0640, max_memsize_show, max_memsize_store); static ssize_t nr_freepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -75,7 +75,7 @@ static ssize_t nr_freepages_show(struct kobject *kobj, } static struct kobj_attribute nr_freepages_attr = - __ATTR(nr_freepages, 0444, nr_freepages_show, NULL); + __ATTR(nr_freepages, 0440, nr_freepages_show, NULL); static ssize_t nr_activepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) -- Gitee From 6e03826f59c08af013594531f89b8296fd13ba3c Mon Sep 17 00:00:00 2001 From: xiuqing Date: Wed, 15 Oct 2025 19:08:55 +0800 Subject: [PATCH 22/34] gmem: support pin and unpin gm_page euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- Some memory in device should not be swapped to host, such as message queue memory in device communication. We introduce pin flag in gm_page and skip swapping those to host. Signed-off-by: xiuqing --- include/linux/gmem.h | 8 +++++ include/uapi/asm-generic/mman-common.h | 1 + mm/gmem.c | 30 +++++++++++++--- mm/gmem_phys.c | 47 ++++++++++++++++++++++++++ mm/huge_memory.c | 6 ++++ 5 files changed, 87 insertions(+), 5 deletions(-) diff --git a/include/linux/gmem.h b/include/linux/gmem.h index e95d9a5be3c5..38320f79c8d3 100644 --- a/include/linux/gmem.h +++ b/include/linux/gmem.h @@ -388,6 +388,7 @@ struct gm_page { }; #define GM_PAGE_EVICTING 0x1 +#define GM_PAGE_PINNED 0x2 static inline void gm_page_flags_set(struct gm_page *gm_page, int flags) { @@ -404,6 +405,11 @@ static inline bool gm_page_evicting(struct gm_page *gm_page) return !!(gm_page->flag & GM_PAGE_EVICTING); } +static inline bool gm_page_pinned(struct gm_page *gm_page) +{ + return !!(gm_page->flag & GM_PAGE_PINNED); +} + #define NUM_IMPORT_PAGES 16 int __init gm_page_cachep_init(void); @@ -414,6 +420,8 @@ void hnode_activelist_add(struct hnode *hnode, struct gm_page *gm_page); void hnode_activelist_del(struct hnode *hnode, struct gm_page *gm_page); void hnode_activelist_del_and_add(struct hnode *hnode, struct gm_page *gm_page); void mark_gm_page_active(struct gm_page *gm_page); +void mark_gm_page_pinned(struct gm_page *gm_page); +void mark_gm_page_unpinned(struct gm_page *gm_page); void gm_page_add_rmap(struct gm_page *gm_page, struct mm_struct *mm, unsigned long va); void gm_page_remove_rmap(struct gm_page *gm_page); int gm_add_pages(unsigned int hnid, struct list_head *pages); diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index d8857c71d4bb..19e22492a85b 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -85,6 +85,7 @@ #define MADV_GMEM_BASE 0x1000 #define MADV_PREFETCH MADV_GMEM_BASE /* prefetch pages for hNUMA node */ #define MADV_PINNED (MADV_GMEM_BASE+1) /* pin these pages */ +#define MADV_PINNED_REMOVE (MADV_GMEM_BASE+2) /* unpin these pages */ #define MADV_ETMEM_BASE 0x1100 #define MADV_SWAPFLAG MADV_ETMEM_BASE /* for memory to be swap out */ diff --git a/mm/gmem.c b/mm/gmem.c index 2a7666005f1d..5585dfa3fb23 100644 --- a/mm/gmem.c +++ b/mm/gmem.c @@ -291,10 +291,17 @@ enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struc if (gm_mapping_nomap(gm_mapping)) { goto peer_map; } else if (gm_mapping_device(gm_mapping)) { - if (behavior == MADV_WILLNEED) { + switch (behavior) { + case MADV_PINNED: + mark_gm_page_pinned(gm_mapping->gm_page); + fallthrough; + case MADV_WILLNEED: mark_gm_page_active(gm_mapping->gm_page); goto unlock; - } else { + case MADV_PINNED_REMOVE: + mark_gm_page_unpinned(gm_mapping->gm_page); + goto unlock; + default: ret = 0; goto unlock; } @@ -344,6 +351,12 @@ enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struc gm_mapping->dev = dev; gm_page_add_rmap(gm_page, mm, addr); gm_mapping->gm_page = gm_page; + + if (behavior == MADV_PINNED) + mark_gm_page_pinned(gm_page); + else if (behavior == MADV_PINNED_REMOVE) + mark_gm_page_unpinned(gm_page); + hnode_activelist_add(hnode, gm_page); hnode_active_pages_inc(hnode); unlock: @@ -494,6 +507,7 @@ struct prefetch_data { unsigned long addr; size_t size; struct work_struct work; + int behavior; int *res; }; @@ -508,7 +522,7 @@ static void prefetch_work_cb(struct work_struct *work) do { /* MADV_WILLNEED: dev will soon access this addr. */ mmap_read_lock(d->mm); - ret = gm_dev_fault_locked(d->mm, addr, d->dev, MADV_WILLNEED); + ret = gm_dev_fault_locked(d->mm, addr, d->dev, d->behavior); mmap_read_unlock(d->mm); if (ret == GM_RET_PAGE_EXIST) { gmem_err("%s: device has done page fault, ignore prefetch\n", @@ -522,7 +536,7 @@ static void prefetch_work_cb(struct work_struct *work) kfree(d); } -static int hmadvise_do_prefetch(struct gm_dev *dev, unsigned long addr, size_t size) +static int hmadvise_do_prefetch(struct gm_dev *dev, unsigned long addr, size_t size, int behavior) { unsigned long start, end, per_size; int page_size = HPAGE_SIZE; @@ -578,6 +592,7 @@ static int hmadvise_do_prefetch(struct gm_dev *dev, unsigned long addr, size_t s data->mm = current->mm; data->dev = dev; data->addr = start; + data->behavior = behavior; data->res = &res; if (per_size == 0) data->size = size; @@ -745,7 +760,12 @@ int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior) no_hnid: switch (behavior) { case MADV_PREFETCH: - return hmadvise_do_prefetch(dev, start, len_in); + behavior = MADV_WILLNEED; + fallthrough; + case MADV_PINNED_REMOVE: + fallthrough; + case MADV_PINNED: + return hmadvise_do_prefetch(dev, start, len_in, behavior); case MADV_DONTNEED: return hmadvise_do_eagerfree(start, len_in); default: diff --git a/mm/gmem_phys.c b/mm/gmem_phys.c index 7a8c27a4d27f..50550cd4b2ff 100644 --- a/mm/gmem_phys.c +++ b/mm/gmem_phys.c @@ -194,6 +194,43 @@ void mark_gm_page_active(struct gm_page *gm_page) hnode_activelist_del_and_add(hnode, gm_page); } +void mark_gm_page_pinned(struct gm_page *gm_page) +{ + struct hnode *hnode = get_hnode(gm_page->hnid); + + if (!hnode) + return; + + spin_lock(&hnode->activelist_lock); + if (gm_page_evicting(gm_page)) { + gmem_err("%s: maybe page has been evicted!", __func__); + goto unlock; + } else if (gm_page_pinned(gm_page)) { + goto unlock; + } + gm_page_flags_set(gm_page, GM_PAGE_PINNED); + +unlock: + spin_unlock(&hnode->activelist_lock); +} + +void mark_gm_page_unpinned(struct gm_page *gm_page) +{ + struct hnode *hnode = get_hnode(gm_page->hnid); + + if (!hnode) + return; + + spin_lock(&hnode->activelist_lock); + if (!gm_page_pinned(gm_page) || gm_page_evicting(gm_page)) + goto unlock; + + gm_page_flags_clear(gm_page, GM_PAGE_PINNED); + +unlock: + spin_unlock(&hnode->activelist_lock); +} + int gm_add_pages(unsigned int hnid, struct list_head *pages) { struct hnode *hnode; @@ -207,6 +244,7 @@ int gm_add_pages(unsigned int hnid, struct list_head *pages) list_del(&gm_page->gm_page_list); hnode_freelist_add(hnode, gm_page); hnode_free_pages_inc(hnode); + gm_page_flags_clear(gm_page, GM_PAGE_PINNED); } return 0; @@ -369,6 +407,10 @@ static void gm_do_swap(struct hnode *hnode) spin_lock(&hnode->activelist_lock); list_for_each_entry_safe(gm_page, n, &hnode->activelist, gm_page_list) { + if (gm_page_pinned(gm_page)) { + gmem_err("%s: va %lx is pinned!", __func__, gm_page->va); + continue; + } /* Move gm_page to temporary list. */ get_gm_page(gm_page); gm_page_flags_set(gm_page, GM_PAGE_EVICTING); @@ -461,6 +503,11 @@ static struct gm_page *get_gm_page_from_freelist(struct hnode *hnode) gm_page = list_first_entry_or_null(&hnode->freelist, struct gm_page, gm_page_list); /* Delete from freelist. */ if (gm_page) { + if (gm_page_pinned(gm_page)) { + gmem_err("%s: gm_page %lx from freelist has pinned flag, clear it!", + __func__, (unsigned long)gm_page); + gm_page_flags_clear(gm_page, GM_PAGE_PINNED); + } list_del_init(&gm_page->gm_page_list); hnode_free_pages_dec(hnode); get_gm_page(gm_page); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 086f5b692973..812241e0868a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1434,6 +1434,12 @@ static vm_fault_t __do_peer_shared_anonymous_page(struct vm_fault *vmf) mutex_lock(&gm_mapping->lock); + if (gm_mapping_device(gm_mapping) && gm_page_pinned(gm_mapping->gm_page)) { + pr_err("page is pinned! addr is %lx\n", gm_mapping->gm_page->va); + ret = VM_FAULT_SIGBUS; + goto release; + } + if (gm_mapping_cpu(gm_mapping)) folio = page_folio(gm_mapping->page); if (!folio) { -- Gitee From 858e2e891e80e2ec77e4bf790e06829b1d40972a Mon Sep 17 00:00:00 2001 From: xiuqing Date: Wed, 15 Oct 2025 20:20:02 +0800 Subject: [PATCH 23/34] gmem: Expand hmemcpy to support copy between gmem and not gmem, besides consider overlimit euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- Memory copy is just allowed between gmem vma before. We support memory copy between gmem vma and un-gmem vma now. Signed-off-by: xiuqing --- mm/gmem.c | 212 +++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 178 insertions(+), 34 deletions(-) diff --git a/mm/gmem.c b/mm/gmem.c index 5585dfa3fb23..382a116ec8df 100644 --- a/mm/gmem.c +++ b/mm/gmem.c @@ -781,6 +781,26 @@ static bool hnid_match_dest(int hnid, struct gm_mapping *dest) return (hnid < 0) ? gm_mapping_cpu(dest) : gm_mapping_device(dest); } +static void cpu_page_copy(struct page *dst_page, unsigned long dst_offset, + struct page *src_page, unsigned long src_offset, size_t size) +{ + unsigned long src, dst; + + src = (unsigned long)page_address(src_page) + src_offset; + dst = (unsigned long)page_address(dst_page) + dst_offset; + if (!src || !dst) { + gmem_err("%s: src (%lx) or dst (%lx) is invalid!", __func__, src, dst); + return; + } + memcpy((void *)dst, (void *)src, size); +} + +enum gmem_copy_dir { + COPY_GMEM_TO_NORM, + COPY_NORM_TO_GMEM, + COPY_GMEM_TO_GMEM, +}; + static void do_hmemcpy(struct mm_struct *mm, int hnid, unsigned long dest, unsigned long src, size_t size) { @@ -790,6 +810,9 @@ static void do_hmemcpy(struct mm_struct *mm, int hnid, unsigned long dest, struct gm_mapping *gm_mapping_dest, *gm_mapping_src; struct gm_dev *dev = NULL; struct gm_memcpy_t gmc = {0}; + enum gmem_copy_dir dir; + struct page *trans_hpage; + void *trans_addr; if (size == 0) return; @@ -803,24 +826,39 @@ static void do_hmemcpy(struct mm_struct *mm, int hnid, unsigned long dest, goto unlock_mm; } - gm_mapping_dest = vm_object_lookup(vma_dest->vm_obj, dest & ~(page_size - 1)); - gm_mapping_src = vm_object_lookup(vma_src->vm_obj, src & ~(page_size - 1)); - - if (!gm_mapping_src) { - gmem_err("hmemcpy: gm_mapping_src is NULL"); + if (vma_is_peer_shared(vma_src) && vma_is_peer_shared(vma_dest)) { + dir = COPY_GMEM_TO_GMEM; + gm_mapping_dest = vm_object_lookup(vma_dest->vm_obj, dest & ~(page_size - 1)); + gm_mapping_src = vm_object_lookup(vma_src->vm_obj, src & ~(page_size - 1)); + } else if (vma_is_peer_shared(vma_src)) { + dir = COPY_GMEM_TO_NORM; + gm_mapping_src = vm_object_lookup(vma_src->vm_obj, src & ~(page_size - 1)); + gm_mapping_dest = NULL; + } else if (vma_is_peer_shared(vma_dest)) { + dir = COPY_NORM_TO_GMEM; + gm_mapping_dest = vm_object_lookup(vma_dest->vm_obj, dest & ~(page_size - 1)); + gm_mapping_src = NULL; + } else { + gmem_err("%s: src %lx and dest %lx both not gmem addr!", __func__, src, dest); goto unlock_mm; } - if (gm_mapping_nomap(gm_mapping_src)) { - gmem_err("hmemcpy: src address is not mapping to CPU or device"); + trans_hpage = alloc_pages(GFP_TRANSHUGE, HPAGE_PMD_ORDER); + if (!trans_hpage) { + gmem_err("%s: alloc trans_hpage failed!", __func__); goto unlock_mm; } + trans_addr = page_to_virt(trans_hpage); + + if (dir != COPY_NORM_TO_GMEM && (!gm_mapping_src || gm_mapping_nomap(gm_mapping_src))) + gmem_err("%s: gm_mapping_src is NULL or still not mapped! addr is %lx", + __func__, src); if (hnid != -1) { dev = get_gm_dev(hnid); if (!dev) { gmem_err("hmemcpy: hnode's dev is NULL"); - goto unlock_mm; + goto free_trans_page; } } @@ -828,49 +866,141 @@ static void do_hmemcpy(struct mm_struct *mm, int hnid, unsigned long dest, if (!gm_mapping_dest || gm_mapping_nomap(gm_mapping_dest) || !hnid_match_dest(hnid, gm_mapping_dest)) { if (hnid == -1) { - ret = handle_mm_fault(vma_dest, dest & ~(page_size - 1), FAULT_FLAG_USER | - FAULT_FLAG_INSTRUCTION | FAULT_FLAG_WRITE, NULL); - if (ret) { - gmem_err("%s: failed to execute host page fault, ret:%d", - __func__, ret); - goto unlock_mm; + if (gm_mapping_dest && gm_mapping_device(gm_mapping_dest) + && gm_page_pinned(gm_mapping_dest->gm_page)) { + gmem_err("%s: dest %lx is pinned on device, skip handle_mm_fault", + __func__, dest); + } else { + ret = handle_mm_fault(vma_dest, dest & ~(page_size - 1), + FAULT_FLAG_USER | FAULT_FLAG_INSTRUCTION | FAULT_FLAG_WRITE, + NULL); + if (ret) { + gmem_err("%s: failed to execute host page fault, ret:%d", + __func__, ret); + goto free_trans_page; + } } } else { ret = gm_dev_fault_locked(mm, dest & ~(page_size - 1), dev, MADV_WILLNEED); if (ret != GM_RET_SUCCESS) { gmem_err("%s: failed to excecute dev page fault.", __func__); - goto unlock_mm; + goto free_trans_page; } } } - if (!gm_mapping_dest) + if (!gm_mapping_dest && dir != COPY_GMEM_TO_NORM) gm_mapping_dest = vm_object_lookup(vma_dest->vm_obj, round_down(dest, page_size)); if (gm_mapping_dest && gm_mapping_dest != gm_mapping_src) mutex_lock(&gm_mapping_dest->lock); - mutex_lock(&gm_mapping_src->lock); + if (gm_mapping_src) + mutex_lock(&gm_mapping_src->lock); // Use memcpy when there is no device address, otherwise use peer_memcpy - if (hnid == -1) { + if (dir == COPY_GMEM_TO_NORM) { + if (!gm_mapping_src) { + gmem_err("%s: do COPY_GMEM_TO_NORM but gm_mapping_src is NULL!", __func__); + goto unlock_gm_mapping; + } if (gm_mapping_cpu(gm_mapping_src)) { // host to host - gmem_err("hmemcpy: host to host is unimplemented\n"); - goto unlock_gm_mmaping; - } else { // device to host + cpu_page_copy(trans_hpage, + (unsigned long)trans_addr & (page_size - 1), + gm_mapping_src->page, src & (page_size - 1), + size); + goto copy_to_norm_dest; + } else if (gm_mapping_device(gm_mapping_src)) { // device to host dev = gm_mapping_src->dev; - gmc.dest = phys_to_dma(dev->dma_dev, - page_to_phys(gm_mapping_dest->page) + (dest & (page_size - 1))); + gmc.dest = phys_to_dma(dev->dma_dev, page_to_phys(trans_hpage) + + ((unsigned long)trans_addr & (page_size - 1))); gmc.src = gm_mapping_src->gm_page->dev_dma_addr + (src & (page_size - 1)); gmc.kind = GM_MEMCPY_D2H; + } else { + gmem_err("gm_mapping_src bad status, dir is COPY_GMEM_TO_NORM"); + goto unlock_gm_mapping; } - } else { - if (gm_mapping_cpu(gm_mapping_src)) { // host to device + } else if (dir == COPY_NORM_TO_GMEM) { + if (!gm_mapping_dest) { + gmem_err("%s: do COPY_NORM_TO_GMEM but gm_mapping_dest is NULL!", __func__); + goto unlock_gm_mapping; + } + if (copy_from_user(trans_addr, (void __user *)src, size) > 0) + gmem_err("copy normal src %lx to trans failed", src); + if (gm_mapping_cpu(gm_mapping_dest)) { // host to host + cpu_page_copy(gm_mapping_dest->page, dest & (page_size - 1), + trans_hpage, (unsigned long)trans_addr & (page_size - 1), size); + goto unlock_gm_mapping; + } else if (gm_mapping_device(gm_mapping_dest)) { + if (!dev) { + gmem_err("%s: do COPY_NORM_TO_GMEM but dev is NULL, hnid is %d", + __func__, hnid); + goto unlock_gm_mapping; + } gmc.dest = gm_mapping_dest->gm_page->dev_dma_addr + - (dest & (page_size - 1)); - gmc.src = phys_to_dma(dev->dma_dev, - page_to_phys(gm_mapping_src->page) + (src & (page_size - 1))); + (dest & (page_size - 1)); + gmc.src = phys_to_dma(dev->dma_dev, page_to_phys(trans_hpage) + + ((unsigned long)trans_addr & (page_size - 1))); gmc.kind = GM_MEMCPY_H2D; } else { // device to device - gmem_err("hmemcpy: device to device is unimplemented\n"); - goto unlock_gm_mmaping; + gmem_err("gm_mapping_dest bad status, dir is COPY_NORM_TO_GMEM\n"); + goto unlock_gm_mapping; + } + } else if (dir == COPY_GMEM_TO_GMEM) { + if (gm_mapping_cpu(gm_mapping_src)) { + if (gm_mapping_cpu(gm_mapping_dest)) { + cpu_page_copy(gm_mapping_dest->page, dest & (page_size - 1), + gm_mapping_src->page, src & (page_size - 1), size); + goto unlock_gm_mapping; + } else if (gm_mapping_device(gm_mapping_dest)) { + dev = gm_mapping_dest->dev; + gmc.dest = gm_mapping_dest->gm_page->dev_dma_addr + + (dest & (page_size - 1)); + gmc.src = phys_to_dma(dev->dma_dev, + page_to_phys(gm_mapping_src->page) + + (src & (page_size - 1))); + gmc.kind = GM_MEMCPY_H2D; + } else { + gmem_err("gm_mapping_dest bad status, src is on host!"); + goto unlock_gm_mapping; + } + } else if (gm_mapping_device(gm_mapping_src)) { + if (gm_mapping_cpu(gm_mapping_dest)) { + dev = gm_mapping_src->dev; + gmc.dest = phys_to_dma(dev->dma_dev, + page_to_phys(gm_mapping_dest->page) + + (dest & (page_size - 1))); + gmc.src = gm_mapping_src->gm_page->dev_dma_addr + + (src & (page_size - 1)); + gmc.kind = GM_MEMCPY_D2H; + } else if (gm_mapping_device(gm_mapping_dest)) { + dev = gm_mapping_src->dev; + gmc.dest = phys_to_dma(dev->dma_dev, page_to_phys(trans_hpage) + + ((unsigned long)trans_addr & (page_size - 1))); + gmc.src = gm_mapping_src->gm_page->dev_dma_addr + + (src & (page_size - 1)); + gmc.kind = GM_MEMCPY_D2H; + gmc.mm = mm; + gmc.dev = dev; + gmc.size = size; + dev->mmu->peer_hmemcpy(&gmc); + + dev = gm_mapping_dest->dev; + gmc.dest = gm_mapping_dest->gm_page->dev_dma_addr + + (dest & (page_size - 1)); + gmc.src = phys_to_dma(dev->dma_dev, page_to_phys(trans_hpage) + + ((unsigned long)trans_addr & (page_size - 1))); + gmc.kind = GM_MEMCPY_H2D; + gmc.mm = mm; + gmc.dev = dev; + gmc.size = size; + dev->mmu->peer_hmemcpy(&gmc); + + goto unlock_gm_mapping; + } else { + gmem_err("gm_mapping_dest bad status, src is on device!"); + goto unlock_gm_mapping; + } + } else { + gmem_err("gm_mapping_src bad status, dir is COPY_GMEM_TO_GMEM"); + goto unlock_gm_mapping; } } gmc.mm = mm; @@ -878,10 +1008,19 @@ static void do_hmemcpy(struct mm_struct *mm, int hnid, unsigned long dest, gmc.size = size; dev->mmu->peer_hmemcpy(&gmc); -unlock_gm_mmaping: - mutex_unlock(&gm_mapping_src->lock); +copy_to_norm_dest: + if (dir == COPY_GMEM_TO_NORM) { + if (copy_to_user((void __user *)dest, trans_addr, size) > 0) + gmem_err("copy trans to normal dest %lx failed!", dest); + } + +unlock_gm_mapping: + if (gm_mapping_src) + mutex_unlock(&gm_mapping_src->lock); if (gm_mapping_dest && gm_mapping_dest != gm_mapping_src) mutex_unlock(&gm_mapping_dest->lock); +free_trans_page: + __free_pages(trans_hpage, HPAGE_PMD_ORDER); unlock_mm: mmap_read_unlock(mm); } @@ -949,17 +1088,22 @@ int hmemcpy(int hnid, unsigned long dest, unsigned long src, size_t size) vma_src = find_vma(mm, src); if ((ULONG_MAX - size < src) || !vma_src || vma_src->vm_start > src || - !vma_is_peer_shared(vma_src) || vma_src->vm_end < (src + size)) { + vma_src->vm_end < (src + size)) { gmem_err("failed to find peer_shared vma by invalid src or size\n"); goto unlock; } if ((ULONG_MAX - size < dest) || !vma_dest || vma_dest->vm_start > dest || - !vma_is_peer_shared(vma_dest) || vma_dest->vm_end < (dest + size)) { + vma_dest->vm_end < (dest + size)) { gmem_err("failed to find peer_shared vma by invalid dest or size\n"); goto unlock; } + if (!vma_is_peer_shared(vma_src) && !vma_is_peer_shared(vma_dest)) { + mmap_read_unlock(mm); + return -EAGAIN; + } + if (!(vma_dest->vm_flags & VM_WRITE)) { gmem_err("dest is not writable.\n"); goto unlock; -- Gitee From c015c7e41eb210994370a98d5c2427dc589de6cd Mon Sep 17 00:00:00 2001 From: zhangjian Date: Wed, 15 Oct 2025 22:32:09 +0800 Subject: [PATCH 24/34] gmem_phys: remove unexport N_HETEROGENEOUS node type. euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- Remove unexport N_HETEROGENEOUS node type to prevent possible conflict of node_states value in future. Signed-off-by: zhangjian --- drivers/base/node.c | 6 ------ include/linux/gmem.h | 7 +------ include/linux/nodemask.h | 12 ------------ mm/gmem_phys.c | 7 +++++++ mm/page_alloc.c | 3 --- 5 files changed, 8 insertions(+), 27 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index 0ece939cc5f6..75f78552da5a 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -935,9 +935,6 @@ static struct node_attr node_state_attr[] = { [N_CPU] = _NODE_ATTR(has_cpu, N_CPU), [N_GENERIC_INITIATOR] = _NODE_ATTR(has_generic_initiator, N_GENERIC_INITIATOR), -#ifdef CONFIG_GMEM - [N_HETEROGENEOUS] = _NODE_ATTR(has_hetero_memory, N_HETEROGENEOUS), -#endif }; static struct attribute *node_state_attrs[] = { @@ -950,9 +947,6 @@ static struct attribute *node_state_attrs[] = { &node_state_attr[N_MEMORY].attr.attr, &node_state_attr[N_CPU].attr.attr, &node_state_attr[N_GENERIC_INITIATOR].attr.attr, -#ifdef CONFIG_GMEM - &node_state_attr[N_HETEROGENEOUS].attr.attr, -#endif NULL }; diff --git a/include/linux/gmem.h b/include/linux/gmem.h index 38320f79c8d3..0d603976f438 100644 --- a/include/linux/gmem.h +++ b/include/linux/gmem.h @@ -348,18 +348,13 @@ static inline void hnode_free_pages_dec(struct hnode *hnode) atomic_dec(&hnode->nr_free_pages); } -static inline bool is_hnode(int node) -{ - return (node < MAX_NUMNODES) && !node_isset(node, node_possible_map) && - node_isset(node, hnode_map); -} - static inline int get_hnuma_id(struct gm_dev *gm_dev) { return first_node(gm_dev->registered_hnodes); } void __init hnuma_init(void); +bool is_hnode(int nid); unsigned int alloc_hnode_id(void); void free_hnode_id(unsigned int nid); struct hnode *get_hnode(unsigned int hnid); diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index f005f3d903ae..8d07116caaf1 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -407,11 +407,6 @@ enum node_states { N_MEMORY, /* The node has memory(regular, high, movable) */ N_CPU, /* The node has one or more cpus */ N_GENERIC_INITIATOR, /* The node has one or more Generic Initiators */ -#ifdef CONFIG_GMEM -#ifndef __GENKSYMS__ - N_HETEROGENEOUS, /* The node has heterogeneous memory */ -#endif -#endif NR_NODE_STATES }; @@ -541,13 +536,6 @@ static inline int node_random(const nodemask_t *maskp) #define for_each_node(node) for_each_node_state(node, N_POSSIBLE) #define for_each_online_node(node) for_each_node_state(node, N_ONLINE) -#ifdef CONFIG_GMEM -/* For h-NUMA topology */ -#define hnode_map node_states[N_HETEROGENEOUS] -#define num_hnodes() num_node_state(N_HETEROGENEOUS) -#define for_each_hnode(node) for_each_node_state(node, N_HETEROGENEOUS) -#endif - /* * For nodemask scratch area. * NODEMASK_ALLOC(type, name) allocates an object with a specified type and diff --git a/mm/gmem_phys.c b/mm/gmem_phys.c index 50550cd4b2ff..256085552194 100644 --- a/mm/gmem_phys.c +++ b/mm/gmem_phys.c @@ -22,6 +22,7 @@ static struct kmem_cache *gm_page_cachep; DEFINE_SPINLOCK(hnode_lock); +static nodemask_t hnode_map; struct hnode *hnodes[MAX_NUMNODES]; void __init hnuma_init(void) @@ -29,11 +30,17 @@ void __init hnuma_init(void) unsigned int node; spin_lock(&hnode_lock); + nodes_clear(hnode_map); for_each_node(node) node_set(node, hnode_map); spin_unlock(&hnode_lock); } +bool is_hnode(int nid) +{ + return node_isset(nid, hnode_map); +} + unsigned int alloc_hnode_id(void) { unsigned int node; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 32b3921949cc..ce0203f660e8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -195,9 +195,6 @@ EXPORT_SYMBOL(latent_entropy); nodemask_t node_states[NR_NODE_STATES] __read_mostly = { [N_POSSIBLE] = NODE_MASK_ALL, [N_ONLINE] = { { [0] = 1UL } }, -#ifdef CONFIG_GMEM - [N_HETEROGENEOUS] = NODE_MASK_NONE, -#endif #ifndef CONFIG_NUMA [N_NORMAL_MEMORY] = { { [0] = 1UL } }, #ifdef CONFIG_HIGHMEM -- Gitee From 20e38cc9b12ff3c9e747e1b868f7d466449f197f Mon Sep 17 00:00:00 2001 From: xiuqing Date: Thu, 16 Oct 2025 04:02:41 +0800 Subject: [PATCH 25/34] gmem: fix memory alloc failure. euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- Process may set_device twice, which will call gm_as_attach twice and set same device in two different context. This caused two different page table in one process for same device. Since we don't actually support multi-context in one address space now, address space can be simplely cleaned up for double set_device. Signed-off-by: xiuqing --- mm/gmem.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mm/gmem.c b/mm/gmem.c index 382a116ec8df..be813ea02947 100644 --- a/mm/gmem.c +++ b/mm/gmem.c @@ -477,6 +477,17 @@ enum gm_ret gm_as_attach(struct gm_as *as, struct gm_dev *dev, enum gm_mmu_mode INIT_LIST_HEAD(&ctx->gm_dev_link); INIT_LIST_HEAD(&ctx->gm_as_link); + + if (!list_empty(&as->gm_ctx_list)) { + struct list_head *old_node; + struct gm_context *old_ctx; + + old_node = as->gm_ctx_list.prev; + list_del_init(old_node); + old_ctx = list_entry(old_node, struct gm_context, gm_as_link); + kfree(old_ctx); + } + list_add_tail(&dev->gm_ctx_list, &ctx->gm_dev_link); list_add_tail(&ctx->gm_as_link, &as->gm_ctx_list); -- Gitee From 9adcfe1b9cd33d3441d241a873730ccb0a324ad5 Mon Sep 17 00:00:00 2001 From: zhangjian Date: Thu, 16 Oct 2025 21:13:44 +0800 Subject: [PATCH 26/34] mm: fix bug when CONFIG_GMEM is false euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- When CONFIG_GMEM=n, mmap.c will not know enum gm_as_alloc. Compile error will raise in gmem.h. Fix it by wrap gmem.h in CONFIG_GMEM Signed-off-by: zhangjian --- include/linux/vm_object.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/vm_object.h b/include/linux/vm_object.h index d37cd0353f85..e7922ddd1846 100644 --- a/include/linux/vm_object.h +++ b/include/linux/vm_object.h @@ -3,9 +3,9 @@ #define _VM_OBJECT_H #include -#include #ifdef CONFIG_GMEM +#include /* vm_object KPI */ int __init vm_object_init(void); struct vm_object *vm_object_create(struct vm_area_struct *vma); -- Gitee From e6805f2e4fa0309c50749bc52ace9d7fbc7b007d Mon Sep 17 00:00:00 2001 From: zhangjian Date: Mon, 20 Oct 2025 08:09:13 +0800 Subject: [PATCH 27/34] gmem: remove some useless code, move some interface into gmem-internal.h euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- remove some useless code, move some interface into gmem-internal.h Signed-off-by: xiuqing --- include/linux/gmem.h | 180 ------------------------- include/linux/mm.h | 6 - include/linux/vm_object.h | 4 +- include/uapi/asm-generic/mman-common.h | 6 +- kernel/fork.c | 12 -- mm/gmem-internal.h | 165 +++++++++++++++++++++++ mm/gmem.c | 21 ++- mm/gmem_phys.c | 5 +- mm/gmem_stat.c | 2 + mm/huge_memory.c | 2 + mm/memory.c | 2 + mm/mm_init.c | 1 + mm/mmap.c | 48 ++++--- mm/util.c | 3 + mm/vm_object.c | 2 + 15 files changed, 219 insertions(+), 240 deletions(-) create mode 100644 mm/gmem-internal.h diff --git a/include/linux/gmem.h b/include/linux/gmem.h index 0d603976f438..e28b9e461b8c 100644 --- a/include/linux/gmem.h +++ b/include/linux/gmem.h @@ -32,29 +32,6 @@ enum gm_ret { GM_RET_UNIMPLEMENTED, }; -/* - * Defines a contiguous range of virtual addresses inside a struct gm_as - * As an analogy, this is conceptually similar as virtual_address_struct - */ -struct gm_region { - unsigned long start_va; - unsigned long end_va; - struct rb_node node; - struct gm_as *as; /* The address space that it belongs to */ - - /* Do we need another list_node to maintain a tailQ of allocated VMAs inside a gm_as? */ - struct list_head mapping_set_link; - - void (*callback_op)(void *args); - void *cb_args; -}; - -/* This holds a list of regions that must not be concurrently manipulated. */ -struct gm_mapping_set { - unsigned int region_cnt; - struct list_head gm_region_list; -}; - /** * enum gm_mmu_mode - defines the method to share a physical page table. * @@ -177,8 +154,6 @@ struct gm_mmu { #define GM_DEV_CAP_REPLAYABLE 0x00000001 #define GM_DEV_CAP_PEER 0x00000010 -#define gm_dev_is_peer(dev) (((dev)->capability & GM_DEV_CAP_PEER) != 0) - struct gm_context { struct gm_as *as; struct gm_dev *dev; @@ -224,14 +199,6 @@ struct gm_dev { struct gm_mapping *gm_mapping; }; -#define GM_MAPPING_CPU 0x10 /* Determines whether page is a pointer or a pfn number. */ -#define GM_MAPPING_DEVICE 0x20 -#define GM_MAPPING_NOMAP 0x40 -#define GM_MAPPING_PINNED 0x80 -#define GM_MAPPING_WILLNEED 0x100 - -#define GM_MAPPING_TYPE_MASK (GM_MAPPING_CPU | GM_MAPPING_DEVICE | GM_MAPPING_NOMAP) - /* Records the status of a page-size physical page */ struct gm_mapping { unsigned int flag; @@ -245,34 +212,6 @@ struct gm_mapping { struct mutex lock; }; -static inline void gm_mapping_flags_set(struct gm_mapping *gm_mapping, int flags) -{ - if (flags & GM_MAPPING_TYPE_MASK) - gm_mapping->flag &= ~GM_MAPPING_TYPE_MASK; - - gm_mapping->flag |= flags; -} - -static inline void gm_mapping_flags_clear(struct gm_mapping *gm_mapping, int flags) -{ - gm_mapping->flag &= ~flags; -} - -static inline bool gm_mapping_cpu(struct gm_mapping *gm_mapping) -{ - return !!(gm_mapping->flag & GM_MAPPING_CPU); -} - -static inline bool gm_mapping_device(struct gm_mapping *gm_mapping) -{ - return !!(gm_mapping->flag & GM_MAPPING_DEVICE); -} - -static inline bool gm_mapping_nomap(struct gm_mapping *gm_mapping) -{ - return !!(gm_mapping->flag & GM_MAPPING_NOMAP); -} - #define test_gm_mapping_mapped_on_node(i) { /* implement this */ } #define set_gm_mapping_mapped_on_node(i) { /* implement this */ } #define unset_gm_mapping_mapped_on_node(i) { /* implement this */ } @@ -293,75 +232,10 @@ extern enum gm_ret gm_as_create(unsigned long begin, unsigned long end, enum gm_ extern enum gm_ret gm_as_destroy(struct gm_as *as); extern enum gm_ret gm_as_attach(struct gm_as *as, struct gm_dev *dev, enum gm_mmu_mode mode, bool activate, struct gm_context **out_ctx); -extern unsigned long gm_as_alloc(struct gm_as *as, unsigned long hint, unsigned long size, - unsigned long align, unsigned long no_cross, unsigned long max_va, - struct gm_region **new_region); extern int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior); extern int hmemcpy(int hnid, unsigned long dest, unsigned long src, size_t size); -enum gmem_stats_item { - NR_PAGE_MIGRATING_H2D, - NR_PAGE_MIGRATING_D2H, - NR_GMEM_STAT_ITEMS -}; - -extern void gmem_stats_counter(enum gmem_stats_item item, int val); -extern void gmem_stats_counter_show(void); - -/* h-NUMA topology */ -struct hnode { - unsigned int id; - struct gm_dev *dev; - - struct task_struct *swapd_task; - - struct list_head freelist; - struct list_head activelist; - spinlock_t freelist_lock; - spinlock_t activelist_lock; - atomic_t nr_free_pages; - atomic_t nr_active_pages; - - unsigned long max_memsize; - - bool import_failed; -}; - -static inline void hnode_active_pages_inc(struct hnode *hnode) -{ - atomic_inc(&hnode->nr_active_pages); -} - -static inline void hnode_active_pages_dec(struct hnode *hnode) -{ - atomic_dec(&hnode->nr_active_pages); -} - -static inline void hnode_free_pages_inc(struct hnode *hnode) -{ - atomic_inc(&hnode->nr_free_pages); -} - -static inline void hnode_free_pages_dec(struct hnode *hnode) -{ - atomic_dec(&hnode->nr_free_pages); -} - -static inline int get_hnuma_id(struct gm_dev *gm_dev) -{ - return first_node(gm_dev->registered_hnodes); -} - -void __init hnuma_init(void); -bool is_hnode(int nid); -unsigned int alloc_hnode_id(void); -void free_hnode_id(unsigned int nid); -struct hnode *get_hnode(unsigned int hnid); -struct gm_dev *get_gm_dev(unsigned int nid); -void hnode_init(struct hnode *hnode, unsigned int hnid, struct gm_dev *dev); -void hnode_deinit(unsigned int hnid, struct gm_dev *dev); - struct gm_page { struct list_head gm_page_list; @@ -382,61 +256,7 @@ struct gm_page { atomic_t refcount; }; -#define GM_PAGE_EVICTING 0x1 -#define GM_PAGE_PINNED 0x2 - -static inline void gm_page_flags_set(struct gm_page *gm_page, int flags) -{ - gm_page->flag |= flags; -} - -static inline void gm_page_flags_clear(struct gm_page *gm_page, int flags) -{ - gm_page->flag &= ~flags; -} - -static inline bool gm_page_evicting(struct gm_page *gm_page) -{ - return !!(gm_page->flag & GM_PAGE_EVICTING); -} - -static inline bool gm_page_pinned(struct gm_page *gm_page) -{ - return !!(gm_page->flag & GM_PAGE_PINNED); -} - -#define NUM_IMPORT_PAGES 16 - -int __init gm_page_cachep_init(void); -void gm_page_cachep_destroy(void); struct gm_page *alloc_gm_page_struct(void); -void hnode_freelist_add(struct hnode *hnode, struct gm_page *gm_page); -void hnode_activelist_add(struct hnode *hnode, struct gm_page *gm_page); -void hnode_activelist_del(struct hnode *hnode, struct gm_page *gm_page); -void hnode_activelist_del_and_add(struct hnode *hnode, struct gm_page *gm_page); -void mark_gm_page_active(struct gm_page *gm_page); -void mark_gm_page_pinned(struct gm_page *gm_page); -void mark_gm_page_unpinned(struct gm_page *gm_page); -void gm_page_add_rmap(struct gm_page *gm_page, struct mm_struct *mm, unsigned long va); -void gm_page_remove_rmap(struct gm_page *gm_page); -int gm_add_pages(unsigned int hnid, struct list_head *pages); -void gm_free_page(struct gm_page *gm_page); -struct gm_page *gm_alloc_page(struct mm_struct *mm, struct hnode *hnode); - -static inline void get_gm_page(struct gm_page *gm_page) -{ - atomic_inc(&gm_page->refcount); -} - -static inline void put_gm_page(struct gm_page *gm_page) -{ - if (atomic_dec_and_test(&gm_page->refcount)) - gm_free_page(gm_page); -} - -int hnode_init_sysfs(unsigned int hnid); -int __init gm_init_sysfs(void); -void gm_deinit_sysfs(void); #define gmem_err(fmt, ...) \ ((void)pr_err("[gmem]" fmt "\n", ##__VA_ARGS__)) diff --git a/include/linux/mm.h b/include/linux/mm.h index 30222ae6daa5..72a1e67997a8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3410,12 +3410,6 @@ unsigned long randomize_page(unsigned long start, unsigned long range); extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); -#ifdef CONFIG_GMEM -extern unsigned long get_unmapped_area_aligned(struct file *file, - unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags, unsigned long align); -#endif - extern unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf); diff --git a/include/linux/vm_object.h b/include/linux/vm_object.h index e7922ddd1846..a3b7a6e9f978 100644 --- a/include/linux/vm_object.h +++ b/include/linux/vm_object.h @@ -2,10 +2,8 @@ #ifndef _VM_OBJECT_H #define _VM_OBJECT_H -#include - #ifdef CONFIG_GMEM -#include +#include /* vm_object KPI */ int __init vm_object_init(void); struct vm_object *vm_object_create(struct vm_area_struct *vma); diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 19e22492a85b..438cde70ee93 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -33,7 +33,7 @@ #define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be * uninitialized */ -#define MAP_PEER_SHARED 0x1000000 +#define MAP_PEER_SHARED 0x1000000 /* GMEM scene, for heterogeneous memory */ /* * Flags for mlock @@ -82,10 +82,10 @@ #define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ /* for hmadvise */ -#define MADV_GMEM_BASE 0x1000 +#define MADV_GMEM_BASE 0x1000 /* base of some madvise for heterogeneous memory only GMEM scene */ #define MADV_PREFETCH MADV_GMEM_BASE /* prefetch pages for hNUMA node */ #define MADV_PINNED (MADV_GMEM_BASE+1) /* pin these pages */ -#define MADV_PINNED_REMOVE (MADV_GMEM_BASE+2) /* unpin these pages */ +#define MADV_UNPINNED (MADV_GMEM_BASE+2) /* unpin these pages */ #define MADV_ETMEM_BASE 0x1100 #define MADV_SWAPFLAG MADV_ETMEM_BASE /* for memory to be swap out */ diff --git a/kernel/fork.c b/kernel/fork.c index f6c45be64ab9..e01a6291b38e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -100,10 +100,6 @@ #include #include -#ifdef CONFIG_GMEM -#include -#endif - #ifdef CONFIG_QOS_SCHED_SMART_GRID #include #endif @@ -119,10 +115,6 @@ #include #endif -#ifdef CONFIG_GMEM -#include -#endif - #include #define CREATE_TRACE_POINTS @@ -535,9 +527,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) vma_numab_state_init(new); dup_anon_vma_name(orig, new); -#ifdef CONFIG_GMEM dup_peer_shared_vma(new); -#endif return new; } @@ -564,10 +554,8 @@ static void vm_area_free_rcu_cb(struct rcu_head *head) void vm_area_free(struct vm_area_struct *vma) { -#ifdef CONFIG_GMEM if (vma_is_peer_shared(vma)) vm_object_drop_locked(vma); -#endif #ifdef CONFIG_PER_VMA_LOCK call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb); #else diff --git a/mm/gmem-internal.h b/mm/gmem-internal.h new file mode 100644 index 000000000000..b93e2c8e1d5e --- /dev/null +++ b/mm/gmem-internal.h @@ -0,0 +1,165 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _GMEM_INTERNAL_H +#define _GMEM_INTERNAL_H + +#include + +#define gm_dev_is_peer(dev) (((dev)->capability & GM_DEV_CAP_PEER) != 0) + +/* h-NUMA topology */ +struct hnode { + unsigned int id; + struct gm_dev *dev; + + struct task_struct *swapd_task; + + struct list_head freelist; + struct list_head activelist; + spinlock_t freelist_lock; + spinlock_t activelist_lock; + atomic_t nr_free_pages; + atomic_t nr_active_pages; + + unsigned long max_memsize; + + bool import_failed; +}; + +static inline void hnode_active_pages_inc(struct hnode *hnode) +{ + atomic_inc(&hnode->nr_active_pages); +} + +static inline void hnode_active_pages_dec(struct hnode *hnode) +{ + atomic_dec(&hnode->nr_active_pages); +} + +static inline void hnode_free_pages_inc(struct hnode *hnode) +{ + atomic_inc(&hnode->nr_free_pages); +} + +static inline void hnode_free_pages_dec(struct hnode *hnode) +{ + atomic_dec(&hnode->nr_free_pages); +} + +static inline int get_hnuma_id(struct gm_dev *gm_dev) +{ + return first_node(gm_dev->registered_hnodes); +} + +#define GM_MAPPING_CPU 0x10 /* Determines whether page is a pointer or a pfn number. */ +#define GM_MAPPING_DEVICE 0x20 +#define GM_MAPPING_NOMAP 0x40 +#define GM_MAPPING_WILLNEED 0x80 + +#define GM_MAPPING_TYPE_MASK (GM_MAPPING_CPU | GM_MAPPING_DEVICE | GM_MAPPING_NOMAP) + + +static inline void gm_mapping_flags_set(struct gm_mapping *gm_mapping, int flags) +{ + if (flags & GM_MAPPING_TYPE_MASK) + gm_mapping->flag &= ~GM_MAPPING_TYPE_MASK; + + gm_mapping->flag |= flags; +} + +static inline void gm_mapping_flags_clear(struct gm_mapping *gm_mapping, int flags) +{ + gm_mapping->flag &= ~flags; +} + +static inline bool gm_mapping_cpu(struct gm_mapping *gm_mapping) +{ + return !!(gm_mapping->flag & GM_MAPPING_CPU); +} + +static inline bool gm_mapping_device(struct gm_mapping *gm_mapping) +{ + return !!(gm_mapping->flag & GM_MAPPING_DEVICE); +} + +static inline bool gm_mapping_nomap(struct gm_mapping *gm_mapping) +{ + return !!(gm_mapping->flag & GM_MAPPING_NOMAP); +} + +enum gmem_stats_item { + NR_PAGE_MIGRATING_H2D, + NR_PAGE_MIGRATING_D2H, + NR_GMEM_STAT_ITEMS +}; + +extern void gmem_stats_counter(enum gmem_stats_item item, int val); +extern void gmem_stats_counter_show(void); + +void __init hnuma_init(void); +bool is_hnode(int nid); +unsigned int alloc_hnode_id(void); +void free_hnode_id(unsigned int nid); +struct hnode *get_hnode(unsigned int hnid); +struct gm_dev *get_gm_dev(unsigned int nid); +void hnode_init(struct hnode *hnode, unsigned int hnid, struct gm_dev *dev); +void hnode_deinit(unsigned int hnid, struct gm_dev *dev); + +#define GM_PAGE_EVICTING 0x1 +#define GM_PAGE_PINNED 0x2 + +static inline void gm_page_flags_set(struct gm_page *gm_page, int flags) +{ + gm_page->flag |= flags; +} + +static inline void gm_page_flags_clear(struct gm_page *gm_page, int flags) +{ + gm_page->flag &= ~flags; +} + +static inline bool gm_page_evicting(struct gm_page *gm_page) +{ + return !!(gm_page->flag & GM_PAGE_EVICTING); +} + +static inline bool gm_page_pinned(struct gm_page *gm_page) +{ + return !!(gm_page->flag & GM_PAGE_PINNED); +} + +#define NUM_IMPORT_PAGES 16 + +int __init gm_page_cachep_init(void); +void gm_page_cachep_destroy(void); + +void hnode_freelist_add(struct hnode *hnode, struct gm_page *gm_page); +void hnode_activelist_add(struct hnode *hnode, struct gm_page *gm_page); +void hnode_activelist_del(struct hnode *hnode, struct gm_page *gm_page); +void hnode_activelist_del_and_add(struct hnode *hnode, struct gm_page *gm_page); + +void mark_gm_page_active(struct gm_page *gm_page); +void mark_gm_page_pinned(struct gm_page *gm_page); +void mark_gm_page_unpinned(struct gm_page *gm_page); + +void gm_page_add_rmap(struct gm_page *gm_page, struct mm_struct *mm, unsigned long va); +void gm_page_remove_rmap(struct gm_page *gm_page); +int gm_add_pages(unsigned int hnid, struct list_head *pages); +void gm_free_page(struct gm_page *gm_page); +struct gm_page *gm_alloc_page(struct mm_struct *mm, struct hnode *hnode); + +static inline void get_gm_page(struct gm_page *gm_page) +{ + atomic_inc(&gm_page->refcount); +} + +static inline void put_gm_page(struct gm_page *gm_page) +{ + if (atomic_dec_and_test(&gm_page->refcount)) + gm_free_page(gm_page); +} + +int hnode_init_sysfs(unsigned int hnid); +int __init gm_init_sysfs(void); +void gm_deinit_sysfs(void); + +#endif /* _GMEM_INTERNAL_H */ diff --git a/mm/gmem.c b/mm/gmem.c index be813ea02947..9d760d5a22f5 100644 --- a/mm/gmem.c +++ b/mm/gmem.c @@ -42,13 +42,14 @@ #include #include +#include "gmem-internal.h" + DEFINE_STATIC_KEY_FALSE(gmem_status); EXPORT_SYMBOL_GPL(gmem_status); static struct kmem_cache *gm_as_cache; static struct kmem_cache *gm_dev_cache; static struct kmem_cache *gm_ctx_cache; -static struct kmem_cache *gm_region_cache; static DEFINE_XARRAY_ALLOC(gm_dev_id_pool); static bool enable_gmem; @@ -136,13 +137,9 @@ static int __init gmem_init(void) if (!gm_ctx_cache) goto free_dev; - gm_region_cache = KMEM_CACHE(gm_region, 0); - if (!gm_region_cache) - goto free_ctx; - err = gm_page_cachep_init(); if (err) - goto free_region; + goto free_ctx; err = gm_init_sysfs(); if (err) @@ -154,14 +151,14 @@ static int __init gmem_init(void) err = gmem_stats_init(); if (err) - goto free_region; + goto free_ctx; prefetch_wq = alloc_workqueue("prefetch", __WQ_LEGACY | WQ_UNBOUND | WQ_HIGHPRI | WQ_CPU_INTENSIVE, GM_WORK_CONCURRENCY); if (!prefetch_wq) { gmem_err("fail to alloc workqueue prefetch_wq\n"); err = -EFAULT; - goto free_region; + goto free_ctx; } #ifdef CONFIG_PROC_FS @@ -176,8 +173,6 @@ static int __init gmem_init(void) gm_deinit_sysfs(); free_gm_page: gm_page_cachep_destroy(); -free_region: - kmem_cache_destroy(gm_region_cache); free_ctx: kmem_cache_destroy(gm_ctx_cache); free_dev: @@ -298,7 +293,7 @@ enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struc case MADV_WILLNEED: mark_gm_page_active(gm_mapping->gm_page); goto unlock; - case MADV_PINNED_REMOVE: + case MADV_UNPINNED: mark_gm_page_unpinned(gm_mapping->gm_page); goto unlock; default: @@ -354,7 +349,7 @@ enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struc if (behavior == MADV_PINNED) mark_gm_page_pinned(gm_page); - else if (behavior == MADV_PINNED_REMOVE) + else if (behavior == MADV_UNPINNED) mark_gm_page_unpinned(gm_page); hnode_activelist_add(hnode, gm_page); @@ -773,7 +768,7 @@ int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior) case MADV_PREFETCH: behavior = MADV_WILLNEED; fallthrough; - case MADV_PINNED_REMOVE: + case MADV_UNPINNED: fallthrough; case MADV_PINNED: return hmadvise_do_prefetch(dev, start, len_in, behavior); diff --git a/mm/gmem_phys.c b/mm/gmem_phys.c index 256085552194..3107ad181a12 100644 --- a/mm/gmem_phys.c +++ b/mm/gmem_phys.c @@ -16,6 +16,8 @@ #include #include +#include "gmem-internal.h" + #define NUM_SWAP_PAGES 16 #define MAX_SWAP_RETRY_TIMES 10 @@ -38,7 +40,8 @@ void __init hnuma_init(void) bool is_hnode(int nid) { - return node_isset(nid, hnode_map); + return (nid < MAX_NUMNODES) && !node_isset(nid, node_possible_map) && + node_isset(nid, hnode_map); } unsigned int alloc_hnode_id(void) diff --git a/mm/gmem_stat.c b/mm/gmem_stat.c index 34dfd5a7ba40..6a6cbdce4f47 100644 --- a/mm/gmem_stat.c +++ b/mm/gmem_stat.c @@ -11,6 +11,8 @@ #include #include +#include "gmem-internal.h" + static struct kobject *gm_kobj; struct hnode_kobject { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 812241e0868a..a582cab33939 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -40,6 +40,8 @@ #include #ifdef CONFIG_GMEM #include +#include +#include "gmem-internal.h" #endif #include diff --git a/mm/memory.c b/mm/memory.c index 5fc9346749e5..3846bf813d89 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -79,6 +79,8 @@ #include #ifdef CONFIG_GMEM #include +#include +#include "gmem-internal.h" #endif #include diff --git a/mm/mm_init.c b/mm/mm_init.c index 1a3d3b6e52c9..e7f0af3bf2b7 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -32,6 +32,7 @@ #include "shuffle.h" #ifdef CONFIG_GMEM #include +#include "gmem-internal.h" #endif #include diff --git a/mm/mmap.c b/mm/mmap.c index 771bb8ae2417..e82fb3ca7122 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -49,6 +49,10 @@ #include #include +#ifdef CONFIG_GMEM +#include +#include "gmem-internal.h" +#endif #include @@ -1293,6 +1297,28 @@ static unsigned long __mmap_region_ext(struct mm_struct *mm, struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf); + +#ifdef CONFIG_GMEM +static unsigned long +get_unmapped_area_aligned(struct file *file, unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags, unsigned long align) +{ + if (len > TASK_SIZE) + return -ENOMEM; + + addr = current->mm->get_unmapped_area(file, addr, len + align, pgoff, flags); + if (IS_ERR_VALUE(addr)) + return addr; + + addr = round_up(addr, align); + if (addr > TASK_SIZE - len) + return -ENOMEM; + if (!IS_ALIGNED(addr, PMD_SIZE)) + return -EINVAL; + + return addr; +} +#endif /* * The caller must write-lock current->mm->mmap_lock. */ @@ -1997,28 +2023,6 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, EXPORT_SYMBOL(get_unmapped_area); -#ifdef CONFIG_GMEM -unsigned long -get_unmapped_area_aligned(struct file *file, unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags, unsigned long align) -{ - if (len > TASK_SIZE) - return -ENOMEM; - - addr = current->mm->get_unmapped_area(file, addr, len + align, pgoff, flags); - if (IS_ERR_VALUE(addr)) - return addr; - - addr = round_up(addr, align); - if (addr > TASK_SIZE - len) - return -ENOMEM; - if (!IS_ALIGNED(addr, PMD_SIZE)) - return -EINVAL; - - return addr; -} -EXPORT_SYMBOL(get_unmapped_area_aligned); -#endif /** * find_vma_intersection() - Look up the first VMA which intersects the interval diff --git a/mm/util.c b/mm/util.c index 65392c97b1e9..60e956d716cd 100644 --- a/mm/util.c +++ b/mm/util.c @@ -29,6 +29,9 @@ #include #ifdef CONFIG_GMEM #include +#include + +#include "gmem-internal.h" #endif #include "internal.h" diff --git a/mm/vm_object.c b/mm/vm_object.c index 427be06a6adb..d34863d00b8b 100644 --- a/mm/vm_object.c +++ b/mm/vm_object.c @@ -35,6 +35,8 @@ #include #include +#include "gmem-internal.h" + /* * Sine VM_OBJECT maintains the logical page table under each VMA, and each VMA * points to a VM_OBJECT. Ultimately VM_OBJECTs must be maintained as long as VMA -- Gitee From bb85a14f1008bc326ef3f179e25fa059a0295baf Mon Sep 17 00:00:00 2001 From: Ni Cunshu Date: Mon, 27 Oct 2025 19:30:54 +0800 Subject: [PATCH 28/34] gmem:remove unexport functions euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- remove some unexport function in common .c files Signed-off-by: Ni Cunshu --- include/linux/gmem.h | 4 - kernel/fork.c | 1 + mm/Makefile | 2 +- mm/gmem-internal.h | 11 ++ mm/gmem.c | 214 ++++------------------ mm/gmem_phys.c | 1 - mm/gmem_stat.c | 8 +- mm/gmem_util.c | 423 +++++++++++++++++++++++++++++++++++++++++++ mm/huge_memory.c | 115 ------------ mm/internal.h | 1 - mm/memory.c | 163 ++++++++--------- mm/mempolicy.c | 5 - mm/mm_init.c | 4 - mm/mmap.c | 181 ++---------------- mm/mprotect.c | 9 +- mm/util.c | 109 ----------- 16 files changed, 570 insertions(+), 681 deletions(-) create mode 100644 mm/gmem_util.c diff --git a/include/linux/gmem.h b/include/linux/gmem.h index e28b9e461b8c..735a27ef7f38 100644 --- a/include/linux/gmem.h +++ b/include/linux/gmem.h @@ -9,10 +9,6 @@ #ifndef _GMEM_H #define _GMEM_H -#include - -struct hnode; - /* * enum gm_ret - The return value of GMEM KPI that can be used to tell * the core VM or peripheral driver whether the GMEM KPI was diff --git a/kernel/fork.c b/kernel/fork.c index e01a6291b38e..b81bd534bbb7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -99,6 +99,7 @@ #include #include #include +#include #ifdef CONFIG_QOS_SCHED_SMART_GRID #include diff --git a/mm/Makefile b/mm/Makefile index 70d7fb204b57..c990c7f8f6ae 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -41,7 +41,7 @@ mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \ mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \ msync.o page_vma_mapped.o pagewalk.o \ pgtable-generic.o rmap.o vmalloc.o -mmu-$(CONFIG_GMEM) += gmem.o gmem_phys.o gmem_stat.o vm_object.o +mmu-$(CONFIG_GMEM) += gmem.o gmem_phys.o gmem_stat.o gmem_util.o vm_object.o ifdef CONFIG_CROSS_MEMORY_ATTACH mmu-$(CONFIG_MMU) += process_vm_access.o diff --git a/mm/gmem-internal.h b/mm/gmem-internal.h index b93e2c8e1d5e..21f2c365e36c 100644 --- a/mm/gmem-internal.h +++ b/mm/gmem-internal.h @@ -3,6 +3,7 @@ #define _GMEM_INTERNAL_H #include +#include #define gm_dev_is_peer(dev) (((dev)->capability & GM_DEV_CAP_PEER) != 0) @@ -162,4 +163,14 @@ int hnode_init_sysfs(unsigned int hnid); int __init gm_init_sysfs(void); void gm_deinit_sysfs(void); +vm_fault_t do_peer_shared_anonymous_page(struct vm_fault *vmf); +unsigned long alloc_va_in_peer_devices(unsigned long addr, unsigned long len, + unsigned long flag); +void gmem_reserve_vma(struct mm_struct *mm, unsigned long start, + size_t len, struct list_head *head); +void gmem_release_vma(struct mm_struct *mm, struct list_head *head); +unsigned long gmem_unmap_align(struct mm_struct *mm, unsigned long start, size_t len); +void gmem_unmap_region(struct mm_struct *mm, unsigned long start, size_t len); +bool gm_mmap_check_flags(unsigned long flags); + #endif /* _GMEM_INTERNAL_H */ diff --git a/mm/gmem.c b/mm/gmem.c index 9d760d5a22f5..48466d21b370 100644 --- a/mm/gmem.c +++ b/mm/gmem.c @@ -125,6 +125,8 @@ static int __init gmem_init(void) if (!enable_gmem) return 0; + hnuma_init(); + gm_as_cache = KMEM_CACHE(gm_as, 0); if (!gm_as_cache) goto out; @@ -787,26 +789,6 @@ static bool hnid_match_dest(int hnid, struct gm_mapping *dest) return (hnid < 0) ? gm_mapping_cpu(dest) : gm_mapping_device(dest); } -static void cpu_page_copy(struct page *dst_page, unsigned long dst_offset, - struct page *src_page, unsigned long src_offset, size_t size) -{ - unsigned long src, dst; - - src = (unsigned long)page_address(src_page) + src_offset; - dst = (unsigned long)page_address(dst_page) + dst_offset; - if (!src || !dst) { - gmem_err("%s: src (%lx) or dst (%lx) is invalid!", __func__, src, dst); - return; - } - memcpy((void *)dst, (void *)src, size); -} - -enum gmem_copy_dir { - COPY_GMEM_TO_NORM, - COPY_NORM_TO_GMEM, - COPY_GMEM_TO_GMEM, -}; - static void do_hmemcpy(struct mm_struct *mm, int hnid, unsigned long dest, unsigned long src, size_t size) { @@ -816,9 +798,6 @@ static void do_hmemcpy(struct mm_struct *mm, int hnid, unsigned long dest, struct gm_mapping *gm_mapping_dest, *gm_mapping_src; struct gm_dev *dev = NULL; struct gm_memcpy_t gmc = {0}; - enum gmem_copy_dir dir; - struct page *trans_hpage; - void *trans_addr; if (size == 0) return; @@ -832,39 +811,24 @@ static void do_hmemcpy(struct mm_struct *mm, int hnid, unsigned long dest, goto unlock_mm; } - if (vma_is_peer_shared(vma_src) && vma_is_peer_shared(vma_dest)) { - dir = COPY_GMEM_TO_GMEM; - gm_mapping_dest = vm_object_lookup(vma_dest->vm_obj, dest & ~(page_size - 1)); - gm_mapping_src = vm_object_lookup(vma_src->vm_obj, src & ~(page_size - 1)); - } else if (vma_is_peer_shared(vma_src)) { - dir = COPY_GMEM_TO_NORM; - gm_mapping_src = vm_object_lookup(vma_src->vm_obj, src & ~(page_size - 1)); - gm_mapping_dest = NULL; - } else if (vma_is_peer_shared(vma_dest)) { - dir = COPY_NORM_TO_GMEM; - gm_mapping_dest = vm_object_lookup(vma_dest->vm_obj, dest & ~(page_size - 1)); - gm_mapping_src = NULL; - } else { - gmem_err("%s: src %lx and dest %lx both not gmem addr!", __func__, src, dest); + gm_mapping_dest = vm_object_lookup(vma_dest->vm_obj, dest & ~(page_size - 1)); + gm_mapping_src = vm_object_lookup(vma_src->vm_obj, src & ~(page_size - 1)); + + if (!gm_mapping_src) { + gmem_err("hmemcpy: gm_mapping_src is NULL"); goto unlock_mm; } - trans_hpage = alloc_pages(GFP_TRANSHUGE, HPAGE_PMD_ORDER); - if (!trans_hpage) { - gmem_err("%s: alloc trans_hpage failed!", __func__); + if (gm_mapping_nomap(gm_mapping_src)) { + gmem_err("hmemcpy: src address is not mapping to CPU or device"); goto unlock_mm; } - trans_addr = page_to_virt(trans_hpage); - - if (dir != COPY_NORM_TO_GMEM && (!gm_mapping_src || gm_mapping_nomap(gm_mapping_src))) - gmem_err("%s: gm_mapping_src is NULL or still not mapped! addr is %lx", - __func__, src); if (hnid != -1) { dev = get_gm_dev(hnid); if (!dev) { gmem_err("hmemcpy: hnode's dev is NULL"); - goto free_trans_page; + goto unlock_mm; } } @@ -872,141 +836,49 @@ static void do_hmemcpy(struct mm_struct *mm, int hnid, unsigned long dest, if (!gm_mapping_dest || gm_mapping_nomap(gm_mapping_dest) || !hnid_match_dest(hnid, gm_mapping_dest)) { if (hnid == -1) { - if (gm_mapping_dest && gm_mapping_device(gm_mapping_dest) - && gm_page_pinned(gm_mapping_dest->gm_page)) { - gmem_err("%s: dest %lx is pinned on device, skip handle_mm_fault", - __func__, dest); - } else { - ret = handle_mm_fault(vma_dest, dest & ~(page_size - 1), - FAULT_FLAG_USER | FAULT_FLAG_INSTRUCTION | FAULT_FLAG_WRITE, - NULL); - if (ret) { - gmem_err("%s: failed to execute host page fault, ret:%d", - __func__, ret); - goto free_trans_page; - } + ret = handle_mm_fault(vma_dest, dest & ~(page_size - 1), FAULT_FLAG_USER | + FAULT_FLAG_INSTRUCTION | FAULT_FLAG_WRITE, NULL); + if (ret) { + gmem_err("%s: failed to execute host page fault, ret:%d", + __func__, ret); + goto unlock_mm; } } else { ret = gm_dev_fault_locked(mm, dest & ~(page_size - 1), dev, MADV_WILLNEED); if (ret != GM_RET_SUCCESS) { gmem_err("%s: failed to excecute dev page fault.", __func__); - goto free_trans_page; + goto unlock_mm; } } } - if (!gm_mapping_dest && dir != COPY_GMEM_TO_NORM) + if (!gm_mapping_dest) gm_mapping_dest = vm_object_lookup(vma_dest->vm_obj, round_down(dest, page_size)); if (gm_mapping_dest && gm_mapping_dest != gm_mapping_src) mutex_lock(&gm_mapping_dest->lock); - if (gm_mapping_src) - mutex_lock(&gm_mapping_src->lock); + mutex_lock(&gm_mapping_src->lock); // Use memcpy when there is no device address, otherwise use peer_memcpy - if (dir == COPY_GMEM_TO_NORM) { - if (!gm_mapping_src) { - gmem_err("%s: do COPY_GMEM_TO_NORM but gm_mapping_src is NULL!", __func__); - goto unlock_gm_mapping; - } + if (hnid == -1) { if (gm_mapping_cpu(gm_mapping_src)) { // host to host - cpu_page_copy(trans_hpage, - (unsigned long)trans_addr & (page_size - 1), - gm_mapping_src->page, src & (page_size - 1), - size); - goto copy_to_norm_dest; - } else if (gm_mapping_device(gm_mapping_src)) { // device to host + gmem_err("hmemcpy: host to host is unimplemented\n"); + goto unlock_gm_mmaping; + } else { // device to host dev = gm_mapping_src->dev; - gmc.dest = phys_to_dma(dev->dma_dev, page_to_phys(trans_hpage) + - ((unsigned long)trans_addr & (page_size - 1))); + gmc.dest = phys_to_dma(dev->dma_dev, + page_to_phys(gm_mapping_dest->page) + (dest & (page_size - 1))); gmc.src = gm_mapping_src->gm_page->dev_dma_addr + (src & (page_size - 1)); gmc.kind = GM_MEMCPY_D2H; - } else { - gmem_err("gm_mapping_src bad status, dir is COPY_GMEM_TO_NORM"); - goto unlock_gm_mapping; } - } else if (dir == COPY_NORM_TO_GMEM) { - if (!gm_mapping_dest) { - gmem_err("%s: do COPY_NORM_TO_GMEM but gm_mapping_dest is NULL!", __func__); - goto unlock_gm_mapping; - } - if (copy_from_user(trans_addr, (void __user *)src, size) > 0) - gmem_err("copy normal src %lx to trans failed", src); - if (gm_mapping_cpu(gm_mapping_dest)) { // host to host - cpu_page_copy(gm_mapping_dest->page, dest & (page_size - 1), - trans_hpage, (unsigned long)trans_addr & (page_size - 1), size); - goto unlock_gm_mapping; - } else if (gm_mapping_device(gm_mapping_dest)) { - if (!dev) { - gmem_err("%s: do COPY_NORM_TO_GMEM but dev is NULL, hnid is %d", - __func__, hnid); - goto unlock_gm_mapping; - } + } else { + if (gm_mapping_cpu(gm_mapping_src)) { // host to device gmc.dest = gm_mapping_dest->gm_page->dev_dma_addr + - (dest & (page_size - 1)); - gmc.src = phys_to_dma(dev->dma_dev, page_to_phys(trans_hpage) + - ((unsigned long)trans_addr & (page_size - 1))); + (dest & (page_size - 1)); + gmc.src = phys_to_dma(dev->dma_dev, + page_to_phys(gm_mapping_src->page) + (src & (page_size - 1))); gmc.kind = GM_MEMCPY_H2D; } else { // device to device - gmem_err("gm_mapping_dest bad status, dir is COPY_NORM_TO_GMEM\n"); - goto unlock_gm_mapping; - } - } else if (dir == COPY_GMEM_TO_GMEM) { - if (gm_mapping_cpu(gm_mapping_src)) { - if (gm_mapping_cpu(gm_mapping_dest)) { - cpu_page_copy(gm_mapping_dest->page, dest & (page_size - 1), - gm_mapping_src->page, src & (page_size - 1), size); - goto unlock_gm_mapping; - } else if (gm_mapping_device(gm_mapping_dest)) { - dev = gm_mapping_dest->dev; - gmc.dest = gm_mapping_dest->gm_page->dev_dma_addr + - (dest & (page_size - 1)); - gmc.src = phys_to_dma(dev->dma_dev, - page_to_phys(gm_mapping_src->page) + - (src & (page_size - 1))); - gmc.kind = GM_MEMCPY_H2D; - } else { - gmem_err("gm_mapping_dest bad status, src is on host!"); - goto unlock_gm_mapping; - } - } else if (gm_mapping_device(gm_mapping_src)) { - if (gm_mapping_cpu(gm_mapping_dest)) { - dev = gm_mapping_src->dev; - gmc.dest = phys_to_dma(dev->dma_dev, - page_to_phys(gm_mapping_dest->page) + - (dest & (page_size - 1))); - gmc.src = gm_mapping_src->gm_page->dev_dma_addr + - (src & (page_size - 1)); - gmc.kind = GM_MEMCPY_D2H; - } else if (gm_mapping_device(gm_mapping_dest)) { - dev = gm_mapping_src->dev; - gmc.dest = phys_to_dma(dev->dma_dev, page_to_phys(trans_hpage) + - ((unsigned long)trans_addr & (page_size - 1))); - gmc.src = gm_mapping_src->gm_page->dev_dma_addr + - (src & (page_size - 1)); - gmc.kind = GM_MEMCPY_D2H; - gmc.mm = mm; - gmc.dev = dev; - gmc.size = size; - dev->mmu->peer_hmemcpy(&gmc); - - dev = gm_mapping_dest->dev; - gmc.dest = gm_mapping_dest->gm_page->dev_dma_addr + - (dest & (page_size - 1)); - gmc.src = phys_to_dma(dev->dma_dev, page_to_phys(trans_hpage) + - ((unsigned long)trans_addr & (page_size - 1))); - gmc.kind = GM_MEMCPY_H2D; - gmc.mm = mm; - gmc.dev = dev; - gmc.size = size; - dev->mmu->peer_hmemcpy(&gmc); - - goto unlock_gm_mapping; - } else { - gmem_err("gm_mapping_dest bad status, src is on device!"); - goto unlock_gm_mapping; - } - } else { - gmem_err("gm_mapping_src bad status, dir is COPY_GMEM_TO_GMEM"); - goto unlock_gm_mapping; + gmem_err("hmemcpy: device to device is unimplemented\n"); + goto unlock_gm_mmaping; } } gmc.mm = mm; @@ -1014,19 +886,10 @@ static void do_hmemcpy(struct mm_struct *mm, int hnid, unsigned long dest, gmc.size = size; dev->mmu->peer_hmemcpy(&gmc); -copy_to_norm_dest: - if (dir == COPY_GMEM_TO_NORM) { - if (copy_to_user((void __user *)dest, trans_addr, size) > 0) - gmem_err("copy trans to normal dest %lx failed!", dest); - } - -unlock_gm_mapping: - if (gm_mapping_src) - mutex_unlock(&gm_mapping_src->lock); +unlock_gm_mmaping: + mutex_unlock(&gm_mapping_src->lock); if (gm_mapping_dest && gm_mapping_dest != gm_mapping_src) mutex_unlock(&gm_mapping_dest->lock); -free_trans_page: - __free_pages(trans_hpage, HPAGE_PMD_ORDER); unlock_mm: mmap_read_unlock(mm); } @@ -1094,22 +957,17 @@ int hmemcpy(int hnid, unsigned long dest, unsigned long src, size_t size) vma_src = find_vma(mm, src); if ((ULONG_MAX - size < src) || !vma_src || vma_src->vm_start > src || - vma_src->vm_end < (src + size)) { + !vma_is_peer_shared(vma_src) || vma_src->vm_end < (src + size)) { gmem_err("failed to find peer_shared vma by invalid src or size\n"); goto unlock; } if ((ULONG_MAX - size < dest) || !vma_dest || vma_dest->vm_start > dest || - vma_dest->vm_end < (dest + size)) { + !vma_is_peer_shared(vma_dest) || vma_dest->vm_end < (dest + size)) { gmem_err("failed to find peer_shared vma by invalid dest or size\n"); goto unlock; } - if (!vma_is_peer_shared(vma_src) && !vma_is_peer_shared(vma_dest)) { - mmap_read_unlock(mm); - return -EAGAIN; - } - if (!(vma_dest->vm_flags & VM_WRITE)) { gmem_err("dest is not writable.\n"); goto unlock; diff --git a/mm/gmem_phys.c b/mm/gmem_phys.c index 3107ad181a12..a3e3a4823daf 100644 --- a/mm/gmem_phys.c +++ b/mm/gmem_phys.c @@ -13,7 +13,6 @@ #include #include -#include #include #include "gmem-internal.h" diff --git a/mm/gmem_stat.c b/mm/gmem_stat.c index 6a6cbdce4f47..8a7e7f7eadb8 100644 --- a/mm/gmem_stat.c +++ b/mm/gmem_stat.c @@ -7,9 +7,9 @@ * */ -#include +#include #include -#include +#include #include "gmem-internal.h" @@ -29,10 +29,8 @@ static struct hnode *get_hnode_kobj(struct kobject *kobj) hnode_kobj = container_of(kobj, struct hnode_kobject, kobj); hnode = get_hnode(hnode_kobj->hnid); - if (!hnode) { + if (!hnode) gmem_err("%s: failed to get hnode from kobject", __func__); - return NULL; - } return hnode; } diff --git a/mm/gmem_util.c b/mm/gmem_util.c new file mode 100644 index 000000000000..dc647df481e8 --- /dev/null +++ b/mm/gmem_util.c @@ -0,0 +1,423 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Generalized Memory Management. + * + * Copyright (C) 2023- Huawei, Inc. + * Author: Ni Cunshu, Wang bin + * + */ + +#include +#include +#include +#include +#include + +#include "internal.h" +#include "gmem-internal.h" + +static struct folio *__vma_alloc_anon_folio_pmd(struct vm_area_struct *vma, + unsigned long addr, gfp_t gfp) +{ + const int order = HPAGE_PMD_ORDER; + struct folio *folio; + + folio = vma_alloc_folio(gfp, order, vma, addr & HPAGE_PMD_MASK, true); + + if (unlikely(!folio)) { + count_vm_event(THP_FAULT_FALLBACK); + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); + return NULL; + } + + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { + folio_put(folio); + count_vm_event(THP_FAULT_FALLBACK); + count_vm_event(THP_FAULT_FALLBACK_CHARGE); + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); + return NULL; + } + folio_throttle_swaprate(folio, gfp); + + clear_huge_page(&folio->page, addr, HPAGE_PMD_NR); + /* + * The memory barrier inside __folio_mark_uptodate makes sure that + * clear_huge_page writes become visible before the set_pmd_at() + * write. + */ + __folio_mark_uptodate(folio); + return folio; +} + +static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd, + struct vm_area_struct *vma, unsigned long haddr) +{ + pmd_t entry; + + entry = mk_huge_pmd(&folio->page, vma->vm_page_prot); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); + folio_add_lru_vma(folio, vma); + set_pmd_at(vma->vm_mm, haddr, pmd, entry); + update_mmu_cache_pmd(vma, haddr, pmd); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); + add_reliable_folio_counter(folio, vma->vm_mm, HPAGE_PMD_NR); + count_vm_event(THP_FAULT_ALLOC); + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); + count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); +} + +static struct gm_mapping *vma_prepare_gm_mapping(struct vm_area_struct *vma, + unsigned long haddr) +{ + struct gm_mapping *gm_mapping; + + xa_lock(vma->vm_obj->logical_page_table); + gm_mapping = vm_object_lookup(vma->vm_obj, haddr); + if (!gm_mapping) { + vm_object_mapping_create(vma->vm_obj, haddr); + gm_mapping = vm_object_lookup(vma->vm_obj, haddr); + } + xa_unlock(vma->vm_obj->logical_page_table); + + return gm_mapping; +} + +vm_fault_t do_peer_shared_anonymous_page(struct vm_fault *vmf) +{ + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + struct vm_area_struct *vma = vmf->vma; + struct folio *folio = NULL; + bool is_new_folio = false; + pgtable_t pgtable = NULL; + struct gm_mapping *gm_mapping; + vm_fault_t ret = 0; + + if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) + return VM_FAULT_FALLBACK; + ret = vmf_anon_prepare(vmf); + if (ret) + return ret; + khugepaged_enter_vma(vma, vma->vm_flags); + + gm_mapping = vma_prepare_gm_mapping(vma, haddr); + if (!gm_mapping) + return VM_FAULT_OOM; + + mutex_lock(&gm_mapping->lock); + + if (gm_mapping_device(gm_mapping) && gm_page_pinned(gm_mapping->gm_page)) { + pr_err("page is pinned! addr is %lx\n", gm_mapping->gm_page->va); + ret = VM_FAULT_SIGBUS; + goto release; + } + + if (gm_mapping_cpu(gm_mapping)) + folio = page_folio(gm_mapping->page); + if (!folio) { + folio = __vma_alloc_anon_folio_pmd(vma, haddr, GFP_TRANSHUGE); + is_new_folio = true; + } + + if (unlikely(!folio)) { + ret = VM_FAULT_FALLBACK; + goto release; + } + + pgtable = pte_alloc_one(vma->vm_mm); + if (unlikely(!pgtable)) { + ret = VM_FAULT_OOM; + goto release; + } + + /** + * if page is mapped in device, release device mapping and + * deliver the page content to host. + */ + if (gm_mapping_device(gm_mapping)) { + vmf->page = &folio->page; + ret = gm_host_fault_locked(vmf, PMD_ORDER); + if (ret) + goto release; + } + + /* map page in pgtable */ + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + + BUG_ON(!pmd_none(*vmf->pmd)); + ret = check_stable_address_space(vma->vm_mm); + if (ret) + goto unlock_release; + pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); + map_anon_folio_pmd(folio, vmf->pmd, vma, haddr); + mm_inc_nr_ptes(vma->vm_mm); + spin_unlock(vmf->ptl); + + /* finally setup cpu mapping */ + gm_mapping_flags_set(gm_mapping, GM_MAPPING_CPU); + gm_mapping->page = &folio->page; + mutex_unlock(&gm_mapping->lock); + + return 0; +unlock_release: + spin_unlock(vmf->ptl); +release: + if (pgtable) + pte_free(vma->vm_mm, pgtable); + if (is_new_folio) + folio_put(folio); + mutex_unlock(&gm_mapping->lock); + return ret; +} + +unsigned long alloc_va_in_peer_devices(unsigned long addr, unsigned long len, + unsigned long flag) +{ + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + struct gm_context *ctx, *tmp; + unsigned long prot = VM_NONE; + enum gm_ret ret; + char *thp_enable_path = "/sys/kernel/mm/transparent_hugepage/enabled"; + + vma = find_vma(mm, addr); + if (!vma) { + gmem_err("vma for addr %lx is NULL, should not happen\n", addr); + return -EINVAL; + } + + if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags)) { + gmem_err("transparent hugepage is not enabled. check %s\n", + thp_enable_path); + return -EINVAL; + } + + prot |= vma->vm_flags; + + if (!mm->gm_as) { + ret = gm_as_create(0, ULONG_MAX, GM_AS_ALLOC_DEFAULT, HPAGE_SIZE, &mm->gm_as); + if (ret) { + gmem_err("gm_as_create failed\n"); + return ret; + } + } + + ret = -ENODEV; + // TODO: consider the concurrency problem of device attaching/detaching from the gm_as. + list_for_each_entry_safe(ctx, tmp, &mm->gm_as->gm_ctx_list, gm_as_link) { + struct gm_fault_t gmf = { + .mm = mm, + .dev = ctx->dev, + .va = addr, + .size = len, + .prot = prot, + }; + + if (!gm_dev_is_peer(ctx->dev)) + continue; + + if (!ctx->dev->mmu->peer_va_alloc_fixed) { + pr_debug("gmem: mmu ops has no alloc_vma\n"); + continue; + } + + ret = ctx->dev->mmu->peer_va_alloc_fixed(&gmf); + if (ret != GM_RET_SUCCESS) { + gmem_err("device mmap failed\n"); + return ret; + } + } + + if (!vma->vm_obj) + vma->vm_obj = vm_object_create(vma); + if (!vma->vm_obj) + return -ENOMEM; + + return ret; +} + +struct gmem_vma_list { + unsigned long start; + size_t len; + struct list_head list; +}; + +void gmem_reserve_vma(struct mm_struct *mm, unsigned long start, + size_t len, struct list_head *head) +{ + struct vm_area_struct *vma; + struct gmem_vma_list *node = kmalloc(sizeof(struct gmem_vma_list), GFP_KERNEL); + + vma = find_vma(mm, start); + if (!vma || vma->vm_start >= start + len) { + kfree(node); + return; + } + vm_flags_set(vma, ~VM_PEER_SHARED); + + node->start = start; + node->len = round_up(len, SZ_2M); + list_add_tail(&node->list, head); +} + +void gmem_release_vma(struct mm_struct *mm, struct list_head *head) +{ + struct gmem_vma_list *node, *next; + + list_for_each_entry_safe(node, next, head, list) { + unsigned long start = node->start; + size_t len = node->len; + + if (len) + vm_munmap(start, len); + + list_del(&node->list); + kfree(node); + } +} + +static void munmap_single_vma_in_peer_devices(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long start_addr, unsigned long end_addr) +{ + unsigned long start, end, addr; + struct vm_object *obj = vma->vm_obj; + enum gm_ret ret; + struct gm_context *ctx, *tmp; + struct gm_mapping *gm_mapping; + struct hnode *hnode; + struct gm_fault_t gmf = { + .mm = mm, + .copy = false, + }; + + start = max(vma->vm_start, start_addr); + if (start >= vma->vm_end) + return; + addr = start; + end = min(vma->vm_end, end_addr); + if (end <= vma->vm_start) + return; + + if (!obj) + return; + + if (!mm->gm_as) + return; + + do { + xa_lock(obj->logical_page_table); + gm_mapping = vm_object_lookup(obj, addr); + if (!gm_mapping) { + xa_unlock(obj->logical_page_table); + continue; + } + xa_unlock(obj->logical_page_table); + + mutex_lock(&gm_mapping->lock); + if (!gm_mapping_device(gm_mapping)) { + mutex_unlock(&gm_mapping->lock); + continue; + } + + gmf.va = addr; + gmf.size = HPAGE_SIZE; + gmf.pfn = gm_mapping->gm_page->dev_pfn; + gmf.dev = gm_mapping->dev; + ret = gm_mapping->dev->mmu->peer_unmap(&gmf); + if (ret != GM_RET_SUCCESS) + gmem_err("%s: call dev peer_unmap error %d", __func__, ret); + + /* + * Regardless of whether the gm_page is unmapped, we should release it. + */ + hnode = get_hnode(gm_mapping->gm_page->hnid); + if (!hnode) { + mutex_unlock(&gm_mapping->lock); + continue; + } + gm_page_remove_rmap(gm_mapping->gm_page); + hnode_activelist_del(hnode, gm_mapping->gm_page); + hnode_active_pages_dec(hnode); + put_gm_page(gm_mapping->gm_page); + gm_mapping_flags_set(gm_mapping, GM_MAPPING_NOMAP); + gm_mapping->gm_page = NULL; + mutex_unlock(&gm_mapping->lock); + } while (addr += HPAGE_SIZE, addr != end); + + list_for_each_entry_safe(ctx, tmp, &mm->gm_as->gm_ctx_list, gm_as_link) { + if (!gm_dev_is_peer(ctx->dev)) + continue; + if (!ctx->dev->mmu->peer_va_free) + continue; + + gmf.va = start; + gmf.size = end - start; + gmf.dev = ctx->dev; + + ret = ctx->dev->mmu->peer_va_free(&gmf); + if (ret != GM_RET_SUCCESS) + pr_debug("gmem: free_vma failed, ret %d\n", ret); + } +} + +static void munmap_in_peer_devices(struct mm_struct *mm, unsigned long start, unsigned long end) +{ + struct vm_area_struct *vma; + + VMA_ITERATOR(vmi, mm, start); + for_each_vma_range(vmi, vma, end) { + if (vma_is_peer_shared(vma)) + munmap_single_vma_in_peer_devices(mm, vma, start, end); + } +} + +unsigned long gmem_unmap_align(struct mm_struct *mm, unsigned long start, size_t len) +{ + struct vm_area_struct *vma, *vma_end; + + vma = find_vma_intersection(mm, start, start + len); + vma_end = find_vma(mm, start + len); + if (!vma || !vma_is_peer_shared(vma)) + return 0; + if (vma_is_peer_shared(vma)) { + if (!IS_ALIGNED(start, PMD_SIZE)) + return -EINVAL; + } + + /* Prevents partial release of the peer_share page. */ + if (vma_end && vma_end->vm_start < (start + len) && vma_is_peer_shared(vma_end)) + len = round_up(len, SZ_2M); + return len; +} + +void gmem_unmap_region(struct mm_struct *mm, unsigned long start, size_t len) +{ + unsigned long end, ret; + + ret = gmem_unmap_align(mm, start, len); + + if (!ret || IS_ERR_VALUE(ret)) + return; + + end = start + ret; + munmap_in_peer_devices(mm, start, end); +} + +bool gm_mmap_check_flags(unsigned long flags) +{ + if (gmem_is_enabled()) { + if ((flags & MAP_SHARED) && (flags & MAP_PEER_SHARED)) { + gmem_err(" MAP_PEER_SHARED and MAP_SHARE cannot be used together.\n"); + return false; + } else if ((flags & MAP_HUGETLB) && (flags & MAP_PEER_SHARED)) { + gmem_err(" MAP_PEER_SHARED and MAP_HUGETLB cannot be used together.\n"); + return false; + } else if (!(flags & MAP_ANONYMOUS) && (flags & MAP_PEER_SHARED)) { + gmem_err(" MAP_PEER_SHARED cannot map file page.\n"); + return false; + } + } + return true; +} diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a582cab33939..da46c38b7595 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -38,11 +38,6 @@ #include #include #include -#ifdef CONFIG_GMEM -#include -#include -#include "gmem-internal.h" -#endif #include @@ -1350,12 +1345,6 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma, const int order = HPAGE_PMD_ORDER; struct folio *folio; -#ifdef CONFIG_GMEM - /* always try to compact hugepage for peer shared vma */ - if (vma_is_peer_shared(vma)) - gfp = GFP_TRANSHUGE; -#endif - folio = vma_alloc_folio(gfp, order, vma, addr & HPAGE_PMD_MASK, true); if (unlikely(!folio)) { @@ -1403,105 +1392,6 @@ static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd, count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); } -#ifdef CONFIG_GMEM - -struct gm_mapping *vma_prepare_gm_mapping(struct vm_area_struct *vma, unsigned long haddr) -{ - struct gm_mapping *gm_mapping; - - xa_lock(vma->vm_obj->logical_page_table); - gm_mapping = vm_object_lookup(vma->vm_obj, haddr); - if (!gm_mapping) { - vm_object_mapping_create(vma->vm_obj, haddr); - gm_mapping = vm_object_lookup(vma->vm_obj, haddr); - } - xa_unlock(vma->vm_obj->logical_page_table); - - return gm_mapping; -} - -static vm_fault_t __do_peer_shared_anonymous_page(struct vm_fault *vmf) -{ - unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - struct vm_area_struct *vma = vmf->vma; - struct folio *folio = NULL; - bool is_new_folio = false; - pgtable_t pgtable = NULL; - struct gm_mapping *gm_mapping; - vm_fault_t ret = 0; - - gm_mapping = vma_prepare_gm_mapping(vma, haddr); - if (!gm_mapping) - return VM_FAULT_OOM; - - mutex_lock(&gm_mapping->lock); - - if (gm_mapping_device(gm_mapping) && gm_page_pinned(gm_mapping->gm_page)) { - pr_err("page is pinned! addr is %lx\n", gm_mapping->gm_page->va); - ret = VM_FAULT_SIGBUS; - goto release; - } - - if (gm_mapping_cpu(gm_mapping)) - folio = page_folio(gm_mapping->page); - if (!folio) { - folio = vma_alloc_anon_folio_pmd(vma, haddr); - is_new_folio = true; - } - - if (unlikely(!folio)) { - ret = VM_FAULT_FALLBACK; - goto release; - } - - pgtable = pte_alloc_one(vma->vm_mm); - if (unlikely(!pgtable)) { - ret = VM_FAULT_OOM; - goto release; - } - - /** - * if page is mapped in device, release device mapping and - * deliver the page content to host. - */ - if (gm_mapping_device(gm_mapping)) { - vmf->page = &folio->page; - ret = gm_host_fault_locked(vmf, PMD_ORDER); - if (ret) - goto release; - } - - /* map page in pgtable */ - vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); - - BUG_ON(!pmd_none(*vmf->pmd)); - ret = check_stable_address_space(vma->vm_mm); - if (ret) - goto unlock_release; - pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); - map_anon_folio_pmd(folio, vmf->pmd, vma, haddr); - mm_inc_nr_ptes(vma->vm_mm); - spin_unlock(vmf->ptl); - - /* finally setup cpu mapping */ - gm_mapping_flags_set(gm_mapping, GM_MAPPING_CPU); - gm_mapping->page = &folio->page; - mutex_unlock(&gm_mapping->lock); - - return 0; -unlock_release: - spin_unlock(vmf->ptl); -release: - if (pgtable) - pte_free(vma->vm_mm, pgtable); - if (is_new_folio) - folio_put(folio); - mutex_unlock(&gm_mapping->lock); - return ret; -} - -#endif - static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) { unsigned long haddr = vmf->address & HPAGE_PMD_MASK; @@ -1617,11 +1507,6 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) return ret; khugepaged_enter_vma(vma, vma->vm_flags); -#ifdef CONFIG_GMEM - if (vma_is_peer_shared(vma)) - return __do_peer_shared_anonymous_page(vmf); -#endif - if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm) && transparent_hugepage_use_zero_page()) { diff --git a/mm/internal.h b/mm/internal.h index 3a127c3e2325..01a58570869a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1546,5 +1546,4 @@ static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry, { } #endif /* CONFIG_SHRINKER_DEBUG */ - #endif /* __MM_INTERNAL_H */ diff --git a/mm/memory.c b/mm/memory.c index 3846bf813d89..68218bd364cb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -77,11 +77,6 @@ #include #include #include -#ifdef CONFIG_GMEM -#include -#include -#include "gmem-internal.h" -#endif #include #include @@ -95,6 +90,11 @@ #include #include +#include +#ifdef CONFIG_GMEM +#include "gmem-internal.h" +#endif + #include "pgalloc-track.h" #include "internal.h" #include "swap.h" @@ -1717,13 +1717,16 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, } #ifdef CONFIG_GMEM -static inline void zap_logic_pmd_range(struct vm_area_struct *vma, - unsigned long addr, - unsigned long end) +static inline void zap_logic_pmd_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long end, bool verify_pmd, pmd_t *pmd) { struct gm_mapping *gm_mapping = NULL; struct page *page = NULL; + if (!vma_is_peer_shared(vma)) + return; + if (!pmd_none_or_clear_bad(pmd) && !pmd_trans_huge(*pmd)) + return; if (!vma->vm_obj) return; @@ -1745,19 +1748,75 @@ static inline void zap_logic_pud_range(struct vm_area_struct *vma, unsigned long end) { unsigned long next; - + if (!vma_is_peer_shared(vma)) + return; do { next = pmd_addr_end(addr, end); - zap_logic_pmd_range(vma, addr, next); + zap_logic_pmd_range(vma, addr, next, false, NULL); } while (addr = next, addr != end); } + +static void unmap_single_peer_shared_vma(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long start_addr, unsigned long end_addr) +{ + unsigned long start, end, addr; + struct vm_object *obj = vma->vm_obj; + struct gm_mapping *gm_mapping; + struct hnode *hnode; + + start = max(vma->vm_start, start_addr); + if (start >= vma->vm_end) + return; + addr = start; + end = min(vma->vm_end, end_addr); + if (end <= vma->vm_start) + return; + + if (!obj) + return; + + if (!mm->gm_as) + return; + + do { + xa_lock(obj->logical_page_table); + gm_mapping = vm_object_lookup(obj, addr); + if (!gm_mapping) { + xa_unlock(obj->logical_page_table); + continue; + } + xa_unlock(obj->logical_page_table); + + mutex_lock(&gm_mapping->lock); + if (!gm_mapping_device(gm_mapping)) { + mutex_unlock(&gm_mapping->lock); + continue; + } + + /* + * Regardless of whether the gm_page is unmapped, we should release it. + */ + hnode = get_hnode(gm_mapping->gm_page->hnid); + if (!hnode) { + mutex_unlock(&gm_mapping->lock); + continue; + } + gm_page_remove_rmap(gm_mapping->gm_page); + hnode_activelist_del(hnode, gm_mapping->gm_page); + hnode_active_pages_dec(hnode); + put_gm_page(gm_mapping->gm_page); + gm_mapping->gm_page = NULL; + mutex_unlock(&gm_mapping->lock); + } while (addr += HPAGE_SIZE, addr != end); +} #else -static inline void zap_logic_pmd_range(struct vm_area_struct *vma, - unsigned long addr, - unsigned long end) {} +static inline void zap_logic_pmd_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long end, bool verify_pmd, pmd_t *pmd) {} static inline void zap_logic_pud_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end) {} +static inline void unmap_single_peer_shared_vma(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long start_addr, unsigned long end_addr) {} #endif static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, @@ -1790,7 +1849,6 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, */ spin_unlock(ptl); } -#ifdef CONFIG_GMEM /* * Here there can be other concurrent MADV_DONTNEED or * trans huge page faults running, and if the pmd is @@ -1798,11 +1856,8 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, * because MADV_DONTNEED holds the mmap_lock in read * mode. */ - if (vma_is_peer_shared(vma)) { - if (pmd_none_or_clear_bad(pmd) || pmd_trans_huge(*pmd)) - zap_logic_pmd_range(vma, addr, next); - } -#endif + zap_logic_pmd_range(vma, addr, next, true, pmd); + if (pmd_none(*pmd)) { addr = next; continue; @@ -1835,8 +1890,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, /* fall through */ } if (pud_none_or_clear_bad(pud)) { - if (vma_is_peer_shared(vma)) - zap_logic_pud_range(vma, addr, next); + zap_logic_pud_range(vma, addr, next); continue; } next = zap_pmd_range(tlb, vma, pud, addr, next, details); @@ -1859,8 +1913,7 @@ static inline unsigned long zap_p4d_range(struct mmu_gather *tlb, do { next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) { - if (vma_is_peer_shared(vma)) - zap_logic_pud_range(vma, addr, next); + zap_logic_pud_range(vma, addr, next); continue; } next = zap_pud_range(tlb, vma, p4d, addr, next, details); @@ -1883,10 +1936,7 @@ void unmap_page_range(struct mmu_gather *tlb, do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) { -#ifdef CONFIG_GMEM - if (vma_is_peer_shared(vma)) - zap_logic_pud_range(vma, addr, next); -#endif + zap_logic_pud_range(vma, addr, next); continue; } next = zap_p4d_range(tlb, vma, pgd, addr, next, details); @@ -1939,63 +1989,6 @@ static void unmap_single_vma(struct mmu_gather *tlb, } } -#ifdef CONFIG_GMEM - -static void unmap_single_peer_shared_vma(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long start_addr, unsigned long end_addr) -{ - unsigned long start, end, addr; - struct vm_object *obj = vma->vm_obj; - struct gm_mapping *gm_mapping; - struct hnode *hnode; - - start = max(vma->vm_start, start_addr); - if (start >= vma->vm_end) - return; - addr = start; - end = min(vma->vm_end, end_addr); - if (end <= vma->vm_start) - return; - - if (!obj) - return; - - if (!mm->gm_as) - return; - - do { - xa_lock(obj->logical_page_table); - gm_mapping = vm_object_lookup(obj, addr); - if (!gm_mapping) { - xa_unlock(obj->logical_page_table); - continue; - } - xa_unlock(obj->logical_page_table); - - mutex_lock(&gm_mapping->lock); - if (!gm_mapping_device(gm_mapping)) { - mutex_unlock(&gm_mapping->lock); - continue; - } - - /* - * Regardless of whether the gm_page is unmapped, we should release it. - */ - hnode = get_hnode(gm_mapping->gm_page->hnid); - if (!hnode) { - mutex_unlock(&gm_mapping->lock); - continue; - } - gm_page_remove_rmap(gm_mapping->gm_page); - hnode_activelist_del(hnode, gm_mapping->gm_page); - hnode_active_pages_dec(hnode); - put_gm_page(gm_mapping->gm_page); - gm_mapping->gm_page = NULL; - mutex_unlock(&gm_mapping->lock); - } while (addr += HPAGE_SIZE, addr != end); -} - -#endif /** * unmap_vmas - unmap a range of memory covered by a list of vma's @@ -5749,6 +5742,8 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; + if (vma_is_peer_shared(vma)) + return do_peer_shared_anonymous_page(vmf); if (vma_is_anonymous(vma)) return do_huge_pmd_anonymous_page(vmf); if (vma->vm_ops->huge_fault) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 879cf8b45f2a..4d88305c2c10 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1903,13 +1903,8 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, bool vma_migratable(struct vm_area_struct *vma) { -#ifdef CONFIG_GMEM if (vma->vm_flags & (VM_IO | VM_PFNMAP | VM_PEER_SHARED)) return false; -#else - if (vma->vm_flags & (VM_IO | VM_PFNMAP)) - return false; -#endif /* * DAX device mappings require predictable access latency, so avoid diff --git a/mm/mm_init.c b/mm/mm_init.c index e7f0af3bf2b7..f2d805dde378 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -31,7 +31,6 @@ #include "slab.h" #include "shuffle.h" #ifdef CONFIG_GMEM -#include #include "gmem-internal.h" #endif @@ -2801,9 +2800,6 @@ static void __init mem_init_print_info(void) */ void __init mm_core_init(void) { -#ifdef CONFIG_GMEM - hnuma_init(); -#endif /* Initializations relying on SMP setup */ build_all_zonelists(NULL); page_alloc_init_cpuhp(); diff --git a/mm/mmap.c b/mm/mmap.c index e82fb3ca7122..1c94bf803146 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -50,7 +50,6 @@ #include #ifdef CONFIG_GMEM -#include #include "gmem-internal.h" #endif @@ -652,9 +651,7 @@ static inline int dup_anon_vma(struct vm_area_struct *dst, */ if (src->anon_vma && !dst->anon_vma) { int ret; -#ifdef CONFIG_GMEM dup_vm_object(dst, src, true); -#endif vma_assert_write_locked(dst); dst->anon_vma = src->anon_vma; ret = anon_vma_clone(dst, src); @@ -1301,8 +1298,10 @@ static unsigned long __mmap_region_ext(struct mm_struct *mm, #ifdef CONFIG_GMEM static unsigned long get_unmapped_area_aligned(struct file *file, unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags, unsigned long align) + unsigned long pgoff, unsigned long flags) { + unsigned long align = HPAGE_SIZE; + len = round_up(len, align); if (len > TASK_SIZE) return -ENOMEM; @@ -1368,17 +1367,7 @@ unsigned long __do_mmap_mm(struct mm_struct *mm, struct file *file, unsigned lon /* Obtain the address to map to. we verify (or select) it and ensure * that it represents a valid section of the address space. */ -#ifdef CONFIG_GMEM - if (gmem_is_enabled() && (flags & MAP_PEER_SHARED)) { - len = round_up(len, PMD_SIZE); - addr = get_unmapped_area_aligned(file, addr, len, pgoff, flags, - PMD_SIZE); - } else { - addr = get_unmapped_area(file, addr, len, pgoff, flags); - } -#else addr = get_unmapped_area(file, addr, len, pgoff, flags); -#endif if (IS_ERR_VALUE(addr)) return addr; @@ -1517,7 +1506,6 @@ unsigned long __do_mmap_mm(struct mm_struct *mm, struct file *file, unsigned lon *populate = len; return addr; } -EXPORT_SYMBOL(__do_mmap_mm); unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, @@ -1537,25 +1525,12 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, unsigned long retval; #ifdef CONFIG_GMEM - if (gmem_is_enabled() && (flags & MAP_SHARED) && (flags & MAP_PEER_SHARED)) { - retval = -EINVAL; - gmem_err(" MAP_PEER_SHARED and MAP_SHARE cannot be used together.\n"); - goto out_fput; - } - if (gmem_is_enabled() && (flags & MAP_HUGETLB) && (flags & MAP_PEER_SHARED)) { + if (!gm_mmap_check_flags(flags)) { retval = -EINVAL; - gmem_err(" MAP_PEER_SHARED and MAP_HUGETLB cannot be used together.\n"); goto out_fput; } #endif if (!(flags & MAP_ANONYMOUS)) { -#ifdef CONFIG_GMEM - if (gmem_is_enabled() && (flags & MAP_PEER_SHARED)) { - retval = -EINVAL; - gmem_err(" MAP_PEER_SHARED cannot map file page.\n"); - goto out_fput; - } -#endif audit_mmap_fd(fd, flags); file = fget(fd); if (!file) @@ -1992,6 +1967,10 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, if (file) { if (file->f_op->get_unmapped_area) get_area = file->f_op->get_unmapped_area; +#ifdef CONFIG_GMEM + } else if (gmem_is_enabled() && (flags & MAP_PEER_SHARED)) { + get_area = get_unmapped_area_aligned; +#endif } else if (flags & MAP_SHARED) { /* * mmap_region() will call shmem_zero_setup() to create a file, @@ -2023,7 +2002,6 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, EXPORT_SYMBOL(get_unmapped_area); - /** * find_vma_intersection() - Look up the first VMA which intersects the interval * @mm: The process address space. @@ -2562,9 +2540,9 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, err = anon_vma_clone(new, vma); if (err) goto out_free_mpol; -#ifdef COFNIG_GMEM + dup_vm_object(new, vma, false); -#endif + if (new->vm_file) get_file(new->vm_file); @@ -2619,135 +2597,6 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, return __split_vma(vmi, vma, addr, new_below); } -#ifdef CONFIG_GMEM -static void munmap_single_vma_in_peer_devices(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long start_addr, unsigned long end_addr) -{ - unsigned long start, end, addr; - struct vm_object *obj = vma->vm_obj; - enum gm_ret ret; - struct gm_context *ctx, *tmp; - struct gm_mapping *gm_mapping; - struct hnode *hnode; - struct gm_fault_t gmf = { - .mm = mm, - .copy = false, - }; - - start = max(vma->vm_start, start_addr); - if (start >= vma->vm_end) - return; - addr = start; - end = min(vma->vm_end, end_addr); - if (end <= vma->vm_start) - return; - - if (!obj) - return; - - if (!mm->gm_as) - return; - - do { - xa_lock(obj->logical_page_table); - gm_mapping = vm_object_lookup(obj, addr); - if (!gm_mapping) { - xa_unlock(obj->logical_page_table); - continue; - } - xa_unlock(obj->logical_page_table); - - mutex_lock(&gm_mapping->lock); - if (!gm_mapping_device(gm_mapping)) { - mutex_unlock(&gm_mapping->lock); - continue; - } - - gmf.va = addr; - gmf.size = HPAGE_SIZE; - gmf.pfn = gm_mapping->gm_page->dev_pfn; - gmf.dev = gm_mapping->dev; - ret = gm_mapping->dev->mmu->peer_unmap(&gmf); - if (ret != GM_RET_SUCCESS) - gmem_err("%s: call dev peer_unmap error %d", __func__, ret); - - /* - * Regardless of whether the gm_page is unmapped, we should release it. - */ - hnode = get_hnode(gm_mapping->gm_page->hnid); - if (!hnode) { - mutex_unlock(&gm_mapping->lock); - continue; - } - gm_page_remove_rmap(gm_mapping->gm_page); - hnode_activelist_del(hnode, gm_mapping->gm_page); - hnode_active_pages_dec(hnode); - put_gm_page(gm_mapping->gm_page); - gm_mapping_flags_set(gm_mapping, GM_MAPPING_NOMAP); - gm_mapping->gm_page = NULL; - mutex_unlock(&gm_mapping->lock); - } while (addr += HPAGE_SIZE, addr != end); - - list_for_each_entry_safe(ctx, tmp, &mm->gm_as->gm_ctx_list, gm_as_link) { - if (!gm_dev_is_peer(ctx->dev)) - continue; - if (!ctx->dev->mmu->peer_va_free) - continue; - - gmf.va = start; - gmf.size = end - start; - gmf.dev = ctx->dev; - - ret = ctx->dev->mmu->peer_va_free(&gmf); - if (ret != GM_RET_SUCCESS) - pr_debug("gmem: free_vma failed, ret %d\n", ret); - } -} - -static void munmap_in_peer_devices(struct mm_struct *mm, unsigned long start, unsigned long end) -{ - struct vm_area_struct *vma; - - VMA_ITERATOR(vmi, mm, start); - for_each_vma_range(vmi, vma, end) { - if (vma_is_peer_shared(vma)) - munmap_single_vma_in_peer_devices(mm, vma, start, end); - } -} - -static unsigned long gmem_unmap_align(struct mm_struct *mm, unsigned long start, size_t len) -{ - struct vm_area_struct *vma, *vma_end; - - vma = find_vma_intersection(mm, start, start + len); - vma_end = find_vma(mm, start + len); - if (!vma || !vma_is_peer_shared(vma)) - return 0; - if (vma_is_peer_shared(vma)) { - if (!IS_ALIGNED(start, PMD_SIZE)) - return -EINVAL; - } - - /* Prevents partial release of the peer_share page. */ - if (vma_end && vma_end->vm_start < (start + len) && vma_is_peer_shared(vma_end)) - len = round_up(len, SZ_2M); - return len; -} - -static void gmem_unmap_region(struct mm_struct *mm, unsigned long start, size_t len) -{ - unsigned long end, ret; - - ret = gmem_unmap_align(mm, start, len); - - if (!ret || IS_ERR_VALUE(ret)) - return; - - end = start + ret; - munmap_in_peer_devices(mm, start, end); -} -#endif - /* * do_vmi_align_munmap() - munmap the aligned region from @start to @end. * @vmi: The vma iterator @@ -2878,7 +2727,6 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, prev = vma_iter_prev_range(vmi); next = vma_next(vmi); - if (next) vma_iter_prev_range(vmi); @@ -3015,24 +2863,21 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, nr_pages = count_vma_pages_range(mm, addr, end); if (!may_expand_vm(mm, vm_flags, - (len >> PAGE_SHIFT) - nr_pages)) { + (len >> PAGE_SHIFT) - nr_pages)) return -ENOMEM; - } } /* Unmap any existing mapping in the area */ - if (do_vmi_munmap(&vmi, mm, addr, len, uf, false)) { + if (do_vmi_munmap(&vmi, mm, addr, len, uf, false)) return -ENOMEM; - } /* * Private writable mapping: check memory availability */ if (accountable_mapping(file, vm_flags)) { charged = len >> PAGE_SHIFT; - if (security_vm_enough_memory_mm(mm, charged)) { + if (security_vm_enough_memory_mm(mm, charged)) return -ENOMEM; - } vm_flags |= VM_ACCOUNT; } diff --git a/mm/mprotect.c b/mm/mprotect.c index 4eac8ad8a718..eed9fe390c70 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -693,11 +693,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, unsigned long prot, int pkey) { unsigned long nstart, end, tmp, reqprot; -#ifdef CONFIG_GMEM - struct vm_area_struct *vma, *prev, *vma_end; -#else struct vm_area_struct *vma, *prev; -#endif int error; const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); const bool rier = (current->personality & READ_IMPLIES_EXEC) && @@ -740,8 +736,9 @@ static int do_mprotect_pkey(unsigned long start, size_t len, error = -ENOMEM; if (!vma) goto out; -#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) { + struct vm_area_struct *vma_end; start = ALIGN_DOWN(start, HPAGE_SIZE); vma_end = find_vma(current->mm, end); if (vma_end && vma_end->vm_start < end && vma_is_peer_shared(vma_end)) @@ -752,7 +749,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, } len = end - start; } -#endif + if (unlikely(grows & PROT_GROWSDOWN)) { if (vma->vm_start >= end) goto out; diff --git a/mm/util.c b/mm/util.c index 60e956d716cd..103d853f837c 100644 --- a/mm/util.c +++ b/mm/util.c @@ -29,7 +29,6 @@ #include #ifdef CONFIG_GMEM #include -#include #include "gmem-internal.h" #endif @@ -546,114 +545,6 @@ int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc) } EXPORT_SYMBOL_GPL(account_locked_vm); -#ifdef CONFIG_GMEM -static unsigned long alloc_va_in_peer_devices(unsigned long addr, unsigned long len, - unsigned long flag) -{ - struct vm_area_struct *vma; - struct mm_struct *mm = current->mm; - struct gm_context *ctx, *tmp; - unsigned long prot = VM_NONE; - enum gm_ret ret; - char *thp_enable_path = "/sys/kernel/mm/transparent_hugepage/enabled"; - - vma = find_vma(mm, addr); - if (!vma) { - gmem_err("vma for addr %lx is NULL, should not happen\n", addr); - return -EINVAL; - } - - if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags)) { - gmem_err("transparent hugepage is not enabled. check %s\n", - thp_enable_path); - return -EINVAL; - } - - prot |= vma->vm_flags; - - if (!mm->gm_as) { - ret = gm_as_create(0, ULONG_MAX, GM_AS_ALLOC_DEFAULT, HPAGE_SIZE, &mm->gm_as); - if (ret) { - gmem_err("gm_as_create failed\n"); - return ret; - } - } - - ret = -ENODEV; - // TODO: consider the concurrency problem of device attaching/detaching from the gm_as. - list_for_each_entry_safe(ctx, tmp, &mm->gm_as->gm_ctx_list, gm_as_link) { - struct gm_fault_t gmf = { - .mm = mm, - .dev = ctx->dev, - .va = addr, - .size = len, - .prot = prot, - }; - - if (!gm_dev_is_peer(ctx->dev)) - continue; - - if (!ctx->dev->mmu->peer_va_alloc_fixed) { - pr_debug("gmem: mmu ops has no alloc_vma\n"); - continue; - } - - ret = ctx->dev->mmu->peer_va_alloc_fixed(&gmf); - if (ret != GM_RET_SUCCESS) { - gmem_err("device mmap failed\n"); - return ret; - } - } - - if (!vma->vm_obj) - vma->vm_obj = vm_object_create(vma); - if (!vma->vm_obj) - return -ENOMEM; - - return ret; -} - -struct gmem_vma_list { - unsigned long start; - size_t len; - struct list_head list; -}; - -static void gmem_reserve_vma(struct mm_struct *mm, unsigned long start, - size_t len, struct list_head *head) -{ - struct vm_area_struct *vma; - struct gmem_vma_list *node = kmalloc(sizeof(struct gmem_vma_list), GFP_KERNEL); - - vma = find_vma(mm, start); - if (!vma || vma->vm_start >= start + len) { - kfree(node); - return; - } - vm_flags_set(vma, ~VM_PEER_SHARED); - - node->start = start; - node->len = round_up(len, SZ_2M); - list_add_tail(&node->list, head); -} - -static void gmem_release_vma(struct mm_struct *mm, struct list_head *head) -{ - struct gmem_vma_list *node, *next; - - list_for_each_entry_safe(node, next, head, list) { - unsigned long start = node->start; - size_t len = node->len; - - if (len) - vm_munmap(start, len); - - list_del(&node->list); - kfree(node); - } -} -#endif - unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long pgoff) -- Gitee From dbf1d1032ae31d56ddc61d32453347136b168cb3 Mon Sep 17 00:00:00 2001 From: zhangjian Date: Tue, 21 Oct 2025 09:12:08 +0800 Subject: [PATCH 29/34] gmem: refactor some code euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- refactor some code Signed-off-by: zhangjian --- arch/arm64/include/asm/rsi_cmds.h | 1 - include/linux/gmem.h | 5 +++- include/linux/mm.h | 12 +++++---- mm/gmem-internal.h | 9 ++++--- mm/gmem.c | 2 +- mm/gmem_util.c | 44 +++++++++++++++++++++++++++---- mm/internal.h | 1 + mm/memory.c | 9 +++---- mm/mmap.c | 6 +++-- mm/util.c | 28 +++----------------- 10 files changed, 69 insertions(+), 48 deletions(-) diff --git a/arch/arm64/include/asm/rsi_cmds.h b/arch/arm64/include/asm/rsi_cmds.h index ccdeffcefbff..e6a211001bd3 100644 --- a/arch/arm64/include/asm/rsi_cmds.h +++ b/arch/arm64/include/asm/rsi_cmds.h @@ -9,7 +9,6 @@ #include #include -#include "string.h" #define RSI_GRANULE_SHIFT 12 #define RSI_GRANULE_SIZE (_AC(1, UL) << RSI_GRANULE_SHIFT) diff --git a/include/linux/gmem.h b/include/linux/gmem.h index 735a27ef7f38..745c07c81526 100644 --- a/include/linux/gmem.h +++ b/include/linux/gmem.h @@ -150,6 +150,8 @@ struct gm_mmu { #define GM_DEV_CAP_REPLAYABLE 0x00000001 #define GM_DEV_CAP_PEER 0x00000010 +#define NUM_IMPORT_PAGES 16 /* number of physical pages imported each time */ + struct gm_context { struct gm_as *as; struct gm_dev *dev; @@ -251,7 +253,8 @@ struct gm_page { unsigned int flag; atomic_t refcount; }; - +/* For driver to add device pages */ +int gm_add_pages(unsigned int hnid, struct list_head *pages); struct gm_page *alloc_gm_page_struct(void); #define gmem_err(fmt, ...) \ diff --git a/include/linux/mm.h b/include/linux/mm.h index 72a1e67997a8..aadfa00f5aed 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -322,6 +322,13 @@ extern unsigned int kobjsize(const void *objp); #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ +#ifdef CONFIG_GMEM +# define VM_PEER_SHARED_BIT 56 /* movable memory between host and device */ +#define VM_PEER_SHARED BIT(VM_PEER_SHARED_BIT) +#else +#define VM_PEER_SHARED VM_NONE +#endif + #ifdef CONFIG_USERSWAP # define VM_USWAP_BIT 61 #define VM_USWAP BIT(VM_USWAP_BIT) @@ -343,11 +350,6 @@ extern unsigned int kobjsize(const void *objp); #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) #define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) -#ifdef CONFIG_GMEM -#define VM_PEER_SHARED BIT(56) -#else -#define VM_PEER_SHARED VM_NONE -#endif #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ #ifdef CONFIG_ARCH_HAS_PKEYS diff --git a/mm/gmem-internal.h b/mm/gmem-internal.h index 21f2c365e36c..3ba381ff72b8 100644 --- a/mm/gmem-internal.h +++ b/mm/gmem-internal.h @@ -128,8 +128,6 @@ static inline bool gm_page_pinned(struct gm_page *gm_page) return !!(gm_page->flag & GM_PAGE_PINNED); } -#define NUM_IMPORT_PAGES 16 - int __init gm_page_cachep_init(void); void gm_page_cachep_destroy(void); @@ -144,7 +142,6 @@ void mark_gm_page_unpinned(struct gm_page *gm_page); void gm_page_add_rmap(struct gm_page *gm_page, struct mm_struct *mm, unsigned long va); void gm_page_remove_rmap(struct gm_page *gm_page); -int gm_add_pages(unsigned int hnid, struct list_head *pages); void gm_free_page(struct gm_page *gm_page); struct gm_page *gm_alloc_page(struct mm_struct *mm, struct hnode *hnode); @@ -165,7 +162,7 @@ void gm_deinit_sysfs(void); vm_fault_t do_peer_shared_anonymous_page(struct vm_fault *vmf); unsigned long alloc_va_in_peer_devices(unsigned long addr, unsigned long len, - unsigned long flag); + unsigned long flag); void gmem_reserve_vma(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *head); void gmem_release_vma(struct mm_struct *mm, struct list_head *head); @@ -173,4 +170,8 @@ unsigned long gmem_unmap_align(struct mm_struct *mm, unsigned long start, size_t void gmem_unmap_region(struct mm_struct *mm, unsigned long start, size_t len); bool gm_mmap_check_flags(unsigned long flags); +unsigned long gm_vm_mmap_pgoff(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, + unsigned long flag, unsigned long pgoff); + #endif /* _GMEM_INTERNAL_H */ diff --git a/mm/gmem.c b/mm/gmem.c index 48466d21b370..3d004b891522 100644 --- a/mm/gmem.c +++ b/mm/gmem.c @@ -52,7 +52,7 @@ static struct kmem_cache *gm_dev_cache; static struct kmem_cache *gm_ctx_cache; static DEFINE_XARRAY_ALLOC(gm_dev_id_pool); -static bool enable_gmem; +static bool enable_gmem __ro_after_init; static inline unsigned long pe_mask(unsigned int order) { diff --git a/mm/gmem_util.c b/mm/gmem_util.c index dc647df481e8..8486690d3074 100644 --- a/mm/gmem_util.c +++ b/mm/gmem_util.c @@ -7,11 +7,12 @@ * */ -#include -#include +#include #include #include +#include #include +#include #include "internal.h" #include "gmem-internal.h" @@ -180,7 +181,6 @@ unsigned long alloc_va_in_peer_devices(unsigned long addr, unsigned long len, struct gm_context *ctx, *tmp; unsigned long prot = VM_NONE; enum gm_ret ret; - char *thp_enable_path = "/sys/kernel/mm/transparent_hugepage/enabled"; vma = find_vma(mm, addr); if (!vma) { @@ -189,8 +189,7 @@ unsigned long alloc_va_in_peer_devices(unsigned long addr, unsigned long len, } if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags)) { - gmem_err("transparent hugepage is not enabled. check %s\n", - thp_enable_path); + gmem_err("transparent hugepage is not enabled\n"); return -EINVAL; } @@ -421,3 +420,38 @@ bool gm_mmap_check_flags(unsigned long flags) } return true; } + +unsigned long gm_vm_mmap_pgoff(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, + unsigned long flag, unsigned long pgoff) +{ + struct mm_struct *mm = current->mm; + LIST_HEAD(reserve_list); + unsigned int retry_times = 0; + unsigned long ret; + enum gm_ret gm_ret; + +retry: + ret = vm_mmap_pgoff(file, addr, len, prot, flag, pgoff); + + if (!IS_ERR_VALUE(ret)) { + + gm_ret = alloc_va_in_peer_devices(ret, len, flag); + /** + * if alloc_va_in_peer_devices failed + * add vma to reserve_list and release after find a proper vma + */ + if (gm_ret == GM_RET_NOMEM && retry_times < GMEM_MMAP_RETRY_TIMES) { + retry_times++; + gmem_reserve_vma(mm, ret, len, &reserve_list); + goto retry; + } else if (gm_ret != GM_RET_SUCCESS) { + gmem_err("alloc vma ret %lu\n", ret); + gmem_reserve_vma(mm, ret, len, &reserve_list); + ret = -ENOMEM; + } + gmem_release_vma(mm, &reserve_list); + } + + return ret; +} \ No newline at end of file diff --git a/mm/internal.h b/mm/internal.h index 01a58570869a..3a127c3e2325 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1546,4 +1546,5 @@ static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry, { } #endif /* CONFIG_SHRINKER_DEBUG */ + #endif /* __MM_INTERNAL_H */ diff --git a/mm/memory.c b/mm/memory.c index 68218bd364cb..f3f5dd73d392 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1725,7 +1725,7 @@ static inline void zap_logic_pmd_range(struct vm_area_struct *vma, unsigned long if (!vma_is_peer_shared(vma)) return; - if (!pmd_none_or_clear_bad(pmd) && !pmd_trans_huge(*pmd)) + if (verify_pmd && !pmd_none_or_clear_bad(pmd) && !pmd_trans_huge(*pmd)) return; if (!vma->vm_obj) return; @@ -5742,8 +5742,10 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; +#ifdef CONFIG_GMEM if (vma_is_peer_shared(vma)) return do_peer_shared_anonymous_page(vmf); +#endif if (vma_is_anonymous(vma)) return do_huge_pmd_anonymous_page(vmf); if (vma->vm_ops->huge_fault) @@ -5991,13 +5993,10 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, vmf.orig_pmd = pmdp_get_lockless(vmf.pmd); #ifdef CONFIG_GMEM -#define THP_ENABLE_PATH "/sys/kernel/mm/transparent_hugepage/enabled" - if (vma_is_peer_shared(vma) && pmd_none(*vmf.pmd) && (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags))) { /* if transparent hugepage is not enabled, return pagefault failed */ - gmem_err("transparent hugepage is not enabled. check %s\n", - THP_ENABLE_PATH); + gmem_err("transparent hugepage is not enabled\n"); return VM_FAULT_SIGBUS; } #endif diff --git a/mm/mmap.c b/mm/mmap.c index 1c94bf803146..58a9b9375899 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1492,10 +1492,12 @@ unsigned long __do_mmap_mm(struct mm_struct *mm, struct file *file, unsigned lon } #ifdef CONFIG_GMEM if (flags & MAP_PEER_SHARED) { - if (gmem_is_enabled()) + if (gmem_is_enabled()) { vm_flags |= VM_PEER_SHARED; - else + len = round_up(len, HPAGE_SIZE); + } else { return -EINVAL; + } } #endif diff --git a/mm/util.c b/mm/util.c index 103d853f837c..6c7a7d8b1eed 100644 --- a/mm/util.c +++ b/mm/util.c @@ -553,11 +553,12 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, struct mm_struct *mm = current->mm; unsigned long populate; LIST_HEAD(uf); + #ifdef CONFIG_GMEM - unsigned int retry_times = 0; - LIST_HEAD(reserve_list); -retry: + if (gmem_is_enabled() && flag & MAP_PEER_SHARED) + return gm_vm_mmap_pgoff(file, addr, len, prot, flag, pgoff); #endif + ret = security_mmap_file(file, prot, flag); if (!ret) { if (mmap_write_lock_killable(mm)) @@ -568,27 +569,6 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(ret, populate); -#ifdef CONFIG_GMEM - if (gmem_is_enabled() && !IS_ERR_VALUE(ret) && flag & MAP_PEER_SHARED) { - enum gm_ret gm_ret = 0; - - gm_ret = alloc_va_in_peer_devices(ret, len, flag); - /* - * if alloc_va_in_peer_devices failed - * add vma to reserve_list and release after find a proper vma - */ - if (gm_ret == GM_RET_NOMEM && retry_times < GMEM_MMAP_RETRY_TIMES) { - retry_times++; - gmem_reserve_vma(mm, ret, len, &reserve_list); - goto retry; - } else if (gm_ret != GM_RET_SUCCESS) { - gmem_err("alloc vma ret %lu\n", ret); - gmem_reserve_vma(mm, ret, len, &reserve_list); - ret = -ENOMEM; - } - gmem_release_vma(mm, &reserve_list); - } -#endif } return ret; } -- Gitee From 79dacf26535e937ac3cfd297fb505bb06918ecd0 Mon Sep 17 00:00:00 2001 From: wangbin Date: Wed, 29 Oct 2025 20:11:15 +0800 Subject: [PATCH 30/34] reorganize some code --- include/linux/gmem.h | 104 ++++++++++++++------------------------ include/linux/mm_types.h | 68 ------------------------- include/linux/vm_object.h | 36 +++++++++++++ mm/gmem-internal.h | 18 +++++++ mm/gmem.c | 8 +-- mm/gmem_util.c | 1 + mm/huge_memory.c | 1 - mm/memory.c | 8 +-- mm/mm_init.c | 3 -- mm/mmap.c | 8 --- mm/util.c | 5 -- 11 files changed, 95 insertions(+), 165 deletions(-) diff --git a/include/linux/gmem.h b/include/linux/gmem.h index 745c07c81526..a6f346fbb45e 100644 --- a/include/linux/gmem.h +++ b/include/linux/gmem.h @@ -9,6 +9,7 @@ #ifndef _GMEM_H #define _GMEM_H +#ifdef CONFIG_GMEM /* * enum gm_ret - The return value of GMEM KPI that can be used to tell * the core VM or peripheral driver whether the GMEM KPI was @@ -28,24 +29,6 @@ enum gm_ret { GM_RET_UNIMPLEMENTED, }; -/** - * enum gm_mmu_mode - defines the method to share a physical page table. - * - * @GM_MMU_MODE_SHARE: Literally share a physical page table with another - * attached device's MMU. Nothing is guaranteed about the allocated address. - * @GM_MMU_MODE_COHERENT_EXCLUSIVE: Maintain a coherent page table that holds - * exclusive mapping entries, so that device memory accesses can trigger fault-driven - * migration for automatic data locality optimizations. - * @GM_MMU_MODE_REPLICATE: Maintain a coherent page table that replicates physical - * mapping entries whenever a physical mapping is installed inside the address space, so - * that it may minimize the page faults to be triggered by this device. - */ -enum gm_mmu_mode { - GM_MMU_MODE_SHARE, - GM_MMU_MODE_COHERENT_EXCLUSIVE, - GM_MMU_MODE_REPLICATE, -}; - /* * This is the parameter list of peer_map/unmap mmu operations. * if device should copy data to/from host, set copy and dma_addr @@ -63,11 +46,8 @@ struct gm_fault_t { }; enum gm_memcpy_kind { - GM_MEMCPY_INIT, - GM_MEMCPY_H2H, GM_MEMCPY_H2D, GM_MEMCPY_D2H, - GM_MEMCPY_D2D, GM_MEMCPY_KIND_INVALID, }; @@ -91,18 +71,6 @@ struct gm_memcpy_t { * the hardware MMU. */ struct gm_mmu { - /* - * Each bit indicates a supported page size for page-based TLB. - * Currently we do not consider range TLBs. - */ - unsigned long pgsize_bitmap; - - /* - * cookie identifies the type of the MMU. If two gm_mmu shares the same cookie, - * then it means their page table formats are compatible. - * In that case, they can share the same void *pmap as the input arg. - */ - unsigned long cookie; /* Synchronize VMA in a peer OS to interact with the host OS */ enum gm_ret (*peer_va_alloc_fixed)(struct gm_fault_t *gmf); @@ -120,23 +88,6 @@ struct gm_mmu { enum gm_ret (*import_phys_mem)(struct mm_struct *mm, int hnid, unsigned long page_cnt); - /* Create or destroy a device's physical page table. */ - enum gm_ret (*pmap_create)(struct gm_dev *dev, void **pmap); - enum gm_ret (*pmap_destroy)(void *pmap); - - /* Create or destroy a physical mapping of a created physical page table */ - enum gm_ret (*pmap_enter)(void *pmap, unsigned long va, unsigned long size, - unsigned long pa, unsigned long prot); - enum gm_ret (*pmap_release)(void *pmap, unsigned long va, unsigned long size); - - /* Change the protection of a virtual page */ - enum gm_ret (*pmap_protect)(void *pmap, unsigned long va, unsigned long size, - unsigned long new_prot); - - /* Invalidation functions of the MMU TLB */ - enum gm_ret (*tlb_invl)(void *pmap, unsigned long va, unsigned long size); - enum gm_ret (*tlb_invl_coalesced)(void *pmap, struct list_head *mappings); - // copy one area of memory from device to host or from host to device enum gm_ret (*peer_hmemcpy)(struct gm_memcpy_t *gmc); }; @@ -155,7 +106,6 @@ struct gm_mmu { struct gm_context { struct gm_as *as; struct gm_dev *dev; - void *pmap; /* * consider a better container to maintain multiple ctx inside a device or multiple ctx * inside a va space. @@ -166,7 +116,6 @@ struct gm_context { /* A va space may have multiple gm_context */ struct list_head gm_as_link; }; -#define get_gm_context(head) (list_entry((head)->prev, struct gm_context, ctx_link)) struct gm_dev { int id; @@ -179,7 +128,6 @@ struct gm_dev { struct gm_mmu *mmu; void *dev_data; /* - * TODO: Use a better container of struct gm_context to support time-sliced context switch. * A collection of device contexts. If the device does not support time-sliced context * switch, then the size of the collection should never be greater than one. * We need to think about what operators should the container be optimized for. @@ -210,29 +158,51 @@ struct gm_mapping { struct mutex lock; }; -#define test_gm_mapping_mapped_on_node(i) { /* implement this */ } -#define set_gm_mapping_mapped_on_node(i) { /* implement this */ } -#define unset_gm_mapping_mapped_on_node(i) { /* implement this */ } +/** + * enum gm_as_alloc - defines different allocation policy for virtual addresses. + * + * @GM_AS_ALLOC_DEFAULT: An object cache is applied to accelerate VA allocations. + * @GM_AS_ALLOC_FIRSTFIT: Prefer allocation efficiency. + * @GM_AS_ALLOC_BESTFIT: Prefer space efficiency. + * @GM_AS_ALLOC_NEXTFIT: Perform an address-ordered search for free addresses, + * beginning where the previous search ended. + */ +enum gm_as_alloc { + GM_AS_ALLOC_DEFAULT = 0, + GM_AS_ALLOC_FIRSTFIT, + GM_AS_ALLOC_BESTFIT, + GM_AS_ALLOC_NEXTFIT, +}; + +/* Defines an address space. */ +struct gm_as { + spinlock_t rbtree_lock; /* spinlock of struct gm_as */ + struct rb_root rbroot; /*root of gm_region_t */ + enum gm_as_alloc policy; + unsigned long start_va; + unsigned long end_va; + /* defines the VA unit size if an object cache is applied */ + unsigned long cache_quantum; + /* tracks device contexts attached to this va space, using gm_as_link */ + struct list_head gm_ctx_list; +}; /* GMEM Device KPI */ -extern enum gm_ret gm_dev_create(struct gm_mmu *mmu, void *dev_data, unsigned long cap, +enum gm_ret gm_dev_create(struct gm_mmu *mmu, void *dev_data, unsigned long cap, struct gm_dev **new_dev); -extern enum gm_ret gm_dev_switch(struct gm_dev *dev, struct gm_as *as); -extern enum gm_ret gm_dev_detach(struct gm_dev *dev, struct gm_as *as); -extern int gm_dev_register_hnode(struct gm_dev *dev); +int gm_dev_register_hnode(struct gm_dev *dev); enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struct gm_dev *dev, int behavior); -vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, unsigned int order); /* GMEM address space KPI */ -extern enum gm_ret gm_as_create(unsigned long begin, unsigned long end, enum gm_as_alloc policy, +enum gm_ret gm_as_create(unsigned long begin, unsigned long end, enum gm_as_alloc policy, unsigned long cache_quantum, struct gm_as **new_as); -extern enum gm_ret gm_as_destroy(struct gm_as *as); -extern enum gm_ret gm_as_attach(struct gm_as *as, struct gm_dev *dev, enum gm_mmu_mode mode, +enum gm_ret gm_as_destroy(struct gm_as *as); +enum gm_ret gm_as_attach(struct gm_as *as, struct gm_dev *dev, bool activate, struct gm_context **out_ctx); -extern int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior); -extern int hmemcpy(int hnid, unsigned long dest, unsigned long src, size_t size); +int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior); +int hmemcpy(int hnid, unsigned long dest, unsigned long src, size_t size); struct gm_page { struct list_head gm_page_list; @@ -257,6 +227,8 @@ struct gm_page { int gm_add_pages(unsigned int hnid, struct list_head *pages); struct gm_page *alloc_gm_page_struct(void); +#endif /* CONFIG_GMEM */ + #define gmem_err(fmt, ...) \ ((void)pr_err("[gmem]" fmt "\n", ##__VA_ARGS__)) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f012f7c7c4d4..5d6ee378d7d4 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -612,74 +612,6 @@ struct vm_userfaultfd_ctx { struct vm_userfaultfd_ctx {}; #endif /* CONFIG_USERFAULTFD */ -#ifdef CONFIG_GMEM -/* - * Defines a centralized logical mapping table that reflects the mapping information - * regardless of the underlying arch-specific MMUs. - * The implementation of this data structure borrows the VM_OBJECT from FreeBSD as well - * as the filemap address_space struct from Linux page cache. - * Only VMAs point to VM_OBJECTs and maintain logical mappings, because we assume that - * the coordiantion between page tables must happen with CPU page table involved. That - * is to say, a generalized process unit must involve in a UVA-programming model, otherwise - * there is no point to support UVA programming. - * However, a VMA only needs to maintain logical mappings if the process has been - * attached to a GMEM VA space. In normal cases, a CPU process does not need it. (unless - * we later build a reservation system on top of the logical mapping tables to support - * reservation-based superpages and rangeTLBs). - * A GM_REGION does not need to maintain logical mappings. In the case that a device wants - * to support its private address space with local physical memory, GMEM should forward address - * space management to the core VM, using VMAs, instead of using GM_REGIONs. - */ -struct vm_object { - spinlock_t lock; - struct vm_area_struct *vma; - - /* - * The logical_page_table is a container that holds the mapping - * information between a VA and a struct page. - */ - struct xarray *logical_page_table; - atomic_t nr_pages; - - /* - * a vm object might be referred by multiple VMAs to share - * memory. - */ - atomic_t ref_count; -}; - -#define GMEM_MMAP_RETRY_TIMES 10 /* gmem retry times before OOM */ - -/** - * enum gm_as_alloc - defines different allocation policy for virtual addresses. - * - * @GM_AS_ALLOC_DEFAULT: An object cache is applied to accelerate VA allocations. - * @GM_AS_ALLOC_FIRSTFIT: Prefer allocation efficiency. - * @GM_AS_ALLOC_BESTFIT: Prefer space efficiency. - * @GM_AS_ALLOC_NEXTFIT: Perform an address-ordered search for free addresses, - * beginning where the previous search ended. - */ -enum gm_as_alloc { - GM_AS_ALLOC_DEFAULT = 0, - GM_AS_ALLOC_FIRSTFIT, - GM_AS_ALLOC_BESTFIT, - GM_AS_ALLOC_NEXTFIT, -}; - -/* Defines an address space. */ -struct gm_as { - spinlock_t rbtree_lock; /* spinlock of struct gm_as */ - struct rb_root rbroot; /*root of gm_region_t */ - enum gm_as_alloc policy; - unsigned long start_va; - unsigned long end_va; - /* defines the VA unit size if an object cache is applied */ - unsigned long cache_quantum; - /* tracks device contexts attached to this va space, using gm_as_link */ - struct list_head gm_ctx_list; -}; -#endif - struct anon_vma_name { struct kref kref; /* The name needs to be at the end because it is dynamically sized. */ diff --git a/include/linux/vm_object.h b/include/linux/vm_object.h index a3b7a6e9f978..8e40bc700640 100644 --- a/include/linux/vm_object.h +++ b/include/linux/vm_object.h @@ -4,6 +4,42 @@ #ifdef CONFIG_GMEM #include + +/* + * Defines a centralized logical mapping table that reflects the mapping information + * regardless of the underlying arch-specific MMUs. + * The implementation of this data structure borrows the VM_OBJECT from FreeBSD as well + * as the filemap address_space struct from Linux page cache. + * Only VMAs point to VM_OBJECTs and maintain logical mappings, because we assume that + * the coordiantion between page tables must happen with CPU page table involved. That + * is to say, a generalized process unit must involve in a UVA-programming model, otherwise + * there is no point to support UVA programming. + * However, a VMA only needs to maintain logical mappings if the process has been + * attached to a GMEM VA space. In normal cases, a CPU process does not need it. (unless + * we later build a reservation system on top of the logical mapping tables to support + * reservation-based superpages and rangeTLBs). + * A GM_REGION does not need to maintain logical mappings. In the case that a device wants + * to support its private address space with local physical memory, GMEM should forward address + * space management to the core VM, using VMAs, instead of using GM_REGIONs. + */ +struct vm_object { + spinlock_t lock; + struct vm_area_struct *vma; + + /* + * The logical_page_table is a container that holds the mapping + * information between a VA and a struct page. + */ + struct xarray *logical_page_table; + atomic_t nr_pages; + + /* + * a vm object might be referred by multiple VMAs to share + * memory. + */ + atomic_t ref_count; +}; + /* vm_object KPI */ int __init vm_object_init(void); struct vm_object *vm_object_create(struct vm_area_struct *vma); diff --git a/mm/gmem-internal.h b/mm/gmem-internal.h index 3ba381ff72b8..f0d4b3077cbd 100644 --- a/mm/gmem-internal.h +++ b/mm/gmem-internal.h @@ -4,6 +4,7 @@ #include #include +#ifdef CONFIG_GMEM #define gm_dev_is_peer(dev) (((dev)->capability & GM_DEV_CAP_PEER) != 0) @@ -174,4 +175,21 @@ unsigned long gm_vm_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long pgoff); +vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, unsigned int order); + +#define GMEM_MMAP_RETRY_TIMES 10 /* gmem retry times before OOM */ + +#else + +static inline vm_fault_t do_peer_shared_anonymous_page(struct vm_fault *vmf) { return 0; } +static inline unsigned long gmem_unmap_align(struct mm_struct *mm, + unsigned long start, size_t len) { return 0; } + +static inline void gmem_unmap_region(struct mm_struct *mm, unsigned long start, size_t len) { return; } +static inline unsigned long gm_vm_mmap_pgoff(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, + unsigned long flag, unsigned long pgoff) { return 0; } + +#endif /* CONFIG_GMEM */ + #endif /* _GMEM_INTERNAL_H */ diff --git a/mm/gmem.c b/mm/gmem.c index 3d004b891522..57a93902cb50 100644 --- a/mm/gmem.c +++ b/mm/gmem.c @@ -452,7 +452,7 @@ enum gm_ret gm_as_destroy(struct gm_as *as) } EXPORT_SYMBOL_GPL(gm_as_destroy); -enum gm_ret gm_as_attach(struct gm_as *as, struct gm_dev *dev, enum gm_mmu_mode mode, +enum gm_ret gm_as_attach(struct gm_as *as, struct gm_dev *dev, bool activate, struct gm_context **out_ctx) { struct gm_context *ctx; @@ -465,12 +465,6 @@ enum gm_ret gm_as_attach(struct gm_as *as, struct gm_dev *dev, enum gm_mmu_mode ctx->as = as; ctx->dev = dev; - ctx->pmap = NULL; - ret = dev->mmu->pmap_create(dev, &ctx->pmap); - if (ret) { - kmem_cache_free(gm_ctx_cache, ctx); - return ret; - } INIT_LIST_HEAD(&ctx->gm_dev_link); INIT_LIST_HEAD(&ctx->gm_as_link); diff --git a/mm/gmem_util.c b/mm/gmem_util.c index 8486690d3074..349810f08d94 100644 --- a/mm/gmem_util.c +++ b/mm/gmem_util.c @@ -432,6 +432,7 @@ unsigned long gm_vm_mmap_pgoff(struct file *file, unsigned long addr, enum gm_ret gm_ret; retry: + flag &= ~MAP_PEER_SHARED; ret = vm_mmap_pgoff(file, addr, len, prot, flag, pgoff); if (!IS_ERR_VALUE(ret)) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index da46c38b7595..a28dda799978 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -38,7 +38,6 @@ #include #include #include - #include #include diff --git a/mm/memory.c b/mm/memory.c index f3f5dd73d392..adf727bbd8df 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -77,7 +77,6 @@ #include #include #include - #include #include @@ -91,9 +90,7 @@ #include #include -#ifdef CONFIG_GMEM #include "gmem-internal.h" -#endif #include "pgalloc-track.h" #include "internal.h" @@ -5742,10 +5739,9 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; -#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) return do_peer_shared_anonymous_page(vmf); -#endif if (vma_is_anonymous(vma)) return do_huge_pmd_anonymous_page(vmf); if (vma->vm_ops->huge_fault) @@ -5992,14 +5988,12 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, } else { vmf.orig_pmd = pmdp_get_lockless(vmf.pmd); -#ifdef CONFIG_GMEM if (vma_is_peer_shared(vma) && pmd_none(*vmf.pmd) && (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags))) { /* if transparent hugepage is not enabled, return pagefault failed */ gmem_err("transparent hugepage is not enabled\n"); return VM_FAULT_SIGBUS; } -#endif if (unlikely(is_swap_pmd(vmf.orig_pmd))) { VM_BUG_ON(thp_migration_supported() && diff --git a/mm/mm_init.c b/mm/mm_init.c index f2d805dde378..6677aaa5972d 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -30,9 +30,6 @@ #include "internal.h" #include "slab.h" #include "shuffle.h" -#ifdef CONFIG_GMEM -#include "gmem-internal.h" -#endif #include diff --git a/mm/mmap.c b/mm/mmap.c index 58a9b9375899..71e54eb33655 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -49,9 +49,7 @@ #include #include -#ifdef CONFIG_GMEM #include "gmem-internal.h" -#endif #include @@ -2787,7 +2785,6 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long end; struct vm_area_struct *vma; -#ifdef CONFIG_GMEM if (gmem_is_enabled()) { unsigned long ret = gmem_unmap_align(mm, start, len); @@ -2796,7 +2793,6 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, else if (ret) len = ret; } -#endif if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) return -EINVAL; @@ -2832,10 +2828,8 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, { VMA_ITERATOR(vmi, mm, start); -#ifdef CONFIG_GMEM if (gmem_is_enabled()) gmem_unmap_region(mm, start, len); -#endif return do_vmi_munmap(&vmi, mm, start, len, uf, false); } @@ -3120,10 +3114,8 @@ static int __vm_munmap(unsigned long start, size_t len, bool unlock) if (sp_check_addr(start)) return -EINVAL; -#ifdef CONFIG_GMEM if (gmem_is_enabled()) gmem_unmap_region(mm, start, len); -#endif if (mmap_write_lock_killable(mm)) return -EINTR; diff --git a/mm/util.c b/mm/util.c index 6c7a7d8b1eed..cc29469130ee 100644 --- a/mm/util.c +++ b/mm/util.c @@ -27,11 +27,8 @@ #include #include -#ifdef CONFIG_GMEM -#include #include "gmem-internal.h" -#endif #include "internal.h" #include "swap.h" @@ -554,10 +551,8 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, unsigned long populate; LIST_HEAD(uf); -#ifdef CONFIG_GMEM if (gmem_is_enabled() && flag & MAP_PEER_SHARED) return gm_vm_mmap_pgoff(file, addr, len, prot, flag, pgoff); -#endif ret = security_mmap_file(file, prot, flag); if (!ret) { -- Gitee From f8caf3a7a10789ca5412c392cffcdd25dca7648e Mon Sep 17 00:00:00 2001 From: xiuqing Date: Thu, 30 Oct 2025 12:12:13 +0800 Subject: [PATCH 31/34] remove capability in gm_dev --- include/linux/gmem.h | 16 +--------------- mm/gmem-internal.h | 2 -- mm/gmem.c | 4 +--- mm/gmem_util.c | 5 ----- 4 files changed, 2 insertions(+), 25 deletions(-) diff --git a/include/linux/gmem.h b/include/linux/gmem.h index a6f346fbb45e..42a0d780b6c0 100644 --- a/include/linux/gmem.h +++ b/include/linux/gmem.h @@ -92,15 +92,6 @@ struct gm_mmu { enum gm_ret (*peer_hmemcpy)(struct gm_memcpy_t *gmc); }; -/** - * unsigned long defines a composable flag to describe the capabilities of a device. - * - * @GM_DEV_CAP_REPLAYABLE: Memory accesses can be replayed to recover page faults. - * @GM_DEV_CAP_PEER: The device has its own VMA/PA management, controlled by another peer OS - */ -#define GM_DEV_CAP_REPLAYABLE 0x00000001 -#define GM_DEV_CAP_PEER 0x00000010 - #define NUM_IMPORT_PAGES 16 /* number of physical pages imported each time */ struct gm_context { @@ -120,11 +111,6 @@ struct gm_context { struct gm_dev { int id; - /* identifies the device capability - * For example, whether the device supports page faults or whether it has its - * own OS that manages the VA and PA resources. - */ - unsigned long capability; struct gm_mmu *mmu; void *dev_data; /* @@ -188,7 +174,7 @@ struct gm_as { }; /* GMEM Device KPI */ -enum gm_ret gm_dev_create(struct gm_mmu *mmu, void *dev_data, unsigned long cap, +enum gm_ret gm_dev_create(struct gm_mmu *mmu, void *dev_data, struct gm_dev **new_dev); int gm_dev_register_hnode(struct gm_dev *dev); enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, diff --git a/mm/gmem-internal.h b/mm/gmem-internal.h index f0d4b3077cbd..2614d4221c08 100644 --- a/mm/gmem-internal.h +++ b/mm/gmem-internal.h @@ -6,8 +6,6 @@ #include #ifdef CONFIG_GMEM -#define gm_dev_is_peer(dev) (((dev)->capability & GM_DEV_CAP_PEER) != 0) - /* h-NUMA topology */ struct hnode { unsigned int id; diff --git a/mm/gmem.c b/mm/gmem.c index 57a93902cb50..b6750d297aa0 100644 --- a/mm/gmem.c +++ b/mm/gmem.c @@ -199,7 +199,7 @@ __setup("gmem=", setup_gmem); * The returned device pointer will be passed by new_dev. * A unique id will be assigned to the GMEM device, using Linux's xarray. */ -enum gm_ret gm_dev_create(struct gm_mmu *mmu, void *dev_data, unsigned long cap, +enum gm_ret gm_dev_create(struct gm_mmu *mmu, void *dev_data, struct gm_dev **new_dev) { struct gm_dev *dev; @@ -217,7 +217,6 @@ enum gm_ret gm_dev_create(struct gm_mmu *mmu, void *dev_data, unsigned long cap, return GM_RET_NOMEM; } - dev->capability = cap; dev->mmu = mmu; dev->dev_data = dev_data; dev->current_ctx = NULL; @@ -457,7 +456,6 @@ enum gm_ret gm_as_attach(struct gm_as *as, struct gm_dev *dev, { struct gm_context *ctx; int nid; - int ret; ctx = kmem_cache_alloc(gm_ctx_cache, GFP_KERNEL); if (!ctx) diff --git a/mm/gmem_util.c b/mm/gmem_util.c index 349810f08d94..fce865876dc2 100644 --- a/mm/gmem_util.c +++ b/mm/gmem_util.c @@ -214,9 +214,6 @@ unsigned long alloc_va_in_peer_devices(unsigned long addr, unsigned long len, .prot = prot, }; - if (!gm_dev_is_peer(ctx->dev)) - continue; - if (!ctx->dev->mmu->peer_va_alloc_fixed) { pr_debug("gmem: mmu ops has no alloc_vma\n"); continue; @@ -346,8 +343,6 @@ static void munmap_single_vma_in_peer_devices(struct mm_struct *mm, struct vm_ar } while (addr += HPAGE_SIZE, addr != end); list_for_each_entry_safe(ctx, tmp, &mm->gm_as->gm_ctx_list, gm_as_link) { - if (!gm_dev_is_peer(ctx->dev)) - continue; if (!ctx->dev->mmu->peer_va_free) continue; -- Gitee From 435d72d016881a9d7f29ba35abe801823579d9c7 Mon Sep 17 00:00:00 2001 From: xiuqing Date: Thu, 30 Oct 2025 15:23:32 +0800 Subject: [PATCH 32/34] recheck return value, whose type is gm_ret --- include/linux/gmem.h | 8 ++++---- mm/gmem-internal.h | 2 +- mm/gmem.c | 32 ++++++++++++++++---------------- mm/gmem_phys.c | 6 +++--- mm/gmem_util.c | 23 +++++++++++++---------- 5 files changed, 37 insertions(+), 34 deletions(-) diff --git a/include/linux/gmem.h b/include/linux/gmem.h index 42a0d780b6c0..f0aba700a878 100644 --- a/include/linux/gmem.h +++ b/include/linux/gmem.h @@ -174,17 +174,17 @@ struct gm_as { }; /* GMEM Device KPI */ -enum gm_ret gm_dev_create(struct gm_mmu *mmu, void *dev_data, +int gm_dev_create(struct gm_mmu *mmu, void *dev_data, struct gm_dev **new_dev); int gm_dev_register_hnode(struct gm_dev *dev); enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struct gm_dev *dev, int behavior); /* GMEM address space KPI */ -enum gm_ret gm_as_create(unsigned long begin, unsigned long end, enum gm_as_alloc policy, +int gm_as_create(unsigned long begin, unsigned long end, enum gm_as_alloc policy, unsigned long cache_quantum, struct gm_as **new_as); -enum gm_ret gm_as_destroy(struct gm_as *as); -enum gm_ret gm_as_attach(struct gm_as *as, struct gm_dev *dev, +int gm_as_destroy(struct gm_as *as); +int gm_as_attach(struct gm_as *as, struct gm_dev *dev, bool activate, struct gm_context **out_ctx); int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior); diff --git a/mm/gmem-internal.h b/mm/gmem-internal.h index 2614d4221c08..7f1f58f30562 100644 --- a/mm/gmem-internal.h +++ b/mm/gmem-internal.h @@ -160,7 +160,7 @@ int __init gm_init_sysfs(void); void gm_deinit_sysfs(void); vm_fault_t do_peer_shared_anonymous_page(struct vm_fault *vmf); -unsigned long alloc_va_in_peer_devices(unsigned long addr, unsigned long len, +int alloc_va_in_peer_devices(unsigned long addr, unsigned long len, unsigned long flag); void gmem_reserve_vma(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *head); diff --git a/mm/gmem.c b/mm/gmem.c index b6750d297aa0..cf058d944e63 100644 --- a/mm/gmem.c +++ b/mm/gmem.c @@ -199,22 +199,22 @@ __setup("gmem=", setup_gmem); * The returned device pointer will be passed by new_dev. * A unique id will be assigned to the GMEM device, using Linux's xarray. */ -enum gm_ret gm_dev_create(struct gm_mmu *mmu, void *dev_data, +int gm_dev_create(struct gm_mmu *mmu, void *dev_data, struct gm_dev **new_dev) { struct gm_dev *dev; if (!gmem_is_enabled()) - return GM_RET_FAILURE_UNKNOWN; + return -EINVAL; dev = kmem_cache_alloc(gm_dev_cache, GFP_KERNEL); if (!dev) - return GM_RET_NOMEM; + return -ENOMEM; if (xa_alloc(&gm_dev_id_pool, &dev->id, dev, xa_limit_32b, GFP_KERNEL)) { kmem_cache_free(gm_dev_cache, dev); - return GM_RET_NOMEM; + return -EAGAIN; } dev->mmu = mmu; @@ -223,7 +223,7 @@ enum gm_ret gm_dev_create(struct gm_mmu *mmu, void *dev_data, INIT_LIST_HEAD(&dev->gm_ctx_list); *new_dev = dev; nodes_clear(dev->registered_hnodes); - return GM_RET_SUCCESS; + return 0; } EXPORT_SYMBOL_GPL(gm_dev_create); @@ -413,7 +413,7 @@ vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, } /* GMEM Virtual Address Space API */ -enum gm_ret gm_as_create(unsigned long begin, unsigned long end, enum gm_as_alloc policy, +int gm_as_create(unsigned long begin, unsigned long end, enum gm_as_alloc policy, unsigned long cache_quantum, struct gm_as **new_as) { struct gm_as *as; @@ -434,11 +434,11 @@ enum gm_ret gm_as_create(unsigned long begin, unsigned long end, enum gm_as_allo INIT_LIST_HEAD(&as->gm_ctx_list); *new_as = as; - return GM_RET_SUCCESS; + return 0; } EXPORT_SYMBOL_GPL(gm_as_create); -enum gm_ret gm_as_destroy(struct gm_as *as) +int gm_as_destroy(struct gm_as *as) { struct gm_context *ctx, *tmp_ctx; @@ -447,11 +447,11 @@ enum gm_ret gm_as_destroy(struct gm_as *as) kmem_cache_free(gm_as_cache, as); - return GM_RET_SUCCESS; + return 0; } EXPORT_SYMBOL_GPL(gm_as_destroy); -enum gm_ret gm_as_attach(struct gm_as *as, struct gm_dev *dev, +int gm_as_attach(struct gm_as *as, struct gm_dev *dev, bool activate, struct gm_context **out_ctx) { struct gm_context *ctx; @@ -459,7 +459,7 @@ enum gm_ret gm_as_attach(struct gm_as *as, struct gm_dev *dev, ctx = kmem_cache_alloc(gm_ctx_cache, GFP_KERNEL); if (!ctx) - return GM_RET_NOMEM; + return -ENOMEM; ctx->as = as; ctx->dev = dev; @@ -497,7 +497,7 @@ enum gm_ret gm_as_attach(struct gm_as *as, struct gm_dev *dev, */ for_each_node_mask(nid, dev->registered_hnodes) node_set(nid, current->mems_allowed); - return GM_RET_SUCCESS; + return 0; } EXPORT_SYMBOL_GPL(gm_as_attach); @@ -620,7 +620,7 @@ static int gmem_unmap_vma_pages(struct vm_area_struct *vma, unsigned long start, struct gm_mapping *gm_mapping; struct vm_object *obj; struct hnode *hnode; - int ret; + enum gm_ret gm_ret; obj = vma->vm_obj; if (!obj) { @@ -645,9 +645,9 @@ static int gmem_unmap_vma_pages(struct vm_area_struct *vma, unsigned long start, } else { gmf.va = start; gmf.dev = gm_mapping->dev; - ret = gm_mapping->dev->mmu->peer_unmap(&gmf); - if (ret) { - gmem_err("peer_unmap failed. ret %d\n", ret); + gm_ret = gm_mapping->dev->mmu->peer_unmap(&gmf); + if (gm_ret) { + gmem_err("peer_unmap failed. ret %d\n", gm_ret); mutex_unlock(&gm_mapping->lock); continue; } diff --git a/mm/gmem_phys.c b/mm/gmem_phys.c index a3e3a4823daf..ffb822b0f566 100644 --- a/mm/gmem_phys.c +++ b/mm/gmem_phys.c @@ -309,7 +309,7 @@ enum gm_evict_ret gm_evict_page_locked(struct gm_page *gm_page) .copy = true }; enum gm_evict_ret ret = GM_EVICT_SUCCESS; - int err; + enum gm_ret gm_ret; gm_dev = get_gm_dev(gm_page->hnid); if (!gm_dev) @@ -373,8 +373,8 @@ enum gm_evict_ret gm_evict_page_locked(struct gm_page *gm_page) goto gm_mapping_unlock; } - err = gm_dev->mmu->peer_unmap(&gmf); - if (err) { + gm_ret = gm_dev->mmu->peer_unmap(&gmf); + if (gm_ret != GM_RET_SUCCESS) { gmem_err("%s: peer_unmap failed.", __func__); ret = GM_EVICT_DEVERR; goto dma_unmap; diff --git a/mm/gmem_util.c b/mm/gmem_util.c index fce865876dc2..1d0250b3dafe 100644 --- a/mm/gmem_util.c +++ b/mm/gmem_util.c @@ -173,14 +173,15 @@ vm_fault_t do_peer_shared_anonymous_page(struct vm_fault *vmf) return ret; } -unsigned long alloc_va_in_peer_devices(unsigned long addr, unsigned long len, +int alloc_va_in_peer_devices(unsigned long addr, unsigned long len, unsigned long flag) { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; struct gm_context *ctx, *tmp; unsigned long prot = VM_NONE; - enum gm_ret ret; + enum gm_ret gm_ret; + int ret; vma = find_vma(mm, addr); if (!vma) { @@ -219,9 +220,11 @@ unsigned long alloc_va_in_peer_devices(unsigned long addr, unsigned long len, continue; } - ret = ctx->dev->mmu->peer_va_alloc_fixed(&gmf); - if (ret != GM_RET_SUCCESS) { + gm_ret = ctx->dev->mmu->peer_va_alloc_fixed(&gmf); + if (gm_ret != GM_RET_SUCCESS) { gmem_err("device mmap failed\n"); + if (gm_ret == GM_RET_NOMEM) + ret = -ENOMEM; return ret; } } @@ -424,25 +427,25 @@ unsigned long gm_vm_mmap_pgoff(struct file *file, unsigned long addr, LIST_HEAD(reserve_list); unsigned int retry_times = 0; unsigned long ret; - enum gm_ret gm_ret; + int error = 0; retry: flag &= ~MAP_PEER_SHARED; - ret = vm_mmap_pgoff(file, addr, len, prot, flag, pgoff); + error = vm_mmap_pgoff(file, addr, len, prot, flag, pgoff); if (!IS_ERR_VALUE(ret)) { - gm_ret = alloc_va_in_peer_devices(ret, len, flag); + error = alloc_va_in_peer_devices(ret, len, flag); /** * if alloc_va_in_peer_devices failed * add vma to reserve_list and release after find a proper vma */ - if (gm_ret == GM_RET_NOMEM && retry_times < GMEM_MMAP_RETRY_TIMES) { + if (error == -ENOMEM && retry_times < GMEM_MMAP_RETRY_TIMES) { retry_times++; gmem_reserve_vma(mm, ret, len, &reserve_list); goto retry; - } else if (gm_ret != GM_RET_SUCCESS) { - gmem_err("alloc vma ret %lu\n", ret); + } else if (error != 0) { ++ gmem_err("alloc vma ret %d\n", error); gmem_reserve_vma(mm, ret, len, &reserve_list); ret = -ENOMEM; } -- Gitee From 95bfe4b5977888c87f95d3659a2fe966be22af04 Mon Sep 17 00:00:00 2001 From: zhangjian Date: Wed, 22 Oct 2025 16:55:40 +0800 Subject: [PATCH 33/34] reverse refactor for vm_mmap_pgoff --- mm/gmem_util.c | 36 ------------------------------------ mm/util.c | 29 +++++++++++++++++++++++++++-- 2 files changed, 27 insertions(+), 38 deletions(-) diff --git a/mm/gmem_util.c b/mm/gmem_util.c index 1d0250b3dafe..c3c4d48b5a03 100644 --- a/mm/gmem_util.c +++ b/mm/gmem_util.c @@ -418,39 +418,3 @@ bool gm_mmap_check_flags(unsigned long flags) } return true; } - -unsigned long gm_vm_mmap_pgoff(struct file *file, unsigned long addr, - unsigned long len, unsigned long prot, - unsigned long flag, unsigned long pgoff) -{ - struct mm_struct *mm = current->mm; - LIST_HEAD(reserve_list); - unsigned int retry_times = 0; - unsigned long ret; - int error = 0; - -retry: - flag &= ~MAP_PEER_SHARED; - error = vm_mmap_pgoff(file, addr, len, prot, flag, pgoff); - - if (!IS_ERR_VALUE(ret)) { - - error = alloc_va_in_peer_devices(ret, len, flag); - /** - * if alloc_va_in_peer_devices failed - * add vma to reserve_list and release after find a proper vma - */ - if (error == -ENOMEM && retry_times < GMEM_MMAP_RETRY_TIMES) { - retry_times++; - gmem_reserve_vma(mm, ret, len, &reserve_list); - goto retry; - } else if (error != 0) { -+ gmem_err("alloc vma ret %d\n", error); - gmem_reserve_vma(mm, ret, len, &reserve_list); - ret = -ENOMEM; - } - gmem_release_vma(mm, &reserve_list); - } - - return ret; -} \ No newline at end of file diff --git a/mm/util.c b/mm/util.c index cc29469130ee..e2537f508bbc 100644 --- a/mm/util.c +++ b/mm/util.c @@ -551,8 +551,12 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, unsigned long populate; LIST_HEAD(uf); - if (gmem_is_enabled() && flag & MAP_PEER_SHARED) - return gm_vm_mmap_pgoff(file, addr, len, prot, flag, pgoff); +#ifdef CONFIG_GMEM + int error = 0; + LIST_HEAD(reserve_list); + unsigned int retry_times = 0; +retry: +#endif ret = security_mmap_file(file, prot, flag); if (!ret) { @@ -565,6 +569,27 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, if (populate) mm_populate(ret, populate); } + +#ifdef CONFIG_GMEM + if (!IS_ERR_VALUE(ret) && gmem_is_enabled() && flag & MAP_PEER_SHARED) { + error = alloc_va_in_peer_devices(ret, len, flag); + /** + * if alloc_va_in_peer_devices failed + * add vma to reserve_list and release after find a proper vma + */ + if (error == -ENOMEM && retry_times < GMEM_MMAP_RETRY_TIMES) { + retry_times++; + gmem_reserve_vma(mm, ret, len, &reserve_list); + goto retry; + } else if (error != 0) { + gmem_err("alloc vma ret %d\n", error); + gmem_reserve_vma(mm, ret, len, &reserve_list); + ret = -ENOMEM; + } + gmem_release_vma(mm, &reserve_list); + } +#endif + return ret; } -- Gitee From eaf50200c25c89823efc4f523ad00cd35a730302 Mon Sep 17 00:00:00 2001 From: zhangjian Date: Wed, 22 Oct 2025 21:38:17 +0800 Subject: [PATCH 34/34] fix some refactor bug --- include/linux/gmem.h | 2 -- mm/gmem_util.c | 15 ++++++++------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/include/linux/gmem.h b/include/linux/gmem.h index f0aba700a878..979ff154434f 100644 --- a/include/linux/gmem.h +++ b/include/linux/gmem.h @@ -127,8 +127,6 @@ struct gm_dev { /* Add tracking of registered device local physical memory. */ nodemask_t registered_hnodes; struct device *dma_dev; - - struct gm_mapping *gm_mapping; }; /* Records the status of a page-size physical page */ diff --git a/mm/gmem_util.c b/mm/gmem_util.c index c3c4d48b5a03..b2bdc0dfb7ae 100644 --- a/mm/gmem_util.c +++ b/mm/gmem_util.c @@ -96,12 +96,12 @@ vm_fault_t do_peer_shared_anonymous_page(struct vm_fault *vmf) struct gm_mapping *gm_mapping; vm_fault_t ret = 0; - if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) - return VM_FAULT_FALLBACK; - ret = vmf_anon_prepare(vmf); - if (ret) - return ret; - khugepaged_enter_vma(vma, vma->vm_flags); + if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) + return VM_FAULT_FALLBACK; + ret = vmf_anon_prepare(vmf); + if (ret) + return ret; + khugepaged_enter_vma(vma, vma->vm_flags); gm_mapping = vma_prepare_gm_mapping(vma, haddr); if (!gm_mapping) @@ -216,7 +216,7 @@ int alloc_va_in_peer_devices(unsigned long addr, unsigned long len, }; if (!ctx->dev->mmu->peer_va_alloc_fixed) { - pr_debug("gmem: mmu ops has no alloc_vma\n"); + gmem_err("gmem: mmu ops has no alloc_vma\n"); continue; } @@ -227,6 +227,7 @@ int alloc_va_in_peer_devices(unsigned long addr, unsigned long len, ret = -ENOMEM; return ret; } + ret = 0; } if (!vma->vm_obj) -- Gitee