From 1d2565aa2945e261157313bee7ebdfd2b8c9d164 Mon Sep 17 00:00:00 2001 From: Jinjiang Tu Date: Thu, 11 Sep 2025 20:21:33 +0800 Subject: [PATCH 01/21] mm/hugetlb: fix folio is still mapped when deleted mainline inclusion from mainline-v6.17 commit 7b7387650dcf2881fd8bb55bcf3c8bd6c9542dd7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID3KH8 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=7b7387650dcf2881fd8bb55bcf3c8bd6c9542dd7 ------------------------------------------- Migration may be raced with fallocating hole. remove_inode_single_folio will unmap the folio if the folio is still mapped. However, it's called without folio lock. If the folio is migrated and the mapped pte has been converted to migration entry, folio_mapped() returns false, and won't unmap it. Due to extra refcount held by remove_inode_single_folio, migration fails, restores migration entry to normal pte, and the folio is mapped again. As a result, we triggered BUG in filemap_unaccount_folio. The log is as follows: BUG: Bad page cache in process hugetlb pfn:156c00 page: refcount:515 mapcount:0 mapping:0000000099fef6e1 index:0x0 pfn:0x156c00 head: order:9 mapcount:1 entire_mapcount:1 nr_pages_mapped:0 pincount:0 aops:hugetlbfs_aops ino:dcc dentry name(?):"my_hugepage_file" flags: 0x17ffffc00000c1(locked|waiters|head|node=0|zone=2|lastcpupid=0x1fffff) page_type: f4(hugetlb) page dumped because: still mapped when deleted CPU: 1 UID: 0 PID: 395 Comm: hugetlb Not tainted 6.17.0-rc5-00044-g7aac71907bde-dirty #484 NONE Hardware name: QEMU Ubuntu 24.04 PC (i440FX + PIIX, 1996), BIOS 0.0.0 02/06/2015 Call Trace: dump_stack_lvl+0x4f/0x70 filemap_unaccount_folio+0xc4/0x1c0 __filemap_remove_folio+0x38/0x1c0 filemap_remove_folio+0x41/0xd0 remove_inode_hugepages+0x142/0x250 hugetlbfs_fallocate+0x471/0x5a0 vfs_fallocate+0x149/0x380 Hold folio lock before checking if the folio is mapped to avold race with migration. Fixes: 4aae8d1c051e ("mm/hugetlbfs: unmap pages if page fault raced with hole punch") Signed-off-by: Jinjiang Tu --- fs/hugetlbfs/inode.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index fb0d3db162c2..5e65b0d40566 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -620,14 +620,16 @@ static bool remove_inode_single_folio(struct hstate *h, struct inode *inode, /* * If folio is mapped, it was faulted in after being - * unmapped in caller. Unmap (again) while holding - * the fault mutex. The mutex will prevent faults - * until we finish removing the folio. + * unmapped in caller or hugetlb_vmdelete_list() skips + * unmapping it due to fail to grab lock. Unmap (again) + * while holding the fault mutex. The mutex will prevent + * faults until we finish removing the folio. Hold folio + * lock to guarantee no concurrent migration. */ + folio_lock(folio); if (unlikely(folio_mapped(folio))) hugetlb_unmap_file_folio(h, mapping, folio, index); - folio_lock(folio); /* * We must remove the folio from page cache before removing * the region/ reserve map (hugetlb_unreserve_pages). In -- Gitee From 37dd9a48ab30c3bece06e0ac0b567fb6f37a198e Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 20 Oct 2023 11:17:01 +0800 Subject: [PATCH 02/21] mm/memory_hotplug: support to hotplug memory on ZONE_EXTMEM hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID3KH8 -------------------------------- Support to hotplug memory on ZONE_EXTMEM. Signed-off-by: Liu Shixin --- drivers/base/memory.c | 10 ++++++++++ include/linux/memory_hotplug.h | 4 ++++ include/linux/mmzone.h | 22 ++++++++++++++++++++++ mm/memory_hotplug.c | 15 ++++++++++++++- mm/page_alloc.c | 13 +++++++++---- 5 files changed, 59 insertions(+), 5 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 8a13babd826c..bca481b3b3e3 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -33,6 +33,9 @@ static const char *const online_type_to_str[] = { [MMOP_ONLINE] = "online", [MMOP_ONLINE_KERNEL] = "online_kernel", [MMOP_ONLINE_MOVABLE] = "online_movable", +#ifdef CONFIG_ZONE_EXTMEM + [MMOP_ONLINE_EXTMEM] = "online_extmem", +#endif }; int mhp_online_type_from_str(const char *str) @@ -371,6 +374,9 @@ static ssize_t state_store(struct device *dev, struct device_attribute *attr, switch (online_type) { case MMOP_ONLINE_KERNEL: case MMOP_ONLINE_MOVABLE: +#ifdef CONFIG_ZONE_EXTMEM + case MMOP_ONLINE_EXTMEM: +#endif case MMOP_ONLINE: /* mem->online_type is protected by device_hotplug_lock */ mem->online_type = online_type; @@ -460,6 +466,10 @@ static ssize_t valid_zones_show(struct device *dev, MMOP_ONLINE_KERNEL, default_zone); len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE, default_zone); +#ifdef CONFIG_ZONE_EXTMEM + len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages, + MMOP_ONLINE_EXTMEM, default_zone); +#endif out: len += sysfs_emit_at(buf, len, "\n"); return len; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 0580ddf546fc..feda1e6e24f6 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -81,6 +81,10 @@ enum { MMOP_ONLINE_KERNEL, /* Online the memory to ZONE_MOVABLE. */ MMOP_ONLINE_MOVABLE, +#ifdef CONFIG_ZONE_EXTMEM + /* Online the memory to ZONE_EXTMEM. */ + MMOP_ONLINE_EXTMEM, +#endif }; /* Flags for add_memory() and friends to specify memory hotplug details. */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ebaa85d5ac70..453872559e22 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1535,6 +1535,28 @@ static inline bool zone_is_zone_device(struct zone *zone) } #endif +#ifdef CONFIG_ZONE_EXTMEM +static inline bool is_zone_extmem_page(const struct page *page) +{ + return page_zonenum(page) == ZONE_EXTMEM; +} + +static inline bool zone_is_zone_extmem(struct zone *zone) +{ + return zone_idx(zone) == ZONE_EXTMEM; +} +#else +static inline bool is_zone_extmem_page(const struct page *page) +{ + return false; +} + +static inline bool zone_is_zone_extmem(struct zone *zone) +{ + return false; +} +#endif + /* * Returns true if a zone has pages managed by the buddy allocator. * All the reclaim decisions have to use this function rather than diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c21408963ec1..777d16005b7a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1055,6 +1055,11 @@ struct zone *zone_for_pfn_range(int online_type, int nid, if (online_type == MMOP_ONLINE_MOVABLE) return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE]; +#ifdef CONFIG_ZONE_EXTMEM + if (online_type == MMOP_ONLINE_EXTMEM) + return &NODE_DATA(nid)->node_zones[ZONE_EXTMEM]; +#endif + if (online_policy == ONLINE_POLICY_AUTO_MOVABLE) return auto_movable_zone_for_pfn(nid, group, start_pfn, nr_pages); @@ -1836,13 +1841,16 @@ static void node_states_check_changes_offline(unsigned long nr_pages, /* * We have accounted the pages from [0..ZONE_NORMAL); ZONE_HIGHMEM * does not apply as we don't support 32bit. - * Here we count the possible pages from ZONE_MOVABLE. + * Here we count the possible pages from ZONE_MOVABLE and ZONE_EXTMEM. * If after having accounted all the pages, we see that the nr_pages * to be offlined is over or equal to the accounted pages, * we know that the node will become empty, and so, we can clear * it for N_MEMORY as well. */ present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages; +#ifdef CONFIG_ZONE_EXTMEM + present_pages += pgdat->node_zones[ZONE_EXTMEM].present_pages; +#endif if (nr_pages >= present_pages) arg->status_change_nid = zone_to_nid(zone); @@ -2278,6 +2286,11 @@ static int try_offline_memory_block(struct memory_block *mem, void *arg) if (page && zone_idx(page_zone(page)) == ZONE_MOVABLE) online_type = MMOP_ONLINE_MOVABLE; +#ifdef CONFIG_ZONE_EXTMEM + if (page && is_zone_extmem_page(page)) + online_type = MMOP_ONLINE_EXTMEM; +#endif + rc = device_offline(&mem->dev); /* * Default is MMOP_OFFLINE - change it only if offlining succeeded, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c233d61d0d06..e65d4820d9b4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6100,9 +6100,13 @@ static void __setup_per_zone_wmarks(void) struct zone *zone; unsigned long flags; - /* Calculate total number of !ZONE_HIGHMEM and !ZONE_MOVABLE pages */ + /* + * Calculate total number of !ZONE_HIGHMEM and !ZONE_MOVABLE and + * !ZONE_EXTMEM pages. + */ for_each_zone(zone) { - if (!is_highmem(zone) && zone_idx(zone) != ZONE_MOVABLE) + if (!is_highmem(zone) && zone_idx(zone) != ZONE_MOVABLE + && !zone_is_zone_extmem(zone)) lowmem_pages += zone_managed_pages(zone); } @@ -6112,7 +6116,8 @@ static void __setup_per_zone_wmarks(void) spin_lock_irqsave(&zone->lock, flags); tmp = (u64)pages_min * zone_managed_pages(zone); do_div(tmp, lowmem_pages); - if (is_highmem(zone) || zone_idx(zone) == ZONE_MOVABLE) { + if (is_highmem(zone) || zone_idx(zone) == ZONE_MOVABLE || + zone_is_zone_extmem(zone)) { /* * __GFP_HIGH and PF_MEMALLOC allocations usually don't * need highmem and movable zones pages, so cap pages_min @@ -6120,7 +6125,7 @@ static void __setup_per_zone_wmarks(void) * * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) * deltas control async page reclaim, and so should - * not be capped for highmem and movable zones. + * not be capped for highmem, movable and extmem zones. */ unsigned long min_pages; -- Gitee From a0106140ef89ef57244489830c921a222f05f2dd Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Mon, 13 Nov 2023 11:18:27 +0800 Subject: [PATCH 03/21] mm/numa_remote: prepare numa node for remote memory hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID3KH8 -------------------------------- In order to support hotplug remote memory, prepare some memory-less node. Remote memory can hotplug on these numa node to distinguish from local memory. Signed-off-by: Liu Shixin --- drivers/base/Kconfig | 11 ++++++ drivers/base/Makefile | 1 + drivers/base/arch_numa.c | 7 +++- drivers/base/numa_remote.c | 68 +++++++++++++++++++++++++++++++++++++ include/linux/numa_remote.h | 24 +++++++++++++ 5 files changed, 110 insertions(+), 1 deletion(-) create mode 100644 drivers/base/numa_remote.c create mode 100644 include/linux/numa_remote.h diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig index 2b8fd6bb7da0..584ee676aa88 100644 --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig @@ -175,6 +175,17 @@ config HMEM_REPORTING Enable reporting for heterogeneous memory access attributes under their non-uniform memory nodes. +config NUMA_REMOTE + bool "Establish numa node for remote memory" + depends on NUMA + depends on ARM64 && ARM64_4K_PAGES + depends on SPARSEMEM_VMEMMAP + depends on ZONE_EXTMEM + help + This option will initial any useless node as memory-less node. + In order to support online remote memory on these node. These + Such node is called remote node. + source "drivers/base/test/Kconfig" config SYS_HYPERVISOR diff --git a/drivers/base/Makefile b/drivers/base/Makefile index 3079bfe53d04..ed91fb97f930 100644 --- a/drivers/base/Makefile +++ b/drivers/base/Makefile @@ -14,6 +14,7 @@ obj-$(CONFIG_ISA_BUS_API) += isa.o obj-y += firmware_loader/ obj-$(CONFIG_NUMA) += node.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory.o +obj-$(CONFIG_NUMA_REMOTE) += numa_remote.o ifeq ($(CONFIG_SYSFS),y) obj-$(CONFIG_MODULES) += module.o endif diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c index 96281de7010d..a7c82b3c89d1 100644 --- a/drivers/base/arch_numa.c +++ b/drivers/base/arch_numa.c @@ -12,6 +12,7 @@ #include #include #include +#include #include @@ -441,13 +442,17 @@ static int __init numa_register_nodes(void) } } + numa_register_remote_nodes(); + /* Finally register nodes. */ for_each_node_mask(nid, numa_nodes_parsed) { unsigned long start_pfn, end_pfn; get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); setup_node_data(nid, start_pfn, end_pfn); - node_set_online(nid); + + if (!numa_is_remote_node(nid)) + node_set_online(nid); } /* Setup online nodes to actual nodes*/ diff --git a/drivers/base/numa_remote.c b/drivers/base/numa_remote.c new file mode 100644 index 000000000000..f6be837d06d7 --- /dev/null +++ b/drivers/base/numa_remote.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2024 Huawei Technologies Co., Ltd. + * Author: Liu Shixin + */ + +#define pr_fmt(fmt) "NUMA remote: " fmt + +#include + +/* The default distance between local node and remote node */ +#define REMOTE_TO_LOCAL_DISTANCE 100 +/* The default distance between two remtoe node */ +#define REMOTE_TO_REMOTE_DISTANCE 254 + +static bool numa_remote_enabled __ro_after_init; +static nodemask_t numa_nodes_remote; + +bool numa_is_remote_node(int nid) +{ + return !!node_isset(nid, numa_nodes_remote); +} +EXPORT_SYMBOL_GPL(numa_is_remote_node); + +static void numa_remote_reset_distance(int nid) +{ + int i; + + for (i = 0; i < MAX_NUMNODES; i++) { + if (i == nid) + continue; + if (!numa_is_remote_node(i)) { + numa_set_distance(i, nid, REMOTE_TO_LOCAL_DISTANCE); + numa_set_distance(nid, i, REMOTE_TO_LOCAL_DISTANCE); + } else { + numa_set_distance(i, nid, REMOTE_TO_REMOTE_DISTANCE); + numa_set_distance(nid, i, REMOTE_TO_REMOTE_DISTANCE); + } + } +} + +void __init numa_register_remote_nodes(void) +{ + int i; + + if (!numa_remote_enabled) + return; + + for (i = 0; i < MAX_NUMNODES; i++) { + if (!node_test_and_set(i, numa_nodes_parsed)) + node_set(i, numa_nodes_remote); + } + + for (i = 0; i < MAX_NUMNODES; i++) { + if (numa_is_remote_node(i)) + numa_remote_reset_distance(i); + } + + pr_info("%d nodes", nodes_weight(numa_nodes_remote)); +} + +static int __init numa_parse_remote_nodes(char *buf) +{ + numa_remote_enabled = true; + + return 0; +} +early_param("numa_remote", numa_parse_remote_nodes); diff --git a/include/linux/numa_remote.h b/include/linux/numa_remote.h new file mode 100644 index 000000000000..e85cb22b9e1f --- /dev/null +++ b/include/linux/numa_remote.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2024 Huawei Technologies Co., Ltd. + * Author: Liu Shixin + */ +#ifndef _LINUX_REMOTE_MEMORY_H_ +#define _LINUX_REMOTE_MEMORY_H_ + +#include + +#ifdef CONFIG_NUMA_REMOTE +bool numa_is_remote_node(int nid); +void numa_register_remote_nodes(void); +#else +static inline bool numa_is_remote_node(int nid) +{ + return false; +} + +static inline void numa_register_remote_nodes(void) +{ +} +#endif +#endif /* _LINUX_REMOTE_MEMORY_H_ */ -- Gitee From 5d0bd1321546c9cb819289181149481eecb491e5 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 24 Nov 2023 11:28:19 +0800 Subject: [PATCH 04/21] mm/numa_remote: support to hotplug/hotremove remote memory hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID3KH8 -------------------------------- Add two function add_memory_remote() and remove_memory_remote() to hotplug remote memory and hotremove remote memory respectively. Signed-off-by: Liu Shixin --- drivers/base/arch_numa.c | 2 +- drivers/base/numa_remote.c | 88 +++++++++++++++++++++++++++++++++++++ include/asm-generic/numa.h | 2 +- include/linux/numa_remote.h | 12 +++++ mm/memory_hotplug.c | 24 ++++++++-- 5 files changed, 123 insertions(+), 5 deletions(-) diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c index a7c82b3c89d1..0e615ed23635 100644 --- a/drivers/base/arch_numa.c +++ b/drivers/base/arch_numa.c @@ -389,7 +389,7 @@ static int __init numa_alloc_distance(void) * If @from or @to is higher than the highest known node or lower than zero * or @distance doesn't make sense, the call is ignored. */ -void __init numa_set_distance(int from, int to, int distance) +void __ref numa_set_distance(int from, int to, int distance) { if (!numa_distance) { pr_warn_once("Warning: distance table not allocated yet\n"); diff --git a/drivers/base/numa_remote.c b/drivers/base/numa_remote.c index f6be837d06d7..baf35cf68529 100644 --- a/drivers/base/numa_remote.c +++ b/drivers/base/numa_remote.c @@ -6,6 +6,7 @@ #define pr_fmt(fmt) "NUMA remote: " fmt +#include #include /* The default distance between local node and remote node */ @@ -66,3 +67,90 @@ static int __init numa_parse_remote_nodes(char *buf) return 0; } early_param("numa_remote", numa_parse_remote_nodes); + +static int find_unused_remote_node(void) +{ + int nid; + + for_each_node_mask(nid, numa_nodes_remote) { + if (!node_online(nid)) + return nid; + } + + return NUMA_NO_NODE; +} + +/* + * Add remote memory to the system as system RAM from CXL or UB. + * The resource_name (visible via /proc/iomem) has to have the format + * "System RAM (Remote)". + * + * @nid: which node to online + * @start: start address of memory range + * @size: size of memory range + * @flags: memory hotplug flags + * + * Returns: + * node in case add memory succeed. + * NUMA_NO_NODE in case add memory failed. + */ +int add_memory_remote(int nid, u64 start, u64 size, int flags) +{ + int real_nid = NUMA_NO_NODE; + + if (!numa_remote_enabled) + return NUMA_NO_NODE; + + if (nid < NUMA_NO_NODE || nid >= MAX_NUMNODES) + return NUMA_NO_NODE; + + if (nid != NUMA_NO_NODE && !numa_is_remote_node(nid)) + return NUMA_NO_NODE; + + lock_device_hotplug(); + + real_nid = (nid == NUMA_NO_NODE) ? find_unused_remote_node() : nid; + if (real_nid == NUMA_NO_NODE) + goto unlock; + + if (__add_memory(real_nid, start, size, MHP_MERGE_RESOURCE)) + real_nid = NUMA_NO_NODE; + +unlock: + unlock_device_hotplug(); + + return real_nid; +} +EXPORT_SYMBOL_GPL(add_memory_remote); + +/* + * Remove remote memory. + * + * Returns: + * 0 in case of memory hotremove succeed. + * -errno in case of memory hotremove failed. + */ +int remove_memory_remote(int nid, u64 start, u64 size) +{ + int ret = -EINVAL; + + if (!numa_remote_enabled) + return -EINVAL; + + if (nid <= NUMA_NO_NODE || nid >= MAX_NUMNODES) + return -EINVAL; + + if (!numa_is_remote_node(nid) || !node_online(nid)) + return -EINVAL; + + ret = offline_and_remove_memory(start, size); + if (ret) + goto out; + + if (!node_online(nid)) + numa_remote_reset_distance(nid); + +out: + return ret; +} +EXPORT_SYMBOL_GPL(remove_memory_remote); diff --git a/include/asm-generic/numa.h b/include/asm-generic/numa.h index c32e0cf23c90..1c24d9acb858 100644 --- a/include/asm-generic/numa.h +++ b/include/asm-generic/numa.h @@ -32,7 +32,7 @@ static inline const struct cpumask *cpumask_of_node(int node) void __init arch_numa_init(void); int __init numa_add_memblk(int nodeid, u64 start, u64 end); -void __init numa_set_distance(int from, int to, int distance); +void numa_set_distance(int from, int to, int distance); void __init numa_free_distance(void); void __init early_map_cpu_to_node(unsigned int cpu, int nid); int __init early_cpu_to_node(int cpu); diff --git a/include/linux/numa_remote.h b/include/linux/numa_remote.h index e85cb22b9e1f..9e1dc4e738d4 100644 --- a/include/linux/numa_remote.h +++ b/include/linux/numa_remote.h @@ -11,6 +11,8 @@ #ifdef CONFIG_NUMA_REMOTE bool numa_is_remote_node(int nid); void numa_register_remote_nodes(void); +int add_memory_remote(int nid, u64 start, u64 size, int flags); +int remove_memory_remote(int nid, u64 start, u64 size); #else static inline bool numa_is_remote_node(int nid) { @@ -20,5 +22,15 @@ static inline bool numa_is_remote_node(int nid) static inline void numa_register_remote_nodes(void) { } + +static inline int add_memory_remote(int nid, u64 start, u64 size, int flags) +{ + return NUMA_NO_NODE; +} + +static inline int remove_memory_remote(int nid, u64 start, u64 size) +{ + return -EINVAL; +} #endif #endif /* _LINUX_REMOTE_MEMORY_H_ */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 777d16005b7a..66e05b6984d2 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -35,6 +35,7 @@ #include #include #include +#include #include @@ -1317,7 +1318,17 @@ static int check_hotplug_memory_range(u64 start, u64 size) static int online_memory_block(struct memory_block *mem, void *arg) { +#ifdef CONFIG_ZONE_EXTMEM + int nid = *(int *)arg; + + if (numa_is_remote_node(nid)) + mem->online_type = MMOP_ONLINE_EXTMEM; + else + mem->online_type = mhp_default_online_type; +#else mem->online_type = mhp_default_online_type; +#endif + return device_online(&mem->dev); } @@ -1500,8 +1511,9 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) merge_system_ram_resource(res); /* online pages if requested */ - if (mhp_default_online_type != MMOP_OFFLINE) - walk_memory_blocks(start, size, NULL, online_memory_block); + if (mhp_default_online_type != MMOP_OFFLINE || + numa_is_remote_node(nid)) + walk_memory_blocks(start, size, &nid, online_memory_block); return ret; error_free: @@ -1518,9 +1530,15 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) int __ref __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags) { struct resource *res; + char *resource_name; int ret; - res = register_memory_resource(start, size, "System RAM"); + if (numa_is_remote_node(nid)) + resource_name = "System RAM (Remote)"; + else + resource_name = "System RAM"; + + res = register_memory_resource(start, size, resource_name); if (IS_ERR(res)) return PTR_ERR(res); -- Gitee From 3194b7c13145b1278b6c9726fe674fee621ad557 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Wed, 12 Jun 2024 17:11:39 +0800 Subject: [PATCH 05/21] mm/numa_remote: support to set node distance hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID3KH8 -------------------------------- Support to modify distance of remote node dynamically. Signed-off-by: Liu Shixin --- drivers/base/numa_remote.c | 28 ++++++++++++++++++++++++++++ include/linux/numa_remote.h | 8 ++++++++ 2 files changed, 36 insertions(+) diff --git a/drivers/base/numa_remote.c b/drivers/base/numa_remote.c index baf35cf68529..7bea95d0c8a3 100644 --- a/drivers/base/numa_remote.c +++ b/drivers/base/numa_remote.c @@ -154,3 +154,31 @@ int remove_memory_remote(int nid, u64 start, u64 size) return ret; } EXPORT_SYMBOL_GPL(remove_memory_remote); + +int numa_remote_set_distance(int target, int *node_ids, int *node_distances, + int count) +{ + int i; + + if (!numa_remote_enabled) + return -EINVAL; + + if (target <= NUMA_NO_NODE || target >= MAX_NUMNODES) + return -EINVAL; + + if (!numa_is_remote_node(target)) + return -EINVAL; + + for (i = 0; i < count; i++) { + if (numa_is_remote_node(node_ids[i])) + return -EINVAL; + } + + for (i = 0; i < count; i++) { + numa_set_distance(target, node_ids[i], node_distances[i]); + numa_set_distance(node_ids[i], target, node_distances[i]); + } + + return 0; +} +EXPORT_SYMBOL_GPL(numa_remote_set_distance); diff --git a/include/linux/numa_remote.h b/include/linux/numa_remote.h index 9e1dc4e738d4..07f59bbb3334 100644 --- a/include/linux/numa_remote.h +++ b/include/linux/numa_remote.h @@ -13,6 +13,8 @@ bool numa_is_remote_node(int nid); void numa_register_remote_nodes(void); int add_memory_remote(int nid, u64 start, u64 size, int flags); int remove_memory_remote(int nid, u64 start, u64 size); +int numa_remote_set_distance(int target, int *node_ids, int *node_distances, + int count); #else static inline bool numa_is_remote_node(int nid) { @@ -32,5 +34,11 @@ static inline int remove_memory_remote(int nid, u64 start, u64 size) { return -EINVAL; } + +static inline int numa_remote_set_distance(int target, int *node_ids, + int *node_distances, int count) +{ + return -EINVAL; +} #endif #endif /* _LINUX_REMOTE_MEMORY_H_ */ -- Gitee From 68026203305dd04f6f23b762c53fbbabff807bb3 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Thu, 7 Dec 2023 22:47:00 +0800 Subject: [PATCH 06/21] mm/numa_remote: introduce nofallback mode for remote node hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID3KH8 -------------------------------- Skip remote node in find_next_best_node so remote node is not in the zonelists of other nodes. In this mode, the remote pages can be only allocated by specifying the remote node. Signed-off-by: Liu Shixin --- drivers/base/numa_remote.c | 24 ++++++++++++++++++++++++ include/linux/numa_remote.h | 6 ++++++ mm/mempolicy.c | 12 ++++++++++++ mm/page_alloc.c | 5 +++++ 4 files changed, 47 insertions(+) diff --git a/drivers/base/numa_remote.c b/drivers/base/numa_remote.c index 7bea95d0c8a3..b9afaaccf53a 100644 --- a/drivers/base/numa_remote.c +++ b/drivers/base/numa_remote.c @@ -15,6 +15,8 @@ #define REMOTE_TO_REMOTE_DISTANCE 254 static bool numa_remote_enabled __ro_after_init; +static bool numa_remote_nofallback_mode __ro_after_init; + static nodemask_t numa_nodes_remote; bool numa_is_remote_node(int nid) @@ -23,6 +25,11 @@ bool numa_is_remote_node(int nid) } EXPORT_SYMBOL_GPL(numa_is_remote_node); +bool numa_remote_nofallback(int nid) +{ + return numa_remote_nofallback_mode && numa_is_remote_node(nid); +} + static void numa_remote_reset_distance(int nid) { int i; @@ -60,10 +67,27 @@ void __init numa_register_remote_nodes(void) pr_info("%d nodes", nodes_weight(numa_nodes_remote)); } +/* + * Parse a series of numa_remote options. + * + * 'nofallback': skip remote node from zonelists. + */ static int __init numa_parse_remote_nodes(char *buf) { numa_remote_enabled = true; + if (!buf) + return 0; + + while (*buf) { + if (!strncmp(buf, "nofallback", 10)) + numa_remote_nofallback_mode = true; + + buf += strcspn(buf, ","); + while (*buf == ',') + buf++; + } + return 0; } early_param("numa_remote", numa_parse_remote_nodes); diff --git a/include/linux/numa_remote.h b/include/linux/numa_remote.h index 07f59bbb3334..2bb79b25732f 100644 --- a/include/linux/numa_remote.h +++ b/include/linux/numa_remote.h @@ -10,6 +10,7 @@ #ifdef CONFIG_NUMA_REMOTE bool numa_is_remote_node(int nid); +bool numa_remote_nofallback(int nid); void numa_register_remote_nodes(void); int add_memory_remote(int nid, u64 start, u64 size, int flags); int remove_memory_remote(int nid, u64 start, u64 size); @@ -21,6 +22,11 @@ static inline bool numa_is_remote_node(int nid) return false; } +static inline bool numa_remote_nofallback(int nid) +{ + return false; +} + static inline void numa_register_remote_nodes(void) { } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a82aab7ab47a..8d7732e276f3 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -103,6 +103,7 @@ #include #include #include +#include #include #include @@ -2063,6 +2064,17 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) policy->home_node != NUMA_NO_NODE) return policy->home_node; + /* + * In nofallback mode, the remote node is not in zonelists, + * set remote node as preferred_nid or it will be skipped. + * MPOL_PREFERRED_MANY is not supported, becase at least + * one remote node that will be skipped. + */ + if (policy->mode == MPOL_BIND) { + if (numa_remote_nofallback(first_node(policy->nodes))) + return first_node(policy->nodes); + } + return nd; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e65d4820d9b4..a6c4e6c88404 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include "internal.h" #include "shuffle.h" @@ -5299,6 +5300,10 @@ int find_next_best_node(int node, nodemask_t *used_node_mask) if (node_isset(n, *used_node_mask)) continue; + /* Don't fallback to remote node */ + if (numa_remote_nofallback(n)) + continue; + /* Use the distance array to find the distance */ val = node_distance(node, n); -- Gitee From 8e27b94c109cfa7d1e61d8c386d45ff88d69bcc0 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Tue, 11 Jun 2024 12:37:45 +0800 Subject: [PATCH 07/21] mm/numa_remote: introduce pre-online mode to support hotplug unready memory hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID3KH8 -------------------------------- To support hotplug unready memory, register online_page callback to keep these pages isolated. After the memory is ready, call numa_remote_undo_fake_online() to undo isolate these pages. While these pages is isolated, reuse vmemmap optimization feature to reduce vmemmap size. Signed-off-by: Liu Shixin Signed-off-by: Jinjiang Tu --- drivers/base/memory.c | 57 +++++++ drivers/base/numa_remote.c | 283 ++++++++++++++++++++++++++++++++- include/linux/memory.h | 23 +++ include/linux/memory_hotplug.h | 7 + include/linux/numa_remote.h | 9 ++ mm/hugetlb_vmemmap.c | 37 +++++ mm/hugetlb_vmemmap.h | 12 ++ mm/memory_hotplug.c | 5 +- 8 files changed, 430 insertions(+), 3 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index bca481b3b3e3..e725251e4e74 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -25,6 +25,7 @@ #include #include +#include #define MEMORY_CLASS_NAME "memory" @@ -916,6 +917,62 @@ void remove_memory_block_devices(unsigned long start, unsigned long size) } } +#ifdef CONFIG_NUMA_REMOTE +bool check_memory_block_nid(unsigned long start, unsigned long size, int nid) +{ + unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); + unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); + unsigned long block_id; + struct memory_block *mem; + + for (block_id = start_block_id; block_id != end_block_id; block_id++) { + mem = find_memory_block_by_id(block_id); + if (!mem) + return false; + + if (mem->nid != nid) + return false; + } + return true; +} + +bool check_memory_block_pre_online(unsigned long start, unsigned long size, + bool pre_online) +{ + unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); + unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); + unsigned long block_id; + struct memory_block *mem; + + for (block_id = start_block_id; block_id != end_block_id; block_id++) { + mem = find_memory_block_by_id(block_id); + if (!mem) + return false; + + if (mem->pre_online != pre_online) + return false; + } + return true; +} + +void set_memory_block_pre_online(unsigned long start, unsigned long size, + bool pre_online) +{ + unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); + unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); + unsigned long block_id; + struct memory_block *mem; + + for (block_id = start_block_id; block_id != end_block_id; block_id++) { + mem = find_memory_block_by_id(block_id); + if (!mem) + continue; + + mem->pre_online = pre_online; + } +} +#endif + static struct attribute *memory_root_attrs[] = { #ifdef CONFIG_ARCH_MEMORY_PROBE &dev_attr_probe.attr, diff --git a/drivers/base/numa_remote.c b/drivers/base/numa_remote.c index b9afaaccf53a..344c80a124c4 100644 --- a/drivers/base/numa_remote.c +++ b/drivers/base/numa_remote.c @@ -7,7 +7,12 @@ #define pr_fmt(fmt) "NUMA remote: " fmt #include +#include +#include +#include #include +#include "../../mm/hugetlb_vmemmap.h" +#include "../../mm/internal.h" /* The default distance between local node and remote node */ #define REMOTE_TO_LOCAL_DISTANCE 100 @@ -16,9 +21,12 @@ static bool numa_remote_enabled __ro_after_init; static bool numa_remote_nofallback_mode __ro_after_init; +static bool numa_remote_preonline_mode __ro_after_init; static nodemask_t numa_nodes_remote; +static DEFINE_MUTEX(numa_remote_lock); + bool numa_is_remote_node(int nid) { return !!node_isset(nid, numa_nodes_remote); @@ -30,6 +38,11 @@ bool numa_remote_nofallback(int nid) return numa_remote_nofallback_mode && numa_is_remote_node(nid); } +bool numa_remote_preonline(int nid) +{ + return numa_remote_preonline_mode && numa_is_remote_node(nid); +} + static void numa_remote_reset_distance(int nid) { int i; @@ -71,6 +84,7 @@ void __init numa_register_remote_nodes(void) * Parse a series of numa_remote options. * * 'nofallback': skip remote node from zonelists. + * 'preonline': support to online remote memory before it is ready. */ static int __init numa_parse_remote_nodes(char *buf) { @@ -82,6 +96,8 @@ static int __init numa_parse_remote_nodes(char *buf) while (*buf) { if (!strncmp(buf, "nofallback", 10)) numa_remote_nofallback_mode = true; + else if (!strncmp(buf, "preonline", 9)) + numa_remote_preonline_mode = true; buf += strcspn(buf, ","); while (*buf == ',') @@ -92,6 +108,188 @@ static int __init numa_parse_remote_nodes(char *buf) } early_param("numa_remote", numa_parse_remote_nodes); +static void numa_remote_optimize_vmemmap(unsigned long start_pfn, + unsigned long end_pfn) +{ + unsigned long pfn; + struct page *page; + + for (pfn = start_pfn; pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES) { + page = pfn_to_page(pfn); + if (!page) + continue; + + fake_online_pages_vmemmap_optimize(page, MAX_ORDER_NR_PAGES); + } +} + +static int numa_remote_restore_vmemmap(unsigned long start_pfn, + unsigned long end_pfn) +{ + unsigned long pfn; + struct page *page; + int ret; + + for (pfn = start_pfn; pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES) { + page = pfn_to_page(pfn); + if (!page) + continue; + + ret = fake_online_pages_vmemmap_restore(page, MAX_ORDER_NR_PAGES); + if (ret) { + numa_remote_optimize_vmemmap(start_pfn, pfn); + return ret; + } + } + + return 0; +} + +static void numa_remote_preonline_going_offline(unsigned long pfn, + unsigned long nr_pages) +{ + struct page *page; + unsigned long i; + + adjust_managed_page_count(pfn_to_page(pfn), nr_pages); + for (i = 0; i < nr_pages; i++) { + page = pfn_to_page(pfn + i); + if (WARN_ON(!page_ref_dec_and_test(page))) + dump_page(page, "preonline page referenced"); + } +} + +static void numa_remote_preonline_cancel_offline(unsigned long pfn, + unsigned long nr_pages) +{ + unsigned long i; + + adjust_managed_page_count(pfn_to_page(pfn), -nr_pages); + for (i = 0; i < nr_pages; i++) + page_ref_inc(pfn_to_page(pfn + i)); +} + +static int numa_remote_memory_notifier_cb(struct notifier_block *nb, + unsigned long action, void *arg) +{ + struct memory_notify *mhp = arg; + const unsigned long start = PFN_PHYS(mhp->start_pfn); + const unsigned long size = PFN_PHYS(mhp->nr_pages); + + if (!check_memory_block_pre_online(start, size, true)) + return NOTIFY_DONE; + + switch (action) { + case MEM_GOING_OFFLINE: + numa_remote_preonline_going_offline(mhp->start_pfn, mhp->nr_pages); + break; + case MEM_CANCEL_OFFLINE: + numa_remote_preonline_cancel_offline(mhp->start_pfn, mhp->nr_pages); + break; + default: + break; + } + + return NOTIFY_OK; +} + +struct notifier_block numa_remote_memory_notifier = { + .notifier_call = numa_remote_memory_notifier_cb, +}; + +static void numa_remote_preonline_pages(struct page *page, unsigned int order) +{ + unsigned long start_pfn, end_pfn, pfn, nr_pages; + struct page *p; + + start_pfn = page_to_pfn(page); + nr_pages = 1 << order; + end_pfn = start_pfn + nr_pages; + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + p = pfn_to_page(pfn); + __SetPageOffline(p); + ClearPageReserved(p); + } + numa_remote_optimize_vmemmap(start_pfn, end_pfn); +} + +static void numa_remote_online_pages(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long nr_pages = end_pfn - start_pfn; + unsigned long pfn, i; + struct page *page; + + for (i = 0; i < nr_pages; ++i) { + page = pfn_to_page(start_pfn + i); + __ClearPageOffline(page); + } + + for (pfn = start_pfn; pfn < end_pfn; pfn += (1UL << MAX_ORDER)) + generic_online_page(pfn_to_page(pfn), MAX_ORDER); +} + +/* + * Undo fake-online a remote node. Have to be called in preonline mode then + * the memory on the node can be allocated. + */ +static int __ref numa_remote_undo_fake_online(u64 start, u64 size) +{ + unsigned long start_pfn = PFN_DOWN(start); + unsigned long end_pfn = PFN_DOWN(start + size); + struct zone *zone; + int nid; + int ret = 0; + + mem_hotplug_begin(); + /* Re-check whether all memory block are pre-online. */ + if (!check_memory_block_pre_online(start, size, true)) { + ret = -EINVAL; + goto out; + } + + zone = page_zone(phys_to_page(start)); + nid = zone_to_nid(zone); + if (!check_memory_block_nid(start, size, nid)) { + ret = -EINVAL; + goto out; + } + + ret = numa_remote_restore_vmemmap(start_pfn, end_pfn); + if (ret) { + pr_err_ratelimited("restore vmemmap failed\n"); + goto out; + } + + set_memory_block_pre_online(start, size, false); + numa_remote_online_pages(start_pfn, end_pfn); + + init_per_zone_wmark_min(); + writeback_set_ratelimit(); + +out: + mem_hotplug_done(); + return ret; +} + +static int __ref numa_remote_restore_isolation(u64 start, u64 size) +{ + unsigned long start_pfn = PFN_DOWN(start); + unsigned long end_pfn = PFN_DOWN(start + size); + int ret = 0; + + mem_hotplug_begin(); + + ret = numa_remote_restore_vmemmap(start_pfn, end_pfn); + if (ret) { + pr_err_ratelimited("restore vmemmap failed\n"); + goto out; + } + +out: + mem_hotplug_done(); + return ret; +} + static int find_unused_remote_node(void) { int nid; @@ -121,6 +319,7 @@ static int find_unused_remote_node(void) int add_memory_remote(int nid, u64 start, u64 size, int flags) { int real_nid = NUMA_NO_NODE; + mhp_t mhp_flags = MHP_MERGE_RESOURCE; if (!numa_remote_enabled) return NUMA_NO_NODE; @@ -131,18 +330,63 @@ int add_memory_remote(int nid, u64 start, u64 size, int flags) if (nid != NUMA_NO_NODE && !numa_is_remote_node(nid)) return NUMA_NO_NODE; + if (!numa_remote_preonline_mode && !(flags & MEMORY_DIRECT_ONLINE)) + return NUMA_NO_NODE; + + if (flags & ~(MEMORY_KEEP_ISOLATED | MEMORY_DIRECT_ONLINE)) + return NUMA_NO_NODE; + + if (flags == (MEMORY_KEEP_ISOLATED | MEMORY_DIRECT_ONLINE)) + return NUMA_NO_NODE; + + mutex_lock(&numa_remote_lock); + + if (numa_remote_preonline_mode && !flags) { + if (check_hotplug_memory_range(start, size)) + goto out; + /* Check whether all memory block are pre-online. */ + if (!check_memory_block_pre_online(start, size, true)) + goto out; + + real_nid = (nid == NUMA_NO_NODE) ? + page_to_nid(phys_to_page(start)) : nid; + if (!check_memory_block_nid(start, size, real_nid)) { + real_nid = NUMA_NO_NODE; + goto out; + } + + if (numa_remote_undo_fake_online(start, size)) { + real_nid = NUMA_NO_NODE; + goto out; + } + } + lock_device_hotplug(); real_nid = (nid == NUMA_NO_NODE) ? find_unused_remote_node() : nid; if (real_nid == NUMA_NO_NODE) goto unlock; - if (__add_memory(real_nid, start, size, MHP_MERGE_RESOURCE)) + if (flags & MEMORY_KEEP_ISOLATED) { + int rc; + + rc = set_online_page_callback(&numa_remote_preonline_pages); + if (rc) { + real_nid = NUMA_NO_NODE; + goto unlock; + } + mhp_flags |= MHP_PREONLINE; + } + + if (__add_memory(real_nid, start, size, mhp_flags)) real_nid = NUMA_NO_NODE; + if (flags & MEMORY_KEEP_ISOLATED) + restore_online_page_callback(&numa_remote_preonline_pages); unlock: unlock_device_hotplug(); - +out: + mutex_unlock(&numa_remote_lock); return real_nid; } EXPORT_SYMBOL_GPL(add_memory_remote); @@ -167,6 +411,23 @@ int remove_memory_remote(int nid, u64 start, u64 size) if (!numa_is_remote_node(nid) || !node_online(nid)) return -EINVAL; + mutex_lock(&numa_remote_lock); + if (!check_memory_block_nid(start, size, nid)) + goto out; + + /* + * If all memory block are already online, do nothing here. + * If all memory block are pre-online, restore the isolation + * and count. If mixed, don't allow to offline. + */ + if (numa_remote_preonline(nid) && + !check_memory_block_pre_online(start, size, false)) { + if (!check_memory_block_pre_online(start, size, true)) + goto out; + if (numa_remote_restore_isolation(start, size)) + goto out; + } + ret = offline_and_remove_memory(start, size); if (ret) goto out; @@ -175,6 +436,7 @@ int remove_memory_remote(int nid, u64 start, u64 size) numa_remote_reset_distance(nid); out: + mutex_unlock(&numa_remote_lock); return ret; } EXPORT_SYMBOL_GPL(remove_memory_remote); @@ -206,3 +468,20 @@ int numa_remote_set_distance(int target, int *node_ids, int *node_distances, return 0; } EXPORT_SYMBOL_GPL(numa_remote_set_distance); + +static int __init numa_remote_init(void) +{ + int ret; + + if (!numa_remote_preonline_mode) + return 0; + + ret = register_memory_notifier(&numa_remote_memory_notifier); + if (ret) { + numa_remote_preonline_mode = false; + pr_err("fail to enanble preonline mode\n"); + } + + return ret; +} +late_initcall(numa_remote_init); diff --git a/include/linux/memory.h b/include/linux/memory.h index f53cfdaaaa41..9d7431ff1282 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -83,6 +83,9 @@ struct memory_block { #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) atomic_long_t nr_hwpoison; #endif +#ifdef CONFIG_NUMA_REMOTE + bool pre_online; +#endif }; int arch_get_memory_phys_device(unsigned long start_pfn); @@ -146,6 +149,26 @@ int create_memory_block_devices(unsigned long start, unsigned long size, struct vmem_altmap *altmap, struct memory_group *group); void remove_memory_block_devices(unsigned long start, unsigned long size); +#ifdef CONFIG_NUMA_REMOTE +bool check_memory_block_nid(unsigned long start, unsigned long size, int nid); +bool check_memory_block_pre_online(unsigned long start, unsigned long size, + bool pre_online); +void set_memory_block_pre_online(unsigned long start, unsigned long size, + bool pre_online); +static inline bool memory_block_is_pre_online(struct memory_block *mem) +{ + return mem->pre_online; +} +#else +static inline void set_memory_block_pre_online(unsigned long start, unsigned long size, + bool pre_online) +{ +} +static inline bool memory_block_is_pre_online(struct memory_block *mem) +{ + return false; +} +#endif extern void memory_dev_init(void); extern int memory_notify(unsigned long val, void *v); extern struct memory_block *find_memory_block(unsigned long section_nr); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index feda1e6e24f6..a02e768937a9 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -115,6 +115,12 @@ typedef int __bitwise mhp_t; */ #define MHP_NID_IS_MGID ((__force mhp_t)BIT(2)) +/* + * Online memory to pre-online state, i.e., memblock device and vmemmap are + * created, but pages are keep isolated to avoid being allocated. + */ +#define MHP_PREONLINE ((__force mhp_t)BIT(3)) + /* * Extended parameters for memory hotplug: * altmap: alternative allocator for memmap array (optional) @@ -176,6 +182,7 @@ extern int set_online_page_callback(online_page_callback_t callback); extern int restore_online_page_callback(online_page_callback_t callback); extern int try_online_node(int nid); +extern int check_hotplug_memory_range(u64 start, u64 size); extern int arch_add_memory(int nid, u64 start, u64 size, struct mhp_params *params); diff --git a/include/linux/numa_remote.h b/include/linux/numa_remote.h index 2bb79b25732f..596589f341f2 100644 --- a/include/linux/numa_remote.h +++ b/include/linux/numa_remote.h @@ -8,9 +8,13 @@ #include +#define MEMORY_KEEP_ISOLATED 1 +#define MEMORY_DIRECT_ONLINE 2 + #ifdef CONFIG_NUMA_REMOTE bool numa_is_remote_node(int nid); bool numa_remote_nofallback(int nid); +bool numa_remote_preonline(int nid); void numa_register_remote_nodes(void); int add_memory_remote(int nid, u64 start, u64 size, int flags); int remove_memory_remote(int nid, u64 start, u64 size); @@ -27,6 +31,11 @@ static inline bool numa_remote_nofallback(int nid) return false; } +static inline bool numa_remote_preonline(int nid) +{ + return false; +} + static inline void numa_register_remote_nodes(void) { } diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 2bde429b2ea3..c197a609b342 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -644,3 +644,40 @@ static int __init hugetlb_vmemmap_init(void) return 0; } late_initcall(hugetlb_vmemmap_init); + +/* Similar with hugetlb_vmemmap_restore. */ +int fake_online_pages_vmemmap_restore(struct page *head, unsigned long nr_pages) +{ + int ret; + unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; + unsigned long vmemmap_reuse; + + if (!HPageVmemmapOptimized(head)) + return 0; + + vmemmap_end = vmemmap_start + nr_pages * sizeof(struct page); + vmemmap_reuse = vmemmap_start; + vmemmap_start += PAGE_SIZE; + + ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse); + if (!ret) + ClearHPageVmemmapOptimized(head); + + return ret; +} + +/* Similar with hugetlb_vmemmap_optimize. */ +void fake_online_pages_vmemmap_optimize(struct page *head, unsigned long nr_pages) +{ + unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; + unsigned long vmemmap_reuse; + + vmemmap_end = vmemmap_start + nr_pages * sizeof(struct page); + vmemmap_reuse = vmemmap_start; + vmemmap_start += PAGE_SIZE; + + if (vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse)) + pr_err_ratelimited("optimize vmemmap failed\n"); + else + SetHPageVmemmapOptimized(head); +} diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index 4573899855d7..2f6950410cb2 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -38,6 +38,9 @@ static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate return 0; return size > 0 ? size : 0; } + +int fake_online_pages_vmemmap_restore(struct page *head, unsigned long nr_pages); +void fake_online_pages_vmemmap_optimize(struct page *head, unsigned long nr_pages); #else static inline int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) { @@ -52,6 +55,15 @@ static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate { return 0; } + +static inline int fake_online_pages_vmemmap_restore(struct page *head, unsigned long nr_pages) +{ + return 0; +} + +static inline void fake_online_pages_vmemmap_optimize(struct page *head, unsigned long nr_pages) +{ +} #endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */ static inline bool hugetlb_vmemmap_optimizable(const struct hstate *h) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 66e05b6984d2..074ec691e892 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1303,7 +1303,7 @@ int try_online_node(int nid) return ret; } -static int check_hotplug_memory_range(u64 start, u64 size) +int check_hotplug_memory_range(u64 start, u64 size) { /* memory range must be block size aligned */ if (!size || !IS_ALIGNED(start, memory_block_size_bytes()) || @@ -1496,6 +1496,9 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) PFN_UP(start + size - 1), MEMINIT_HOTPLUG); + if (mhp_flags & MHP_PREONLINE) + set_memory_block_pre_online(start, size, true); + /* create new memmap entry */ if (!strcmp(res->name, "System RAM")) firmware_map_add_hotplug(start, start + size, "System RAM"); -- Gitee From c855b296345ef1ac9ff2a1fbb2e01bf4c5d85515 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Tue, 8 Oct 2024 17:15:25 +0800 Subject: [PATCH 08/21] mm/numa_remote: undo isolation of remote memory asynchronously hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID3KH8 -------------------------------- There is no need to wait for all remote memory to be de-isolated. So try to undo isolation of remote memory asynchronously and only wait if the memory below watermark. Signed-off-by: Liu Shixin Signed-off-by: Jinjiang Tu --- drivers/base/numa_remote.c | 91 ++++++++++++++++++++++++++++++++++++- include/linux/numa_remote.h | 6 +++ mm/page_alloc.c | 6 +++ 3 files changed, 102 insertions(+), 1 deletion(-) diff --git a/drivers/base/numa_remote.c b/drivers/base/numa_remote.c index 344c80a124c4..307cb49a079d 100644 --- a/drivers/base/numa_remote.c +++ b/drivers/base/numa_remote.c @@ -25,7 +25,18 @@ static bool numa_remote_preonline_mode __ro_after_init; static nodemask_t numa_nodes_remote; +struct undo_fake_online_control { + u64 start; + u64 size; + struct llist_node llist; +}; + +static LLIST_HEAD(undo_fake_online_list); + +static atomic_long_t undo_fake_online_pages_node[MAX_NUMNODES]; + static DEFINE_MUTEX(numa_remote_lock); +static DECLARE_RWSEM(numa_remote_state_lock); bool numa_is_remote_node(int nid) { @@ -236,6 +247,7 @@ static int __ref numa_remote_undo_fake_online(u64 start, u64 size) { unsigned long start_pfn = PFN_DOWN(start); unsigned long end_pfn = PFN_DOWN(start + size); + unsigned long nr_pages = end_pfn - start_pfn; struct zone *zone; int nid; int ret = 0; @@ -262,6 +274,7 @@ static int __ref numa_remote_undo_fake_online(u64 start, u64 size) set_memory_block_pre_online(start, size, false); numa_remote_online_pages(start_pfn, end_pfn); + atomic_long_add(-nr_pages, &undo_fake_online_pages_node[nid]); init_per_zone_wmark_min(); writeback_set_ratelimit(); @@ -290,6 +303,61 @@ static int __ref numa_remote_restore_isolation(u64 start, u64 size) return ret; } +static void undo_fake_online_work_fn(struct work_struct *work) +{ + struct undo_fake_online_control *uic; + struct llist_node *node; + + node = llist_del_all(&undo_fake_online_list); + + while (node) { + uic = container_of(node, struct undo_fake_online_control, llist); + node = node->next; + + mutex_lock(&numa_remote_lock); + numa_remote_undo_fake_online(uic->start, uic->size); + mutex_unlock(&numa_remote_lock); + kfree(uic); + } +} + +static DECLARE_WORK(undo_fake_online_work, undo_fake_online_work_fn); + +static void numa_remote_wait_undo_fake_online(void) +{ + flush_work(&undo_fake_online_work); +} + +bool numa_remote_try_wait_undo_fake_online(int nid) +{ + int ret = false; + + if (!numa_remote_preonline(nid)) + return ret; + + if (!atomic_long_read(&undo_fake_online_pages_node[nid])) + return ret; + + /* + * Avoid circular locking lockdep warnings. Preonline and + * offline require numa_remote_lock and vma lock. undo_fake_online_work + * requires numa_remote_lock. handle_mm_fault() may flush undo_fake_online_work + * with vma lock held. This forms circular locking dependency. However, + * numa_remote_state_lock guarantees when preonline or offline is doing, + * handle_mm_fault() won't flush undo_fake_online_work. False positive. + */ + lockdep_off(); + if (!down_read_trylock(&numa_remote_state_lock)) + goto out; + + numa_remote_wait_undo_fake_online(); + up_read(&numa_remote_state_lock); + ret = true; +out: + lockdep_on(); + return ret; +} + static int find_unused_remote_node(void) { int nid; @@ -339,9 +407,15 @@ int add_memory_remote(int nid, u64 start, u64 size, int flags) if (flags == (MEMORY_KEEP_ISOLATED | MEMORY_DIRECT_ONLINE)) return NUMA_NO_NODE; + if (flags & (MEMORY_KEEP_ISOLATED | MEMORY_DIRECT_ONLINE)) { + numa_remote_wait_undo_fake_online(); + down_write(&numa_remote_state_lock); + } mutex_lock(&numa_remote_lock); if (numa_remote_preonline_mode && !flags) { + struct undo_fake_online_control *uic; + if (check_hotplug_memory_range(start, size)) goto out; /* Check whether all memory block are pre-online. */ @@ -355,10 +429,19 @@ int add_memory_remote(int nid, u64 start, u64 size, int flags) goto out; } - if (numa_remote_undo_fake_online(start, size)) { + uic = kzalloc(sizeof(struct undo_fake_online_control), + GFP_KERNEL); + if (!uic) { real_nid = NUMA_NO_NODE; goto out; } + + atomic_long_add(size / PAGE_SIZE, &undo_fake_online_pages_node[real_nid]); + uic->start = start; + uic->size = size; + if (llist_add(&uic->llist, &undo_fake_online_list)) + schedule_work(&undo_fake_online_work); + goto out; } lock_device_hotplug(); @@ -387,6 +470,8 @@ int add_memory_remote(int nid, u64 start, u64 size, int flags) unlock_device_hotplug(); out: mutex_unlock(&numa_remote_lock); + if (flags & (MEMORY_KEEP_ISOLATED | MEMORY_DIRECT_ONLINE)) + up_write(&numa_remote_state_lock); return real_nid; } EXPORT_SYMBOL_GPL(add_memory_remote); @@ -411,6 +496,9 @@ int remove_memory_remote(int nid, u64 start, u64 size) if (!numa_is_remote_node(nid) || !node_online(nid)) return -EINVAL; + numa_remote_wait_undo_fake_online(); + + down_write(&numa_remote_state_lock); mutex_lock(&numa_remote_lock); if (!check_memory_block_nid(start, size, nid)) goto out; @@ -437,6 +525,7 @@ int remove_memory_remote(int nid, u64 start, u64 size) out: mutex_unlock(&numa_remote_lock); + up_write(&numa_remote_state_lock); return ret; } EXPORT_SYMBOL_GPL(remove_memory_remote); diff --git a/include/linux/numa_remote.h b/include/linux/numa_remote.h index 596589f341f2..9691efb7a59d 100644 --- a/include/linux/numa_remote.h +++ b/include/linux/numa_remote.h @@ -16,6 +16,7 @@ bool numa_is_remote_node(int nid); bool numa_remote_nofallback(int nid); bool numa_remote_preonline(int nid); void numa_register_remote_nodes(void); +bool numa_remote_try_wait_undo_fake_online(int nid); int add_memory_remote(int nid, u64 start, u64 size, int flags); int remove_memory_remote(int nid, u64 start, u64 size); int numa_remote_set_distance(int target, int *node_ids, int *node_distances, @@ -40,6 +41,11 @@ static inline void numa_register_remote_nodes(void) { } +static inline bool numa_remote_try_wait_undo_fake_online(int nid) +{ + return false; +} + static inline int add_memory_remote(int nid, u64 start, u64 size, int flags) { return NUMA_NO_NODE; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a6c4e6c88404..afbcbc8adeb2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3342,6 +3342,12 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, if (alloc_flags & ALLOC_NO_WATERMARKS) goto try_this_zone; + if (numa_remote_try_wait_undo_fake_online(zone_to_nid(zone))) { + if (zone_watermark_ok(zone, order, mark, + ac->highest_zoneidx, alloc_flags)) + goto try_this_zone; + } + if (!node_reclaim_enabled() || !zone_allows_reclaim(ac->preferred_zoneref->zone, zone)) continue; -- Gitee From 3a1587ef612ce9ccc0e13c61fa5ea26a9736f792 Mon Sep 17 00:00:00 2001 From: Jinjiang Tu Date: Mon, 3 Mar 2025 15:17:58 +0800 Subject: [PATCH 09/21] mm/numa_remote: introduce hugetlb_nowatermark mode for remote node hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID3KH8 -------------------------------- In hugetlb_nowatermark mode, alloc_pool_huge_page() for remote node will ignore the watermark limit. Signed-off-by: Jinjiang Tu --- drivers/base/numa_remote.c | 10 ++++++++++ include/linux/mmzone.h | 4 ++++ include/linux/numa_remote.h | 6 ++++++ mm/hugetlb.c | 32 +++++++++++++++++++++++++++++++- 4 files changed, 51 insertions(+), 1 deletion(-) diff --git a/drivers/base/numa_remote.c b/drivers/base/numa_remote.c index 307cb49a079d..903aae5ee6fd 100644 --- a/drivers/base/numa_remote.c +++ b/drivers/base/numa_remote.c @@ -22,6 +22,7 @@ static bool numa_remote_enabled __ro_after_init; static bool numa_remote_nofallback_mode __ro_after_init; static bool numa_remote_preonline_mode __ro_after_init; +static bool numa_remote_hugetlb_nowatermark_mode __ro_after_init; static nodemask_t numa_nodes_remote; @@ -54,6 +55,11 @@ bool numa_remote_preonline(int nid) return numa_remote_preonline_mode && numa_is_remote_node(nid); } +bool numa_remote_hugetlb_nowatermark(int nid) +{ + return numa_remote_hugetlb_nowatermark_mode && numa_is_remote_node(nid); +} + static void numa_remote_reset_distance(int nid) { int i; @@ -109,6 +115,10 @@ static int __init numa_parse_remote_nodes(char *buf) numa_remote_nofallback_mode = true; else if (!strncmp(buf, "preonline", 9)) numa_remote_preonline_mode = true; +#ifdef CONFIG_HUGETLB_PAGE + else if (!strncmp(buf, "hugetlb_nowatermark", 19)) + numa_remote_hugetlb_nowatermark_mode = true; +#endif buf += strcspn(buf, ","); while (*buf == ',') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 453872559e22..b1cded240049 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1545,6 +1545,8 @@ static inline bool zone_is_zone_extmem(struct zone *zone) { return zone_idx(zone) == ZONE_EXTMEM; } + +#define get_extmem_zone(nid) (&NODE_DATA((nid))->node_zones[ZONE_EXTMEM]) #else static inline bool is_zone_extmem_page(const struct page *page) { @@ -1555,6 +1557,8 @@ static inline bool zone_is_zone_extmem(struct zone *zone) { return false; } + +#define get_extmem_zone(nid) NULL #endif /* diff --git a/include/linux/numa_remote.h b/include/linux/numa_remote.h index 9691efb7a59d..6c08ff22b4fb 100644 --- a/include/linux/numa_remote.h +++ b/include/linux/numa_remote.h @@ -15,6 +15,7 @@ bool numa_is_remote_node(int nid); bool numa_remote_nofallback(int nid); bool numa_remote_preonline(int nid); +bool numa_remote_hugetlb_nowatermark(int nid); void numa_register_remote_nodes(void); bool numa_remote_try_wait_undo_fake_online(int nid); int add_memory_remote(int nid, u64 start, u64 size, int flags); @@ -37,6 +38,11 @@ static inline bool numa_remote_preonline(int nid) return false; } +static inline bool numa_remote_hugetlb_nowatermark(int nid) +{ + return false; +} + static inline void numa_register_remote_nodes(void) { } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bad0dcf0faeb..9ba075d424f9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -47,6 +47,7 @@ #include #include #include +#include #include "internal.h" #include "hugetlb_vmemmap.h" #include @@ -2304,7 +2305,13 @@ static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { - folio = alloc_fresh_hugetlb_folio(h, gfp_mask, node, + gfp_t gfp = 0; + + /* Use __GFP_MEMALLOC to make sure all pages can be allocated */ + if (numa_remote_hugetlb_nowatermark(node)) + gfp |= __GFP_MEMALLOC; + + folio = alloc_fresh_hugetlb_folio(h, gfp_mask | gfp, node, nodes_allowed, node_alloc_noretry); if (folio) { free_huge_folio(folio); /* free it into the hugepage allocator */ @@ -3723,6 +3730,23 @@ static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, return 1; } +#ifdef CONFIG_ZONE_EXTMEM +static void hugetlb_drain_remote_pcp(struct hstate *h, int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + struct zone *zone; + + zone = &pgdat->node_zones[ZONE_EXTMEM]; + + if (zone_managed_pages(zone)) + drain_all_pages(zone); +} +#else +static inline void hugetlb_drain_remote_pcp(struct hstate *h, int nid) +{ +} +#endif + #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, nodemask_t *nodes_allowed) @@ -3732,6 +3756,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, struct page *page; LIST_HEAD(page_list); NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL); + bool drained = false; /* * Bit mask controlling how hard we retry per-node allocations. @@ -3817,6 +3842,11 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, /* yield cpu to avoid soft lockup */ cond_resched(); + if (numa_remote_hugetlb_nowatermark(nid) && !drained && (nid != NUMA_NO_NODE)) { + hugetlb_drain_remote_pcp(h, nid); + drained = true; + } + ret = alloc_pool_huge_page(h, nodes_allowed, node_alloc_noretry); spin_lock_irq(&hugetlb_lock); -- Gitee From d9ffa39c27369db524b026a794a1abcd9822455e Mon Sep 17 00:00:00 2001 From: Zou Jingwei Date: Thu, 14 Nov 2024 09:54:18 +0800 Subject: [PATCH 10/21] mm/numa_remote: add sysfs to distinguish whether a remote node hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID3KH8 -------------------------------- Add new sysfs interface /sys/devices/system/node//remote to distinguish a node betwen local and remote. Signed-off-by: Zou Jingwei Signed-off-by: Liu Shixin --- drivers/base/node.c | 3 +++ drivers/base/numa_remote.c | 19 +++++++++++++++++++ include/linux/numa_remote.h | 11 +++++++++++ 3 files changed, 33 insertions(+) diff --git a/drivers/base/node.c b/drivers/base/node.c index 4d588f4658c8..0ccaf26e39b2 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -20,6 +20,7 @@ #include #include #include +#include static struct bus_type node_subsys = { .name = "node", @@ -625,6 +626,7 @@ static int register_node(struct node *node, int num) } else { hugetlb_register_node(node); compaction_register_node(node); + numa_remote_register_node(node); } return error; @@ -641,6 +643,7 @@ void unregister_node(struct node *node) { hugetlb_unregister_node(node); compaction_unregister_node(node); + numa_remote_unregister_node(node); node_remove_accesses(node); node_remove_caches(node); device_unregister(&node->dev); diff --git a/drivers/base/numa_remote.c b/drivers/base/numa_remote.c index 903aae5ee6fd..09641c63ab70 100644 --- a/drivers/base/numa_remote.c +++ b/drivers/base/numa_remote.c @@ -568,6 +568,25 @@ int numa_remote_set_distance(int target, int *node_ids, int *node_distances, } EXPORT_SYMBOL_GPL(numa_remote_set_distance); +static ssize_t remote_show(struct device *dev, + struct device_attribute *dev_attr, char *buf) +{ + return sprintf(buf, "%d\n", numa_is_remote_node(dev->id)); +} +static DEVICE_ATTR_RO(remote); + +void numa_remote_register_node(struct node *node) +{ + if (numa_remote_enabled) + device_create_file(&node->dev, &dev_attr_remote); +} + +void numa_remote_unregister_node(struct node *node) +{ + if (numa_remote_enabled) + device_remove_file(&node->dev, &dev_attr_remote); +} + static int __init numa_remote_init(void) { int ret; diff --git a/include/linux/numa_remote.h b/include/linux/numa_remote.h index 6c08ff22b4fb..53e780f0240b 100644 --- a/include/linux/numa_remote.h +++ b/include/linux/numa_remote.h @@ -7,6 +7,7 @@ #define _LINUX_REMOTE_MEMORY_H_ #include +#include #define MEMORY_KEEP_ISOLATED 1 #define MEMORY_DIRECT_ONLINE 2 @@ -22,6 +23,8 @@ int add_memory_remote(int nid, u64 start, u64 size, int flags); int remove_memory_remote(int nid, u64 start, u64 size); int numa_remote_set_distance(int target, int *node_ids, int *node_distances, int count); +void numa_remote_register_node(struct node *node); +void numa_remote_unregister_node(struct node *node); #else static inline bool numa_is_remote_node(int nid) { @@ -67,5 +70,13 @@ static inline int numa_remote_set_distance(int target, int *node_ids, { return -EINVAL; } + +static inline void numa_remote_register_node(struct node *node) +{ +} + +static inline void numa_remote_unregister_node(struct node *node) +{ +} #endif #endif /* _LINUX_REMOTE_MEMORY_H_ */ -- Gitee From f6bfb20a2fe0580913e1d73324680b254fb3466f Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Thu, 14 Nov 2024 09:54:18 +0800 Subject: [PATCH 11/21] mm/numa_remote: add pre-online count in meminfo hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID3KH8 -------------------------------- In pre-online mode, the pre-online remote memory is actually inaccessible. We should strip it out of total memory count and display it separately, since such memory can't be used yet. Signed-off-by: Liu Shixin --- drivers/base/node.c | 1 + drivers/base/numa_remote.c | 47 +++++++++++++++++++++++++++++++++++++ fs/proc/meminfo.c | 3 +++ include/linux/numa_remote.h | 11 +++++++++ 4 files changed, 62 insertions(+) diff --git a/drivers/base/node.c b/drivers/base/node.c index 0ccaf26e39b2..75f78552da5a 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -487,6 +487,7 @@ static ssize_t node_read_meminfo(struct device *dev, #endif ); len += hugetlb_report_node_meminfo(buf, len, nid); + len += numa_remote_report_node_meminfo(buf, len, nid); return len; } diff --git a/drivers/base/numa_remote.c b/drivers/base/numa_remote.c index 09641c63ab70..8f6cd541c067 100644 --- a/drivers/base/numa_remote.c +++ b/drivers/base/numa_remote.c @@ -35,6 +35,8 @@ struct undo_fake_online_control { static LLIST_HEAD(undo_fake_online_list); static atomic_long_t undo_fake_online_pages_node[MAX_NUMNODES]; +static atomic_long_t pre_online_pages_node[MAX_NUMNODES]; +static atomic_long_t pre_online_pages; static DEFINE_MUTEX(numa_remote_lock); static DECLARE_RWSEM(numa_remote_state_lock); @@ -221,6 +223,7 @@ struct notifier_block numa_remote_memory_notifier = { static void numa_remote_preonline_pages(struct page *page, unsigned int order) { unsigned long start_pfn, end_pfn, pfn, nr_pages; + int nid = page_to_nid(page); struct page *p; start_pfn = page_to_pfn(page); @@ -232,6 +235,8 @@ static void numa_remote_preonline_pages(struct page *page, unsigned int order) ClearPageReserved(p); } numa_remote_optimize_vmemmap(start_pfn, end_pfn); + atomic_long_add(nr_pages, &pre_online_pages_node[nid]); + atomic_long_add(nr_pages, &pre_online_pages); } static void numa_remote_online_pages(unsigned long start_pfn, unsigned long end_pfn) @@ -283,6 +288,8 @@ static int __ref numa_remote_undo_fake_online(u64 start, u64 size) } set_memory_block_pre_online(start, size, false); + atomic_long_add(-nr_pages, &pre_online_pages_node[nid]); + atomic_long_add(-nr_pages, &pre_online_pages); numa_remote_online_pages(start_pfn, end_pfn); atomic_long_add(-nr_pages, &undo_fake_online_pages_node[nid]); @@ -298,6 +305,9 @@ static int __ref numa_remote_restore_isolation(u64 start, u64 size) { unsigned long start_pfn = PFN_DOWN(start); unsigned long end_pfn = PFN_DOWN(start + size); + unsigned long nr_pages = end_pfn - start_pfn; + struct zone *zone = page_zone(phys_to_page(start)); + int nid = zone_to_nid(zone); int ret = 0; mem_hotplug_begin(); @@ -308,6 +318,8 @@ static int __ref numa_remote_restore_isolation(u64 start, u64 size) goto out; } + atomic_long_add(-nr_pages, &pre_online_pages_node[nid]); + atomic_long_add(-nr_pages, &pre_online_pages); out: mem_hotplug_done(); return ret; @@ -587,6 +599,41 @@ void numa_remote_unregister_node(struct node *node) device_remove_file(&node->dev, &dev_attr_remote); } +void numa_remote_report_meminfo(struct seq_file *m) +{ + pg_data_t *pgdat; + struct zone *zone; + unsigned long total_pages = 0; + unsigned long free_pages = 0; + + if (!numa_remote_enabled) + return; + + for_each_online_pgdat(pgdat) { + zone = &pgdat->node_zones[ZONE_EXTMEM]; + if (populated_zone(zone)) { + total_pages += zone_managed_pages(zone); + free_pages += zone_page_state(zone, NR_FREE_PAGES); + } + } + + seq_printf(m, "RemoteMemTotal: %8lu kB\n" + "RemoteMemFree: %8lu kB\n" + "RemoteMemPreonline: %4lu kB\n", + K(total_pages), K(free_pages), + K(atomic_long_read(&pre_online_pages))); +} + +int numa_remote_report_node_meminfo(char *buf, int len, int nid) +{ + if (!numa_remote_enabled) + return 0; + + return sysfs_emit_at(buf, len, + "Node %d RemoteMemPreonline: %4lu kB\n", + nid, K(atomic_long_read(&pre_online_pages_node[nid]))); +} + static int __init numa_remote_init(void) { int ret; diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 57a431c1130b..91560c0bcd4d 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -19,6 +19,7 @@ #endif #include #include +#include #include #include "internal.h" @@ -173,6 +174,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) dynamic_pool_show_meminfo(m); + numa_remote_report_meminfo(m); + return 0; } diff --git a/include/linux/numa_remote.h b/include/linux/numa_remote.h index 53e780f0240b..6f8a294a46a3 100644 --- a/include/linux/numa_remote.h +++ b/include/linux/numa_remote.h @@ -25,6 +25,8 @@ int numa_remote_set_distance(int target, int *node_ids, int *node_distances, int count); void numa_remote_register_node(struct node *node); void numa_remote_unregister_node(struct node *node); +void numa_remote_report_meminfo(struct seq_file *m); +int numa_remote_report_node_meminfo(char *buf, int len, int nid); #else static inline bool numa_is_remote_node(int nid) { @@ -78,5 +80,14 @@ static inline void numa_remote_register_node(struct node *node) static inline void numa_remote_unregister_node(struct node *node) { } + +static inline void numa_remote_report_meminfo(struct seq_file *m) +{ +} + +static inline int numa_remote_report_node_meminfo(char *buf, int len, int nid) +{ + return 0; +} #endif #endif /* _LINUX_REMOTE_MEMORY_H_ */ -- Gitee From 25ce40714c72a58231726a00bd7f87d46b357193 Mon Sep 17 00:00:00 2001 From: Jinjiang Tu Date: Wed, 7 May 2025 10:13:19 +0800 Subject: [PATCH 12/21] mm/numa_rmeote: add preonline interface for each memory device hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID3KH8 -------------------------------- Add preonline interface for each memory device to indicate if this memory device is preonline or not. Signed-off-by: Jinjiang Tu --- drivers/base/memory.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index e725251e4e74..1582cbe8ee6d 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -478,6 +478,17 @@ static ssize_t valid_zones_show(struct device *dev, static DEVICE_ATTR_RO(valid_zones); #endif +#ifdef CONFIG_NUMA_REMOTE +static ssize_t preonline_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct memory_block *mem = to_memory_block(dev); + + return sysfs_emit_at(buf, 0, "%d\n", mem->pre_online); +} +static DEVICE_ATTR_RO(preonline); +#endif + static DEVICE_ATTR_RO(phys_index); static DEVICE_ATTR_RW(state); static DEVICE_ATTR_RO(phys_device); @@ -653,6 +664,9 @@ static struct attribute *memory_memblk_attrs[] = { &dev_attr_removable.attr, #ifdef CONFIG_MEMORY_HOTREMOVE &dev_attr_valid_zones.attr, +#endif +#ifdef CONFIG_NUMA_REMOTE + &dev_attr_preonline.attr, #endif NULL }; -- Gitee From 527a665ecf74fc4ad92f6fdea8da37b2e7e5e0e9 Mon Sep 17 00:00:00 2001 From: Jinjiang Tu Date: Thu, 4 Sep 2025 21:19:34 +0800 Subject: [PATCH 13/21] mm/oom_kill: kill current in OOM when binding to cpu-less nodes hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID3KH8 -------------------------------- out_of_memory() selects tasks without considering mempolicy. Assuming a cpu-less NUMA Node, ordinary process that don't set mempolicy don't allocate memory from this cpu-less Node, unless other NUMA Nodes are below low watermark. If a task binds to this cpu-less Node and triggers OOM, many tasks may be killed wrongly that don't occupy memory from this Node. To fix it, only kill current if oc->nodemask are all nodes without any cpu. Add a new sysctl knob oom_kill_cpuless_numa_allocating_task to enable it. Signed-off-by: Jinjiang Tu --- mm/oom_kill.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f1d458a5abdf..8aa470f68f25 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -57,6 +57,7 @@ static int sysctl_panic_on_oom; static int sysctl_oom_kill_allocating_task; +static int sysctl_oom_kill_cpuless_numa_allocating_task; static int sysctl_oom_dump_tasks = 1; static int sysctl_enable_oom_killer = 1; @@ -778,6 +779,13 @@ static struct ctl_table vm_oom_kill_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "oom_kill_cpuless_numa_allocating_task", + .data = &sysctl_oom_kill_cpuless_numa_allocating_task, + .maxlen = sizeof(sysctl_oom_kill_cpuless_numa_allocating_task), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "oom_dump_tasks", .data = &sysctl_oom_dump_tasks, @@ -1207,6 +1215,23 @@ int oom_type_notifier_call(unsigned int type, struct oom_control *oc) EXPORT_SYMBOL_GPL(oom_type_notifier_call); #endif +static bool should_oom_kill_allocating_task(struct oom_control *oc) +{ + if (sysctl_oom_kill_allocating_task) + return true; + + if (!oc->nodemask) + return false; + + if (!sysctl_oom_kill_cpuless_numa_allocating_task) + return false; + + if (nodes_intersects(*oc->nodemask, node_states[N_CPU])) + return false; + + return true; +} + /** * out_of_memory - kill the "best" process when we run out of memory * @oc: pointer to struct oom_control @@ -1263,7 +1288,7 @@ bool out_of_memory(struct oom_control *oc) oc->nodemask = NULL; check_panic_on_oom(oc); - if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task && + if (!is_memcg_oom(oc) && should_oom_kill_allocating_task(oc) && current->mm && !oom_unkillable_task(current) && oom_cpuset_eligible(current, oc) && current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { -- Gitee From 479dbd03bbf0bd68971d3cdc5078f0e0606b0f40 Mon Sep 17 00:00:00 2001 From: Jinjiang Tu Date: Mon, 8 Sep 2025 19:53:40 +0800 Subject: [PATCH 14/21] mm/numa_remote: enable oom_kill_cpuless_numa_allocating_task when numa_remote is enabled hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID3KH8 -------------------------------- Enable oom_kill_cpuless_numa_allocating_task by default when numa_remote is enabled. Signed-off-by: Jinjiang Tu --- drivers/base/numa_remote.c | 5 +++++ include/linux/oom.h | 1 + mm/oom_kill.c | 2 +- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/base/numa_remote.c b/drivers/base/numa_remote.c index 8f6cd541c067..c85f470d7d9a 100644 --- a/drivers/base/numa_remote.c +++ b/drivers/base/numa_remote.c @@ -638,6 +638,11 @@ static int __init numa_remote_init(void) { int ret; + if (!numa_remote_enabled) + return 0; + + sysctl_oom_kill_cpuless_numa_allocating_task = 1; + if (!numa_remote_preonline_mode) return 0; diff --git a/include/linux/oom.h b/include/linux/oom.h index b9210e272651..d2f086658d26 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -56,6 +56,7 @@ struct oom_control { extern struct mutex oom_lock; extern struct mutex oom_adj_mutex; +extern int sysctl_oom_kill_cpuless_numa_allocating_task; static inline void set_current_oom_origin(void) { diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 8aa470f68f25..ca005ec51c68 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -57,7 +57,7 @@ static int sysctl_panic_on_oom; static int sysctl_oom_kill_allocating_task; -static int sysctl_oom_kill_cpuless_numa_allocating_task; +int sysctl_oom_kill_cpuless_numa_allocating_task; static int sysctl_oom_dump_tasks = 1; static int sysctl_enable_oom_killer = 1; -- Gitee From e396c79ca241d93e9d926231b4dcb54bf90b00a2 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Sun, 7 Apr 2024 17:43:10 +0800 Subject: [PATCH 15/21] arm64: configs: enable NUMA_REMOTE by default hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID3KH8 -------------------------------- Set CONFIG_NUMA_REMOTE=y by default. Signed-off-by: Liu Shixin --- arch/arm64/configs/openeuler_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index bb3c44921c0f..63625b54e541 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -1162,6 +1162,7 @@ CONFIG_MEMORY_ISOLATION=y CONFIG_EXCLUSIVE_SYSTEM_RAM=y CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE=y +CONFIG_NUMA_REMOTE=y CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y CONFIG_MEMORY_HOTREMOVE=y -- Gitee From 58ae3aa9306f96f038957d29eabc1dd1118415fc Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Wed, 12 Mar 2025 19:28:51 +0800 Subject: [PATCH 16/21] mm/hwpoison: do not send SIGBUS to processes with recovered clean pages stable inclusion from stable-v6.6.88 commit 94b3a19cedb371c69c06eb9b4299d618eb0f7e02 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID3KH8 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=94b3a19cedb371c69c06eb9b4299d618eb0f7e02 -------------------------------- commit aaf99ac2ceb7c974f758a635723eeaf48596388e upstream. When an uncorrected memory error is consumed there is a race between the CMCI from the memory controller reporting an uncorrected error with a UCNA signature, and the core reporting and SRAR signature machine check when the data is about to be consumed. - Background: why *UN*corrected errors tied to *C*MCI in Intel platform [1] Prior to Icelake memory controllers reported patrol scrub events that detected a previously unseen uncorrected error in memory by signaling a broadcast machine check with an SRAO (Software Recoverable Action Optional) signature in the machine check bank. This was overkill because it's not an urgent problem that no core is on the verge of consuming that bad data. It's also found that multi SRAO UCE may cause nested MCE interrupts and finally become an IERR. Hence, Intel downgrades the machine check bank signature of patrol scrub from SRAO to UCNA (Uncorrected, No Action required), and signal changed to uc_decode_notifier()) to try to offline the page despite the UC*NA* signature name. - Background: why #CMCI and #MCE race when poison is consuming in Intel platform [1] Having decided that CMCI/UCNA is the best action for patrol scrub errors, the memory controller uses it for reads too. But the memory controller is executing asynchronously from the core, and can't tell the difference between a "real" read and a speculative read. So it will do CMCI/UCNA if an error is found in any read. Thus: 1) Core is clever and thinks address A is needed soon, issues a speculative read. 2) Core finds it is going to use address A soon after sending the read request 3) The CMCI from the memory controller is in a race with MCE from the core that will soon try to retire the load from address A. Quite often (because speculation has got better) the CMCI from the memory controller is delivered before the core is committed to the instruction reading address A, so the interrupt is taken, and Linux offlines the page (marking it as poison). - Why user process is killed for instr case Commit 046545a661af ("mm/hwpoison: fix error page recovered but reported "not recovered"") tries to fix noise message "Memory error not recovered" and skips duplicate SIGBUSs due to the race. But it also introduced a bug that kill_accessing_process() return -EHWPOISON for instr case, as result, kill_me_maybe() send a SIGBUS to user process. If the CMCI wins that race, the page is marked poisoned when uc_decode_notifier() calls memory_failure(). For dirty pages, memory_failure() invokes try_to_unmap() with the TTU_HWPOISON flag, converting the PTE to a hwpoison entry. As a result, kill_accessing_process(): - call walk_page_range() and return 1 regardless of whether try_to_unmap() succeeds or fails, - call kill_proc() to make sure a SIGBUS is sent - return -EHWPOISON to indicate that SIGBUS is already sent to the process and kill_me_maybe() doesn't have to send it again. However, for clean pages, the TTU_HWPOISON flag is cleared, leaving the PTE unchanged and not converted to a hwpoison entry. Conversely, for clean pages where PTE entries are not marked as hwpoison, kill_accessing_process() returns -EFAULT, causing kill_me_maybe() to send a SIGBUS. Console log looks like this: Memory failure: 0x827ca68: corrupted page was clean: dropped without side effects Memory failure: 0x827ca68: recovery action for clean LRU page: Recovered Memory failure: 0x827ca68: already hardware poisoned mce: Memory error not recovered To fix it, return 0 for "corrupted page was clean", preventing an unnecessary SIGBUS to user process. [1] https://lore.kernel.org/lkml/20250217063335.22257-1-xueshuai@linux.alibaba.com/T/#mba94f1305b3009dd340ce4114d3221fe810d1871 Link: https://lkml.kernel.org/r/20250312112852.82415-3-xueshuai@linux.alibaba.com Fixes: 046545a661af ("mm/hwpoison: fix error page recovered but reported "not recovered"") Signed-off-by: Shuai Xue Tested-by: Tony Luck Acked-by: Miaohe Lin Cc: Baolin Wang Cc: Borislav Betkov Cc: Catalin Marinas Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jane Chu Cc: Jarkko Sakkinen Cc: Jonathan Cameron Cc: Josh Poimboeuf Cc: Naoya Horiguchi Cc: Peter Zijlstra Cc: Ruidong Tian Cc: Thomas Gleinxer Cc: Yazen Ghannam Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman Signed-off-by: Wang Hai Conflicts: mm/memory-failure.c [Context conflicts.] Signed-off-by: Jinjiang Tu --- mm/memory-failure.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index c683704e6878..5406eb81b1af 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -863,12 +863,17 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn, mmap_read_lock(p->mm); ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwpoison_walk_ops, (void *)&priv); + /* + * ret = 1 when CMCI wins, regardless of whether try_to_unmap() + * succeeds or fails, then kill the process with SIGBUS. + * ret = 0 when poison page is a clean page and it's dropped, no + * SIGBUS is needed. + */ if (ret == 1 && priv.tk.addr) kill_proc(&priv.tk, pfn, flags); - else - ret = 0; mmap_read_unlock(p->mm); - return ret > 0 ? -EHWPOISON : -EFAULT; + + return ret > 0 ? -EHWPOISON : 0; } /* -- Gitee From 9f34d6227c7cba1e4f6920d30d3dff0391f622b2 Mon Sep 17 00:00:00 2001 From: Jinjiang Tu Date: Fri, 15 Aug 2025 15:32:09 +0800 Subject: [PATCH 17/21] mm/memory-failure: fix infinite UCE for VM_PFNMAP pfn mainline inclusion from mainline-v6.17-rc3 commit 2e6053fea379806269c4f7f5e36b523c9c0fb35c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID3KH8 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2e6053fea379806269c4f7f5e36b523c9c0fb35c -------------------------------- When memory_failure() is called for a already hwpoisoned pfn, kill_accessing_process() will be called to kill current task. However, if the vma of the accessing vaddr is VM_PFNMAP, walk_page_range() will skip the vma in walk_page_test() and return 0. Before commit aaf99ac2ceb7 ("mm/hwpoison: do not send SIGBUS to processes with recovered clean pages"), kill_accessing_process() will return EFAULT. For x86, the current task will be killed in kill_me_maybe(). However, after this commit, kill_accessing_process() simplies return 0, that means UCE is handled properly, but it doesn't actually. In such case, the user task will trigger UCE infinitely. To fix it, add .test_walk callback for hwpoison_walk_ops to scan all vmas. Link: https://lkml.kernel.org/r/20250815073209.1984582-1-tujinjiang@huawei.com Fixes: aaf99ac2ceb7 ("mm/hwpoison: do not send SIGBUS to processes with recovered clean pages") Signed-off-by: Jinjiang Tu Acked-by: David Hildenbrand Acked-by: Miaohe Lin Reviewed-by: Jane Chu Cc: Kefeng Wang Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Shuai Xue Cc: Zi Yan Cc: Signed-off-by: Andrew Morton --- mm/memory-failure.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 5406eb81b1af..d1bdd1483ca0 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -829,9 +829,17 @@ static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask, #define hwpoison_hugetlb_range NULL #endif +static int hwpoison_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + /* We also want to consider pages mapped into VM_PFNMAP. */ + return 0; +} + static const struct mm_walk_ops hwpoison_walk_ops = { .pmd_entry = hwpoison_pte_range, .hugetlb_entry = hwpoison_hugetlb_range, + .test_walk = hwpoison_test_walk, .walk_lock = PGWALK_RDLOCK, }; -- Gitee From b0f072c6c81741d9b1743e927aa0d9ab26a16f0a Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 19 Aug 2025 19:42:25 +0800 Subject: [PATCH 18/21] mm: memory-failure: remove task_struct from kill_accessing_process() hulk inclusion category: cleanup bugzilla: https://gitee.com/openeuler/kernel/issues/ID3KH8 ---------------------------------------- The argument task_struct isn't used in kill_accessing_process(), remove it. Signed-off-by: Kefeng Wang Signed-off-by: Jinjiang Tu --- mm/memory-failure.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index d1bdd1483ca0..efc8b1d5c2c0 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -856,10 +856,10 @@ static const struct mm_walk_ops hwpoison_walk_ops = { * is proper in most cases, but it could be wrong when the application * process has multiple entries mapping the error page. */ -static int kill_accessing_process(struct task_struct *p, unsigned long pfn, - int flags) +static int kill_accessing_process(unsigned long pfn, int flags) { int ret; + struct task_struct *p = current; struct hwpoison_walk priv = { .pfn = pfn, }; @@ -2094,7 +2094,7 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb pr_err("%#lx: already hardware poisoned\n", pfn); if (flags & MF_ACTION_REQUIRED) { folio = page_folio(p); - res = kill_accessing_process(current, folio_pfn(folio), flags); + res = kill_accessing_process(folio_pfn(folio), flags); action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); } return res; @@ -2290,7 +2290,7 @@ int memory_failure(unsigned long pfn, int flags) pr_err("%#lx: already hardware poisoned\n", pfn); res = -EHWPOISON; if (flags & MF_ACTION_REQUIRED) - res = kill_accessing_process(current, pfn, flags); + res = kill_accessing_process(pfn, flags); if (flags & MF_COUNT_INCREASED) put_page(p); action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); -- Gitee From a81f1aa5c8ea118127614142b13a67095d947a89 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 19 Aug 2025 20:10:56 +0800 Subject: [PATCH 19/21] mm: memory-failure: use kill_accessing_process() in ghes hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID3KH8 ---------------------------------------- Use kill_accessing_process() instead of force_sig() to pass addr to userspace. Signed-off-by: Kefeng Wang Signed-off-by: Jinjiang Tu --- drivers/acpi/apei/ghes.c | 3 +-- include/linux/mm.h | 1 + mm/memory-failure.c | 17 ++++++++++++----- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 8c12d7d98ef4..a79fa61b3518 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -489,8 +489,7 @@ static void memory_failure_cb(struct callback_head *twork) if (!ret || ret == -EHWPOISON || ret == -EOPNOTSUPP) return; - pr_err("Sending SIGBUS to current task due to memory error not recovered"); - force_sig(SIGBUS); + kill_accessing_process(twcb->pfn, twcb->flags, true); } static bool ghes_do_memory_failure(u64 physical_addr, int flags) diff --git a/include/linux/mm.h b/include/linux/mm.h index 1f36bf9ee02f..1dfc9ecc195e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3982,6 +3982,7 @@ extern void shake_page(struct page *p); extern atomic_long_t num_poisoned_pages __read_mostly; extern int soft_offline_page(unsigned long pfn, int flags); extern int soft_online_page(unsigned long pfn); +int kill_accessing_process(unsigned long pfn, int flags, bool force_kill); #ifdef CONFIG_MEMORY_FAILURE /* * Sysfs entries for memory failure handling statistics. diff --git a/mm/memory-failure.c b/mm/memory-failure.c index efc8b1d5c2c0..1e7aef98afae 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -856,9 +856,9 @@ static const struct mm_walk_ops hwpoison_walk_ops = { * is proper in most cases, but it could be wrong when the application * process has multiple entries mapping the error page. */ -static int kill_accessing_process(unsigned long pfn, int flags) +int kill_accessing_process(unsigned long pfn, int flags, bool force_kill) { - int ret; + int ret, ret_kill = -EINVAL; struct task_struct *p = current; struct hwpoison_walk priv = { .pfn = pfn, @@ -878,7 +878,14 @@ static int kill_accessing_process(unsigned long pfn, int flags) * SIGBUS is needed. */ if (ret == 1 && priv.tk.addr) - kill_proc(&priv.tk, pfn, flags); + ret_kill = kill_proc(&priv.tk, pfn, flags); + + if (force_kill && (ret_kill < 0)) { + pr_err("%#lx: Sending force SIGBUS to %s:%d due to hardware memory corruption\n", + pfn, p->comm, task_pid_nr(p)); + force_sig(SIGBUS); + } + mmap_read_unlock(p->mm); return ret > 0 ? -EHWPOISON : 0; @@ -2094,7 +2101,7 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb pr_err("%#lx: already hardware poisoned\n", pfn); if (flags & MF_ACTION_REQUIRED) { folio = page_folio(p); - res = kill_accessing_process(folio_pfn(folio), flags); + res = kill_accessing_process(folio_pfn(folio), flags, false); action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); } return res; @@ -2290,7 +2297,7 @@ int memory_failure(unsigned long pfn, int flags) pr_err("%#lx: already hardware poisoned\n", pfn); res = -EHWPOISON; if (flags & MF_ACTION_REQUIRED) - res = kill_accessing_process(pfn, flags); + res = kill_accessing_process(pfn, flags, false); if (flags & MF_COUNT_INCREASED) put_page(p); action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); -- Gitee From 13aacc18f999eb2c6d03f32ad9ee6eaa2630ba8f Mon Sep 17 00:00:00 2001 From: Jinjiang Tu Date: Fri, 12 Sep 2025 16:39:55 +0800 Subject: [PATCH 20/21] mm/numa_remote: extend cmdline numa_remote to limit the max number of remote node hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID3KH8 -------------------------------- extend cmdline numa_remote to limit the max number of remote node. Reduing the max number of remote node will imporve performance and reduce memory consumption in some cases. Memory offline calls build_all_zonelists to rebuild zonelists when the zone isn't populated. build_all_zonelists traverses all possible NUMA nodes, so reduing the max number of remote node will improve the memory offline performance. When creating memcg, struct mem_cgroup_per_node will be allocated for all possible NUMA nodes, so reduing the max number of remote node will reduce memory consumption. Signed-off-by: Jinjiang Tu --- drivers/base/numa_remote.c | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/drivers/base/numa_remote.c b/drivers/base/numa_remote.c index c85f470d7d9a..6228dfc632d1 100644 --- a/drivers/base/numa_remote.c +++ b/drivers/base/numa_remote.c @@ -23,6 +23,7 @@ static bool numa_remote_enabled __ro_after_init; static bool numa_remote_nofallback_mode __ro_after_init; static bool numa_remote_preonline_mode __ro_after_init; static bool numa_remote_hugetlb_nowatermark_mode __ro_after_init; +static int numa_remote_max_nodes __ro_after_init = MAX_NUMNODES; static nodemask_t numa_nodes_remote; @@ -81,14 +82,18 @@ static void numa_remote_reset_distance(int nid) void __init numa_register_remote_nodes(void) { - int i; + int i, count = 0; if (!numa_remote_enabled) return; for (i = 0; i < MAX_NUMNODES; i++) { - if (!node_test_and_set(i, numa_nodes_parsed)) + if (!node_test_and_set(i, numa_nodes_parsed)) { node_set(i, numa_nodes_remote); + count++; + if (count >= numa_remote_max_nodes) + break; + } } for (i = 0; i < MAX_NUMNODES; i++) { @@ -107,24 +112,33 @@ void __init numa_register_remote_nodes(void) */ static int __init numa_parse_remote_nodes(char *buf) { + char *sep; + int val; + numa_remote_enabled = true; if (!buf) return 0; while (*buf) { - if (!strncmp(buf, "nofallback", 10)) + sep = strchr(buf, ','); + if (sep) + *sep = 0; + if (!strcmp(buf, "nofallback")) numa_remote_nofallback_mode = true; - else if (!strncmp(buf, "preonline", 9)) + else if (!strcmp(buf, "preonline")) numa_remote_preonline_mode = true; #ifdef CONFIG_HUGETLB_PAGE - else if (!strncmp(buf, "hugetlb_nowatermark", 19)) + else if (!strcmp(buf, "hugetlb_nowatermark")) numa_remote_hugetlb_nowatermark_mode = true; #endif - - buf += strcspn(buf, ","); - while (*buf == ',') - buf++; + else if (!kstrtoint(buf, 0, &val)) { + if (val > 0) + numa_remote_max_nodes = val; + } + if (!sep) + break; + buf = sep + 1; } return 0; -- Gitee From 77a4edae18c13623b75b124195c78a48687b13cc Mon Sep 17 00:00:00 2001 From: Jinjiang Tu Date: Fri, 12 Sep 2025 15:02:07 +0800 Subject: [PATCH 21/21] mm/numa_remote: add kernel doc for numa remote feature hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID3KH8 -------------------------------- Add kernel doc for numa remote feature. Signed-off-by: Jinjiang Tu --- Documentation/admin-guide/kernel-parameters.txt | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 04ad341b5956..ac7eb9cbf24a 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4126,6 +4126,23 @@ This can be set from sysctl after boot. See Documentation/admin-guide/sysctl/vm.rst for details. + numa_remote= [ARM64,KNL] + Prepare unused NUMA Nodes as remote Nodes, allows to hotplug + remote memory on these remote NUMA Nodes when CONFIG_NUMA_REMOTE + is enabled. By default, all unused NUMA Nodes will be configured + as remote Nodes. cmdline numa_remote_max_nodes can be used to limit + the number of remote NUMA Nodes. + Format: [arg0,][arg1] + preonline - allow to online unready memory and keep them isolated, + to improve the online performance. + nofallback - the remote nodes don't appear in the zonelists of + other nodes, the remote memory can only be allocated by + specifying the remote node. + hugetlb_nowatermark - allocate hugetlb in remote node will ignore + watermark, and all memory can be allocated as + hugetlb. + - limit the number of remote NUMA Nodes. + ohci1394_dma=early [HW] enable debugging via the ohci1394 driver. See Documentation/core-api/debugging-via-ohci1394.rst for more info. -- Gitee