From 07b9a8696a706d305e680598b18df882c13ea8de Mon Sep 17 00:00:00 2001
From: tanghui <tanghui20@huawei.com>
Date: Wed, 16 Nov 2022 15:12:16 +0800
Subject: [PATCH 1/3] sched: Introduce dynamic affinity for cfs scheduler

Dynamic affinity set preferred cpus for task. When the utilization of
taskgroup's preferred cpu is low, task only run in cpus preferred to
enhance cpu resource locality and reduce interference between task cgroups,
otherwise task can burst preferred cpus to use external cpu within
cpus allowed.

Signed-off-by: huwentao <wentaohu0417@163.com>
---
 arch/arm64/configs/tencent.config |   1 +
 fs/proc/array.c                   |  13 +++
 fs/proc/base.c                    |  80 +++++++++++++
 include/linux/sched.h             |  16 +++
 include/linux/sched/sysctl.h      |   3 +
 init/Kconfig                      |  15 +++
 init/init_task.c                  |   3 +
 kernel/cgroup/cpuset.c            | 154 +++++++++++++++++++++++++
 kernel/fork.c                     |  13 +++
 kernel/sched/core.c               |  95 ++++++++++++++++
 kernel/sched/fair.c               | 180 +++++++++++++++++++++++++++++-
 kernel/sched/features.h           |   7 ++
 kernel/sysctl.c                   |  11 ++
 13 files changed, 590 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/configs/tencent.config b/arch/arm64/configs/tencent.config
index 75ea73ecd347..3cb93ce14bf1 100644
--- a/arch/arm64/configs/tencent.config
+++ b/arch/arm64/configs/tencent.config
@@ -40,6 +40,7 @@ CONFIG_USER_NS=y
 # CONFIG_SECURITY_MONITOR is not set
 CONFIG_CHECKPOINT_RESTORE=y
 CONFIG_SCHED_AUTOGROUP=y
+CONFIG_QOS_SCHED_DYNAMIC_AFFINITY=y
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_EXPERT=y
 CONFIG_SGETMASK_SYSCALL=y
diff --git a/fs/proc/array.c b/fs/proc/array.c
index ccd78c5f91c2..58bb4dce4e20 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -394,6 +394,16 @@ static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
 		   cpumask_pr_args(task->cpus_ptr));
 }
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+static void task_cpus_preferred(struct seq_file *m, struct task_struct *task)
+{
+	seq_printf(m, "Cpus_preferred:\t%*pb\n",
+		cpumask_pr_args(task->prefer_cpus));
+	seq_printf(m, "Cpus_preferred_list:\t%*pbl\n",
+		cpumask_pr_args(task->prefer_cpus));
+}
+#endif
+
 static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)
 {
 	seq_put_decimal_ull(m, "CoreDumping:\t", !!mm->core_state);
@@ -467,6 +477,9 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 	task_cpus_allowed(m, task);
 	cpuset_task_status_allowed(m, task);
 	task_context_switch_counts(m, task);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	task_cpus_preferred(m, task);
+#endif
 	return 0;
 }
 
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 06f2b66f5d4d..6fb724ac6c79 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -95,6 +95,7 @@
 #include <linux/sched/stat.h>
 #include <linux/posix-timers.h>
 #include <linux/time_namespace.h>
+#include <linux/cpu.h>
 #include <trace/events/oom.h>
 #include "internal.h"
 #include "fd.h"
@@ -3132,6 +3133,82 @@ static const struct file_operations proc_setgroups_operations = {
 };
 #endif /* CONFIG_USER_NS */
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+
+static int preferred_cpuset_show(struct seq_file *m, void *v)
+{
+	struct inode *inode = m->private;
+	struct task_struct *p;
+
+	p = get_proc_task(inode);
+	if (!p)
+		return -ESRCH;
+
+	if (p->prefer_cpus)
+		seq_printf(m, "%*pbl\n", cpumask_pr_args(p->prefer_cpus));
+	else
+		seq_putc(m, '\n');
+
+	put_task_struct(p);
+
+	return 0;
+}
+
+static ssize_t preferred_cpuset_write(struct file *file, const char __user *buf,
+					size_t count, loff_t *offset)
+{
+	cpumask_var_t new_mask;
+	int retval;
+	struct inode *inode = file_inode(file);
+	struct task_struct *p;
+
+	p = get_proc_task(inode);
+	if (!p)
+		return -ESRCH;
+
+	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+		retval = -ENOMEM;
+		goto out_put_task;
+	}
+
+	retval = cpumask_parselist_user(buf, count, new_mask);
+	if (retval < 0)
+		goto out_free_cpumask;
+
+	retval = set_prefer_cpus_ptr(p, new_mask);
+	if (retval < 0)
+		goto out_free_cpumask;
+
+	if (!cpumask_empty(new_mask)) {
+		cpus_read_lock();
+		dynamic_affinity_enable();
+		cpus_read_unlock();
+	}
+
+	retval = count;
+
+out_free_cpumask:
+	free_cpumask_var(new_mask);
+out_put_task:
+	put_task_struct(p);
+
+	return retval;
+}
+
+static int preferred_cpuset_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, preferred_cpuset_show, inode);
+}
+
+static const struct file_operations proc_preferred_cpuset_operations = {
+	.open		= preferred_cpuset_open,
+	.write		= preferred_cpuset_write,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif
+
 static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
 				struct pid *pid, struct task_struct *task)
 {
@@ -3677,6 +3754,9 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_PROC_PID_ARCH_STATUS
 	ONE("arch_status", S_IRUGO, proc_pid_arch_status),
 #endif
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	REG("preferred_cpuset", 0644, proc_preferred_cpuset_operations),
+#endif
 };
 
 static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ae9d328120db..0322d881fae3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1442,8 +1442,16 @@ struct task_struct {
 	unsigned long is_coredump_mcs;
 #endif
 
+#if defined(CONFIG_QOS_SCHED_DYNAMIC_AFFINITY) && !defined(__GENKSYMS__)
+	KABI_USE(1, cpumask_t *prefer_cpus);
+#else
 	KABI_RESERVE(1);
+#endif
+#if defined(CONFIG_TASK_PLACEMENT_BY_CPU_RANGE) && !defined(__GENKSYMS__)
+	KABI_USE(2, const cpumask_t *select_cpus);
+#else
 	KABI_RESERVE(2);
+#endif
 	KABI_RESERVE(3);
 	KABI_RESERVE(4);
 
@@ -2174,4 +2182,12 @@ static inline void sched_core_free(struct task_struct *tsk) { }
 static inline void sched_core_fork(struct task_struct *p) { }
 #endif
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+int set_prefer_cpus_ptr(struct task_struct *p,
+			const struct cpumask *new_mask);
+int sched_prefer_cpus_fork(struct task_struct *p, struct cpumask *mask);
+void sched_prefer_cpus_free(struct task_struct *p);
+void dynamic_affinity_enable(void);
+#endif
+
 #endif
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 96b072b2a74c..9f273fe40590 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -31,6 +31,9 @@ extern unsigned int sysctl_cpu_qos_tick_gran;
 extern unsigned int sysctl_cpu_qos_tick_granularity;
 extern unsigned int sysctl_cpu_qos_latency_recoup;
 extern unsigned int sysctl_cpu_qos_recoup_granularity;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+extern int sysctl_sched_util_low_pct;
+#endif
 
 enum sched_tunable_scaling {
 	SCHED_TUNABLESCALING_NONE,
diff --git a/init/Kconfig b/init/Kconfig
index 2542f4aeba58..4226d24823cf 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -996,6 +996,21 @@ config BT_GROUP_SCHED
 
 endif #CGROUP_SCHED
 
+config TASK_PLACEMENT_BY_CPU_RANGE
+	bool "variable cpu range for task placement"
+
+config QOS_SCHED_DYNAMIC_AFFINITY
+	bool "qos dynamic affinity"
+	depends on CPUSETS
+	depends on FAIR_GROUP_SCHED
+	select TASK_PLACEMENT_BY_CPU_RANGE
+	default n
+	help
+	 This feature lets you allocate preferred cpus to taskgroup. If enabled,
+	 it will make taskgroup only to use preferred cpus when cpu utilization
+	 of taskgroup is below threshold setted, otherwise make taskgroup to use
+	 cpus allowed.
+
 config UCLAMP_TASK_GROUP
 	bool "Utilization clamping per group of tasks"
 	depends on CGROUP_SCHED
diff --git a/init/init_task.c b/init/init_task.c
index b89453de8fdb..a943e50bc6c5 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -190,6 +190,9 @@ struct task_struct init_task
 #ifdef CONFIG_SECURITY
 	.security	= NULL,
 #endif
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	.prefer_cpus	= NULL,
+#endif
 };
 EXPORT_SYMBOL(init_task);
 
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index b3ae4b77efa7..e75c46c74ce5 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -114,6 +114,9 @@ struct cpuset {
 	/* user-configured CPUs and Memory Nodes allow to tasks */
 	cpumask_var_t cpus_allowed;
 	nodemask_t mems_allowed;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	cpumask_var_t prefer_cpus;
+#endif
 
 	/* effective CPUs and Memory Nodes allow to tasks */
 	cpumask_var_t effective_cpus;
@@ -212,6 +215,9 @@ struct cpuset {
 struct tmpmasks {
 	cpumask_var_t addmask, delmask;	/* For partition root */
 	cpumask_var_t new_cpus;		/* For update_cpumasks_hier() */
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	cpumask_var_t prefer_cpus;
+#endif
 };
 
 static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
@@ -488,15 +494,24 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
 static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
 {
 	cpumask_var_t *pmask1, *pmask2, *pmask3;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	cpumask_var_t *pmask4;
+#endif
 
 	if (cs) {
 		pmask1 = &cs->cpus_allowed;
 		pmask2 = &cs->effective_cpus;
 		pmask3 = &cs->subparts_cpus;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+		pmask4 = &cs->prefer_cpus;
+#endif
 	} else {
 		pmask1 = &tmp->new_cpus;
 		pmask2 = &tmp->addmask;
 		pmask3 = &tmp->delmask;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+		pmask4 = &tmp->prefer_cpus;
+#endif
 	}
 
 	if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
@@ -507,9 +522,17 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
 
 	if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
 		goto free_two;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	if (!zalloc_cpumask_var(pmask4, GFP_KERNEL))
+		goto free_three;
+#endif
 
 	return 0;
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+free_three:
+	free_cpumask_var(*pmask3);
+#endif
 free_two:
 	free_cpumask_var(*pmask2);
 free_one:
@@ -525,11 +548,17 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
 static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
 {
 	if (cs) {
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+		free_cpumask_var(cs->prefer_cpus);
+#endif
 		free_cpumask_var(cs->cpus_allowed);
 		free_cpumask_var(cs->effective_cpus);
 		free_cpumask_var(cs->subparts_cpus);
 	}
 	if (tmp) {
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+		free_cpumask_var(tmp->prefer_cpus);
+#endif
 		free_cpumask_var(tmp->new_cpus);
 		free_cpumask_var(tmp->addmask);
 		free_cpumask_var(tmp->delmask);
@@ -553,6 +582,9 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
 		return NULL;
 	}
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	cpumask_copy(trial->prefer_cpus, cs->prefer_cpus);
+#endif
 	cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
 	cpumask_copy(trial->effective_cpus, cs->effective_cpus);
 	return trial;
@@ -596,6 +628,11 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
 
 	rcu_read_lock();
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	ret = -EINVAL;
+	if (!cpumask_subset(cur->prefer_cpus, trial->cpus_allowed))
+		goto out;
+#endif
 	/* Each of our child cpusets must be a subset of us */
 	ret = -EBUSY;
 	cpuset_for_each_child(c, css, cur)
@@ -660,6 +697,69 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
 	return ret;
 }
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+static cpumask_var_t prefer_cpus_attach;
+
+static void update_tasks_prefer_cpumask(struct cpuset *cs)
+{
+	struct css_task_iter it;
+	struct task_struct *task;
+
+	css_task_iter_start(&cs->css, 0, &it);
+	while ((task = css_task_iter_next(&it)))
+		set_prefer_cpus_ptr(task, cs->prefer_cpus);
+	css_task_iter_end(&it);
+}
+
+/*
+ * update_prefer_cpumask - update the prefer_cpus mask of a cpuset and
+ *			   all tasks in it
+ * @cs: the cpuset to consider
+ * @trialcs: trial cpuset
+ * @buf: buffer of cpu numbers written to this cpuset
+ */
+static int update_prefer_cpumask(struct cpuset *cs, struct cpuset *trialcs,
+				 const char *buf)
+{
+	int retval;
+
+	if (cs == &top_cpuset)
+		return -EACCES;
+
+	/*
+	 * An empty prefer_cpus is ok which mean that the cpuset tasks disable
+	 * dynamic affinity feature.
+	 * Since cpulist_parse() fails on an empty mask, we special case
+	 * that parsing.
+	 */
+	if (!*buf) {
+		cpumask_clear(trialcs->prefer_cpus);
+	} else {
+		retval = cpulist_parse(buf, trialcs->prefer_cpus);
+		if (retval < 0)
+			return retval;
+	}
+
+	/* Nothing to do if the cpus didn't change */
+	if (cpumask_equal(cs->prefer_cpus, trialcs->prefer_cpus))
+		return 0;
+
+	if (!cpumask_subset(trialcs->prefer_cpus, cs->cpus_allowed))
+		return -EINVAL;
+
+	update_tasks_prefer_cpumask(trialcs);
+
+	if (!cpumask_empty(trialcs->prefer_cpus))
+		dynamic_affinity_enable();
+
+	spin_lock_irq(&callback_lock);
+	cpumask_copy(cs->prefer_cpus, trialcs->prefer_cpus);
+	spin_unlock_irq(&callback_lock);
+
+	return 0;
+}
+#endif
+
 #ifdef CONFIG_SMP
 /*
  * Helper routine for generate_sched_domains().
@@ -2248,6 +2348,10 @@ static void cpuset_attach(struct cgroup_taskset *tset)
 	else
 		guarantee_online_cpus(cs, cpus_attach);
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	cpumask_copy(prefer_cpus_attach, cs->prefer_cpus);
+#endif
+
 	guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
 
 	cgroup_taskset_for_each(task, css, tset) {
@@ -2256,6 +2360,9 @@ static void cpuset_attach(struct cgroup_taskset *tset)
 		 * fail.  TODO: have a better way to handle failure here
 		 */
 		WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+		set_prefer_cpus_ptr(task, prefer_cpus_attach);
+#endif
 
 		cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
 		cpuset_update_task_spread_flag(cs, task);
@@ -2316,6 +2423,9 @@ typedef enum {
 	FILE_MEMORY_PRESSURE,
 	FILE_SPREAD_PAGE,
 	FILE_SPREAD_SLAB,
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	FILE_DYNAMIC_CPULIST,
+#endif
 } cpuset_filetype_t;
 
 static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
@@ -2446,6 +2556,11 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
 	case FILE_MEMLIST:
 		retval = update_nodemask(cs, trialcs, buf);
 		break;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	case FILE_DYNAMIC_CPULIST:
+		retval = update_prefer_cpumask(cs, trialcs, buf);
+		break;
+#endif
 	default:
 		retval = -EINVAL;
 		break;
@@ -2493,6 +2608,11 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
 	case FILE_SUBPARTS_CPULIST:
 		seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus));
 		break;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	case FILE_DYNAMIC_CPULIST:
+		seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->prefer_cpus));
+		break;
+#endif
 	default:
 		ret = -EINVAL;
 	}
@@ -3262,6 +3382,15 @@ static struct cftype legacy_files[] = {
 		.name = "loadavg",
 		.seq_show = cpuset_cgroup_loadavg_show,
 	},
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	{
+		.name = "preferred_cpus",
+		.seq_show = cpuset_common_seq_show,
+		.write = cpuset_write_resmask,
+		.max_write_len = (100U + 6 * NR_CPUS),
+		.private = FILE_DYNAMIC_CPULIST,
+	},
+#endif
 
 	{ }	/* terminate */
 };
@@ -3415,6 +3544,9 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
 	cs->effective_mems = parent->mems_allowed;
 	cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
 	cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	cpumask_copy(cs->prefer_cpus, parent->prefer_cpus);
+#endif
 	spin_unlock_irq(&callback_lock);
 out_unlock:
 	percpu_up_write(&cpuset_rwsem);
@@ -3497,6 +3629,9 @@ static void cpuset_fork(struct task_struct *task)
 		return;
 
 	set_cpus_allowed_ptr(task, current->cpus_ptr);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	set_prefer_cpus_ptr(task, current->prefer_cpus);
+#endif
 	task->mems_allowed = current->mems_allowed;
 }
 
@@ -3530,17 +3665,26 @@ int __init cpuset_init(void)
 	BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
 	BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
 	BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	BUG_ON(!alloc_cpumask_var(&top_cpuset.prefer_cpus, GFP_KERNEL));
+#endif
 
 	cpumask_setall(top_cpuset.cpus_allowed);
 	nodes_setall(top_cpuset.mems_allowed);
 	cpumask_setall(top_cpuset.effective_cpus);
 	nodes_setall(top_cpuset.effective_mems);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	cpumask_clear(top_cpuset.prefer_cpus);
+#endif
 
 	fmeter_init(&top_cpuset.fmeter);
 	set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
 	top_cpuset.relax_domain_level = -1;
 
 	BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	BUG_ON(!alloc_cpumask_var(&prefer_cpus_attach, GFP_KERNEL));
+#endif
 
 	return 0;
 }
@@ -3577,6 +3721,9 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
 			    struct cpumask *new_cpus, nodemask_t *new_mems,
 			    bool cpus_updated, bool mems_updated)
 {
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	cpumask_t prefer_cpus;
+#endif
 	bool is_empty;
 
 	spin_lock_irq(&callback_lock);
@@ -3595,6 +3742,13 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
 	if (mems_updated && !nodes_empty(cs->mems_allowed))
 		update_tasks_nodemask(cs);
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	if (!cpumask_subset(cs->prefer_cpus, cs->cpus_allowed)) {
+		cpumask_and(&prefer_cpus, cs->prefer_cpus, cs->cpus_allowed);
+		cpumask_copy(cs->prefer_cpus, &prefer_cpus);
+		update_tasks_prefer_cpumask(cs);
+	}
+#endif
 	is_empty = cpumask_empty(cs->cpus_allowed) ||
 		   nodes_empty(cs->mems_allowed);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index b3f95a497aac..42d21a1f7812 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -492,6 +492,9 @@ void free_task(struct task_struct *tsk)
 	arch_release_task_struct(tsk);
 	if (tsk->flags & PF_KTHREAD)
 		free_kthread_struct(tsk);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	sched_prefer_cpus_free(tsk);
+#endif
 	free_task_struct(tsk);
 }
 EXPORT_SYMBOL(free_task);
@@ -968,6 +971,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	tsk->seccomp.filter = NULL;
 #endif
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	tsk->prefer_cpus = NULL;
+#endif
+
 	setup_thread_stack(tsk, orig);
 	clear_user_return_notifier(tsk);
 	clear_tsk_need_resched(tsk);
@@ -2197,6 +2204,12 @@ static __latent_entropy struct task_struct *copy_process(
 		p->tgid = p->pid;
 	}
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	retval = sched_prefer_cpus_fork(p, current->prefer_cpus);
+	if (retval)
+		goto bad_fork_put_pidfd;
+#endif
+
 	p->nr_dirtied = 0;
 	p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
 	p->dirty_paused_when = 0;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 573bee3ce201..eb40b0d27bbe 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9697,6 +9697,101 @@ int cpu_cgroup_notify_prio_change(struct cgroup_subsys_state *css,
 }
 #endif
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+int sched_prefer_cpus_fork(struct task_struct *p, struct cpumask *mask)
+{
+	p->prefer_cpus = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+	if (!p->prefer_cpus)
+		return -ENOMEM;
+
+	if (mask)
+		cpumask_copy(p->prefer_cpus, mask);
+	else
+		cpumask_clear(p->prefer_cpus);
+
+	return 0;
+}
+
+void sched_prefer_cpus_free(struct task_struct *p)
+{
+	kfree(p->prefer_cpus);
+}
+
+static void do_set_prefer_cpus(struct task_struct *p,
+				const struct cpumask *new_mask)
+{
+	struct rq *rq = task_rq(p);
+	bool queued, running;
+
+	lockdep_assert_held(&p->pi_lock);
+
+	queued = task_on_rq_queued(p);
+	running = task_current(rq, p);
+
+	if (queued) {
+		/*
+		 * Because __kthread_bind() calls this on blocked tasks without
+		 * holding rq->lock.
+		 */
+		lockdep_assert_held(&rq->__lock);
+		dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
+	}
+	if (running)
+		put_prev_task(rq, p);
+
+	cpumask_copy(p->prefer_cpus, new_mask);
+
+	if (queued)
+		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
+	if (running)
+		set_next_task(rq, p);
+}
+
+/*
+ * Change a given task's prefer CPU affinity. Prioritize migrate the thread to
+ * prefer cpus according to preferred bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+static int __set_prefer_cpus_ptr(struct task_struct *p,
+				  const struct cpumask *new_mask, bool check)
+{
+	struct rq_flags rf;
+	struct rq *rq;
+	int ret = 0;
+
+	if (unlikely(!p->prefer_cpus))
+		return -EINVAL;
+
+	rq = task_rq_lock(p, &rf);
+	update_rq_clock(rq);
+
+	if (cpumask_equal(p->prefer_cpus, new_mask))
+		goto out;
+
+	if (!cpumask_subset(new_mask, p->cpus_ptr)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	do_set_prefer_cpus(p, new_mask);
+out:
+	task_rq_unlock(rq, p, &rf);
+
+	return ret;
+}
+
+int set_prefer_cpus_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+	if (p->sched_class != &fair_sched_class)
+		return 0;
+
+	return __set_prefer_cpus_ptr(p, new_mask, false);
+}
+#endif
+
 #ifdef CONFIG_CFS_BANDWIDTH
 static int cpu_max_show(struct seq_file *sf, void *v)
 {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f8abbf625b11..64524be96c4a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5815,7 +5815,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 
 		/* Skip over this group if it has no CPUs allowed */
 		if (!cpumask_intersects(sched_group_span(group),
+#ifdef CONFIG_TASK_PLACEMENT_BY_CPU_RANGE
+					p->select_cpus))
+#else
 					p->cpus_ptr))
+#endif
 			continue;
 
 		/* Skip over this group if no cookie matched */
@@ -5946,7 +5950,11 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
 		return cpumask_first(sched_group_span(group));
 
 	/* Traverse only the allowed CPUs */
+#ifdef CONFIG_TASK_PLACEMENT_BY_CPU_RANGE
+	for_each_cpu_and(i, sched_group_span(group), p->select_cpus) {
+#else
 	for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
+#endif
 		struct rq *rq = cpu_rq(i);
 
 		if (!sched_core_cookie_match(rq, p))
@@ -5998,7 +6006,11 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
 {
 	int new_cpu = cpu;
 
+#ifdef CONFIG_TASK_PLACEMENT_BY_CPU_RANGE
+	if (!cpumask_intersects(sched_domain_span(sd), p->select_cpus))
+#else
 	if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
+#endif
 		return prev_cpu;
 
 	/*
@@ -6240,8 +6252,11 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
 
 	time = cpu_clock(this);
 
+#ifdef CONFIG_TASK_PLACEMENT_BY_CPU_RANGE
+	cpumask_and(cpus, sched_domain_span(sd), p->select_cpus);
+#else
 	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
-
+#endif
 	if (sched_feat(SIS_UTIL)) {
 		sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
 		if (sd_share) {
@@ -6301,13 +6316,20 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	*/
 	lockdep_assert_irqs_disabled();
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	if ((available_idle_cpu(target) || sched_idle_cpu(target)) && cpumask_test_cpu(target, p->select_cpus))
+#else
 	if (available_idle_cpu(target) || sched_idle_cpu(target))
+#endif
 		return target;
 
 	/*
 	 * If the previous CPU is cache affine and idle, don't be stupid:
 	 */
 	if (prev != target && cpus_share_lowest_cache(prev, target) &&
+#ifdef CONFIG_TASK_PLACEMENT_BY_CPU_RANGE
+	    cpumask_test_cpu(prev, p->select_cpus) &&
+#endif
 	    (available_idle_cpu(prev) || sched_idle_cpu(prev)))
 		return prev;
 
@@ -6317,7 +6339,11 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	    recent_used_cpu != target &&
 	    cpus_share_lowest_cache(recent_used_cpu, target) &&
 	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	    cpumask_test_cpu(p->recent_used_cpu, p->select_cpus)) {
+#else
 	    cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
+#endif
 		/*
 		 * Replace recent_used_cpu with prev as it is a potential
 		 * candidate for the next wake:
@@ -6738,6 +6764,123 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 	return -1;
 }
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+
+#ifdef CONFIG_JUMP_LABEL
+static DEFINE_STATIC_KEY_FALSE(__dynamic_affinity_used);
+
+static inline bool dynamic_affinity_used(void)
+{
+	return static_branch_unlikely(&__dynamic_affinity_used);
+}
+
+void dynamic_affinity_enable(void)
+{
+	static_branch_enable_cpuslocked(&__dynamic_affinity_used);
+}
+
+#else /* CONFIG_JUMP_LABEL */
+static bool dynamic_affinity_used(void)
+{
+	return true;
+}
+#endif
+
+/*
+ * Low utilization threshold for CPU
+ *
+ * (default: 85%), units: percentage of CPU utilization)
+ */
+int sysctl_sched_util_low_pct = 85;
+
+static inline bool prefer_cpus_valid(struct task_struct *p)
+{
+	return p->prefer_cpus &&
+	       !cpumask_empty(p->prefer_cpus) &&
+	       !cpumask_equal(p->prefer_cpus, p->cpus_ptr) &&
+	       cpumask_subset(p->prefer_cpus, p->cpus_ptr);
+}
+
+static inline unsigned long taskgroup_cpu_util(struct task_group *tg,
+					       int cpu)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	if (tg->se[cpu] && sched_feat(DA_UTIL_TASKGROUP))
+		return tg->se[cpu]->avg.util_avg;
+#endif
+
+	return cpu_util(cpu);
+}
+
+/*
+ * set_task_select_cpus: select the cpu range for task
+ * @p: the task whose available cpu range will to set
+ * @idlest_cpu: the cpu which is the idlest in prefer cpus
+ *
+ * If sum of 'util_avg' among 'preferred_cpus' lower than the percentage
+ * 'sysctl_sched_util_low_pct' of 'preferred_cpus' capacity, select
+ * 'preferred_cpus' range for task, otherwise select 'preferred_cpus' for task.
+ *
+ * The available cpu range set to p->select_cpus. Idlest cpu in preferred cpus
+ * set to @idlest_cpu, which is set to wakeup cpu when fast path wakeup cpu
+ * without p->select_cpus.
+ */
+static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu,
+				 int sd_flag)
+{
+	unsigned long util_avg_sum = 0;
+	unsigned long tg_capacity = 0;
+	long min_util = INT_MIN;
+	struct task_group *tg;
+	long spare;
+	int cpu;
+	int nr_cpus_valid = 0;
+
+	p->select_cpus = p->cpus_ptr;
+	if (!prefer_cpus_valid(p))
+		return;
+
+	rcu_read_lock();
+	tg = task_group(p);
+	for_each_cpu_and(cpu, p->prefer_cpus, cpu_online_mask) {
+		if (idlest_cpu && (available_idle_cpu(cpu) || sched_idle_cpu(cpu))) {
+			*idlest_cpu = cpu;
+		} else if (idlest_cpu) {
+			spare = (long)(capacity_of(cpu) -
+				taskgroup_cpu_util(tg, cpu));
+			if (spare > min_util) {
+				min_util = spare;
+				*idlest_cpu = cpu;
+			}
+		}
+
+		if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) {
+			rcu_read_unlock();
+			p->select_cpus = p->prefer_cpus;
+			return;
+		}
+
+		util_avg_sum += taskgroup_cpu_util(tg, cpu);
+		tg_capacity += capacity_of(cpu);
+		nr_cpus_valid++;
+	}
+	rcu_read_unlock();
+
+	/*
+	 * Follow cases should select cpus_ptr, checking by condition of
+	 * tg_capacity > nr_cpus_valid:
+	 * 1. all prefer_cpus offline;
+	 * 2. all prefer_cpus has no cfs capaicity(tg_capacity = nr_cpus_valid * 1)
+	 */
+	if (tg_capacity > nr_cpus_valid &&
+	    util_avg_sum * 100 <= tg_capacity * sysctl_sched_util_low_pct) {
+		p->select_cpus = p->prefer_cpus;
+	} else if (idlest_cpu) {
+		*idlest_cpu = -1;
+	}
+}
+#endif
+
 /*
  * select_task_rq_fair: Select target runqueue for the waking task in domains
  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@ -6758,11 +6901,20 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 	int new_cpu = prev_cpu;
 	int want_affine = 0;
 	int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	int idlest_cpu = -1;
+#endif
 
 	/*
 	* required for stable ->cpus_allowed
 	*/
 	lockdep_assert_held(&p->pi_lock);
+
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	p->select_cpus = p->cpus_ptr;
+	if (dynamic_affinity_used())
+		set_task_select_cpus(p, &idlest_cpu, sd_flag);
+#endif
 	if (sd_flag & SD_BALANCE_WAKE) {
 		record_wakee(p);
 
@@ -6774,7 +6926,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 		}
 
 		want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) &&
+#ifdef CONFIG_TASK_PLACEMENT_BY_CPU_RANGE
+			      cpumask_test_cpu(cpu, p->select_cpus);
+#else
 			      cpumask_test_cpu(cpu, p->cpus_ptr);
+#endif
 	}
 
 	rcu_read_lock();
@@ -6785,7 +6941,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 		 */
 		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
 		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+#ifdef CONFIG_TASK_PLACEMENT_BY_CPU_RANGE
+			new_cpu = cpu;
+			if (cpu != prev_cpu &&
+			    cpumask_test_cpu(prev_cpu, p->select_cpus))
+#else
 			if (cpu != prev_cpu)
+#endif
 				new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
 
 			sd = NULL; /* Prefer wake_affine over balance flags */
@@ -6811,6 +6973,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 	}
 	rcu_read_unlock();
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	if (idlest_cpu != -1 && !cpumask_test_cpu(new_cpu, p->select_cpus))
+		new_cpu = idlest_cpu;
+#endif
+
 	return new_cpu;
 }
 
@@ -7718,7 +7885,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 	if (kthread_is_per_cpu(p))
 		return 0;
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	p->select_cpus = p->cpus_ptr;
+	if (dynamic_affinity_used())
+		set_task_select_cpus(p, NULL, 0);
+	if (!cpumask_test_cpu(env->dst_cpu, p->select_cpus)) {
+#else
 	if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
+#endif
 		int cpu;
 
 		schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
@@ -7738,7 +7912,11 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 
 		/* Prevent to re-select dst_cpu via env's CPUs: */
 		for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+			if (cpumask_test_cpu(cpu, p->select_cpus)) {
+#else
 			if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
+#endif
 				env->flags |= LBF_DST_PINNED;
 				env->new_dst_cpu = cpu;
 				break;
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 59e83f5a5719..36d5a2f83630 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -90,3 +90,10 @@ SCHED_FEAT(WA_BIAS, true)
  * UtilEstimation. Use estimated CPU utilization.
  */
 SCHED_FEAT(UTIL_EST, true)
+
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+/*
+ * Use util_avg of bottom-Level taskgroup
+ */
+SCHED_FEAT(DA_UTIL_TASKGROUP, true)
+#endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1556b988a527..4d3c1f8ec3d9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1946,6 +1946,17 @@ static struct ctl_table kern_table[] = {
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= SYSCTL_ONE,
 	},
+#endif
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	{
+		.procname       = "sched_util_low_pct",
+		.data           = &sysctl_sched_util_low_pct,
+		.maxlen         = sizeof(sysctl_sched_util_low_pct),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec_minmax,
+		.extra1         = SYSCTL_ZERO,
+		.extra2		= &one_hundred,
+	},
 #endif
 	{ }
 };
-- 
Gitee


From 05ab7ca72944b70ae42852138459a01852c2ec59 Mon Sep 17 00:00:00 2001
From: hantwofish <hankangkang5@huawei.com>
Date: Thu, 12 Jun 2025 14:17:39 +0800
Subject: [PATCH 2/3] sched: Support NUMA parallel scheduling for multiple
 processes

For architectures with multiple NUMA node levels and large distances
between nodes, a better approach is to support processes running in
parallel on each NUMA node.

The usage is restricted to the following scenarios:
1. No CPU binding for user-space processes;
2. It is applicable to distributed applications, such as business
   architectures with one master and multiple slaves running in
   parallel;
3. The existing "qos dynamic affinity" and "qos smart grid" features
   must not be used simultaneously.

Signed-off-by: huwentao <wentaohu0417@163.com>
---
 arch/arm64/Kconfig                   |  1 +
 arch/arm64/configs/tencent.config    |  1 +
 arch/arm64/include/asm/prefer_numa.h | 13 ++++++
 arch/arm64/kernel/Makefile           |  1 +
 arch/arm64/kernel/prefer_numa.c      | 68 ++++++++++++++++++++++++++++
 include/linux/perf_event.h           |  2 +
 include/linux/sched.h                |  1 +
 init/Kconfig                         | 22 +++++++++
 kernel/cgroup/cpuset.c               |  8 +++-
 kernel/events/core.c                 | 13 ++++++
 kernel/fork.c                        |  9 ++++
 kernel/sched/debug.c                 | 42 +++++++++++++++++
 kernel/sched/fair.c                  | 19 +++++++-
 kernel/sched/features.h              |  4 ++
 14 files changed, 200 insertions(+), 4 deletions(-)
 create mode 100644 arch/arm64/include/asm/prefer_numa.h
 create mode 100644 arch/arm64/kernel/prefer_numa.c

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 198dcd3a0c36..fbf4eb3e0027 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -71,6 +71,7 @@ config ARM64
 	select ARCH_SUPPORTS_ATOMIC_RMW
 	select ARCH_SUPPORTS_INT128 if GCC_VERSION >= 50000 || CC_IS_CLANG
 	select ARCH_SUPPORTS_NUMA_BALANCING
+	select ARCH_SUPPORTS_SCHED_PARAL
 	select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
 	select ARCH_WANT_FRAME_POINTERS
diff --git a/arch/arm64/configs/tencent.config b/arch/arm64/configs/tencent.config
index 3cb93ce14bf1..3811e0167e78 100644
--- a/arch/arm64/configs/tencent.config
+++ b/arch/arm64/configs/tencent.config
@@ -37,6 +37,7 @@ CONFIG_CGROUP_MISC=y
 CONFIG_CGROUP_SLI=y
 CONFIG_NAMESPACES=y
 CONFIG_USER_NS=y
+CONFIG_SCHED_PARAL=y
 # CONFIG_SECURITY_MONITOR is not set
 CONFIG_CHECKPOINT_RESTORE=y
 CONFIG_SCHED_AUTOGROUP=y
diff --git a/arch/arm64/include/asm/prefer_numa.h b/arch/arm64/include/asm/prefer_numa.h
new file mode 100644
index 000000000000..6c8e2b2142b9
--- /dev/null
+++ b/arch/arm64/include/asm/prefer_numa.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef __ASM_PREFER_NUMA_H
+#define __ASM_PREFER_NUMA_H
+
+#include <linux/sched.h>
+
+#define PROBE_NUMA_PMU_NAME  "hisi_sccl3_hha0"
+#define PROBE_NUMA_PMU_EVENT 0x02
+
+void set_task_paral_node(struct task_struct *p);
+int probe_pmu_numa_event(void);
+
+#endif	/* __ASM_PREFER_NUMA_H */
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 9763b9f576d8..d0813553ce2e 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -64,6 +64,7 @@ obj-$(CONFIG_ARM_SDE_INTERFACE)		+= sdei.o
 obj-$(CONFIG_ARM64_SSBD)		+= ssbd.o
 obj-$(CONFIG_SDEI_WATCHDOG)		+= watchdog_sdei.o
 obj-$(CONFIG_ARM64_PTR_AUTH)		+= pointer_auth.o
+obj-$(CONFIG_SCHED_PARAL)		+= prefer_numa.o
 
 obj-y					+= vdso/ probes/
 obj-$(CONFIG_COMPAT_VDSO)		+= vdso32/
diff --git a/arch/arm64/kernel/prefer_numa.c b/arch/arm64/kernel/prefer_numa.c
new file mode 100644
index 000000000000..e6f3f43fb97a
--- /dev/null
+++ b/arch/arm64/kernel/prefer_numa.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * choose a prefer numa node
+ *
+ * Copyright (C) 2025 Huawei Limited.
+ */
+#include <linux/perf_event.h>
+#include <asm/prefer_numa.h>
+
+static atomic_t paral_nid_last = ATOMIC_INIT(-1);
+
+int probe_pmu_numa_event(void)
+{
+	struct perf_event *event;
+	struct perf_event_attr attr = {};
+	int type = perf_pmu_type_of_name(PROBE_NUMA_PMU_NAME);
+
+	if (type == -1)
+		return -EINVAL;
+
+	attr.type = type;
+	attr.config = PROBE_NUMA_PMU_EVENT;
+	attr.size = sizeof(struct perf_event_attr);
+	attr.pinned = 1;
+	attr.disabled = 1;
+	attr.sample_period = 0;
+
+	event = perf_event_create_kernel_counter(&attr, smp_processor_id(),
+							NULL, NULL, NULL);
+	if (IS_ERR(event))
+		return PTR_ERR(event);
+
+	perf_event_release_kernel(event);
+
+	return 0;
+}
+
+static inline unsigned int update_sched_paral_nid(void)
+{
+	return (unsigned int)atomic_inc_return(&paral_nid_last);
+}
+
+void set_task_paral_node(struct task_struct *p)
+{
+	int nid;
+	int i = 0;
+	const cpumask_t *cpus_mask;
+
+	if (is_global_init(current))
+		return;
+
+	if (p->flags & PF_KTHREAD || p->tgid != p->pid)
+		return;
+
+	while (i < nr_node_ids) {
+		nid = update_sched_paral_nid() % nr_node_ids;
+		cpus_mask = cpumask_of_node(nid);
+
+		if (cpumask_empty(cpus_mask) ||
+			!cpumask_subset(cpus_mask, p->cpus_ptr)) {
+			i++;
+			continue;
+		}
+
+		cpumask_copy(p->prefer_cpus, cpus_mask);
+		break;
+	}
+}
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 85e966c2c1f2..07fc4b3e3ce4 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1367,6 +1367,7 @@ extern void perf_event_disable_local(struct perf_event *event);
 extern void perf_event_disable_inatomic(struct perf_event *event);
 extern void perf_event_task_tick(void);
 extern int perf_event_account_interrupt(struct perf_event *event);
+extern int perf_pmu_type_of_name(const char *name);
 #else /* !CONFIG_PERF_EVENTS: */
 static inline void *
 perf_aux_output_begin(struct perf_output_handle *handle,
@@ -1446,6 +1447,7 @@ static inline void perf_event_disable(struct perf_event *event)		{ }
 static inline int __perf_event_disable(void *info)			{ return -1; }
 static inline void perf_event_task_tick(void)				{ }
 static inline int perf_event_release_kernel(struct perf_event *event)	{ return 0; }
+static inline int perf_pmu_type_of_name(const char *name) { return -1; }
 #endif
 
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0322d881fae3..2e7048f23f89 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2188,6 +2188,7 @@ int set_prefer_cpus_ptr(struct task_struct *p,
 int sched_prefer_cpus_fork(struct task_struct *p, struct cpumask *mask);
 void sched_prefer_cpus_free(struct task_struct *p);
 void dynamic_affinity_enable(void);
+bool sched_paral_used(void);
 #endif
 
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index 4226d24823cf..ea5f89cdc013 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1265,6 +1265,28 @@ config SECURITY_MONITOR
 	default y
 	help
 		Allow user to add security monitor
+#
+# For architectures that want to enable the support for SCHED_PARAL
+#
+config ARCH_SUPPORTS_SCHED_PARAL
+	bool
+
+config SCHED_PARAL
+	bool "Parallelly schedule processes on different NUMA nodes"
+	depends on ARCH_SUPPORTS_SCHED_PARAL
+	depends on QOS_SCHED_DYNAMIC_AFFINITY
+	default n
+	help
+	  By enabling this feature, processes can be scheduled in parallel
+	  on various NUMA nodes to better utilize the cache in NUMA node.
+	  The usage is restricted to the following scenarios:
+	  1. No CPU binding is performed for user-space processes;
+	  2. It is applicable to distributed applications, such as business
+	     architectures with one master and multiple slaves running in
+	     parallel;
+	  3. The existing "qos dynamic affinity" and "qos smart grid"
+	     features must not be used simultaneously.
+
 
 config CHECKPOINT_RESTORE
 	bool "Checkpoint/restore support"
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index e75c46c74ce5..7217abfcc18e 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2361,7 +2361,8 @@ static void cpuset_attach(struct cgroup_taskset *tset)
 		 */
 		WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
-		set_prefer_cpus_ptr(task, prefer_cpus_attach);
+		if (!sched_paral_used() || !cpumask_empty(prefer_cpus_attach))
+			set_prefer_cpus_ptr(task, prefer_cpus_attach);
 #endif
 
 		cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
@@ -3630,7 +3631,10 @@ static void cpuset_fork(struct task_struct *task)
 
 	set_cpus_allowed_ptr(task, current->cpus_ptr);
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
-	set_prefer_cpus_ptr(task, current->prefer_cpus);
+	rcu_read_lock();
+	if (!sched_paral_used() || !cpumask_empty(task_cs(current)->prefer_cpus))
+		set_prefer_cpus_ptr(task, current->prefer_cpus);
+	rcu_read_unlock();
 #endif
 	task->mems_allowed = current->mems_allowed;
 }
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f83d3e4f2578..2043b0f729a2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -12372,6 +12372,19 @@ static int __init perf_event_sysfs_init(void)
 }
 device_initcall(perf_event_sysfs_init);
 
+int perf_pmu_type_of_name(const char *name)
+{
+	unsigned int i;
+	struct pmu *pmu;
+
+	idr_for_each_entry(&pmu_idr, pmu, i) {
+		if (!strcmp(pmu->name, name))
+			return pmu->type;
+	}
+
+	return -1;
+}
+
 #ifdef CONFIG_CGROUP_PERF
 static struct cgroup_subsys_state *
 perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
diff --git a/kernel/fork.c b/kernel/fork.c
index 42d21a1f7812..058b78c11577 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -106,6 +106,10 @@
 
 #include <trace/events/sched.h>
 
+#ifdef CONFIG_SCHED_PARAL
+#include <asm/prefer_numa.h>
+#endif
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/task.h>
 
@@ -2210,6 +2214,11 @@ static __latent_entropy struct task_struct *copy_process(
 		goto bad_fork_put_pidfd;
 #endif
 
+#ifdef CONFIG_SCHED_PARAL
+	if (sched_paral_used())
+		set_task_paral_node(p);
+#endif
+
 	p->nr_dirtied = 0;
 	p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
 	p->dirty_paused_when = 0;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index f03c43e7861a..85706ad00fc1 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -6,6 +6,11 @@
  *
  * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
  */
+
+#ifdef CONFIG_SCHED_PARAL
+#include <asm/prefer_numa.h>
+#endif
+
 #include "sched.h"
 
 /*
@@ -96,6 +101,39 @@ static void sched_feat_disable(int i) { };
 static void sched_feat_enable(int i) { };
 #endif /* CONFIG_JUMP_LABEL */
 
+#ifdef CONFIG_SCHED_PARAL
+static void sched_feat_disable_paral(char *cmp)
+{
+	struct task_struct *tsk, *t;
+
+	if (strncmp(cmp, "PARAL", 5) == 0) {
+		read_lock(&tasklist_lock);
+		for_each_process(tsk) {
+			if (tsk->flags & PF_KTHREAD || is_global_init(tsk))
+				continue;
+
+			for_each_thread(tsk, t)
+				cpumask_clear(t->prefer_cpus);
+		}
+		read_unlock(&tasklist_lock);
+	}
+}
+
+static bool sched_feat_enable_paral(char *cmp)
+{
+	if (strncmp(cmp, "PARAL", 5) != 0)
+		return true;
+
+	if (probe_pmu_numa_event() != 0)
+		return false;
+
+	return true;
+}
+#else
+static void sched_feat_disable_paral(char *cmp) {};
+static bool sched_feat_enable_paral(char *cmp) { return true; };
+#endif /* CONFIG_SCHED_PARAL */
+
 static int sched_feat_set(char *cmp)
 {
 	int i;
@@ -112,8 +150,12 @@ static int sched_feat_set(char *cmp)
 
 	if (neg) {
 		sysctl_sched_features &= ~(1UL << i);
+		sched_feat_disable_paral(cmp);
 		sched_feat_disable(i);
 	} else {
+		if (!sched_feat_enable_paral(cmp))
+			return -EPERM;
+
 		sysctl_sched_features |= (1UL << i);
 		sched_feat_enable(i);
 	}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 64524be96c4a..695ea7b7111c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6765,6 +6765,15 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 }
 
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+bool sched_paral_used(void)
+{
+#ifdef CONFIG_SCHED_PARAL
+	if (sched_feat(PARAL))
+		return true;
+#endif
+
+	return false;
+}
 
 #ifdef CONFIG_JUMP_LABEL
 static DEFINE_STATIC_KEY_FALSE(__dynamic_affinity_used);
@@ -6866,6 +6875,12 @@ static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu,
 	}
 	rcu_read_unlock();
 
+	/* In extreme cases, it may cause uneven system load. */
+	if (sched_paral_used() && sysctl_sched_util_low_pct == 100 && nr_cpus_valid > 0) {
+		p->select_cpus = p->prefer_cpus;
+		return;
+	}
+
 	/*
 	 * Follow cases should select cpus_ptr, checking by condition of
 	 * tg_capacity > nr_cpus_valid:
@@ -6912,7 +6927,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
 	p->select_cpus = p->cpus_ptr;
-	if (dynamic_affinity_used())
+	if (dynamic_affinity_used() || sched_paral_used())
 		set_task_select_cpus(p, &idlest_cpu, sd_flag);
 #endif
 	if (sd_flag & SD_BALANCE_WAKE) {
@@ -7887,7 +7902,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
 	p->select_cpus = p->cpus_ptr;
-	if (dynamic_affinity_used())
+	if (dynamic_affinity_used() || sched_paral_used())
 		set_task_select_cpus(p, NULL, 0);
 	if (!cpumask_test_cpu(env->dst_cpu, p->select_cpus)) {
 #else
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 36d5a2f83630..f929c48c621a 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -58,6 +58,10 @@ SCHED_FEAT(SIS_AVG_CPU, false)
 SCHED_FEAT(SIS_PROP, true)
 SCHED_FEAT(SIS_UTIL, false)
 
+#ifdef CONFIG_SCHED_PARAL
+SCHED_FEAT(PARAL, false)
+#endif
+
 /*
  * Issue a WARN when we do multiple update_rq_clock() calls
  * in a single rq->lock section. Default disabled because the
-- 
Gitee


From ffe6060db70608c26bd65b9b917b7b653ca6ec40 Mon Sep 17 00:00:00 2001
From: hantwofish <hankangkang5@huawei.com>
Date: Wed, 11 Jun 2025 16:59:25 +0800
Subject: [PATCH 3/3] sched/fair: Prefer physical cores when migrating tasks

When cpu hyperthreading is enabled, one physical core can virtualize
multiple logical cpus. Assume that physical core0 virtualizes two
logical cpus, cpu0 and cpu1. Only when the load of cpu0 exceeds the set
ratio to the capacity of cpu0, the task will be migrated to the cpu1,
otherwise the task will not be migrated and the cpu0 will still be used.

The optimizations are as follows:
1. A more reasonable algorithm for obtaining load values;
2. Limit the maximum value of sysctl_sched_util_ratio to 100;
3. If the value of sysctl_sched_util_ratio is 100, the other smt of the
   core will not be used.

External impacts:
   1) default config in arm64: CONFIG_SCHED_KEEP_ON_CORE=y
   2) sysctl: /proc/sys/kernel/sched_util_ratio
   3) sched features: KEEP_ON_CORE (default NO_KEEP_ON_CORE)

Signed-off-by: huwentao <wentaohu0417@163.com>
---
 arch/arm64/Kconfig                |  1 +
 arch/arm64/configs/tencent.config |  2 ++
 include/linux/sched/sysctl.h      |  4 ++++
 init/Kconfig                      | 20 ++++++++++++++++++
 kernel/sched/fair.c               | 34 +++++++++++++++++++++++++++++++
 kernel/sched/features.h           |  4 ++++
 kernel/sysctl.c                   | 11 ++++++++++
 7 files changed, 76 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index fbf4eb3e0027..6a7d7bf4b35a 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -71,6 +71,7 @@ config ARM64
 	select ARCH_SUPPORTS_ATOMIC_RMW
 	select ARCH_SUPPORTS_INT128 if GCC_VERSION >= 50000 || CC_IS_CLANG
 	select ARCH_SUPPORTS_NUMA_BALANCING
+	select ARCH_SUPPORTS_SCHED_KEEP_ON_CORE
 	select ARCH_SUPPORTS_SCHED_PARAL
 	select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
diff --git a/arch/arm64/configs/tencent.config b/arch/arm64/configs/tencent.config
index 3811e0167e78..0adaf17e9a2d 100644
--- a/arch/arm64/configs/tencent.config
+++ b/arch/arm64/configs/tencent.config
@@ -37,6 +37,8 @@ CONFIG_CGROUP_MISC=y
 CONFIG_CGROUP_SLI=y
 CONFIG_NAMESPACES=y
 CONFIG_USER_NS=y
+CONFIG_SCHED_SMT=y
+CONFIG_SCHED_KEEP_ON_CORE=y
 CONFIG_SCHED_PARAL=y
 # CONFIG_SECURITY_MONITOR is not set
 CONFIG_CHECKPOINT_RESTORE=y
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 9f273fe40590..3a0ab8e2a57f 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -57,6 +57,10 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
 		loff_t *ppos);
 #endif
 
+#ifdef CONFIG_SCHED_KEEP_ON_CORE
+extern int sysctl_sched_util_ratio;
+#endif
+
 /*
  *  control realtime throttling:
  *
diff --git a/init/Kconfig b/init/Kconfig
index ea5f89cdc013..3aa73f27028f 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1260,11 +1260,31 @@ config NET_NS
 
 endif # NAMESPACES
 
+config SCHED_KEEP_ON_CORE
+	bool "Prefer physical cores when migrating tasks"
+	depends on SCHED_SMT
+	depends on ARCH_SUPPORTS_SCHED_KEEP_ON_CORE
+	default n
+	help
+		When cpu hyperthreading is enabled, one physical core can virtualize
+		multiple logical cpus. Assume that physical core0 virtualizes two
+		logical cpus, cpu0 and cpu1. Only when the load of cpu0 exceeds the
+		ratio to the capacity of cpu0, the task will be migrated to the cpu1,
+		otherwise the task will not be migrated and the cpu0 will still be
+		used.
+
 config SECURITY_MONITOR
 	bool "security monitor"
 	default y
 	help
 		Allow user to add security monitor
+
+# For architectures that want to enable the support for SCHED_KEEP_ON_CORE
+#
+config ARCH_SUPPORTS_SCHED_KEEP_ON_CORE
+       bool
+
+#
 #
 # For architectures that want to enable the support for SCHED_PARAL
 #
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 695ea7b7111c..f4727163abb0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6198,6 +6198,22 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
 	return si_cpu;
 }
 
+#ifdef CONFIG_SCHED_KEEP_ON_CORE
+int sysctl_sched_util_ratio = 100;
+
+static bool core_has_spare(int cpu)
+{
+	int core_id = cpumask_first(cpu_smt_mask(cpu));
+	unsigned long util = cpu_util(core_id);
+	unsigned long capacity = capacity_of(core_id);
+
+	if (sysctl_sched_util_ratio == 100)
+		return true;
+
+	return util * 100 < capacity * sysctl_sched_util_ratio;
+}
+#endif
+
 #else /* CONFIG_SCHED_SMT */
 
 static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
@@ -6986,6 +7002,15 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 		if (want_affine)
 			current->recent_used_cpu = cpu;
 	}
+
+#ifdef CONFIG_SCHED_KEEP_ON_CORE
+	if (sched_feat(KEEP_ON_CORE) &&
+		static_branch_likely(&sched_smt_present)) {
+		if (core_has_spare(new_cpu))
+			new_cpu = cpumask_first(cpu_smt_mask((new_cpu)));
+	}
+#endif
+
 	rcu_read_unlock();
 
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
@@ -7886,6 +7911,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 
 	lockdep_assert_rq_held(env->src_rq);
 
+#ifdef CONFIG_SCHED_KEEP_ON_CORE
+	if (sched_feat(KEEP_ON_CORE) &&
+		static_branch_likely(&sched_smt_present)) {
+		if (core_has_spare(env->dst_cpu) &&
+			cpumask_first(cpu_smt_mask((env->dst_cpu))) != env->dst_cpu)
+			return 0;
+	}
+#endif
+
 	/*
 	 * We do not migrate tasks that are:
 	 * 1) throttled_lb_pair, or
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index f929c48c621a..d4651d9cdb4a 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -58,6 +58,10 @@ SCHED_FEAT(SIS_AVG_CPU, false)
 SCHED_FEAT(SIS_PROP, true)
 SCHED_FEAT(SIS_UTIL, false)
 
+#ifdef CONFIG_SCHED_KEEP_ON_CORE
+SCHED_FEAT(KEEP_ON_CORE, false)
+#endif
+
 #ifdef CONFIG_SCHED_PARAL
 SCHED_FEAT(PARAL, false)
 #endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4d3c1f8ec3d9..84f83cd5ad2b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1957,6 +1957,17 @@ static struct ctl_table kern_table[] = {
 		.extra1         = SYSCTL_ZERO,
 		.extra2		= &one_hundred,
 	},
+#endif
+#ifdef CONFIG_SCHED_KEEP_ON_CORE
+	{
+		.procname       = "sched_util_ratio",
+		.data           = &sysctl_sched_util_ratio,
+		.maxlen         = sizeof(sysctl_sched_util_ratio),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec_minmax,
+		.extra1         = SYSCTL_ZERO,
+		.extra2         = &one_hundred,
+	},
 #endif
 	{ }
 };
-- 
Gitee