From b893afc50d410688c06263d75fd8379f66f73b9f Mon Sep 17 00:00:00 2001 From: jiangheng Date: Thu, 18 Dec 2025 16:10:40 +0800 Subject: [PATCH 1/2] sched: Introduce dynamic affinity for cfs scheduler [commit 890af1b1e1d59ec2d382f015c7ebbb19b26bf6af openEuler.] Dynamic affinity set preferred cpus for task. When the utilization of taskgroup's preferred cpu is low, task only run in cpus preferred to enhance cpu resource locality and reduce interference between task cgroups, otherwise task can burst preferred cpus to use external cpu within cpus allowed. Signed-off-by: jiangheng --- arch/arm64/configs/tencent.config | 1 + fs/proc/array.c | 16 +++ fs/proc/base.c | 73 ++++++++++ include/linux/sched.h | 23 +++ init/Kconfig | 11 ++ init/init_task.c | 3 + kernel/cgroup/cpuset.c | 164 ++++++++++++++++++++- kernel/fork.c | 16 +++ kernel/sched/core.c | 98 +++++++++++++ kernel/sched/debug.c | 6 + kernel/sched/fair.c | 230 ++++++++++++++++++++++++++++++ kernel/sched/features.h | 7 + 12 files changed, 647 insertions(+), 1 deletion(-) diff --git a/arch/arm64/configs/tencent.config b/arch/arm64/configs/tencent.config index 3352297eb324..029fa71f43b8 100644 --- a/arch/arm64/configs/tencent.config +++ b/arch/arm64/configs/tencent.config @@ -31,6 +31,7 @@ CONFIG_MEMCG=y CONFIG_BLK_CGROUP=y CONFIG_CFS_BANDWIDTH=y CONFIG_RT_GROUP_SCHED=y +CONFIG_QOS_SCHED_DYNAMIC_AFFINITY=y CONFIG_CGROUP_PIDS=y CONFIG_CGROUP_RDMA=y CONFIG_CGROUP_FREEZER=y diff --git a/fs/proc/array.c b/fs/proc/array.c index f2b44669408c..b3eece840aeb 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -478,6 +478,19 @@ int host_pid_info(struct seq_file *m, struct pid_namespace *ns, return 0; } +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +static void task_cpus_preferred(struct seq_file *m, struct task_struct *task) +{ + if (!dynamic_affinity_enabled()) + return; + + seq_printf(m, "Cpus_preferred:\t%*pb\n", + cpumask_pr_args(task->prefer_cpus)); + seq_printf(m, "Cpus_preferred_list:\t%*pbl\n", + cpumask_pr_args(task->prefer_cpus)); +} +#endif + int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -507,6 +520,9 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, cpuset_task_status_allowed(m, task); task_context_switch_counts(m, task); arch_proc_pid_thread_features(m, task); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + task_cpus_preferred(m, task); +#endif return 0; } diff --git a/fs/proc/base.c b/fs/proc/base.c index 3ebeef46d9d8..a25d507afe32 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3236,6 +3236,76 @@ static const struct file_operations proc_setgroups_operations = { }; #endif /* CONFIG_USER_NS */ +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + +static int preferred_cpuset_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + if (p->prefer_cpus) + seq_printf(m, "%*pbl\n", cpumask_pr_args(p->prefer_cpus)); + else + seq_putc(m, '\n'); + + put_task_struct(p); + + return 0; +} + +static ssize_t preferred_cpuset_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + cpumask_var_t new_mask; + int retval; + struct inode *inode = file_inode(file); + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_put_task; + } + + retval = cpumask_parselist_user(buf, count, new_mask); + if (retval < 0) + goto out_free_cpumask; + + retval = set_prefer_cpus_ptr(p, new_mask); + if (retval < 0) + goto out_free_cpumask; + + retval = count; + +out_free_cpumask: + free_cpumask_var(new_mask); +out_put_task: + put_task_struct(p); + + return retval; +} + +static int preferred_cpuset_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, preferred_cpuset_show, inode); +} + +static const struct file_operations proc_preferred_cpuset_operations = { + .open = preferred_cpuset_open, + .write = preferred_cpuset_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -3764,6 +3834,9 @@ static const struct pid_entry tid_base_stuff[] = { ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat), #endif +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + REG("preferred_cpuset", 0644, proc_preferred_cpuset_operations), +#endif }; static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/include/linux/sched.h b/include/linux/sched.h index c0fb12930d9d..d8bfa6b7f383 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -602,8 +602,13 @@ struct sched_statistics { u64 core_forceidle_sum; #endif +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + u64 nr_wakeups_preferred_cpus; + u64 nr_wakeups_force_preferred_cpus; +#else KABI_RESERVE(1); KABI_RESERVE(2); +#endif KABI_RESERVE(3); KABI_RESERVE(4); KABI_RESERVE(5); @@ -1537,6 +1542,11 @@ struct task_struct { /* Timestamp for last schedule: */ unsigned long long ftrace_timestamp; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_t *prefer_cpus; + const cpumask_t *select_cpus; +#endif + /* * Number of functions that haven't been traced * because of depth overrun: @@ -2589,6 +2599,19 @@ static inline void rseq_execve(struct task_struct *t) #endif +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +int set_prefer_cpus_ptr(struct task_struct *p, + const struct cpumask *new_mask); +int sched_prefer_cpus_fork(struct task_struct *p, struct cpumask *mask); +void sched_prefer_cpus_free(struct task_struct *p); + +extern struct static_key_false __dynamic_affinity_switch; +static inline bool dynamic_affinity_enabled(void) +{ + return static_branch_unlikely(&__dynamic_affinity_switch); +} +#endif + #ifdef CONFIG_DEBUG_RSEQ void rseq_syscall(struct pt_regs *regs); diff --git a/init/Kconfig b/init/Kconfig index 0bb86ff1f222..784021a8e174 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1113,6 +1113,17 @@ config BT_GROUP_SCHED endif #CGROUP_SCHED +config QOS_SCHED_DYNAMIC_AFFINITY + bool "qos dynamic affinity" + depends on CPUSETS + depends on FAIR_GROUP_SCHED + default n + help + This feature lets you allocate preferred cpus to taskgroup. If enabled, + it will make taskgroup only to use preferred cpus when cpu utilization + of taskgroup is below threshold setted, otherwise make taskgroup to use + cpus allowed. + config SCHED_MM_CID def_bool y depends on SMP && RSEQ diff --git a/init/init_task.c b/init/init_task.c index 13f1af12823b..26590b951350 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -223,6 +223,9 @@ struct task_struct init_task #ifdef CONFIG_SECURITY .security = NULL, #endif +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + .prefer_cpus = NULL, +#endif #ifdef CONFIG_SECCOMP_FILTER .seccomp = { .filter_count = ATOMIC_INIT(0) }, #endif diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 6a5263753131..6b4835a7a9ac 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -131,6 +131,9 @@ struct cpuset { /* user-configured CPUs and Memory Nodes allow to tasks */ cpumask_var_t cpus_allowed; nodemask_t mems_allowed; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_var_t prefer_cpus; +#endif /* effective CPUs and Memory Nodes allow to tasks */ cpumask_var_t effective_cpus; @@ -245,6 +248,9 @@ static inline bool is_prs_invalid(int prs_state) struct tmpmasks { cpumask_var_t addmask, delmask; /* For partition root */ cpumask_var_t new_cpus; /* For update_cpumasks_hier() */ +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_var_t prefer_cpus; +#endif }; static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) @@ -633,15 +639,24 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) { cpumask_var_t *pmask1, *pmask2, *pmask3; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_var_t *pmask4; +#endif if (cs) { pmask1 = &cs->cpus_allowed; pmask2 = &cs->effective_cpus; pmask3 = &cs->subparts_cpus; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + pmask4 = &cs->prefer_cpus; +#endif } else { pmask1 = &tmp->new_cpus; pmask2 = &tmp->addmask; pmask3 = &tmp->delmask; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + pmask4 = &tmp->prefer_cpus; +#endif } if (!zalloc_cpumask_var(pmask1, GFP_KERNEL)) @@ -652,9 +667,17 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) if (!zalloc_cpumask_var(pmask3, GFP_KERNEL)) goto free_two; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (!zalloc_cpumask_var(pmask4, GFP_KERNEL)) + goto free_three; +#endif return 0; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +free_three: + free_cpumask_var(*pmask3); +#endif free_two: free_cpumask_var(*pmask2); free_one: @@ -670,11 +693,17 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) { if (cs) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + free_cpumask_var(cs->prefer_cpus); +#endif free_cpumask_var(cs->cpus_allowed); free_cpumask_var(cs->effective_cpus); free_cpumask_var(cs->subparts_cpus); } if (tmp) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + free_cpumask_var(tmp->prefer_cpus); +#endif free_cpumask_var(tmp->new_cpus); free_cpumask_var(tmp->addmask); free_cpumask_var(tmp->delmask); @@ -698,6 +727,9 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) return NULL; } +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_copy(trial->prefer_cpus, cs->prefer_cpus); +#endif cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); cpumask_copy(trial->effective_cpus, cs->effective_cpus); return trial; @@ -779,6 +811,12 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) if (cur == &top_cpuset) goto out; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + ret = -EINVAL; + if (!cpumask_subset(cur->prefer_cpus, trial->cpus_allowed)) + goto out; +#endif + par = parent_cs(cur); /* @@ -827,6 +865,69 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) return ret; } +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +static cpumask_var_t prefer_cpus_attach; + +static void update_tasks_prefer_cpumask(struct cpuset *cs) +{ + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&cs->css, 0, &it); + while ((task = css_task_iter_next(&it))) + set_prefer_cpus_ptr(task, cs->prefer_cpus); + css_task_iter_end(&it); +} + +/* + * update_prefer_cpumask - update the prefer_cpus mask of a cpuset and + * all tasks in it + * @cs: the cpuset to consider + * @trialcs: trial cpuset + * @buf: buffer of cpu numbers written to this cpuset + */ +static int update_prefer_cpumask(struct cpuset *cs, struct cpuset *trialcs, + const char *buf) +{ + int retval; + + if (cs == &top_cpuset) + return -EACCES; + + if (!dynamic_affinity_enabled()) + return -EPERM; + + /* + * An empty prefer_cpus is ok which mean that the cpuset tasks disable + * dynamic affinity feature. + * Since cpulist_parse() fails on an empty mask, we special case + * that parsing. + */ + if (!*buf) { + cpumask_clear(trialcs->prefer_cpus); + } else { + retval = cpulist_parse(buf, trialcs->prefer_cpus); + if (retval < 0) + return retval; + } + + /* Nothing to do if the cpus didn't change */ + if (cpumask_equal(cs->prefer_cpus, trialcs->prefer_cpus)) + return 0; + + if (!cpumask_subset(trialcs->prefer_cpus, cs->cpus_allowed)) + return -EINVAL; + + update_tasks_prefer_cpumask(trialcs); + + spin_lock_irq(&callback_lock); + cpumask_copy(cs->prefer_cpus, trialcs->prefer_cpus); + spin_unlock_irq(&callback_lock); + + return 0; +} +#endif + #ifdef CONFIG_SMP /* * Helper routine for generate_sched_domains(). @@ -2708,6 +2809,10 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) * fail. TODO: have a better way to handle failure here */ WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_copy(prefer_cpus_attach, cs->prefer_cpus); + set_prefer_cpus_ptr(task, prefer_cpus_attach); +#endif cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); cpuset_update_task_spread_flags(cs, task); @@ -2819,6 +2924,9 @@ typedef enum { FILE_MEMORY_PRESSURE, FILE_SPREAD_PAGE, FILE_SPREAD_SLAB, +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + FILE_DYNAMIC_CPULIST, +#endif } cpuset_filetype_t; static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, @@ -2949,6 +3057,11 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, case FILE_MEMLIST: retval = update_nodemask(cs, trialcs, buf); break; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + case FILE_DYNAMIC_CPULIST: + retval = update_prefer_cpumask(cs, trialcs, buf); + break; +#endif default: retval = -EINVAL; break; @@ -2997,6 +3110,11 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) case FILE_SUBPARTS_CPULIST: seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus)); break; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + case FILE_DYNAMIC_CPULIST: + seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->prefer_cpus)); + break; +#endif default: ret = -EINVAL; } @@ -3879,7 +3997,15 @@ static struct cftype legacy_files[] = { .name = "loadavg", .seq_show = cpuset_cgroup_loadavg_show, }, - +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + { + .name = "preferred_cpus", + .seq_show = cpuset_common_seq_show, + .write = cpuset_write_resmask, + .max_write_len = (100U + 6 * NR_CPUS), + .private = FILE_DYNAMIC_CPULIST, + }, +#endif { } /* terminate */ }; @@ -3938,6 +4064,17 @@ static struct cftype dfl_files[] = { .seq_show = cpuset_cgroup_loadavg_show, }, +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + { + .name = "preferred_cpus", + .seq_show = cpuset_common_seq_show, + .write = cpuset_write_resmask, + .max_write_len = (100U + 6 * NR_CPUS), + .private = FILE_DYNAMIC_CPULIST, + .flags = CFTYPE_NOT_ON_ROOT, + }, +#endif + { } /* terminate */ }; @@ -4049,6 +4186,9 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->effective_mems = parent->mems_allowed; cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); cpumask_copy(cs->effective_cpus, parent->cpus_allowed); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_copy(cs->prefer_cpus, parent->prefer_cpus); +#endif spin_unlock_irq(&callback_lock); out_unlock: mutex_unlock(&cpuset_mutex); @@ -4202,6 +4342,9 @@ static void cpuset_fork(struct task_struct *task) return; set_cpus_allowed_ptr(task, current->cpus_ptr); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + set_prefer_cpus_ptr(task, current->prefer_cpus); +#endif task->mems_allowed = current->mems_allowed; return; } @@ -4247,17 +4390,26 @@ int __init cpuset_init(void) BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL)); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + BUG_ON(!alloc_cpumask_var(&top_cpuset.prefer_cpus, GFP_KERNEL)); +#endif cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); cpumask_setall(top_cpuset.effective_cpus); nodes_setall(top_cpuset.effective_mems); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_clear(top_cpuset.prefer_cpus); +#endif fmeter_init(&top_cpuset.fmeter); set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); top_cpuset.relax_domain_level = -1; BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + BUG_ON(!alloc_cpumask_var(&prefer_cpus_attach, GFP_KERNEL)); +#endif return 0; } @@ -4294,6 +4446,9 @@ hotplug_update_tasks_legacy(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_t prefer_cpus; +#endif bool is_empty; spin_lock_irq(&callback_lock); @@ -4312,6 +4467,13 @@ hotplug_update_tasks_legacy(struct cpuset *cs, if (mems_updated && !nodes_empty(cs->mems_allowed)) update_tasks_nodemask(cs); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (!cpumask_subset(cs->prefer_cpus, cs->cpus_allowed)) { + cpumask_and(&prefer_cpus, cs->prefer_cpus, cs->cpus_allowed); + cpumask_copy(cs->prefer_cpus, &prefer_cpus); + update_tasks_prefer_cpumask(cs); + } +#endif is_empty = cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed); diff --git a/kernel/fork.c b/kernel/fork.c index 8a9102ad6145..83e23d7e8070 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -659,6 +659,10 @@ void free_task(struct task_struct *tsk) #ifdef CONFIG_IEE_PTRP if(haoc_enabled) iee_invalidate_token(tsk); +#endif +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (dynamic_affinity_enabled()) + sched_prefer_cpus_free(tsk); #endif free_task_struct(tsk); } @@ -1196,6 +1200,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->seccomp.filter = NULL; #endif +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + tsk->prefer_cpus = NULL; +#endif + setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); @@ -2427,6 +2435,14 @@ __latent_entropy struct task_struct *copy_process( rt_mutex_init_task(p); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (dynamic_affinity_enabled()) { + retval = sched_prefer_cpus_fork(p, current->prefer_cpus); + if (retval) + goto bad_fork_free; + } +#endif + lockdep_assert_irqs_enabled(); #ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5d2dbe74581a..aaa3a598770a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -11498,6 +11498,104 @@ static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, } #endif /* CONFIG_GROUP_SCHED_WEIGHT */ +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +int sched_prefer_cpus_fork(struct task_struct *p, struct cpumask *mask) +{ + p->prefer_cpus = kmalloc(sizeof(cpumask_t), GFP_KERNEL); + if (!p->prefer_cpus) + return -ENOMEM; + + if (mask) + cpumask_copy(p->prefer_cpus, mask); + else + cpumask_clear(p->prefer_cpus); + + return 0; +} + +void sched_prefer_cpus_free(struct task_struct *p) +{ + kfree(p->prefer_cpus); +} + +static void do_set_prefer_cpus(struct task_struct *p, + const struct cpumask *new_mask) +{ + struct rq *rq = task_rq(p); + bool queued, running; + + lockdep_assert_held(&p->pi_lock); + + queued = task_on_rq_queued(p); + running = task_current(rq, p); + + if (queued) { + /* + * Because __kthread_bind() calls this on blocked tasks without + * holding rq->lock. + */ + lockdep_assert_held(&rq->__lock); + dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); + } + if (running) + put_prev_task(rq, p); + + cpumask_copy(p->prefer_cpus, new_mask); + + if (queued) + enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); + if (running) + set_next_task(rq, p); +} + +/* + * Change a given task's prefer CPU affinity. Prioritize migrate the thread to + * prefer cpus according to preferred bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +static int __set_prefer_cpus_ptr(struct task_struct *p, + const struct cpumask *new_mask) +{ + struct rq_flags rf; + struct rq *rq; + int ret = 0; + + if (!dynamic_affinity_enabled()) + return -EPERM; + + if (unlikely(!p->prefer_cpus)) + return -EINVAL; + + rq = task_rq_lock(p, &rf); + update_rq_clock(rq); + + if (cpumask_equal(p->prefer_cpus, new_mask)) + goto out; + + if (!cpumask_subset(new_mask, p->cpus_ptr)) { + ret = -EINVAL; + goto out; + } + + do_set_prefer_cpus(p, new_mask); +out: + task_rq_unlock(rq, p, &rf); + + return ret; +} + +int set_prefer_cpus_ptr(struct task_struct *p, const struct cpumask *new_mask) +{ + if (p->sched_class != &fair_sched_class) + return 0; + + return __set_prefer_cpus_ptr(p, new_mask); +} +#endif + #ifdef CONFIG_CFS_BANDWIDTH static DEFINE_MUTEX(cfs_constraints_mutex); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index c8199377f302..0238b989fcf2 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -1082,6 +1082,12 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P_SCHEDSTAT(nr_wakeups_affine_attempts); P_SCHEDSTAT(nr_wakeups_passive); P_SCHEDSTAT(nr_wakeups_idle); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (dynamic_affinity_enabled()) { + P_SCHEDSTAT(nr_wakeups_preferred_cpus); + P_SCHEDSTAT(nr_wakeups_force_preferred_cpus); + } +#endif P_SCHEDSTAT(sleep_avg); P_SCHEDSTAT(block_avg); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5284e90f42a9..1e78f4944c47 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -164,6 +164,15 @@ unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; #endif +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +/* + * Low utilization threshold for CPU + * + * (default: 85%), units: percentage of CPU utilization) + */ +int sysctl_sched_util_low_pct = 85; +#endif + #ifdef CONFIG_SYSCTL static struct ctl_table sched_fair_sysctls[] = { { @@ -183,6 +192,17 @@ static struct ctl_table sched_fair_sysctls[] = { .extra1 = SYSCTL_ONE, }, #endif +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + { + .procname = "sched_util_low_pct", + .data = &sysctl_sched_util_low_pct, + .maxlen = sizeof(sysctl_sched_util_low_pct), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, +#endif #ifdef CONFIG_NUMA_BALANCING { .procname = "numa_balancing_promote_rate_limit_MBps", @@ -7252,7 +7272,11 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this return cpumask_first(sched_group_span(group)); /* Traverse only the allowed CPUs */ +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + for_each_cpu_and(i, sched_group_span(group), p->select_cpus) { +#else for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { +#endif struct rq *rq = cpu_rq(i); if (!sched_core_cookie_match(rq, p)) @@ -7299,7 +7323,11 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p { int new_cpu = cpu; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (!cpumask_intersects(sched_domain_span(sd), p->select_cpus)) +#else if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr)) +#endif return prev_cpu; /* @@ -7449,7 +7477,11 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t { int cpu; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + for_each_cpu_and(cpu, cpu_smt_mask(target), p->select_cpus) { +#else for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) { +#endif if (cpu == target) continue; /* @@ -7503,7 +7535,11 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool struct sched_domain *this_sd = NULL; u64 time = 0; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_and(cpus, sched_domain_span(sd), p->select_cpus); +#else cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); +#endif if (sched_feat(SIS_PROP) && !has_idle_core) { u64 avg_cost, avg_idle, span_avg; @@ -7700,6 +7736,9 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) lockdep_assert_irqs_disabled(); if ((available_idle_cpu(target) || sched_idle_cpu(target)) && +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_test_cpu(target, p->select_cpus) && +#endif asym_fits_cpu(task_util, util_min, util_max, target)) return target; @@ -7708,6 +7747,9 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) */ if (prev != target && cpus_share_cache(prev, target) && (available_idle_cpu(prev) || sched_idle_cpu(prev)) && +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_test_cpu(prev, p->select_cpus) && +#endif asym_fits_cpu(task_util, util_min, util_max, prev)) { if (!static_branch_unlikely(&sched_cluster_active) || @@ -7740,7 +7782,11 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_test_cpu(recent_used_cpu, p->select_cpus) && +#else cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) && +#endif asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { if (!static_branch_unlikely(&sched_cluster_active) || @@ -8308,6 +8354,152 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) return target; } +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +static inline struct cpumask *task_prefer_cpus(struct task_struct *p) +{ + return p->prefer_cpus; +} + +static inline int dynamic_affinity_mode(struct task_struct *p) +{ + return 0; +} + +DEFINE_STATIC_KEY_FALSE(__dynamic_affinity_switch); + +static int __init dynamic_affinity_switch_setup(char *str) +{ + int ret = 1; + + if (!str) + goto out; + + if (!strcmp(str, "enable")) + static_branch_enable(&__dynamic_affinity_switch); + else if (!strcmp(str, "disable")) + static_branch_disable(&__dynamic_affinity_switch); + else + ret = 0; + +out: + if (!ret) + pr_warn("Unable to parse dynamic_affinity=\n"); + + return ret; +} +__setup("dynamic_affinity=", dynamic_affinity_switch_setup); + +static inline bool prefer_cpus_valid(struct task_struct *p) +{ + if (!dynamic_affinity_enabled()) + return false; + + return p->prefer_cpus && + !cpumask_empty(p->prefer_cpus) && + !cpumask_equal(p->prefer_cpus, p->cpus_ptr) && + cpumask_subset(p->prefer_cpus, p->cpus_ptr); +} + +static inline unsigned long taskgroup_cpu_util(struct task_group *tg, + int cpu) +{ +#ifdef CONFIG_FAIR_GROUP_SCHED + if (tg->se[cpu] && sched_feat(DA_UTIL_TASKGROUP)) + return tg->se[cpu]->avg.util_avg; +#endif + return cpu_util_cfs(cpu); +} + +static unsigned long scale_rt_capacity(int cpu); + +static inline unsigned long calc_cpu_capacity(int cpu) +{ + unsigned long capacity; + + capacity = scale_rt_capacity(cpu); + if (!capacity) + capacity = 1; + + return capacity; +} + +/* + * set_task_select_cpus: select the cpu range for task + * @p: the task whose available cpu range will to set + * @idlest_cpu: the cpu which is the idlest in prefer cpus + * + * If sum of 'util_avg' among 'prefer_cpus' lower than the percentage + * 'sysctl_sched_util_low_pct' of 'prefer_cpus' capacity, select + * 'prefer_cpus' range for task, otherwise select 'cpus_ptr' for task. + * + * The available cpu range set to p->select_cpus. Idlest cpu in preferred cpus + * set to @idlest_cpu, which is set to wakeup cpu when fast path wakeup cpu + * without p->select_cpus. + */ +static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu, + int sd_flag) +{ + unsigned long util_avg_sum = 0; + unsigned long tg_capacity = 0; + long min_util = INT_MIN; + struct task_group *tg; + long spare; + int cpu; + int nr_cpus_valid = 0; + + p->select_cpus = p->cpus_ptr; + if (!prefer_cpus_valid(p)) + return; + + rcu_read_lock(); + tg = task_group(p); + for_each_cpu_and(cpu, p->prefer_cpus, cpu_online_mask) { + if (idlest_cpu && (available_idle_cpu(cpu) || sched_idle_cpu(cpu))) { + *idlest_cpu = cpu; + } else if (idlest_cpu) { + spare = (long)(capacity_of(cpu) - + taskgroup_cpu_util(tg, cpu)); + if (spare > min_util) { + min_util = spare; + *idlest_cpu = cpu; + } + } + + if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) { + rcu_read_unlock(); + p->select_cpus = p->prefer_cpus; + if (sd_flag & SD_BALANCE_WAKE) + schedstat_inc(p->stats.nr_wakeups_preferred_cpus); + return; + } + + util_avg_sum += taskgroup_cpu_util(tg, cpu); + nr_cpus_valid++; + + if (cpu_rq(cpu)->rt.rt_nr_running) + tg_capacity += calc_cpu_capacity(cpu); + else + tg_capacity += capacity_of(cpu); + } + rcu_read_unlock(); + + /* + * Follow cases should select cpus_ptr, checking by condition of + * tg_capacity > nr_cpus_valid: + * 1. all prefer_cpus offline; + * 2. all prefer_cpus has no cfs capaicity(tg_capacity = nr_cpus_valid * 1) + */ + if (tg_capacity > nr_cpus_valid && + util_avg_sum * 100 <= tg_capacity * sysctl_sched_util_low_pct) { + p->select_cpus = p->prefer_cpus; + if (sd_flag & SD_BALANCE_WAKE) + schedstat_inc(p->stats.nr_wakeups_preferred_cpus); + } else if (idlest_cpu) { + *idlest_cpu = -1; + } +} +#endif + /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE, @@ -8328,11 +8520,19 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) int want_affine = 0; /* SD_flags and WF_flags share the first nibble */ int sd_flag = wake_flags & 0xF; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + int idlest_cpu = -1; +#endif /* * required for stable ->cpus_allowed */ lockdep_assert_held(&p->pi_lock); + +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + set_task_select_cpus(p, &idlest_cpu, sd_flag); +#endif + if (wake_flags & WF_TTWU) { record_wakee(p); @@ -8347,7 +8547,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) new_cpu = prev_cpu; } +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->select_cpus); +#else want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr); +#endif } rcu_read_lock(); @@ -8358,7 +8562,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) */ if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + new_cpu = cpu; + if (cpu != prev_cpu && + cpumask_test_cpu(prev_cpu, p->select_cpus)) +#else if (cpu != prev_cpu) +#endif new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync); sd = NULL; /* Prefer wake_affine over balance flags */ @@ -8385,6 +8595,12 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) } rcu_read_unlock(); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (idlest_cpu != -1 && !cpumask_test_cpu(new_cpu, p->select_cpus)) { + new_cpu = idlest_cpu; + schedstat_inc(p->stats.nr_wakeups_force_preferred_cpus); + } +#endif return new_cpu; } @@ -9040,7 +9256,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (kthread_is_per_cpu(p)) return 0; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + set_task_select_cpus(p, NULL, 0); + if (!cpumask_test_cpu(env->dst_cpu, p->select_cpus)) { +#else if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) { +#endif int cpu; schedstat_inc(p->stats.nr_failed_migrations_affine); @@ -9063,7 +9284,11 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* Prevent to re-select dst_cpu via env's CPUs: */ for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (cpumask_test_cpu(cpu, p->select_cpus)) { +#else if (cpumask_test_cpu(cpu, p->cpus_ptr)) { +#endif env->flags |= LBF_DST_PINNED; env->new_dst_cpu = cpu; break; @@ -10438,8 +10663,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) int local_group; /* Skip over this group if it has no CPUs allowed */ +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (!cpumask_intersects(sched_group_span(group), + p->select_cpus)) +#else if (!cpumask_intersects(sched_group_span(group), p->cpus_ptr)) +#endif continue; /* Skip over this group if no cookie matched */ diff --git a/kernel/sched/features.h b/kernel/sched/features.h index a172fe8234fc..0c7f1f4b5559 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -92,3 +92,10 @@ SCHED_FEAT(UTIL_EST_FASTUP, true) SCHED_FEAT(LATENCY_WARN, false) SCHED_FEAT(HZ_BW, true) + +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +/* + * Use util_avg of bottom-Level taskgroup + */ +SCHED_FEAT(DA_UTIL_TASKGROUP, true) +#endif -- Gitee From ad34b06cc077b2955224ffef8e33907c2d2d0da4 Mon Sep 17 00:00:00 2001 From: jiangheng Date: Thu, 18 Dec 2025 16:47:33 +0800 Subject: [PATCH 2/2] sched/fair: Support NUMA parallel scheduling for multiple processes [commit 0f22304e62d360fce47988586d599e33895ccb01 openEuler.] For architectures with multiple NUMA node levels and large distances between nodes, a better approach is to support processes running in parallel on each NUMA node. The usage is restricted to the following scenarios: 1. No CPU binding for user-space processes; 2. It is applicable to distributed applications, such as business architectures with one master and multiple slaves running in parallel; 3. The existing "qos dynamic affinity" and "qos smart grid" features must not be used simultaneously. Signed-off-by: jiangheng --- arch/arm64/Kconfig | 1 + arch/arm64/configs/tencent.config | 1 + arch/arm64/include/asm/prefer_numa.h | 9 +++++ arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/prefer_numa.c | 51 ++++++++++++++++++++++++++++ fs/proc/array.c | 3 -- include/linux/sched.h | 6 ++++ init/Kconfig | 22 ++++++++++++ kernel/cgroup/cpuset.c | 6 ++-- kernel/fork.c | 11 +++--- kernel/sched/core.c | 3 -- kernel/sched/debug.c | 45 +++++++++++++++++++++++- kernel/sched/fair.c | 39 +++++++++++++++++---- kernel/sched/features.h | 4 +++ 14 files changed, 180 insertions(+), 22 deletions(-) create mode 100644 arch/arm64/include/asm/prefer_numa.h create mode 100644 arch/arm64/kernel/prefer_numa.c diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 56d327490287..16eaafd5e26c 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -100,6 +100,7 @@ config ARM64 select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_SUPPORTS_NUMA_BALANCING + select ARCH_SUPPORTS_SCHED_PARAL select ARCH_SUPPORTS_PAGE_TABLE_CHECK select ARCH_SUPPORTS_PER_VMA_LOCK select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH diff --git a/arch/arm64/configs/tencent.config b/arch/arm64/configs/tencent.config index 029fa71f43b8..285e84dc4e02 100644 --- a/arch/arm64/configs/tencent.config +++ b/arch/arm64/configs/tencent.config @@ -45,6 +45,7 @@ CONFIG_CGROUP_MISC=y CONFIG_CGROUP_SLI=y CONFIG_NAMESPACES=y CONFIG_USER_NS=y +CONFIG_SCHED_PARAL=y CONFIG_CHECKPOINT_RESTORE=y CONFIG_SCHED_AUTOGROUP=y CONFIG_BOOT_CONFIG=y diff --git a/arch/arm64/include/asm/prefer_numa.h b/arch/arm64/include/asm/prefer_numa.h new file mode 100644 index 000000000000..7e579cd9355b --- /dev/null +++ b/arch/arm64/include/asm/prefer_numa.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef __ASM_PREFER_NUMA_H +#define __ASM_PREFER_NUMA_H + +#include + +void set_task_paral_node(struct task_struct *p); + +#endif /* __ASM_PREFER_NUMA_H */ diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 93bbbe12277a..7a86c426ec6e 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -77,6 +77,7 @@ obj-y += vdso-wrap.o obj-$(CONFIG_COMPAT_VDSO) += vdso32-wrap.o obj-$(CONFIG_UNWIND_PATCH_PAC_INTO_SCS) += patch-scs.o CFLAGS_patch-scs.o += -mbranch-protection=none +obj-$(CONFIG_SCHED_PARAL) += prefer_numa.o # Force dependency (vdso*-wrap.S includes vdso.so through incbin) $(obj)/vdso-wrap.o: $(obj)/vdso/vdso.so diff --git a/arch/arm64/kernel/prefer_numa.c b/arch/arm64/kernel/prefer_numa.c new file mode 100644 index 000000000000..8dcd6c746df8 --- /dev/null +++ b/arch/arm64/kernel/prefer_numa.c @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * choose a prefer numa node + * + * Copyright (C) 2025 Huawei Limited. + */ +#include +#include +#include + +static atomic_t paral_nid_last = ATOMIC_INIT(-1); + +int is_prefer_numa(void) +{ + if (num_possible_nodes() <= 1) + return 0; + + return 1; +} + +static inline unsigned int update_sched_paral_nid(void) +{ + return (unsigned int)atomic_inc_return(¶l_nid_last); +} + +void set_task_paral_node(struct task_struct *p) +{ + int nid; + int i = 0; + const cpumask_t *cpus_mask; + + if (is_global_init(current)) + return; + + if (p->flags & PF_KTHREAD || p->tgid != p->pid) + return; + + while (i < nr_node_ids) { + nid = update_sched_paral_nid() % nr_node_ids; + cpus_mask = cpumask_of_node(nid); + + if (cpumask_empty(cpus_mask) || + !cpumask_subset(cpus_mask, p->cpus_ptr)) { + i++; + continue; + } + + cpumask_copy(p->prefer_cpus, cpus_mask); + break; + } +} diff --git a/fs/proc/array.c b/fs/proc/array.c index b3eece840aeb..a083160d0c8d 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -481,9 +481,6 @@ int host_pid_info(struct seq_file *m, struct pid_namespace *ns, #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY static void task_cpus_preferred(struct seq_file *m, struct task_struct *task) { - if (!dynamic_affinity_enabled()) - return; - seq_printf(m, "Cpus_preferred:\t%*pb\n", cpumask_pr_args(task->prefer_cpus)); seq_printf(m, "Cpus_preferred_list:\t%*pbl\n", diff --git a/include/linux/sched.h b/include/linux/sched.h index d8bfa6b7f383..f93e3cf8d7f3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2610,6 +2610,12 @@ static inline bool dynamic_affinity_enabled(void) { return static_branch_unlikely(&__dynamic_affinity_switch); } + +#ifdef CONFIG_SCHED_PARAL +bool sched_paral_used(void); +#else +static inline bool sched_paral_used(void) { return false; } +#endif #endif #ifdef CONFIG_DEBUG_RSEQ diff --git a/init/Kconfig b/init/Kconfig index 784021a8e174..b754c65322fa 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1371,6 +1371,28 @@ config NVIDIA_SMI_TRAP Enable trap on NVIDIA SMI calls to solve the problem when acquiring information caused by host pid and guest pid difference. +# +# For architectures that want to enable the support for SCHED_PARAL +# +config ARCH_SUPPORTS_SCHED_PARAL + bool + +config SCHED_PARAL + bool "Parallelly schedule processes on different NUMA nodes" + depends on ARCH_SUPPORTS_SCHED_PARAL + depends on QOS_SCHED_DYNAMIC_AFFINITY + default n + help + By enabling this feature, processes can be scheduled in parallel + on various NUMA nodes to better utilize the cache in NUMA node. + The usage is restricted to the following scenarios: + 1. No CPU binding is performed for user-space processes; + 2. It is applicable to distributed applications, such as business + architectures with one master and multiple slaves running in + parallel; + 3. The existing "qos dynamic affinity" and "qos smart grid" + features must not be used simultaneously. + config CHECKPOINT_RESTORE bool "Checkpoint/restore support" depends on PROC_FS diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 6b4835a7a9ac..befc6b632ef9 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2811,7 +2811,8 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY cpumask_copy(prefer_cpus_attach, cs->prefer_cpus); - set_prefer_cpus_ptr(task, prefer_cpus_attach); + if (!sched_paral_used() || !cpumask_empty(prefer_cpus_attach)) + set_prefer_cpus_ptr(task, prefer_cpus_attach); #endif cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); @@ -4343,7 +4344,8 @@ static void cpuset_fork(struct task_struct *task) set_cpus_allowed_ptr(task, current->cpus_ptr); #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY - set_prefer_cpus_ptr(task, current->prefer_cpus); + if (!sched_paral_used() || !cpumask_empty(cs->prefer_cpus)) + set_prefer_cpus_ptr(task, current->prefer_cpus); #endif task->mems_allowed = current->mems_allowed; return; diff --git a/kernel/fork.c b/kernel/fork.c index 83e23d7e8070..13c92d12134d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -661,8 +661,7 @@ void free_task(struct task_struct *tsk) iee_invalidate_token(tsk); #endif #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY - if (dynamic_affinity_enabled()) - sched_prefer_cpus_free(tsk); + sched_prefer_cpus_free(tsk); #endif free_task_struct(tsk); } @@ -2436,11 +2435,9 @@ __latent_entropy struct task_struct *copy_process( rt_mutex_init_task(p); #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY - if (dynamic_affinity_enabled()) { - retval = sched_prefer_cpus_fork(p, current->prefer_cpus); - if (retval) - goto bad_fork_free; - } + retval = sched_prefer_cpus_fork(p, current->prefer_cpus); + if (retval) + goto bad_fork_free; #endif lockdep_assert_irqs_enabled(); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index aaa3a598770a..228d4b3b74c4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -11563,9 +11563,6 @@ static int __set_prefer_cpus_ptr(struct task_struct *p, struct rq *rq; int ret = 0; - if (!dynamic_affinity_enabled()) - return -EPERM; - if (unlikely(!p->prefer_cpus)) return -EINVAL; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 0238b989fcf2..5271ac83e5a2 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -7,6 +7,10 @@ * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar */ +#ifdef CONFIG_SCHED_PARAL +#include +#endif + /* * This allows printing both to /proc/sched_debug and * to the console @@ -95,6 +99,41 @@ static void sched_feat_disable(int i) { }; static void sched_feat_enable(int i) { }; #endif /* CONFIG_JUMP_LABEL */ +int __weak is_prefer_numa(void) +{ + return 0; +} + +#ifdef CONFIG_SCHED_PARAL +static void sched_feat_disable_paral(char *cmp) +{ + struct task_struct *tsk, *t; + + if (strncmp(cmp, "PARAL", 5) == 0) { + read_lock(&tasklist_lock); + for_each_process(tsk) { + if (tsk->flags & PF_KTHREAD || is_global_init(tsk)) + continue; + + for_each_thread(tsk, t) + cpumask_clear(t->prefer_cpus); + } + read_unlock(&tasklist_lock); + } +} + +static bool sched_feat_enable_paral(char *cmp) +{ + if (strncmp(cmp, "PARAL", 5) != 0) + return true; + + return is_prefer_numa(); +} +#else +static void sched_feat_disable_paral(char *cmp) {}; +static bool sched_feat_enable_paral(char *cmp) { return true; }; +#endif /* CONFIG_SCHED_PARAL */ + static int sched_feat_set(char *cmp) { int i; @@ -111,8 +150,12 @@ static int sched_feat_set(char *cmp) if (neg) { sysctl_sched_features &= ~(1UL << i); + sched_feat_disable_paral(cmp); sched_feat_disable(i); } else { + if (!sched_feat_enable_paral(cmp)) + return -EPERM; + sysctl_sched_features |= (1UL << i); sched_feat_enable(i); } @@ -1083,7 +1126,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P_SCHEDSTAT(nr_wakeups_passive); P_SCHEDSTAT(nr_wakeups_idle); #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY - if (dynamic_affinity_enabled()) { + if (dynamic_affinity_enabled() || sched_paral_used()) { P_SCHEDSTAT(nr_wakeups_preferred_cpus); P_SCHEDSTAT(nr_wakeups_force_preferred_cpus); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1e78f4944c47..fa8c30bb1c38 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -91,6 +91,10 @@ static unsigned int normalized_sysctl_sched_latency = 6000000ULL; */ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; +#ifdef CONFIG_SCHED_PARAL +#include +#endif + /* * Minimal preemption granularity for CPU-bound tasks: * @@ -8355,6 +8359,13 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) } #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +#ifdef CONFIG_SCHED_PARAL +bool sched_paral_used(void) +{ + return sched_feat(PARAL); +} +#endif + static inline struct cpumask *task_prefer_cpus(struct task_struct *p) { return p->prefer_cpus; @@ -8391,13 +8402,15 @@ __setup("dynamic_affinity=", dynamic_affinity_switch_setup); static inline bool prefer_cpus_valid(struct task_struct *p) { - if (!dynamic_affinity_enabled()) - return false; + struct cpumask *prefer_cpus = task_prefer_cpus(p); - return p->prefer_cpus && - !cpumask_empty(p->prefer_cpus) && - !cpumask_equal(p->prefer_cpus, p->cpus_ptr) && - cpumask_subset(p->prefer_cpus, p->cpus_ptr); + if (dynamic_affinity_enabled() || sched_paral_used()) { + return !cpumask_empty(prefer_cpus) && + !cpumask_equal(prefer_cpus, p->cpus_ptr) && + cpumask_subset(prefer_cpus, p->cpus_ptr); + } + + return false; } static inline unsigned long taskgroup_cpu_util(struct task_group *tg, @@ -8483,6 +8496,14 @@ static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu, } rcu_read_unlock(); + /* In extreme cases, it may cause uneven system load. */ + if (sched_paral_used() && sysctl_sched_util_low_pct == 100 && nr_cpus_valid > 0) { + p->select_cpus = p->prefer_cpus; + if (sd_flag & SD_BALANCE_WAKE) + schedstat_inc(p->stats.nr_wakeups_preferred_cpus); + return; + } + /* * Follow cases should select cpus_ptr, checking by condition of * tg_capacity > nr_cpus_valid: @@ -12984,6 +13005,12 @@ static void task_fork_fair(struct task_struct *p) if (curr) update_curr(cfs_rq); place_entity(cfs_rq, se, ENQUEUE_INITIAL); + +#ifdef CONFIG_SCHED_PARAL + if (sched_paral_used()) + set_task_paral_node(p); +#endif + rq_unlock(rq, &rf); } diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 0c7f1f4b5559..0536462f34fa 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -55,6 +55,10 @@ SCHED_FEAT(TTWU_QUEUE, true) SCHED_FEAT(SIS_PROP, false) SCHED_FEAT(SIS_UTIL, true) +#ifdef CONFIG_SCHED_PARAL +SCHED_FEAT(PARAL, false) +#endif + /* * Issue a WARN when we do multiple update_rq_clock() calls * in a single rq->lock section. Default disabled because the -- Gitee