From 53ad6bf76d9c646e3c8494ed82d90f304c50de1f Mon Sep 17 00:00:00 2001 From: Barry Song Date: Mon, 17 Oct 2022 15:01:55 +0800 Subject: [PATCH 1/7] sched: Add per_cpu cluster domain info and cpus_share_lowest_cache API kunpeng inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5W44S CVE: NA Reference: https://lore.kernel.org/lkml/20220915073423.25535-1-yangyicong@huawei.com/ ---------------------------------------------------------------------- Add per-cpu cluster domain info and cpus_share_lowest_cache() API. This is the preparation for the optimization of select_idle_cpu() on platforms with cluster scheduler level. Tested-by: K Prateek Nayak Signed-off-by: Barry Song Signed-off-by: Yicong Yang Reviewed-by: Gautham R. Shenoy Reviewed-by: Tim Chen Reviewed-by: Vincent Guittot Signed-off-by: Jie Liu --- include/linux/sched/sd_flags.h | 7 +++++++ include/linux/sched/topology.h | 8 +++++++- kernel/sched/core.c | 12 ++++++++++++ kernel/sched/sched.h | 2 ++ kernel/sched/topology.c | 15 +++++++++++++++ 5 files changed, 43 insertions(+), 1 deletion(-) diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h index 34b21e971d77..c405b3940147 100644 --- a/include/linux/sched/sd_flags.h +++ b/include/linux/sched/sd_flags.h @@ -99,6 +99,13 @@ SD_FLAG(SD_ASYM_CPUCAPACITY, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) */ SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) +/* + * Domain members share CPU cluster (LLC tags or L2 cache) + * + * NEEDS_GROUPS: Clusters are shared between groups. + */ +SD_FLAG(SD_CLUSTER, SDF_NEEDS_GROUPS) + /* * Domain members share CPU package resources (i.e. caches) * diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 249c98aef083..7c4861d0e647 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -46,7 +46,7 @@ static inline int cpu_smt_flags(void) #ifdef CONFIG_SCHED_CLUSTER static inline int cpu_cluster_flags(void) { - return SD_SHARE_PKG_RESOURCES; + return SD_CLUSTER | SD_SHARE_PKG_RESOURCES; } #endif @@ -185,6 +185,7 @@ cpumask_var_t *alloc_sched_domains(unsigned int ndoms); void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms); bool cpus_share_cache(int this_cpu, int that_cpu); +bool cpus_share_lowest_cache(int this_cpu, int that_cpu); typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); typedef int (*sched_domain_flags_f)(void); @@ -238,6 +239,11 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu) return true; } +static inline bool cpus_share_lowest_cache(int this_cpu, int that_cpu) +{ + return true; +} + #endif /* !CONFIG_SMP */ #ifndef arch_scale_cpu_capacity diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ec25e19f0912..976c9e142d88 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3014,6 +3014,18 @@ bool cpus_share_cache(int this_cpu, int that_cpu) return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } +/* + * Whether CPUs are share lowest cache, which means LLC on non-cluster + * machines and LLC tag or L2 on machines with clusters. + */ +bool cpus_share_lowest_cache(int this_cpu, int that_cpu) +{ + if (this_cpu == that_cpu) + return true; + + return per_cpu(sd_lowest_cache_id, this_cpu) == per_cpu(sd_lowest_cache_id, that_cpu); +} + static inline bool ttwu_queue_cond(int cpu, int wake_flags) { /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 32d4775e537d..b48345e28259 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1801,7 +1801,9 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc); DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); +DECLARE_PER_CPU(int, sd_lowest_cache_id); DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_cluster); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 2678e7590cfc..36f053f310c5 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -647,6 +647,8 @@ static void destroy_sched_domains(struct sched_domain *sd) DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); +DEFINE_PER_CPU(int, sd_lowest_cache_id); +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_cluster); DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); @@ -682,6 +684,18 @@ static void update_top_cache_domain(int cpu) per_cpu(sd_llc_id, cpu) = id; rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); + sd = lowest_flag_domain(cpu, SD_CLUSTER); + if (sd) + id = cpumask_first(sched_domain_span(sd)); + rcu_assign_pointer(per_cpu(sd_cluster, cpu), sd); + + /* + * This assignment should be placed after the sd_llc_id as + * we want this id equals to cluster id on cluster machines + * but equals to LLC id on non-Cluster machines. + */ + per_cpu(sd_lowest_cache_id, cpu) = id; + sd = lowest_flag_domain(cpu, SD_NUMA); rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); @@ -1406,6 +1420,7 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; */ #define TOPOLOGY_SD_FLAGS \ (SD_SHARE_CPUCAPACITY | \ + SD_CLUSTER | \ SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING) -- Gitee From 0c3a4f986962ed94da6e26ba3ec0bdf700945894 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Mon, 17 Oct 2022 15:34:27 +0800 Subject: [PATCH 2/7] sched/fair: Scan cluster before scanning LLC in wake-up path kunpeng inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5W44S CVE: NA Reference: https://lore.kernel.org/lkml/20220915073423.25535-1-yangyicong@huawei.com/ ---------------------------------------------------------------------- For platforms having clusters like Kunpeng920, CPUs within the same cluster have lower latency when synchronizing and accessing shared resources like cache. Thus, this patch tries to find an idle cpu within the cluster of the target CPU before scanning the whole LLC to gain lower latency. Testing has been done on Kunpeng920 by pinning tasks to one numa and two numa. On Kunpeng920, Each numa has 8 clusters and each cluster has 4 CPUs. With this patch, We noticed enhancement on tbench within one numa or cross two numa. On numa 0: 6.0-rc1 patched Hmean 1 351.20 ( 0.00%) 396.45 * 12.88%* Hmean 2 700.43 ( 0.00%) 793.76 * 13.32%* Hmean 4 1404.42 ( 0.00%) 1583.62 * 12.76%* Hmean 8 2833.31 ( 0.00%) 3147.85 * 11.10%* Hmean 16 5501.90 ( 0.00%) 6089.89 * 10.69%* Hmean 32 10428.59 ( 0.00%) 10619.63 * 1.83%* Hmean 64 8223.39 ( 0.00%) 8306.93 * 1.02%* Hmean 128 7042.88 ( 0.00%) 7068.03 * 0.36%* On numa 0-1: 6.0-rc1 patched Hmean 1 363.06 ( 0.00%) 397.13 * 9.38%* Hmean 2 721.68 ( 0.00%) 789.84 * 9.44%* Hmean 4 1435.15 ( 0.00%) 1566.01 * 9.12%* Hmean 8 2776.17 ( 0.00%) 3007.05 * 8.32%* Hmean 16 5471.71 ( 0.00%) 6103.91 * 11.55%* Hmean 32 10164.98 ( 0.00%) 11531.81 * 13.45%* Hmean 64 17143.28 ( 0.00%) 20078.68 * 17.12%* Hmean 128 14552.70 ( 0.00%) 15156.41 * 4.15%* Hmean 256 12827.37 ( 0.00%) 13326.86 * 3.89%* Note neither Kunpeng920 nor x86 Jacobsville supports SMT, so the SMT branch in the code has not been tested but it supposed to work. Suggested-by: Peter Zijlstra [https://lore.kernel.org/lkml/Ytfjs+m1kUs0ScSn@worktop.programming.kicks-ass.net] Tested-by: Yicong Yang Signed-off-by: Barry Song Signed-off-by: Yicong Yang Reviewed-by: Tim Chen Reviewed-by: Chen Yu Signed-off-by: Jie Liu --- kernel/sched/fair.c | 30 +++++++++++++++++++++++++++--- kernel/sched/sched.h | 1 + kernel/sched/topology.c | 11 +++++++++++ 3 files changed, 39 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c2c1f8f6c12d..a28eb51b3c72 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6370,6 +6370,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t } } + if (static_branch_unlikely(&sched_cluster_active)) { + struct sched_domain *sdc = rcu_dereference(per_cpu(sd_cluster, target)); + + if (sdc) { + for_each_cpu_wrap(cpu, sched_domain_span(sdc), target) { + if (!cpumask_test_cpu(cpu, cpus)) + continue; + + if (smt) { + i = select_idle_core(p, cpu, cpus, &idle_cpu); + if ((unsigned int)i < nr_cpumask_bits) + return i; + } else { + if (--nr <= 0) + return -1; + idle_cpu = __select_idle_cpu(cpu, p); + if ((unsigned int)idle_cpu < nr_cpumask_bits) + return idle_cpu; + } + } + cpumask_andnot(cpus, cpus, sched_domain_span(sdc)); + } + } + for_each_cpu_wrap(cpu, cpus, target) { if (smt) { i = select_idle_core(p, cpu, cpus, &idle_cpu); @@ -6377,7 +6401,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t return i; } else { - if (!--nr) + if (--nr <= 0) return -1; idle_cpu = __select_idle_cpu(cpu, p); if ((unsigned int)idle_cpu < nr_cpumask_bits) @@ -6487,7 +6511,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) /* * If the previous CPU is cache affine and idle, don't be stupid: */ - if (prev != target && cpus_share_cache(prev, target) && + if (prev != target && cpus_share_lowest_cache(prev, target) && (available_idle_cpu(prev) || sched_idle_cpu(prev)) && #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY cpumask_test_cpu(prev, p->select_cpus) && @@ -6518,7 +6542,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) recent_used_cpu = p->recent_used_cpu; if (recent_used_cpu != prev && recent_used_cpu != target && - cpus_share_cache(recent_used_cpu, target) && + cpus_share_lowest_cache(recent_used_cpu, target) && (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY cpumask_test_cpu(p->recent_used_cpu, p->select_cpus) && diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b48345e28259..b742d1df609b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1808,6 +1808,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); extern struct static_key_false sched_asym_cpucapacity; +extern struct static_key_false sched_cluster_active; struct sched_group_capacity { atomic_t ref; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 36f053f310c5..5472674eb1f6 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -653,7 +653,9 @@ DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); + DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); +DEFINE_STATIC_KEY_FALSE(sched_cluster_active); static void update_top_cache_domain(int cpu) { @@ -2200,6 +2202,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att int i, ret = -ENOMEM; struct sched_domain_topology_level *tl_asym; bool has_asym = false; + bool has_cluster = false; if (WARN_ON(cpumask_empty(cpu_map))) goto error; @@ -2227,6 +2230,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i); + has_cluster |= sd->flags & SD_CLUSTER; + if (tl == sched_domain_topology) *per_cpu_ptr(d.sd, i) = sd; if (tl->flags & SDTL_OVERLAP) @@ -2286,6 +2291,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att if (has_asym) static_branch_inc_cpuslocked(&sched_asym_cpucapacity); + if (has_cluster) + static_branch_inc_cpuslocked(&sched_cluster_active); + if (rq && sched_debug_enabled) { pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); @@ -2385,6 +2393,9 @@ static void detach_destroy_domains(const struct cpumask *cpu_map) if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu))) static_branch_dec_cpuslocked(&sched_asym_cpucapacity); + if (rcu_access_pointer(per_cpu(sd_cluster, cpu))) + static_branch_dec_cpuslocked(&sched_cluster_active); + rcu_read_lock(); for_each_cpu(i, cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); -- Gitee From 211b6fb7d5a8558a453475a08a697e651ca2d0cb Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Fri, 3 Dec 2021 12:32:38 -0800 Subject: [PATCH 3/7] scheduler: Create SDTL_SKIP flag to skip topology level kunpeng inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5W44S CVE: NA Reference: https://lore.kernel.org/lkml/cover.1638563225.git.tim.c.chen@linux.intel.com/ ---------------------------------------------------------------------- A system admin may not want to use cluster scheduling. Make changes to allow cluster topology level to be skipped when building sched domains. Create SDTL_SKIP bit on the sched_domain_topology_level flag so we can check if the cluster topology level should be skipped when building sched domains. Signed-off-by: Tim Chen Signed-off-by: Jie Liu --- include/linux/sched/topology.h | 1 + kernel/sched/topology.c | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 7c4861d0e647..9680886277c9 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -191,6 +191,7 @@ typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); typedef int (*sched_domain_flags_f)(void); #define SDTL_OVERLAP 0x01 +#define SDTL_SKIP 0x02 struct sd_data { struct sched_domain *__percpu *sd; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 5472674eb1f6..b10ad8dc9045 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1557,8 +1557,16 @@ static struct sched_domain_topology_level default_topology[] = { static struct sched_domain_topology_level *sched_domain_topology = default_topology; +static struct sched_domain_topology_level *next_tl(struct sched_domain_topology_level *tl) +{ + ++tl; + while (tl->mask && tl->flags & SDTL_SKIP) + ++tl; + return tl; +} + #define for_each_sd_topology(tl) \ - for (tl = sched_domain_topology; tl->mask; tl++) + for (tl = sched_domain_topology; tl->mask; tl = next_tl(tl)) void set_sched_topology(struct sched_domain_topology_level *tl) { -- Gitee From 8ce3e706b31409147f035c037055caa68e450ce5 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Fri, 3 Dec 2021 12:32:40 -0800 Subject: [PATCH 4/7] scheduler: Add runtime knob sysctl_sched_cluster kunpeng inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5W44S CVE: NA Reference: https://lore.kernel.org/lkml/cover.1638563225.git.tim.c.chen@linux.intel.com/ ---------------------------------------------------------------------- Allow run time configuration of the scheduler to use cluster scheduling. Configuration can be changed via the sysctl variable /proc/sys/kernel/sched_cluster. Setting it to 1 enable cluster scheduling and setting it to 0 turns it off. Cluster scheduling should benefit independent tasks by load balancing them between clusters. It reaps the most benefit when the system's CPUs are not fully busy, so we can spread the tasks out between the clusters to reduce contention on cluster resource (e.g. L2 cache). However, if the system is expected to operate close to full utilization, the system admin could turn this feature off so as not to incur extra load balancing overhead between the cluster domains. Signed-off-by: Tim Chen Signed-off-by: Jie Liu --- arch/x86/kernel/smpboot.c | 8 +++++ drivers/base/arch_topology.c | 13 +++++--- include/linux/sched/sysctl.h | 6 ++++ include/linux/topology.h | 1 + kernel/sched/core.c | 1 + kernel/sched/sched.h | 6 ++++ kernel/sched/topology.c | 64 ++++++++++++++++++++++++++++++++++++ 7 files changed, 95 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index b4911aa7b2d5..b3d1e449702c 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include @@ -122,6 +123,13 @@ int arch_update_cpu_topology(void) return retval; } +void arch_rebuild_cpu_topology(void) +{ + x86_topology_update = true; + rebuild_sched_domains(); + x86_topology_update = false; +} + static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) { unsigned long flags; diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 802077cd1ab9..aaf969004275 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -114,16 +114,21 @@ int topology_update_cpu_topology(void) return update_topology; } +void __weak arch_rebuild_cpu_topology(void) +{ + update_topology = 1; + rebuild_sched_domains(); + pr_debug("sched_domain hierarchy rebuilt, flags updated\n"); + update_topology = 0; +} + /* * Updating the sched_domains can't be done directly from cpufreq callbacks * due to locking, so queue the work for later. */ static void update_topology_flags_workfn(struct work_struct *work) { - update_topology = 1; - rebuild_sched_domains(); - pr_debug("sched_domain hierarchy rebuilt, flags updated\n"); - update_topology = 0; + arch_rebuild_cpu_topology(); } static DEFINE_PER_CPU(u32, freq_factor) = 1; diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 0acfc0cb0456..933ffee18b4b 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -111,4 +111,10 @@ int sched_energy_aware_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); #endif +#ifdef CONFIG_SCHED_CLUSTER +extern unsigned int sysctl_sched_cluster; +int sched_cluster_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +#endif + #endif /* _LINUX_SCHED_SYSCTL_H */ diff --git a/include/linux/topology.h b/include/linux/topology.h index 0b3704ad13c8..42bcfd5d9fdb 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -44,6 +44,7 @@ if (nr_cpus_node(node)) int arch_update_cpu_topology(void); +void arch_rebuild_cpu_topology(void); /* Conform to ACPI 2.0 SLIT distance definitions */ #define LOCAL_DISTANCE 10 diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 976c9e142d88..d06ac41204c6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8097,6 +8097,7 @@ int sched_cpu_dying(unsigned int cpu) void __init sched_init_smp(void) { sched_init_numa(); + set_sched_cluster(); /* * There's no userspace yet to cause hotplug operations; hence all the diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b742d1df609b..6bb11d71caff 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1685,6 +1685,12 @@ this_rq_lock_irq(struct rq_flags *rf) return rq; } +#ifdef CONFIG_SCHED_CLUSTER +extern void set_sched_cluster(void); +#else +static inline void set_sched_cluster(void) { } +#endif + #ifdef CONFIG_NUMA #ifdef CONFIG_SCHED_STEAL extern struct static_key_true sched_steal_allow; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index b10ad8dc9045..c3c2cfcca3a0 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1557,6 +1557,70 @@ static struct sched_domain_topology_level default_topology[] = { static struct sched_domain_topology_level *sched_domain_topology = default_topology; +#ifdef CONFIG_SCHED_CLUSTER +void set_sched_cluster(void) +{ + struct sched_domain_topology_level *tl; + + for (tl = sched_domain_topology; tl->mask; tl++) { + if (tl->sd_flags && (tl->sd_flags() & SD_CLUSTER)) { + if (!sysctl_sched_cluster) + tl->flags |= SDTL_SKIP; + else + tl->flags &= ~SDTL_SKIP; + break; + } + } +} + +/* set via /proc/sys/kernel/sched_cluster */ +unsigned int __read_mostly sysctl_sched_cluster = 1; + +static DEFINE_MUTEX(sched_cluster_mutex); +int sched_cluster_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + unsigned int oldval; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&sched_cluster_mutex); + oldval = sysctl_sched_cluster; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (!ret && write) { + if (oldval != sysctl_sched_cluster) { + set_sched_cluster(); + arch_rebuild_cpu_topology(); + } + } + mutex_unlock(&sched_cluster_mutex); + + return ret; +} + +static struct ctl_table sched_cluster_sysctls[] = { + { + .procname = "sched_cluster", + .data = &sysctl_sched_cluster, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_cluster_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static int __init sched_cluster_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_cluster_sysctls); + return 0; +} +late_initcall(sched_cluster_sysctl_init); +#endif + static struct sched_domain_topology_level *next_tl(struct sched_domain_topology_level *tl) { ++tl; -- Gitee From 9e68cc2bf535a2f4e3c33e7e53bbb15815b703c4 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Fri, 3 Dec 2021 12:32:41 -0800 Subject: [PATCH 5/7] scheduler: Add boot time enabling/disabling of cluster scheduling kunpeng inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5W44S CVE: NA Reference: https://lore.kernel.org/lkml/cover.1638563225.git.tim.c.chen@linux.intel.com/ ---------------------------------------------------------------------- Add boot time parameter sched_cluster to enable or disable cluster scheduling. Set boot parameter as follow: sched_cluster=0 disables cluster scheduling sched_cluster=1 enables cluster scheduling Signed-off-by: Tim Chen Signed-off-by: Jie Liu --- Documentation/admin-guide/kernel-parameters.txt | 4 ++++ kernel/sched/topology.c | 16 ++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 8a1a25216da6..786bb3f02c61 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4978,6 +4978,10 @@ sched_debug [KNL] Enables verbose scheduler debug messages. + sched_cluster= Enable or disable cluster scheduling. + 0 -- disable. + 1 -- enable. + schedstats= [KNL,X86] Enable or disable scheduled statistics. Allowed values are enable and disable. This feature incurs a small amount of overhead in the scheduler diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index c3c2cfcca3a0..2f8e18792352 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1619,6 +1619,22 @@ static int __init sched_cluster_sysctl_init(void) return 0; } late_initcall(sched_cluster_sysctl_init); + +static int __init sched_cluster_option(char *str) +{ + int enable; + + if (get_option(&str, &enable)) { + if (enable != 0 && enable != 1) + return -EINVAL; + + sysctl_sched_cluster = enable; + return 0; + } + + return -EINVAL; +} +early_param("sched_cluster", sched_cluster_option); #endif static struct sched_domain_topology_level *next_tl(struct sched_domain_topology_level *tl) -- Gitee From 6afb257d6dd71085344e1472ea6e820b5dc0a8e3 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Mon, 13 Feb 2023 10:48:54 +0800 Subject: [PATCH 6/7] scheduler: Disable cluster scheduling by default kunpeng inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5W44S CVE: NA ---------------------------------------------------------------------- Disable cluster scheduling by default since it's not a universal win. User can choose to enable it through sysctl or at boot time according to their scenario. Signed-off-by: Yicong Yang Signed-off-by: Jie Liu --- kernel/sched/topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 2f8e18792352..3208293d68fa 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1574,7 +1574,7 @@ void set_sched_cluster(void) } /* set via /proc/sys/kernel/sched_cluster */ -unsigned int __read_mostly sysctl_sched_cluster = 1; +unsigned int __read_mostly sysctl_sched_cluster; static DEFINE_MUTEX(sched_cluster_mutex); int sched_cluster_handler(struct ctl_table *table, int write, -- Gitee From aff649361671b432570e94c9056932f50dd6f101 Mon Sep 17 00:00:00 2001 From: Jie Liu Date: Mon, 24 Oct 2022 09:34:57 +0800 Subject: [PATCH 7/7] sched:Open the kernel configuration for cluster. kunpeng inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5W44S CVE: NA ---------------------------------------------------------------------- In the past configuration, CONFIG_SCHED_CLUSTER was not set. Now, we need to open the configuration. Signed-off-by: Jie Liu --- arch/arm64/configs/openeuler_defconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index f6cc13c8971d..02f5281eb018 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -401,7 +401,7 @@ CONFIG_ARM64_PA_BITS=48 # CONFIG_CPU_BIG_ENDIAN is not set CONFIG_CPU_LITTLE_ENDIAN=y CONFIG_SCHED_MC=y -# CONFIG_SCHED_CLUSTER is not set +CONFIG_SCHED_CLUSTER=y CONFIG_SCHED_SMT=y CONFIG_NR_CPUS=4096 CONFIG_HOTPLUG_CPU=y -- Gitee