From 53ad6bf76d9c646e3c8494ed82d90f304c50de1f Mon Sep 17 00:00:00 2001
From: Barry Song <song.bao.hua@hisilicon.com>
Date: Mon, 17 Oct 2022 15:01:55 +0800
Subject: [PATCH 1/7] sched: Add per_cpu cluster domain info and
 cpus_share_lowest_cache API

kunpeng inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5W44S
CVE: NA

Reference: https://lore.kernel.org/lkml/20220915073423.25535-1-yangyicong@huawei.com/

----------------------------------------------------------------------

Add per-cpu cluster domain info and cpus_share_lowest_cache() API.
This is the preparation for the optimization of select_idle_cpu()
on platforms with cluster scheduler level.

Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Jie Liu <liujie375@h-partners.com>
---
 include/linux/sched/sd_flags.h |  7 +++++++
 include/linux/sched/topology.h |  8 +++++++-
 kernel/sched/core.c            | 12 ++++++++++++
 kernel/sched/sched.h           |  2 ++
 kernel/sched/topology.c        | 15 +++++++++++++++
 5 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h
index 34b21e971d77..c405b3940147 100644
--- a/include/linux/sched/sd_flags.h
+++ b/include/linux/sched/sd_flags.h
@@ -99,6 +99,13 @@ SD_FLAG(SD_ASYM_CPUCAPACITY, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
  */
 SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
 
+/*
+ * Domain members share CPU cluster (LLC tags or L2 cache)
+ *
+ * NEEDS_GROUPS: Clusters are shared between groups.
+ */
+SD_FLAG(SD_CLUSTER, SDF_NEEDS_GROUPS)
+
 /*
  * Domain members share CPU package resources (i.e. caches)
  *
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 249c98aef083..7c4861d0e647 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -46,7 +46,7 @@ static inline int cpu_smt_flags(void)
 #ifdef CONFIG_SCHED_CLUSTER
 static inline int cpu_cluster_flags(void)
 {
-	return SD_SHARE_PKG_RESOURCES;
+	return SD_CLUSTER | SD_SHARE_PKG_RESOURCES;
 }
 #endif
 
@@ -185,6 +185,7 @@ cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
 
 bool cpus_share_cache(int this_cpu, int that_cpu);
+bool cpus_share_lowest_cache(int this_cpu, int that_cpu);
 
 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
 typedef int (*sched_domain_flags_f)(void);
@@ -238,6 +239,11 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu)
 	return true;
 }
 
+static inline bool cpus_share_lowest_cache(int this_cpu, int that_cpu)
+{
+	return true;
+}
+
 #endif	/* !CONFIG_SMP */
 
 #ifndef arch_scale_cpu_capacity
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ec25e19f0912..976c9e142d88 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3014,6 +3014,18 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
 	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
 }
 
+/*
+ * Whether CPUs are share lowest cache, which means LLC on non-cluster
+ * machines and LLC tag or L2 on machines with clusters.
+ */
+bool cpus_share_lowest_cache(int this_cpu, int that_cpu)
+{
+	if (this_cpu == that_cpu)
+		return true;
+
+	return per_cpu(sd_lowest_cache_id, this_cpu) == per_cpu(sd_lowest_cache_id, that_cpu);
+}
+
 static inline bool ttwu_queue_cond(int cpu, int wake_flags)
 {
 	/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 32d4775e537d..b48345e28259 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1801,7 +1801,9 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_size);
 DECLARE_PER_CPU(int, sd_llc_id);
+DECLARE_PER_CPU(int, sd_lowest_cache_id);
 DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+DECLARE_PER_CPU(struct sched_domain __rcu *, sd_cluster);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 2678e7590cfc..36f053f310c5 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -647,6 +647,8 @@ static void destroy_sched_domains(struct sched_domain *sd)
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_size);
 DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(int, sd_lowest_cache_id);
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_cluster);
 DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
@@ -682,6 +684,18 @@ static void update_top_cache_domain(int cpu)
 	per_cpu(sd_llc_id, cpu) = id;
 	rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
 
+	sd = lowest_flag_domain(cpu, SD_CLUSTER);
+	if (sd)
+		id = cpumask_first(sched_domain_span(sd));
+	rcu_assign_pointer(per_cpu(sd_cluster, cpu), sd);
+
+	/*
+	 * This assignment should be placed after the sd_llc_id as
+	 * we want this id equals to cluster id on cluster machines
+	 * but equals to LLC id on non-Cluster machines.
+	 */
+	per_cpu(sd_lowest_cache_id, cpu) = id;
+
 	sd = lowest_flag_domain(cpu, SD_NUMA);
 	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
 
@@ -1406,6 +1420,7 @@ int __read_mostly		node_reclaim_distance = RECLAIM_DISTANCE;
  */
 #define TOPOLOGY_SD_FLAGS		\
 	(SD_SHARE_CPUCAPACITY	|	\
+	 SD_CLUSTER		|	\
 	 SD_SHARE_PKG_RESOURCES |	\
 	 SD_NUMA		|	\
 	 SD_ASYM_PACKING)
-- 
Gitee


From 0c3a4f986962ed94da6e26ba3ec0bdf700945894 Mon Sep 17 00:00:00 2001
From: Barry Song <song.bao.hua@hisilicon.com>
Date: Mon, 17 Oct 2022 15:34:27 +0800
Subject: [PATCH 2/7] sched/fair: Scan cluster before scanning LLC in wake-up
 path

kunpeng inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5W44S
CVE: NA

Reference: https://lore.kernel.org/lkml/20220915073423.25535-1-yangyicong@huawei.com/

----------------------------------------------------------------------

For platforms having clusters like Kunpeng920, CPUs within the same cluster
have lower latency when synchronizing and accessing shared resources like
cache. Thus, this patch tries to find an idle cpu within the cluster of the
target CPU before scanning the whole LLC to gain lower latency.

Testing has been done on Kunpeng920 by pinning tasks to one numa and two
numa. On Kunpeng920, Each numa has 8 clusters and each cluster has 4 CPUs.

With this patch, We noticed enhancement on tbench within one numa or cross
two numa.

On numa 0:
                             6.0-rc1                patched
Hmean     1        351.20 (   0.00%)      396.45 *  12.88%*
Hmean     2        700.43 (   0.00%)      793.76 *  13.32%*
Hmean     4       1404.42 (   0.00%)     1583.62 *  12.76%*
Hmean     8       2833.31 (   0.00%)     3147.85 *  11.10%*
Hmean     16      5501.90 (   0.00%)     6089.89 *  10.69%*
Hmean     32     10428.59 (   0.00%)    10619.63 *   1.83%*
Hmean     64      8223.39 (   0.00%)     8306.93 *   1.02%*
Hmean     128     7042.88 (   0.00%)     7068.03 *   0.36%*

On numa 0-1:
                             6.0-rc1                patched
Hmean     1        363.06 (   0.00%)      397.13 *   9.38%*
Hmean     2        721.68 (   0.00%)      789.84 *   9.44%*
Hmean     4       1435.15 (   0.00%)     1566.01 *   9.12%*
Hmean     8       2776.17 (   0.00%)     3007.05 *   8.32%*
Hmean     16      5471.71 (   0.00%)     6103.91 *  11.55%*
Hmean     32     10164.98 (   0.00%)    11531.81 *  13.45%*
Hmean     64     17143.28 (   0.00%)    20078.68 *  17.12%*
Hmean     128    14552.70 (   0.00%)    15156.41 *   4.15%*
Hmean     256    12827.37 (   0.00%)    13326.86 *   3.89%*

Note neither Kunpeng920 nor x86 Jacobsville supports SMT, so the SMT branch
in the code has not been tested but it supposed to work.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
[https://lore.kernel.org/lkml/Ytfjs+m1kUs0ScSn@worktop.programming.kicks-ass.net]
Tested-by: Yicong Yang <yangyicong@hisilicon.com>
Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Reviewed-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Jie Liu <liujie375@h-partners.com>
---
 kernel/sched/fair.c     | 30 +++++++++++++++++++++++++++---
 kernel/sched/sched.h    |  1 +
 kernel/sched/topology.c | 11 +++++++++++
 3 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c2c1f8f6c12d..a28eb51b3c72 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6370,6 +6370,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
 		}
 	}
 
+	if (static_branch_unlikely(&sched_cluster_active)) {
+		struct sched_domain *sdc = rcu_dereference(per_cpu(sd_cluster, target));
+
+		if (sdc) {
+			for_each_cpu_wrap(cpu, sched_domain_span(sdc), target) {
+				if (!cpumask_test_cpu(cpu, cpus))
+					continue;
+
+				if (smt) {
+					i = select_idle_core(p, cpu, cpus, &idle_cpu);
+					if ((unsigned int)i < nr_cpumask_bits)
+						return i;
+				} else {
+					if (--nr <= 0)
+						return -1;
+					idle_cpu = __select_idle_cpu(cpu, p);
+					if ((unsigned int)idle_cpu < nr_cpumask_bits)
+						return idle_cpu;
+				}
+			}
+			cpumask_andnot(cpus, cpus, sched_domain_span(sdc));
+		}
+	}
+
 	for_each_cpu_wrap(cpu, cpus, target) {
 		if (smt) {
 			i = select_idle_core(p, cpu, cpus, &idle_cpu);
@@ -6377,7 +6401,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
 				return i;
 
 		} else {
-			if (!--nr)
+			if (--nr <= 0)
 				return -1;
 			idle_cpu = __select_idle_cpu(cpu, p);
 			if ((unsigned int)idle_cpu < nr_cpumask_bits)
@@ -6487,7 +6511,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	/*
 	 * If the previous CPU is cache affine and idle, don't be stupid:
 	 */
-	if (prev != target && cpus_share_cache(prev, target) &&
+	if (prev != target && cpus_share_lowest_cache(prev, target) &&
 	    (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
 	    cpumask_test_cpu(prev, p->select_cpus) &&
@@ -6518,7 +6542,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	recent_used_cpu = p->recent_used_cpu;
 	if (recent_used_cpu != prev &&
 	    recent_used_cpu != target &&
-	    cpus_share_cache(recent_used_cpu, target) &&
+	    cpus_share_lowest_cache(recent_used_cpu, target) &&
 	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
 	    cpumask_test_cpu(p->recent_used_cpu, p->select_cpus) &&
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b48345e28259..b742d1df609b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1808,6 +1808,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
 extern struct static_key_false sched_asym_cpucapacity;
+extern struct static_key_false sched_cluster_active;
 
 struct sched_group_capacity {
 	atomic_t		ref;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 36f053f310c5..5472674eb1f6 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -653,7 +653,9 @@ DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
+
 DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
+DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
 
 static void update_top_cache_domain(int cpu)
 {
@@ -2200,6 +2202,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 	int i, ret = -ENOMEM;
 	struct sched_domain_topology_level *tl_asym;
 	bool has_asym = false;
+	bool has_cluster = false;
 
 	if (WARN_ON(cpumask_empty(cpu_map)))
 		goto error;
@@ -2227,6 +2230,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 
 			sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
 
+			has_cluster |= sd->flags & SD_CLUSTER;
+
 			if (tl == sched_domain_topology)
 				*per_cpu_ptr(d.sd, i) = sd;
 			if (tl->flags & SDTL_OVERLAP)
@@ -2286,6 +2291,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 	if (has_asym)
 		static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
 
+	if (has_cluster)
+		static_branch_inc_cpuslocked(&sched_cluster_active);
+
 	if (rq && sched_debug_enabled) {
 		pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
 			cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
@@ -2385,6 +2393,9 @@ static void detach_destroy_domains(const struct cpumask *cpu_map)
 	if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu)))
 		static_branch_dec_cpuslocked(&sched_asym_cpucapacity);
 
+	if (rcu_access_pointer(per_cpu(sd_cluster, cpu)))
+		static_branch_dec_cpuslocked(&sched_cluster_active);
+
 	rcu_read_lock();
 	for_each_cpu(i, cpu_map)
 		cpu_attach_domain(NULL, &def_root_domain, i);
-- 
Gitee


From 211b6fb7d5a8558a453475a08a697e651ca2d0cb Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Fri, 3 Dec 2021 12:32:38 -0800
Subject: [PATCH 3/7] scheduler: Create SDTL_SKIP flag to skip topology level

kunpeng inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5W44S
CVE: NA

Reference: https://lore.kernel.org/lkml/cover.1638563225.git.tim.c.chen@linux.intel.com/

----------------------------------------------------------------------

A system admin may not want to use cluster scheduling.  Make changes to
allow cluster topology level to be skipped when building sched domains.

Create SDTL_SKIP bit on the sched_domain_topology_level flag so we can
check if the cluster topology level should be skipped when building
sched domains.

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Jie Liu <liujie375@h-partners.com>
---
 include/linux/sched/topology.h |  1 +
 kernel/sched/topology.c        | 10 +++++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 7c4861d0e647..9680886277c9 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -191,6 +191,7 @@ typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
 typedef int (*sched_domain_flags_f)(void);
 
 #define SDTL_OVERLAP	0x01
+#define SDTL_SKIP	0x02
 
 struct sd_data {
 	struct sched_domain *__percpu *sd;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 5472674eb1f6..b10ad8dc9045 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1557,8 +1557,16 @@ static struct sched_domain_topology_level default_topology[] = {
 static struct sched_domain_topology_level *sched_domain_topology =
 	default_topology;
 
+static struct sched_domain_topology_level *next_tl(struct sched_domain_topology_level *tl)
+{
+	++tl;
+	while (tl->mask && tl->flags & SDTL_SKIP)
+		++tl;
+	return tl;
+}
+
 #define for_each_sd_topology(tl)			\
-	for (tl = sched_domain_topology; tl->mask; tl++)
+	for (tl = sched_domain_topology; tl->mask; tl = next_tl(tl))
 
 void set_sched_topology(struct sched_domain_topology_level *tl)
 {
-- 
Gitee


From 8ce3e706b31409147f035c037055caa68e450ce5 Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Fri, 3 Dec 2021 12:32:40 -0800
Subject: [PATCH 4/7] scheduler: Add runtime knob sysctl_sched_cluster

kunpeng inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5W44S
CVE: NA

Reference: https://lore.kernel.org/lkml/cover.1638563225.git.tim.c.chen@linux.intel.com/

----------------------------------------------------------------------

Allow run time configuration of the scheduler to use cluster
scheduling.  Configuration can be changed via the sysctl variable
/proc/sys/kernel/sched_cluster. Setting it to 1 enable cluster
scheduling and setting it to 0 turns it off.

Cluster scheduling should benefit independent tasks by load balancing
them between clusters.  It reaps the most benefit when the system's CPUs
are not fully busy, so we can spread the tasks out between the clusters to
reduce contention on cluster resource (e.g. L2 cache).

However, if the system is expected to operate close to full utilization,
the system admin could turn this feature off so as not to incur
extra load balancing overhead between the cluster domains.

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Jie Liu <liujie375@h-partners.com>
---
 arch/x86/kernel/smpboot.c    |  8 +++++
 drivers/base/arch_topology.c | 13 +++++---
 include/linux/sched/sysctl.h |  6 ++++
 include/linux/topology.h     |  1 +
 kernel/sched/core.c          |  1 +
 kernel/sched/sched.h         |  6 ++++
 kernel/sched/topology.c      | 64 ++++++++++++++++++++++++++++++++++++
 7 files changed, 95 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index b4911aa7b2d5..b3d1e449702c 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -56,6 +56,7 @@
 #include <linux/numa.h>
 #include <linux/pgtable.h>
 #include <linux/overflow.h>
+#include <linux/cpuset.h>
 
 #include <asm/acpi.h>
 #include <asm/desc.h>
@@ -122,6 +123,13 @@ int arch_update_cpu_topology(void)
 	return retval;
 }
 
+void arch_rebuild_cpu_topology(void)
+{
+	x86_topology_update = true;
+	rebuild_sched_domains();
+	x86_topology_update = false;
+}
+
 static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
 {
 	unsigned long flags;
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index 802077cd1ab9..aaf969004275 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -114,16 +114,21 @@ int topology_update_cpu_topology(void)
 	return update_topology;
 }
 
+void __weak arch_rebuild_cpu_topology(void)
+{
+	update_topology = 1;
+	rebuild_sched_domains();
+	pr_debug("sched_domain hierarchy rebuilt, flags updated\n");
+	update_topology = 0;
+}
+
 /*
  * Updating the sched_domains can't be done directly from cpufreq callbacks
  * due to locking, so queue the work for later.
  */
 static void update_topology_flags_workfn(struct work_struct *work)
 {
-	update_topology = 1;
-	rebuild_sched_domains();
-	pr_debug("sched_domain hierarchy rebuilt, flags updated\n");
-	update_topology = 0;
+	arch_rebuild_cpu_topology();
 }
 
 static DEFINE_PER_CPU(u32, freq_factor) = 1;
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 0acfc0cb0456..933ffee18b4b 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -111,4 +111,10 @@ int sched_energy_aware_handler(struct ctl_table *table, int write,
 		void *buffer, size_t *lenp, loff_t *ppos);
 #endif
 
+#ifdef CONFIG_SCHED_CLUSTER
+extern unsigned int sysctl_sched_cluster;
+int sched_cluster_handler(struct ctl_table *table, int write,
+			  void *buffer, size_t *lenp, loff_t *ppos);
+#endif
+
 #endif /* _LINUX_SCHED_SYSCTL_H */
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 0b3704ad13c8..42bcfd5d9fdb 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -44,6 +44,7 @@
 		if (nr_cpus_node(node))
 
 int arch_update_cpu_topology(void);
+void arch_rebuild_cpu_topology(void);
 
 /* Conform to ACPI 2.0 SLIT distance definitions */
 #define LOCAL_DISTANCE		10
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 976c9e142d88..d06ac41204c6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8097,6 +8097,7 @@ int sched_cpu_dying(unsigned int cpu)
 void __init sched_init_smp(void)
 {
 	sched_init_numa();
+	set_sched_cluster();
 
 	/*
 	 * There's no userspace yet to cause hotplug operations; hence all the
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b742d1df609b..6bb11d71caff 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1685,6 +1685,12 @@ this_rq_lock_irq(struct rq_flags *rf)
 	return rq;
 }
 
+#ifdef CONFIG_SCHED_CLUSTER
+extern void set_sched_cluster(void);
+#else
+static inline void set_sched_cluster(void) { }
+#endif
+
 #ifdef CONFIG_NUMA
 #ifdef CONFIG_SCHED_STEAL
 extern struct static_key_true sched_steal_allow;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index b10ad8dc9045..c3c2cfcca3a0 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1557,6 +1557,70 @@ static struct sched_domain_topology_level default_topology[] = {
 static struct sched_domain_topology_level *sched_domain_topology =
 	default_topology;
 
+#ifdef CONFIG_SCHED_CLUSTER
+void set_sched_cluster(void)
+{
+	struct sched_domain_topology_level *tl;
+
+	for (tl = sched_domain_topology; tl->mask; tl++) {
+		if (tl->sd_flags && (tl->sd_flags() & SD_CLUSTER)) {
+			if (!sysctl_sched_cluster)
+				tl->flags |= SDTL_SKIP;
+			else
+				tl->flags &= ~SDTL_SKIP;
+			break;
+		}
+	}
+}
+
+/* set via /proc/sys/kernel/sched_cluster */
+unsigned int __read_mostly sysctl_sched_cluster = 1;
+
+static DEFINE_MUTEX(sched_cluster_mutex);
+int sched_cluster_handler(struct ctl_table *table, int write,
+		void *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret;
+	unsigned int oldval;
+
+	if (write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	mutex_lock(&sched_cluster_mutex);
+	oldval = sysctl_sched_cluster;
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (!ret && write) {
+		if (oldval != sysctl_sched_cluster) {
+			set_sched_cluster();
+			arch_rebuild_cpu_topology();
+		}
+	}
+	mutex_unlock(&sched_cluster_mutex);
+
+	return ret;
+}
+
+static struct ctl_table sched_cluster_sysctls[] = {
+	{
+		.procname       = "sched_cluster",
+		.data           = &sysctl_sched_cluster,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = sched_cluster_handler,
+		.extra1         = SYSCTL_ZERO,
+		.extra2         = SYSCTL_ONE,
+	},
+	{}
+};
+
+static int __init sched_cluster_sysctl_init(void)
+{
+	register_sysctl_init("kernel", sched_cluster_sysctls);
+	return 0;
+}
+late_initcall(sched_cluster_sysctl_init);
+#endif
+
 static struct sched_domain_topology_level *next_tl(struct sched_domain_topology_level *tl)
 {
 	++tl;
-- 
Gitee


From 9e68cc2bf535a2f4e3c33e7e53bbb15815b703c4 Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Fri, 3 Dec 2021 12:32:41 -0800
Subject: [PATCH 5/7] scheduler: Add boot time enabling/disabling of cluster
 scheduling

kunpeng inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5W44S
CVE: NA

Reference: https://lore.kernel.org/lkml/cover.1638563225.git.tim.c.chen@linux.intel.com/

----------------------------------------------------------------------

Add boot time parameter sched_cluster to enable or disable cluster
scheduling.  Set boot parameter as follow:

	sched_cluster=0 disables cluster scheduling
	sched_cluster=1 enables cluster scheduling

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Jie Liu <liujie375@h-partners.com>
---
 Documentation/admin-guide/kernel-parameters.txt |  4 ++++
 kernel/sched/topology.c                         | 16 ++++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 8a1a25216da6..786bb3f02c61 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4978,6 +4978,10 @@
 
 	sched_debug	[KNL] Enables verbose scheduler debug messages.
 
+	sched_cluster=  Enable or disable cluster scheduling.
+			0 -- disable.
+			1 -- enable.
+
 	schedstats=	[KNL,X86] Enable or disable scheduled statistics.
 			Allowed values are enable and disable. This feature
 			incurs a small amount of overhead in the scheduler
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index c3c2cfcca3a0..2f8e18792352 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1619,6 +1619,22 @@ static int __init sched_cluster_sysctl_init(void)
 	return 0;
 }
 late_initcall(sched_cluster_sysctl_init);
+
+static int __init sched_cluster_option(char *str)
+{
+	int enable;
+
+	if (get_option(&str, &enable)) {
+		if (enable != 0 && enable != 1)
+			return -EINVAL;
+
+		sysctl_sched_cluster = enable;
+		return 0;
+	}
+
+	return -EINVAL;
+}
+early_param("sched_cluster", sched_cluster_option);
 #endif
 
 static struct sched_domain_topology_level *next_tl(struct sched_domain_topology_level *tl)
-- 
Gitee


From 6afb257d6dd71085344e1472ea6e820b5dc0a8e3 Mon Sep 17 00:00:00 2001
From: Yicong Yang <yangyicong@hisilicon.com>
Date: Mon, 13 Feb 2023 10:48:54 +0800
Subject: [PATCH 6/7] scheduler: Disable cluster scheduling by default

kunpeng inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5W44S
CVE: NA

----------------------------------------------------------------------

Disable cluster scheduling by default since it's not a universal win.
User can choose to enable it through sysctl or at boot time according to
their scenario.

Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Signed-off-by: Jie Liu <liujie375@h-partners.com>
---
 kernel/sched/topology.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 2f8e18792352..3208293d68fa 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1574,7 +1574,7 @@ void set_sched_cluster(void)
 }
 
 /* set via /proc/sys/kernel/sched_cluster */
-unsigned int __read_mostly sysctl_sched_cluster = 1;
+unsigned int __read_mostly sysctl_sched_cluster;
 
 static DEFINE_MUTEX(sched_cluster_mutex);
 int sched_cluster_handler(struct ctl_table *table, int write,
-- 
Gitee


From aff649361671b432570e94c9056932f50dd6f101 Mon Sep 17 00:00:00 2001
From: Jie Liu <liujie375@h-partners.com>
Date: Mon, 24 Oct 2022 09:34:57 +0800
Subject: [PATCH 7/7] sched:Open the kernel configuration for cluster.

kunpeng inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5W44S
CVE: NA

----------------------------------------------------------------------

In the past configuration, CONFIG_SCHED_CLUSTER was not set. Now, we need
to open the configuration.

Signed-off-by: Jie Liu <liujie375@h-partners.com>
---
 arch/arm64/configs/openeuler_defconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig
index f6cc13c8971d..02f5281eb018 100644
--- a/arch/arm64/configs/openeuler_defconfig
+++ b/arch/arm64/configs/openeuler_defconfig
@@ -401,7 +401,7 @@ CONFIG_ARM64_PA_BITS=48
 # CONFIG_CPU_BIG_ENDIAN is not set
 CONFIG_CPU_LITTLE_ENDIAN=y
 CONFIG_SCHED_MC=y
-# CONFIG_SCHED_CLUSTER is not set
+CONFIG_SCHED_CLUSTER=y
 CONFIG_SCHED_SMT=y
 CONFIG_NR_CPUS=4096
 CONFIG_HOTPLUG_CPU=y
-- 
Gitee