diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 8a1a25216da61907fad14864d61ada0646eab7f4..786bb3f02c610dc25b46941fcd916aaa2ef49e37 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4978,6 +4978,10 @@ sched_debug [KNL] Enables verbose scheduler debug messages. + sched_cluster= Enable or disable cluster scheduling. + 0 -- disable. + 1 -- enable. + schedstats= [KNL,X86] Enable or disable scheduled statistics. Allowed values are enable and disable. This feature incurs a small amount of overhead in the scheduler diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index f6cc13c8971d03274edd533af7c1850d6cac9c59..02f5281eb0184aca38bfb34a818746d2ae84bb8d 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -401,7 +401,7 @@ CONFIG_ARM64_PA_BITS=48 # CONFIG_CPU_BIG_ENDIAN is not set CONFIG_CPU_LITTLE_ENDIAN=y CONFIG_SCHED_MC=y -# CONFIG_SCHED_CLUSTER is not set +CONFIG_SCHED_CLUSTER=y CONFIG_SCHED_SMT=y CONFIG_NR_CPUS=4096 CONFIG_HOTPLUG_CPU=y diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index b4911aa7b2d594e571f9ea1f90ef6e9b3054e79c..b3d1e449702c332e1d653b8b0a3c2105ca5b0d15 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include @@ -122,6 +123,13 @@ int arch_update_cpu_topology(void) return retval; } +void arch_rebuild_cpu_topology(void) +{ + x86_topology_update = true; + rebuild_sched_domains(); + x86_topology_update = false; +} + static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) { unsigned long flags; diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 802077cd1ab9b9ce2345cab813dd4adcea507ddd..aaf96900427593fd05935c80de19b11e84b04cd9 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -114,16 +114,21 @@ int topology_update_cpu_topology(void) return update_topology; } +void __weak arch_rebuild_cpu_topology(void) +{ + update_topology = 1; + rebuild_sched_domains(); + pr_debug("sched_domain hierarchy rebuilt, flags updated\n"); + update_topology = 0; +} + /* * Updating the sched_domains can't be done directly from cpufreq callbacks * due to locking, so queue the work for later. */ static void update_topology_flags_workfn(struct work_struct *work) { - update_topology = 1; - rebuild_sched_domains(); - pr_debug("sched_domain hierarchy rebuilt, flags updated\n"); - update_topology = 0; + arch_rebuild_cpu_topology(); } static DEFINE_PER_CPU(u32, freq_factor) = 1; diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h index 34b21e971d77bfb0e96667dcca0e14e173f01593..c405b3940147def118c7e664a4664c4c70a67c83 100644 --- a/include/linux/sched/sd_flags.h +++ b/include/linux/sched/sd_flags.h @@ -99,6 +99,13 @@ SD_FLAG(SD_ASYM_CPUCAPACITY, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) */ SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) +/* + * Domain members share CPU cluster (LLC tags or L2 cache) + * + * NEEDS_GROUPS: Clusters are shared between groups. + */ +SD_FLAG(SD_CLUSTER, SDF_NEEDS_GROUPS) + /* * Domain members share CPU package resources (i.e. caches) * diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 0acfc0cb045630ab4a408aec52d43fa32146c53f..933ffee18b4bfacc32496d99ace9aeeeb8423636 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -111,4 +111,10 @@ int sched_energy_aware_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); #endif +#ifdef CONFIG_SCHED_CLUSTER +extern unsigned int sysctl_sched_cluster; +int sched_cluster_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +#endif + #endif /* _LINUX_SCHED_SYSCTL_H */ diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 249c98aef083dbc4ae714624ec0c593e980ea82d..9680886277c97ce4e8e4fafc752b61d3a3995a86 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -46,7 +46,7 @@ static inline int cpu_smt_flags(void) #ifdef CONFIG_SCHED_CLUSTER static inline int cpu_cluster_flags(void) { - return SD_SHARE_PKG_RESOURCES; + return SD_CLUSTER | SD_SHARE_PKG_RESOURCES; } #endif @@ -185,11 +185,13 @@ cpumask_var_t *alloc_sched_domains(unsigned int ndoms); void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms); bool cpus_share_cache(int this_cpu, int that_cpu); +bool cpus_share_lowest_cache(int this_cpu, int that_cpu); typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); typedef int (*sched_domain_flags_f)(void); #define SDTL_OVERLAP 0x01 +#define SDTL_SKIP 0x02 struct sd_data { struct sched_domain *__percpu *sd; @@ -238,6 +240,11 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu) return true; } +static inline bool cpus_share_lowest_cache(int this_cpu, int that_cpu) +{ + return true; +} + #endif /* !CONFIG_SMP */ #ifndef arch_scale_cpu_capacity diff --git a/include/linux/topology.h b/include/linux/topology.h index 0b3704ad13c8861fab27eea7c863f9b3f69ffdbf..42bcfd5d9fdb0db59eacdcfc85d73d826d0daa86 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -44,6 +44,7 @@ if (nr_cpus_node(node)) int arch_update_cpu_topology(void); +void arch_rebuild_cpu_topology(void); /* Conform to ACPI 2.0 SLIT distance definitions */ #define LOCAL_DISTANCE 10 diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ec25e19f0912c76bc83d3ceac5bd7becb220bc7c..d06ac41204c63a461c1c7c4957ff8ba9c93f23c5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3014,6 +3014,18 @@ bool cpus_share_cache(int this_cpu, int that_cpu) return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } +/* + * Whether CPUs are share lowest cache, which means LLC on non-cluster + * machines and LLC tag or L2 on machines with clusters. + */ +bool cpus_share_lowest_cache(int this_cpu, int that_cpu) +{ + if (this_cpu == that_cpu) + return true; + + return per_cpu(sd_lowest_cache_id, this_cpu) == per_cpu(sd_lowest_cache_id, that_cpu); +} + static inline bool ttwu_queue_cond(int cpu, int wake_flags) { /* @@ -8085,6 +8097,7 @@ int sched_cpu_dying(unsigned int cpu) void __init sched_init_smp(void) { sched_init_numa(); + set_sched_cluster(); /* * There's no userspace yet to cause hotplug operations; hence all the diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c2c1f8f6c12da3764386a9d8743860703d587d9e..a28eb51b3c724b24b2328a09176f89eac4b421c3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6370,6 +6370,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t } } + if (static_branch_unlikely(&sched_cluster_active)) { + struct sched_domain *sdc = rcu_dereference(per_cpu(sd_cluster, target)); + + if (sdc) { + for_each_cpu_wrap(cpu, sched_domain_span(sdc), target) { + if (!cpumask_test_cpu(cpu, cpus)) + continue; + + if (smt) { + i = select_idle_core(p, cpu, cpus, &idle_cpu); + if ((unsigned int)i < nr_cpumask_bits) + return i; + } else { + if (--nr <= 0) + return -1; + idle_cpu = __select_idle_cpu(cpu, p); + if ((unsigned int)idle_cpu < nr_cpumask_bits) + return idle_cpu; + } + } + cpumask_andnot(cpus, cpus, sched_domain_span(sdc)); + } + } + for_each_cpu_wrap(cpu, cpus, target) { if (smt) { i = select_idle_core(p, cpu, cpus, &idle_cpu); @@ -6377,7 +6401,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t return i; } else { - if (!--nr) + if (--nr <= 0) return -1; idle_cpu = __select_idle_cpu(cpu, p); if ((unsigned int)idle_cpu < nr_cpumask_bits) @@ -6487,7 +6511,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) /* * If the previous CPU is cache affine and idle, don't be stupid: */ - if (prev != target && cpus_share_cache(prev, target) && + if (prev != target && cpus_share_lowest_cache(prev, target) && (available_idle_cpu(prev) || sched_idle_cpu(prev)) && #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY cpumask_test_cpu(prev, p->select_cpus) && @@ -6518,7 +6542,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) recent_used_cpu = p->recent_used_cpu; if (recent_used_cpu != prev && recent_used_cpu != target && - cpus_share_cache(recent_used_cpu, target) && + cpus_share_lowest_cache(recent_used_cpu, target) && (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY cpumask_test_cpu(p->recent_used_cpu, p->select_cpus) && diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 32d4775e537d62ea1601e8fe58a8336c12400834..6bb11d71caff750ac0672ecb340aecc544f097fa 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1685,6 +1685,12 @@ this_rq_lock_irq(struct rq_flags *rf) return rq; } +#ifdef CONFIG_SCHED_CLUSTER +extern void set_sched_cluster(void); +#else +static inline void set_sched_cluster(void) { } +#endif + #ifdef CONFIG_NUMA #ifdef CONFIG_SCHED_STEAL extern struct static_key_true sched_steal_allow; @@ -1801,11 +1807,14 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc); DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); +DECLARE_PER_CPU(int, sd_lowest_cache_id); DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_cluster); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); extern struct static_key_false sched_asym_cpucapacity; +extern struct static_key_false sched_cluster_active; struct sched_group_capacity { atomic_t ref; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 2678e7590cfc46c7c8d1f5eeafccef8c17c92c1a..3208293d68fac66d1b2d158349b5dadab3c2d506 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -647,11 +647,15 @@ static void destroy_sched_domains(struct sched_domain *sd) DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); +DEFINE_PER_CPU(int, sd_lowest_cache_id); +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_cluster); DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); + DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); +DEFINE_STATIC_KEY_FALSE(sched_cluster_active); static void update_top_cache_domain(int cpu) { @@ -682,6 +686,18 @@ static void update_top_cache_domain(int cpu) per_cpu(sd_llc_id, cpu) = id; rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); + sd = lowest_flag_domain(cpu, SD_CLUSTER); + if (sd) + id = cpumask_first(sched_domain_span(sd)); + rcu_assign_pointer(per_cpu(sd_cluster, cpu), sd); + + /* + * This assignment should be placed after the sd_llc_id as + * we want this id equals to cluster id on cluster machines + * but equals to LLC id on non-Cluster machines. + */ + per_cpu(sd_lowest_cache_id, cpu) = id; + sd = lowest_flag_domain(cpu, SD_NUMA); rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); @@ -1406,6 +1422,7 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; */ #define TOPOLOGY_SD_FLAGS \ (SD_SHARE_CPUCAPACITY | \ + SD_CLUSTER | \ SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING) @@ -1540,8 +1557,96 @@ static struct sched_domain_topology_level default_topology[] = { static struct sched_domain_topology_level *sched_domain_topology = default_topology; +#ifdef CONFIG_SCHED_CLUSTER +void set_sched_cluster(void) +{ + struct sched_domain_topology_level *tl; + + for (tl = sched_domain_topology; tl->mask; tl++) { + if (tl->sd_flags && (tl->sd_flags() & SD_CLUSTER)) { + if (!sysctl_sched_cluster) + tl->flags |= SDTL_SKIP; + else + tl->flags &= ~SDTL_SKIP; + break; + } + } +} + +/* set via /proc/sys/kernel/sched_cluster */ +unsigned int __read_mostly sysctl_sched_cluster; + +static DEFINE_MUTEX(sched_cluster_mutex); +int sched_cluster_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + unsigned int oldval; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&sched_cluster_mutex); + oldval = sysctl_sched_cluster; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (!ret && write) { + if (oldval != sysctl_sched_cluster) { + set_sched_cluster(); + arch_rebuild_cpu_topology(); + } + } + mutex_unlock(&sched_cluster_mutex); + + return ret; +} + +static struct ctl_table sched_cluster_sysctls[] = { + { + .procname = "sched_cluster", + .data = &sysctl_sched_cluster, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_cluster_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static int __init sched_cluster_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_cluster_sysctls); + return 0; +} +late_initcall(sched_cluster_sysctl_init); + +static int __init sched_cluster_option(char *str) +{ + int enable; + + if (get_option(&str, &enable)) { + if (enable != 0 && enable != 1) + return -EINVAL; + + sysctl_sched_cluster = enable; + return 0; + } + + return -EINVAL; +} +early_param("sched_cluster", sched_cluster_option); +#endif + +static struct sched_domain_topology_level *next_tl(struct sched_domain_topology_level *tl) +{ + ++tl; + while (tl->mask && tl->flags & SDTL_SKIP) + ++tl; + return tl; +} + #define for_each_sd_topology(tl) \ - for (tl = sched_domain_topology; tl->mask; tl++) + for (tl = sched_domain_topology; tl->mask; tl = next_tl(tl)) void set_sched_topology(struct sched_domain_topology_level *tl) { @@ -2185,6 +2290,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att int i, ret = -ENOMEM; struct sched_domain_topology_level *tl_asym; bool has_asym = false; + bool has_cluster = false; if (WARN_ON(cpumask_empty(cpu_map))) goto error; @@ -2212,6 +2318,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i); + has_cluster |= sd->flags & SD_CLUSTER; + if (tl == sched_domain_topology) *per_cpu_ptr(d.sd, i) = sd; if (tl->flags & SDTL_OVERLAP) @@ -2271,6 +2379,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att if (has_asym) static_branch_inc_cpuslocked(&sched_asym_cpucapacity); + if (has_cluster) + static_branch_inc_cpuslocked(&sched_cluster_active); + if (rq && sched_debug_enabled) { pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); @@ -2370,6 +2481,9 @@ static void detach_destroy_domains(const struct cpumask *cpu_map) if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu))) static_branch_dec_cpuslocked(&sched_asym_cpucapacity); + if (rcu_access_pointer(per_cpu(sd_cluster, cpu))) + static_branch_dec_cpuslocked(&sched_cluster_active); + rcu_read_lock(); for_each_cpu(i, cpu_map) cpu_attach_domain(NULL, &def_root_domain, i);