diff --git a/arch/arm64/configs/tencent.config b/arch/arm64/configs/tencent.config index dcfd3e97e9a488233856ddaca7ac609ae803f3d9..d20a55b7b34ca04d969e1dcf58eb65570985fbe6 100644 --- a/arch/arm64/configs/tencent.config +++ b/arch/arm64/configs/tencent.config @@ -34,6 +34,7 @@ CONFIG_NAMESPACES=y CONFIG_USER_NS=y # CONFIG_SECURITY_MONITOR is not set CONFIG_CHECKPOINT_RESTORE=y +CONFIG_SCHED_STEAL=y CONFIG_SCHED_AUTOGROUP=y CONFIG_BLK_DEV_INITRD=y CONFIG_EXPERT=y diff --git a/arch/x86/configs/tencent.config b/arch/x86/configs/tencent.config index f6ec17e7ea2c8913a53e33cd5c44adf691aceac8..d1a8b4fd3c338bca2bc98e133e003e03838bce08 100644 --- a/arch/x86/configs/tencent.config +++ b/arch/x86/configs/tencent.config @@ -37,6 +37,7 @@ CONFIG_NAMESPACES=y CONFIG_USER_NS=y # CONFIG_SECURITY_MONITOR is not set CONFIG_CHECKPOINT_RESTORE=y +CONFIG_SCHED_STEAL=y CONFIG_SCHED_AUTOGROUP=y CONFIG_BLK_DEV_INITRD=y CONFIG_EXPERT=y diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 4188de2c163163d325b1192fba5b5011d213f3a2..762e83f095f53098c7cdee1afb7a5dc601956efa 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -66,6 +66,10 @@ struct sched_domain_shared { atomic_t ref; atomic_t nr_busy_cpus; int has_idle_cores; +#ifdef CONFIG_SCHED_STEAL + struct sparsemask *cfs_overload_cpus; +#endif + #ifdef CONFIG_ARM64 } ____cacheline_aligned; #else diff --git a/init/Kconfig b/init/Kconfig index 2542f4aeba581dd68b50b158846fe11e5d2b50cf..10e70a42335258dcf9dd9cb787a1142c8ddef759 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1250,6 +1250,20 @@ config SECURITY_MONITOR default y help Allow user to add security monitor +config SCHED_STEAL + bool "Steal tasks to improve CPU utilization" + depends on SMP + default n + help + When a CPU has no more CFS tasks to run, and idle_balance() fails + to find a task, then attempt to steal a task from an overloaded + CPU in the same LLC. Maintain and use a bitmap of overloaded CPUs + to efficiently identify candidates. To minimize search time, steal + the first migratable task that is found when the bitmap is traversed. + For fairness, search for migratable tasks on an overloaded CPU in + order of next to run. + + If unsure, say N here. config CHECKPOINT_RESTORE bool "Checkpoint/restore support" diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2f7eda976364718c9e11299b8a2b7388a0658fdd..6ea437239d16d2c9346c00ec960d215e96db0e7f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1944,17 +1944,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP -static inline bool is_per_cpu_kthread(struct task_struct *p) -{ - if (!(p->flags & PF_KTHREAD)) - return false; - - if (p->nr_cpus_allowed != 1) - return false; - - return true; -} - /* * Per-CPU kthreads are allowed to run on !active && online CPUs, see * __set_cpus_allowed_ptr() and select_fallback_rq(). @@ -3312,17 +3301,48 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, DEFINE_STATIC_KEY_FALSE(sched_schedstats); static bool __initdata __sched_schedstats = false; +#ifdef CONFIG_SCHED_STEAL +unsigned long schedstat_skid; + +static void compute_skid(void) +{ + int i, n = 0; + s64 t; + int skid = 0; + + for (i = 0; i < 100; i++) { + t = local_clock(); + t = local_clock() - t; + if (t > 0 && t < 1000) { /* only use sane samples */ + skid += (int) t; + n++; + } + } + + if (n > 0) + schedstat_skid = skid / n; + else + schedstat_skid = 0; + pr_info("schedstat_skid = %lu\n", schedstat_skid); +} +#else +static inline void compute_skid(void) {} +#endif + static void set_schedstats(bool enabled) { - if (enabled) + if (enabled) { + compute_skid(); static_branch_enable(&sched_schedstats); - else + } else { static_branch_disable(&sched_schedstats); + } } void force_schedstat_enabled(void) { if (!schedstat_enabled()) { + compute_skid(); pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); static_branch_enable(&sched_schedstats); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index db15e02baa55042a52f6fd0717c418a2e6605ffe..97db83a86bf0281870222082733ea9e10b16ca07 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -21,6 +21,9 @@ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ #include "sched.h" +#ifdef CONFIG_SCHED_STEAL +#include "sparsemask.h" +#endif #include "fair.h" #include @@ -114,6 +117,10 @@ int __weak arch_asym_cpu_priority(int cpu) #endif +#ifdef CONFIG_QOS_SCHED +static DEFINE_PER_CPU_SHARED_ALIGNED(struct list_head, qos_throttled_cfs_rq); +#endif + #ifdef CONFIG_CFS_BANDWIDTH /* * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool @@ -3670,6 +3677,20 @@ static inline unsigned long task_util_est(struct task_struct *p) return max(task_util(p), _task_util_est(p)); } +#ifdef CONFIG_UCLAMP_TASK +static inline unsigned long uclamp_task_util(struct task_struct *p) +{ + return clamp(task_util_est(p), + uclamp_eff_value(p, UCLAMP_MIN), + uclamp_eff_value(p, UCLAMP_MAX)); +} +#else +static inline unsigned long uclamp_task_util(struct task_struct *p) +{ + return task_util_est(p); +} +#endif + static inline void util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) { @@ -3769,7 +3790,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) static inline int task_fits_capacity(struct task_struct *p, long capacity) { - return fits_capacity(task_util_est(p), capacity); + return fits_capacity(uclamp_task_util(p), capacity); } static inline void update_misfit_status(struct task_struct *p, struct rq *rq) @@ -3794,6 +3815,69 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1); } +static inline void rq_idle_stamp_update(struct rq *rq) +{ + rq->idle_bt_stamp = rq_clock(rq); +} + +static inline void rq_idle_stamp_clear(struct rq *rq) +{ + rq->idle_bt_stamp = 0; +} + +#ifdef CONFIG_SCHED_STEAL + +static inline bool steal_enabled(void) +{ +#ifdef CONFIG_NUMA + bool allow = static_branch_likely(&sched_steal_allow); +#else + bool allow = true; +#endif + return sched_feat(STEAL) && allow; +} + +static void overload_clear(struct rq *rq) +{ + struct sparsemask *overload_cpus; + unsigned long time; + + if (!steal_enabled()) + return; + + time = schedstat_start_time(); + rcu_read_lock(); + overload_cpus = rcu_dereference(rq->cfs_overload_cpus); + if (overload_cpus) + sparsemask_clear_elem(overload_cpus, rq->cpu); + rcu_read_unlock(); + schedstat_end_time(rq, time); +} + +static void overload_set(struct rq *rq) +{ + struct sparsemask *overload_cpus; + unsigned long time; + + if (!steal_enabled()) + return; + + time = schedstat_start_time(); + rcu_read_lock(); + overload_cpus = rcu_dereference(rq->cfs_overload_cpus); + if (overload_cpus) + sparsemask_set_elem(overload_cpus, rq->cpu); + rcu_read_unlock(); + schedstat_end_time(rq, time); +} + +static int try_steal(struct rq *this_rq, struct rq_flags *rf); +#else +static inline int try_steal(struct rq *this_rq, struct rq_flags *rf) { return 0; } +static inline void overload_clear(struct rq *rq) {} +static inline void overload_set(struct rq *rq) {} +#endif + #else /* CONFIG_SMP */ #define UPDATE_TG 0x0 @@ -3817,6 +3901,12 @@ static inline int idle_balance(struct rq *rq, struct rq_flags *rf) return 0; } +static inline void rq_idle_stamp_update(struct rq *rq) {} +static inline void rq_idle_stamp_clear(struct rq *rq) {} +static inline int try_steal(struct rq *this_rq, struct rq_flags *rf) { return 0; } +static inline void overload_clear(struct rq *rq) {} +static inline void overload_set(struct rq *rq) {} + static inline void util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} @@ -4603,6 +4693,7 @@ static int tg_throttle_down(struct task_group *tg, void *data) static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); + unsigned int prev_nr = rq->cfs.h_nr_running; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, idle_task_delta, dequeue = 1; @@ -4652,8 +4743,11 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) dequeue = 0; } - if (!se) + if (!se) { sub_nr_running(rq, task_delta); + if (prev_nr >= 2 && prev_nr - task_delta < 2) + overload_clear(rq); + } /* * Note: distribution will already see us throttled via the @@ -4667,6 +4761,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); + unsigned int prev_nr = rq->cfs.h_nr_running; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, idle_task_delta; @@ -4725,6 +4820,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) /* At this point se is NULL and we are at root level*/ add_nr_running(rq, task_delta); + if (prev_nr < 2 && prev_nr + task_delta >= 2) + overload_set(rq); unthrottle_throttle: /* @@ -5412,6 +5509,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; int idle_h_nr_running = task_has_idle_policy(p); int task_new = !(flags & ENQUEUE_WAKEUP); + unsigned int prev_nr = rq->cfs.h_nr_running; /* * The code below (indirectly) updates schedutil which looks at @@ -5466,30 +5564,32 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) list_add_leaf_cfs_rq(cfs_rq); } -enqueue_throttle: - if (!se) { - add_nr_running(rq, 1); - /* - * Since new tasks are assigned an initial util_avg equal to - * half of the spare capacity of their CPU, tiny tasks have the - * ability to cross the overutilized threshold, which will - * result in the load balancer ruining all the task placement - * done by EAS. As a way to mitigate that effect, do not account - * for the first enqueue operation of new tasks during the - * overutilized flag detection. - * - * A better way of solving this problem would be to wait for - * the PELT signals of tasks to converge before taking them - * into account, but that is not straightforward to implement, - * and the following generally works well enough in practice. - */ - if (!task_new) - update_overutilized_status(rq); + /* At this point se is NULL and we are at root level*/ + add_nr_running(rq, 1); + if (prev_nr == 1) + overload_set(rq); + + /* + * Since new tasks are assigned an initial util_avg equal to + * half of the spare capacity of their CPU, tiny tasks have the + * ability to cross the overutilized threshold, which will + * result in the load balancer ruining all the task placement + * done by EAS. As a way to mitigate that effect, do not account + * for the first enqueue operation of new tasks during the + * overutilized flag detection. + * + * A better way of solving this problem would be to wait for + * the PELT signals of tasks to converge before taking them + * into account, but that is not straightforward to implement, + * and the following generally works well enough in practice. + */ + if (!task_new) + update_overutilized_status(rq); - if (rq->curr == rq->idle) - check_preempt_from_idle(cfs_rq_of(&p->se), &p->se); - } + if (rq->curr == rq->idle) + check_preempt_from_idle(cfs_rq_of(&p->se), &p->se); +enqueue_throttle: if (cfs_bandwidth_used()) { /* * When bandwidth control is enabled; the cfs_rq_throttled() @@ -5526,6 +5626,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; int task_sleep = flags & DEQUEUE_SLEEP; int idle_h_nr_running = task_has_idle_policy(p); + unsigned int prev_nr = rq->cfs.h_nr_running; for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); @@ -5568,6 +5669,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } + if (prev_nr == 2) + overload_clear(rq); + dequeue_throttle: if (!se) sub_nr_running(rq, 1); @@ -6234,12 +6338,69 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t return cpu; } +/* + * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which + * the task fits. If no CPU is big enough, but there are idle ones, try to + * maximize capacity. + */ +static int +select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) +{ + unsigned long task_util, best_cap = 0; + int cpu, best_cpu = -1; + struct cpumask *cpus; + + cpus = this_cpu_cpumask_var_ptr(select_idle_mask); + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + + task_util = uclamp_task_util(p); + + for_each_cpu_wrap(cpu, cpus, target) { + unsigned long cpu_cap = capacity_of(cpu); + + if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) + continue; + if (fits_capacity(task_util, cpu_cap)) + return cpu; + + if (cpu_cap > best_cap) { + best_cap = cpu_cap; + best_cpu = cpu; + } + } + + return best_cpu; +} + +static inline bool asym_fits_capacity(int task_util, int cpu) +{ + if (static_branch_unlikely(&sched_asym_cpucapacity)) + return fits_capacity(task_util, capacity_of(cpu)); + + return true; +} + +#ifdef CONFIG_SCHED_STEAL +#define SET_STAT(STAT) \ + do { \ + if (schedstat_enabled()) { \ + struct rq *rq = this_rq(); \ + \ + if (rq) \ + __schedstat_inc(rq->STAT); \ + } \ + } while (0) +#else +#define SET_STAT(STAT) +#endif + /* * Try and locate an idle core/thread in the LLC cache domain. */ static int select_idle_sibling(struct task_struct *p, int prev, int target) { struct sched_domain *sd; + unsigned long task_util; int i, recent_used_cpu; /* @@ -6247,15 +6408,45 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) */ lockdep_assert_irqs_disabled(); - if (available_idle_cpu(target) || sched_idle_cpu(target)) + /* + * On asymmetric system, update task utilization because we will check + * that the task fits with cpu's capacity. + */ + if (static_branch_unlikely(&sched_asym_cpucapacity)) { + sync_entity_load_avg(&p->se); + task_util = uclamp_task_util(p); + } + + if ((available_idle_cpu(target) || sched_idle_cpu(target)) && + asym_fits_capacity(task_util, target)) { + SET_STAT(found_idle_cpu_easy); return target; + } /* * If the previous CPU is cache affine and idle, don't be stupid: */ if (prev != target && cpus_share_cache(prev, target) && - (available_idle_cpu(prev) || sched_idle_cpu(prev))) + (available_idle_cpu(prev) || sched_idle_cpu(prev)) && + asym_fits_capacity(task_util, prev)) { + SET_STAT(found_idle_cpu_easy); return prev; + } + + /* + * Allow a per-cpu kthread to stack with the wakee if the + * kworker thread and the tasks previous CPUs are the same. + * The assumption is that the wakee queued work for the + * per-cpu kthread that is now complete and the wakeup is + * essentially a sync wakeup. An obvious example of this + * pattern is IO completions. + */ + if (is_per_cpu_kthread(current) && + prev == smp_processor_id() && + this_rq()->nr_running <= 1) { + SET_STAT(found_idle_cpu_easy); + return prev; + } /* Check a recently used CPU as a potential idle candidate: */ recent_used_cpu = p->recent_used_cpu; @@ -6263,31 +6454,63 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && - cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) { + cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) && + asym_fits_capacity(task_util, recent_used_cpu)) { /* * Replace recent_used_cpu with prev as it is a potential * candidate for the next wake: */ + SET_STAT(found_idle_cpu_easy); p->recent_used_cpu = prev; return recent_used_cpu; } + /* + * For asymmetric CPU capacity systems, our domain of interest is + * sd_asym_cpucapacity rather than sd_llc. + */ + if (static_branch_unlikely(&sched_asym_cpucapacity)) { + sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); + /* + * On an asymmetric CPU capacity system where an exclusive + * cpuset defines a symmetric island (i.e. one unique + * capacity_orig value through the cpuset), the key will be set + * but the CPUs within that cpuset will not have a domain with + * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric + * capacity path. + */ + if (sd) { + i = select_idle_capacity(p, sd, target); + SET_STAT(found_idle_cpu_capacity); + return ((unsigned)i < nr_cpumask_bits) ? i : target; + } + } + sd = rcu_dereference(per_cpu(sd_llc, target)); - if (!sd) + if (!sd) { + SET_STAT(nofound_idle_cpu); return target; + } i = select_idle_core(p, sd, target); - if ((unsigned)i < nr_cpumask_bits) + if ((unsigned)i < nr_cpumask_bits){ + SET_STAT(found_idle_cpu); return i; + } i = select_idle_cpu(p, sd, target); - if ((unsigned)i < nr_cpumask_bits) + if ((unsigned)i < nr_cpumask_bits) { + SET_STAT(found_idle_cpu); return i; + } i = select_idle_smt(p, sd, target); - if ((unsigned)i < nr_cpumask_bits) + if ((unsigned)i < nr_cpumask_bits){ + SET_STAT(found_idle_cpu); return i; + } + SET_STAT(nofound_idle_cpu); return target; } @@ -6699,6 +6922,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) static int select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) { + unsigned long time; struct sched_domain *tmp, *sd = NULL; int cpu = smp_processor_id(); int new_cpu = prev_cpu; @@ -6709,6 +6933,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f * required for stable ->cpus_allowed */ lockdep_assert_held(&p->pi_lock); + + time = schedstat_start_time(); + if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); @@ -6759,6 +6986,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f current->recent_used_cpu = cpu; } rcu_read_unlock(); + schedstat_end_time(cpu_rq(cpu), time); return new_cpu; } @@ -7161,13 +7389,141 @@ static struct task_struct *pick_task_fair(struct rq *rq) } #endif -static struct task_struct * +#ifdef CONFIG_QOS_SCHED +static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + struct sched_entity *se; + unsigned int prev_nr = cfs_rq->h_nr_running; + long task_delta, idle_task_delta, dequeue = 1; + + se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; + + /* freeze hierarchy runnable averages while throttled */ + rcu_read_lock(); + walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); + rcu_read_unlock(); + + task_delta = cfs_rq->h_nr_running; + idle_task_delta = cfs_rq->idle_h_nr_running; + for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + /* throttled entity or throttle-on-deactivate */ + if (!se->on_rq) + break; + + if (dequeue) + dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); + qcfs_rq->h_nr_running -= task_delta; + qcfs_rq->idle_h_nr_running -= idle_task_delta; + + if (qcfs_rq->load.weight) + dequeue = 0; + } + + if (!se) { + sub_nr_running(rq, task_delta); + if (prev_nr >= 2 && prev_nr - task_delta < 2) + overload_clear(rq); + + } + + cfs_rq->throttled = 1; + cfs_rq->throttled_clock = rq_clock(rq); + + list_add(&cfs_rq->throttled_list, &per_cpu(qos_throttled_cfs_rq, cpu_of(rq))); +} + +static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + struct sched_entity *se; + int enqueue = 1; + unsigned int prev_nr = cfs_rq->h_nr_running; + long task_delta, idle_task_delta; + + se = cfs_rq->tg->se[cpu_of(rq)]; + + cfs_rq->throttled = 0; + + update_rq_clock(rq); + + cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; + list_del_init(&cfs_rq->throttled_list); + + /* update hierarchical throttle state */ + walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); + + if (!cfs_rq->load.weight) + return; + + task_delta = cfs_rq->h_nr_running; + idle_task_delta = cfs_rq->idle_h_nr_running; + for_each_sched_entity(se) { + if (se->on_rq) + enqueue = 0; + + cfs_rq = cfs_rq_of(se); + if (enqueue) + enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); + cfs_rq->h_nr_running += task_delta; + cfs_rq->idle_h_nr_running += idle_task_delta; + + if (cfs_rq_throttled(cfs_rq)) + break; + } + + assert_list_leaf_cfs_rq(rq); + + if (!se) { + add_nr_running(rq, task_delta); + if (prev_nr < 2 && prev_nr + task_delta >= 2) + overload_set(rq); + } + + /* Determine whether we need to wake up potentially idle CPU: */ + if (rq->curr == rq->idle && rq->cfs.nr_running) + resched_curr(rq); +} + +static int unthrottle_qos_cfs_rqs(int cpu) +{ + struct cfs_rq *cfs_rq, *tmp_rq; + int res = 0; + + list_for_each_entry_safe(cfs_rq, tmp_rq, &per_cpu(qos_throttled_cfs_rq, cpu), + throttled_list) { + if (cfs_rq_throttled(cfs_rq)) { + unthrottle_qos_cfs_rq(cfs_rq); + res++; + } + } + + return res; +} + +static bool check_qos_cfs_rq(struct cfs_rq *cfs_rq) +{ + if (unlikely(cfs_rq && cfs_rq->tg->qos_level < 0 && + !sched_idle_cpu(smp_processor_id()) && + cfs_rq->h_nr_running == cfs_rq->idle_h_nr_running)) { + throttle_qos_cfs_rq(cfs_rq); + return true; + } + + return false; +} +#endif + +struct task_struct * pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; struct task_struct *p; int new_tasks; + unsigned long time; again: if (!sched_fair_runnable(rq)) @@ -7226,6 +7582,16 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf se = pick_next_entity(cfs_rq, curr); cfs_rq = group_cfs_rq(se); +#ifdef CONFIG_QOS_SCHED + if (check_qos_cfs_rq(cfs_rq)) { + cfs_rq = &rq->cfs; + WARN(cfs_rq->nr_running == 0, + "rq->nr_running=%u, cfs_rq->idle_h_nr_running=%u\n", + rq->nr_running, cfs_rq->idle_h_nr_running); + if (unlikely(!cfs_rq->nr_running)) + return NULL; + } +#endif } while (cfs_rq); p = task_of(se); @@ -7291,12 +7657,27 @@ done: __maybe_unused; if (!rf) return NULL; + time = schedstat_start_time(); + + /* + * We must set idle_stamp _before_ calling try_steal() or + * idle_balance(), such that we measure the duration as idle time. + */ + rq_idle_stamp_update(rq); + new_tasks = newidle_balance(rq, rf); + if (new_tasks == 0) + new_tasks = try_steal(rq, rf); + schedstat_end_time(rq, time); + + if (new_tasks) + rq_idle_stamp_clear(rq); + /* - * Because newidle_balance() releases (and re-acquires) rq->lock, it is - * possible for any higher priority task to appear. In that case we - * must re-start the pick_next_entity() loop. + * Because try_steal() and idle_balance() release (and re-acquire) + * rq->lock, it is possible for any higher priority task to appear. + * In that case we must re-start the pick_next_entity() loop. */ if (new_tasks < 0) return RETRY_TASK; @@ -7304,6 +7685,12 @@ done: __maybe_unused; if (new_tasks > 0) goto again; +#ifdef CONFIG_QOS_SCHED + if (unthrottle_qos_cfs_rqs(cpu_of(rq))) { + rq->idle_stamp = 0; + goto again; + } +#endif /* * rq is about to be idle, check if we need to update the * lost_idle_time of clock_pelt @@ -7728,15 +8115,45 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) return 0; } +#ifdef CONFIG_SCHED_STEAL /* - * detach_task() -- detach the task for the migration specified in env + * Return true if task @p can migrate from @rq to @dst_rq in the same LLC. + * No need to test for co-locality, and no need to test task_hot(), as sharing + * LLC provides cache warmth at that level. */ -static void detach_task(struct task_struct *p, struct lb_env *env) +static bool +can_migrate_task_llc(struct task_struct *p, struct rq *rq, struct rq *dst_rq) { - lockdep_assert_rq_held(env->src_rq); + int dst_cpu = dst_rq->cpu; + + lockdep_assert_held(rq); + + if (throttled_lb_pair(task_group(p), cpu_of(rq), dst_cpu)) + return false; + + if (!cpumask_test_cpu(dst_cpu, p->cpus_ptr)) { + schedstat_inc(p->se.statistics.nr_failed_migrations_affine); + return false; + } + + if (task_running(rq, p)) { + schedstat_inc(p->se.statistics.nr_failed_migrations_running); + return false; + } + + return true; +} +#endif + +/* + * detach_task() -- detach the task for the migration from @src_rq to @dst_cpu. + */ +static void detach_task(struct task_struct *p, struct rq *src_rq, int dst_cpu) +{ + lockdep_assert_held(src_rq); - deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); - set_task_cpu(p, env->dst_cpu); + deactivate_task(src_rq, p, DEQUEUE_NOCLOCK); + set_task_cpu(p, dst_cpu); } /* @@ -7756,7 +8173,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) if (!can_migrate_task(p, env)) continue; - detach_task(p, env); + detach_task(p, env->src_rq, env->dst_cpu); /* * Right now, this is only the second place where @@ -7831,7 +8248,7 @@ static int detach_tasks(struct lb_env *env) if ((load / 2) > env->imbalance) goto next; - detach_task(p, env); + detach_task(p, env->src_rq, env->dst_cpu); list_add(&p->se.group_node, &env->tasks); detached++; @@ -10225,11 +10642,6 @@ int newidle_balance(struct rq *this_rq, struct rq_flags *rf) u64 curr_cost = 0; update_misfit_status(NULL, this_rq); - /* - * We must set idle_stamp _before_ calling idle_balance(), such that we - * measure the duration of idle_balance() as idle time. - */ - this_rq->idle_bt_stamp = rq_clock(this_rq); /* * Do not pull tasks towards !active CPUs... @@ -10322,9 +10734,6 @@ int newidle_balance(struct rq *this_rq, struct rq_flags *rf) if (RQ_CFS_NR_RUNNING(this_rq) != this_rq->cfs.h_nr_running) pulled_task = -1; - if (pulled_task) - this_rq->idle_bt_stamp = 0; - rq_repin_lock(this_rq, rf); return pulled_task; @@ -10371,6 +10780,157 @@ void trigger_load_balance(struct rq *rq) nohz_balancer_kick(rq); } +#ifdef CONFIG_SCHED_STEAL +/* + * Search the runnable tasks in @cfs_rq in order of next to run, and find + * the first one that can be migrated to @dst_rq. @cfs_rq is locked on entry. + * On success, dequeue the task from @cfs_rq and return it, else return NULL. + */ +static struct task_struct * +detach_next_task(struct cfs_rq *cfs_rq, struct rq *dst_rq) +{ + int dst_cpu = dst_rq->cpu; + struct task_struct *p; + struct rq *rq = rq_of(cfs_rq); + + lockdep_assert_rq_held(rq_of(cfs_rq)); + + list_for_each_entry_reverse(p, &rq->cfs_tasks, se.group_node) { + if (can_migrate_task_llc(p, rq, dst_rq)) { + detach_task(p, rq, dst_cpu); + return p; + } + } + return NULL; +} + +/* + * Attempt to migrate a CFS task from @src_cpu to @dst_rq. @locked indicates + * whether @dst_rq is already locked on entry. This function may lock or + * unlock @dst_rq, and updates @locked to indicate the locked state on return. + * The locking protocol is based on idle_balance(). + * Returns 1 on success and 0 on failure. + */ +static int steal_from(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked, + int src_cpu) +{ + struct task_struct *p; + struct rq_flags rf; + int stolen = 0; + int dst_cpu = dst_rq->cpu; + struct rq *src_rq = cpu_rq(src_cpu); + + if (dst_cpu == src_cpu || src_rq->cfs.h_nr_running < 2) + return 0; + + if (*locked) { + rq_unpin_lock(dst_rq, dst_rf); + raw_spin_rq_unlock(dst_rq); + *locked = false; + } + rq_lock_irqsave(src_rq, &rf); + update_rq_clock(src_rq); + + if (src_rq->cfs.h_nr_running < 2 || !cpu_active(src_cpu)) + p = NULL; + else + p = detach_next_task(&src_rq->cfs, dst_rq); + + rq_unlock(src_rq, &rf); + + if (p) { + raw_spin_rq_lock(dst_rq); + rq_repin_lock(dst_rq, dst_rf); + *locked = true; + update_rq_clock(dst_rq); + attach_task(dst_rq, p); + stolen = 1; + schedstat_inc(dst_rq->steal); + } + local_irq_restore(rf.flags); + + return stolen; +} + +/* + * Conservative upper bound on the max cost of a steal, in nsecs (the typical + * cost is 1-2 microsec). Do not steal if average idle time is less. + */ +#define SCHED_STEAL_COST 10000 + +/* + * Try to steal a runnable CFS task from a CPU in the same LLC as @dst_rq, + * and migrate it to @dst_rq. rq_lock is held on entry and return, but + * may be dropped in between. Return 1 on success, 0 on failure, and -1 + * if a task in a different scheduling class has become runnable on @dst_rq. + */ +static int try_steal(struct rq *dst_rq, struct rq_flags *dst_rf) +{ + int src_cpu; + int dst_cpu = dst_rq->cpu; + bool locked = true; + int stolen = 0; + bool any_overload = false; + struct sparsemask *overload_cpus; + + if (!steal_enabled()) + return 0; + + if (!cpu_active(dst_cpu)) + return 0; + + if (dst_rq->avg_idle_bt < SCHED_STEAL_COST) + return 0; + + /* Get bitmap of overloaded CPUs in the same LLC as @dst_rq */ + + rcu_read_lock(); + overload_cpus = rcu_dereference(dst_rq->cfs_overload_cpus); + if (!overload_cpus) { + rcu_read_unlock(); + return 0; + } + +#ifdef CONFIG_SCHED_SMT + /* + * First try overloaded CPUs on the same core to preserve cache warmth. + */ + if (static_branch_likely(&sched_smt_present)) { + for_each_cpu(src_cpu, cpu_smt_mask(dst_cpu)) { + if (sparsemask_test_elem(overload_cpus, src_cpu) && + steal_from(dst_rq, dst_rf, &locked, src_cpu)) { + stolen = 1; + goto out; + } + } + } +#endif /* CONFIG_SCHED_SMT */ + + /* Accept any suitable task in the LLC */ + + sparsemask_for_each(overload_cpus, dst_cpu, src_cpu) { + if (steal_from(dst_rq, dst_rf, &locked, src_cpu)) { + stolen = 1; + goto out; + } + any_overload = true; + } + +out: + rcu_read_unlock(); + if (!locked) { + raw_spin_rq_lock(dst_rq); + rq_repin_lock(dst_rq, dst_rf); + } + stolen |= (dst_rq->cfs.h_nr_running > 0); + if (dst_rq->nr_running != dst_rq->cfs.h_nr_running) + stolen = -1; + if (!stolen && any_overload) + schedstat_inc(dst_rq->steal_fail); + return stolen; +} +#endif + static void rq_online_fair(struct rq *rq) { update_sysctl(); @@ -11187,6 +11747,13 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) __init void init_sched_fair_class(void) { +#ifdef CONFIG_QOS_SCHED + int i; + + for_each_possible_cpu(i) + INIT_LIST_HEAD(&per_cpu(qos_throttled_cfs_rq, i)); +#endif + #ifdef CONFIG_SMP open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 66c74aa4753e79c04d4c52d96a37525b514983ef..15c491ac0b1862f7af4f4ea5753f695263e44528 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -57,6 +57,14 @@ SCHED_FEAT(TTWU_QUEUE, true) SCHED_FEAT(SIS_AVG_CPU, false) SCHED_FEAT(SIS_PROP, true) +#ifdef CONFIG_SCHED_STEAL +/* + * Steal a CFS task from another CPU when going idle. + * Improves CPU utilization. + */ +SCHED_FEAT(STEAL, false) +#endif + /* * Issue a WARN when we do multiple update_rq_clock() calls * in a single rq->lock section. Default disabled because the diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 764a0000df94e0fab30a4d0bb4be547a1c64284b..c1c8b29218bd31f1d5e41c457e1003f7317b8dbd 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -85,6 +85,9 @@ struct rq; struct cpuidle_state; +#ifdef CONFIG_SCHED_STEAL +struct sparsemask; +#endif /* task_struct::on_rq states: */ #define TASK_ON_RQ_QUEUED 1 @@ -1152,6 +1155,9 @@ struct rq { #endif struct rt_rq rt; struct dl_rq dl; +#ifdef CONFIG_SCHED_STEAL + struct sparsemask *cfs_overload_cpus; +#endif #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this CPU: */ @@ -1278,6 +1284,17 @@ struct rq { /* try_to_wake_up() stats */ unsigned int ttwu_count; unsigned int ttwu_local; + +#ifdef CONFIG_SCHED_STEAL + /* Idle search stats */ + unsigned int found_idle_cpu_capacity; + unsigned int found_idle_cpu; + unsigned int found_idle_cpu_easy; + unsigned int nofound_idle_cpu; + unsigned long find_time; + unsigned int steal; + unsigned int steal_fail; +#endif /* CONFIG_SCHED_STEAL */ #endif #ifdef CONFIG_SMP @@ -1822,6 +1839,10 @@ this_rq_lock_irq(struct rq_flags *rf) } #ifdef CONFIG_NUMA +#ifdef CONFIG_SCHED_STEAL +extern struct static_key_true sched_steal_allow; +#endif + enum numa_topology_type { NUMA_DIRECT, NUMA_GLUELESS_MESH, @@ -3204,3 +3225,16 @@ static inline void membarrier_switch_mm(struct rq *rq, { } #endif + +#ifdef CONFIG_SMP +static inline bool is_per_cpu_kthread(struct task_struct *p) +{ + if (!(p->flags & PF_KTHREAD)) + return false; + + if (p->nr_cpus_allowed != 1) + return false; + + return true; +} +#endif diff --git a/kernel/sched/sparsemask.h b/kernel/sched/sparsemask.h new file mode 100644 index 0000000000000000000000000000000000000000..11948620a1a2b2f428fb1e08775a860a1d7aa230 --- /dev/null +++ b/kernel/sched/sparsemask.h @@ -0,0 +1,210 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * sparsemask.h - sparse bitmap operations + * + * Copyright (c) 2018 Oracle Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef __LINUX_SPARSEMASK_H +#define __LINUX_SPARSEMASK_H + +#include +#include +#include + +/* + * A sparsemask is a sparse bitmap. It reduces cache contention vs the usual + * bitmap when many threads concurrently set, clear, and visit elements. For + * each cacheline chunk of the mask, only the first K bits of the first word are + * used, and the remaining bits are ignored, where K is a creation time + * parameter. Thus a sparsemask that can represent a set of N elements is + * approximately (N/K * CACHELINE) bytes in size. + * + * Clients pass and receive element numbers in the public API, and the + * implementation translates them to bit numbers to perform the bitmap + * operations. + */ + +struct sparsemask_chunk { + unsigned long word; /* the significant bits */ +} ____cacheline_aligned_in_smp; + +struct sparsemask { + short nelems; /* current number of elements */ + short density; /* store 2^density elements per chunk */ + struct sparsemask_chunk chunks[0]; /* embedded array of chunks */ +}; + +#define _SMASK_INDEX(density, elem) ((elem) >> (density)) +#define _SMASK_BIT(density, elem) ((elem) & ((1U << (density)) - 1U)) +#define SMASK_INDEX(mask, elem) _SMASK_INDEX((mask)->density, elem) +#define SMASK_BIT(mask, elem) _SMASK_BIT((mask)->density, elem) +#define SMASK_WORD(mask, elem) \ + (&(mask)->chunks[SMASK_INDEX((mask), (elem))].word) + +/* + * sparsemask_next() - Return the next one bit in a bitmap, starting at a + * specified position and wrapping from the last bit to the first, up to but + * not including a specified origin. This is a helper, so do not call it + * directly. + * + * @mask: Bitmap to search. + * @origin: Origin. + * @prev: Previous bit. Start search after this bit number. + * If -1, start search at @origin. + * + * Return: the bit number, else mask->nelems if no bits are set in the range. + */ +static inline int +sparsemask_next(const struct sparsemask *mask, int origin, int prev) +{ + int density = mask->density; + int bits_per_word = 1U << density; + const struct sparsemask_chunk *chunk; + int nelems = mask->nelems; + int next, bit, nbits; + unsigned long word; + + /* Calculate number of bits to be searched. */ + if (prev == -1) { + nbits = nelems; + next = origin; + } else if (prev < origin) { + nbits = origin - prev; + next = prev + 1; + } else { + nbits = nelems - prev + origin - 1; + next = prev + 1; + } + + if (unlikely(next >= nelems)) + return nelems; + + /* + * Fetch and adjust first word. Clear word bits below @next, and round + * @next down to @bits_per_word boundary because later ffs will add + * those bits back. + */ + chunk = &mask->chunks[_SMASK_INDEX(density, next)]; + bit = _SMASK_BIT(density, next); + word = chunk->word & (~0UL << bit); + next -= bit; + nbits += bit; + + while (!word) { + next += bits_per_word; + nbits -= bits_per_word; + if (nbits <= 0) + return nelems; + + if (next >= nelems) { + chunk = mask->chunks; + nbits -= (next - nelems); + next = 0; + } else { + chunk++; + } + word = chunk->word; + } + + next += __ffs(word); + if (next >= origin && prev != -1) + return nelems; + return next; +} + +/****************** The public API ********************/ + +/* + * Max value for the density parameter, limited by 64 bits in the chunk word. + */ +#define SMASK_DENSITY_MAX 6 + +/* + * Return bytes to allocate for a sparsemask, for custom allocators. + */ +static inline size_t sparsemask_size(int nelems, int density) +{ + int index = _SMASK_INDEX(density, nelems) + 1; + + return offsetof(struct sparsemask, chunks[index]); +} + +/* + * Initialize an allocated sparsemask, for custom allocators. + */ +static inline void +sparsemask_init(struct sparsemask *mask, int nelems, int density) +{ + WARN_ON(density < 0 || density > SMASK_DENSITY_MAX || nelems < 0); + mask->nelems = nelems; + mask->density = density; +} + +/* + * sparsemask_alloc_node() - Allocate, initialize, and return a sparsemask. + * + * @nelems - maximum number of elements. + * @density - store 2^density elements per cacheline chunk. + * values from 0 to SMASK_DENSITY_MAX inclusive. + * @flags - kmalloc allocation flags + * @node - numa node + */ +static inline struct sparsemask * +sparsemask_alloc_node(int nelems, int density, gfp_t flags, int node) +{ + int nbytes = sparsemask_size(nelems, density); + struct sparsemask *mask = kmalloc_node(nbytes, flags, node); + + if (mask) + sparsemask_init(mask, nelems, density); + return mask; +} + +static inline void sparsemask_free(struct sparsemask *mask) +{ + kfree(mask); +} + +static inline void sparsemask_set_elem(struct sparsemask *dst, int elem) +{ + set_bit(SMASK_BIT(dst, elem), SMASK_WORD(dst, elem)); +} + +static inline void sparsemask_clear_elem(struct sparsemask *dst, int elem) +{ + clear_bit(SMASK_BIT(dst, elem), SMASK_WORD(dst, elem)); +} + +static inline int sparsemask_test_elem(const struct sparsemask *mask, int elem) +{ + return test_bit(SMASK_BIT(mask, elem), SMASK_WORD(mask, elem)); +} + +/* + * sparsemask_for_each() - iterate over each set bit in a bitmap, starting at a + * specified position, and wrapping from the last bit to the first. + * + * @mask: Bitmap to iterate over. + * @origin: Bit number at which to start searching. + * @elem: Iterator. Can be signed or unsigned integer. + * + * The implementation does not assume any bit in @mask is set, including + * @origin. After the loop, @elem = @mask->nelems. + */ +#define sparsemask_for_each(mask, origin, elem) \ + for ((elem) = -1; \ + (elem) = sparsemask_next((mask), (origin), (elem)), \ + (elem) < (mask)->nelems;) + +#endif /* __LINUX_SPARSEMASK_H */ diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 750fb3c67eed27c70c5ae73b07651d8c2e0e2eab..616c4b3c4307bfb2dff4b5d7b904d0d9e7c3b337 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -10,7 +10,11 @@ * Bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ +#ifdef CONFIG_SCHED_STEAL +#define SCHEDSTAT_VERSION 16 +#else #define SCHEDSTAT_VERSION 15 +#endif static int show_schedstat(struct seq_file *seq, void *v) { @@ -37,6 +41,17 @@ static int show_schedstat(struct seq_file *seq, void *v) rq->rq_cpu_time, rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); +#ifdef CONFIG_SCHED_STEAL + seq_printf(seq, " %u %u %u %u %lu %u %u", + rq->found_idle_cpu_easy, + rq->found_idle_cpu_capacity, + rq->found_idle_cpu, + rq->nofound_idle_cpu, + rq->find_time, + rq->steal, + rq->steal_fail); +#endif /* CONFIG_SCHED_STEAL */ + seq_printf(seq, "\n"); #ifdef CONFIG_SMP diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 398035545a6a8c4263d2025f4431cfc322bff4c5..fd9385c72185dc9818863d5c1e63caeee8ce5ccb 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -48,6 +48,24 @@ static inline void update_schedstat_avg(u64 *avg, u64 sample) #define schedstat_val(var) (var) #define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) #define schedstat_update_avg(var, val) do { update_schedstat_avg(var, val); } while (0) +#ifdef CONFIG_SCHED_STEAL +#define schedstat_start_time() schedstat_val_or_zero(local_clock()) +#define __schedstat_end_time(stat, time) \ + do { \ + unsigned long endtime; \ + \ + if (schedstat_enabled() && (time)) { \ + endtime = local_clock() - (time) - schedstat_skid; \ + schedstat_add((stat), endtime); \ + } \ + } while (0) +#define schedstat_end_time(rq, time) \ + __schedstat_end_time(((rq)->find_time), time) +extern unsigned long schedstat_skid; +#else /* !CONFIG_SCHED_STEAL */ +# define schedstat_start_time() 0 +# define schedstat_end_time(rq, t) do { } while (0) +#endif /* CONFIG_SCHED_STEAL */ #else /* !CONFIG_SCHEDSTATS: */ static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { } @@ -63,6 +81,8 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt # define schedstat_val(var) 0 # define schedstat_val_or_zero(var) 0 # define schedstat_update_avg(var, val) do { } while (0) +# define schedstat_start_time() 0 +# define schedstat_end_time(rq, t) do { } while (0) #endif /* CONFIG_SCHEDSTATS */ #ifdef CONFIG_PSI diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index ea53dd8bbf7f9d118458fd6d2b4c71aee6ab9e0b..95c4265856686fe3cdaa87b253ff3d8b55d0dbb1 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -3,6 +3,9 @@ * Scheduler topology setup/handling methods */ #include "sched.h" +#ifdef CONFIG_SCHED_STEAL +#include "sparsemask.h" +#endif DEFINE_MUTEX(sched_domains_mutex); @@ -10,6 +13,18 @@ DEFINE_MUTEX(sched_domains_mutex); static cpumask_var_t sched_domains_tmpmask; static cpumask_var_t sched_domains_tmpmask2; +struct s_data; +#ifdef CONFIG_SCHED_STEAL +static int sd_llc_alloc(struct sched_domain *sd); +static void sd_llc_free(struct sched_domain *sd); +static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d); +static void sd_llc_free_all(const struct cpumask *cpu_map); +#else +static inline void sd_llc_free(struct sched_domain *sd) {} +static inline int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d) { return 0; } +static inline void sd_llc_free_all(const struct cpumask *cpu_map) {} +#endif + #ifdef CONFIG_SCHED_DEBUG static int __init sched_debug_setup(char *str) @@ -584,8 +599,10 @@ static void destroy_sched_domain(struct sched_domain *sd) */ free_sched_groups(sd->groups, 1); - if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) + if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) { + sd_llc_free(sd); kfree(sd->shared); + } kfree(sd); } @@ -626,6 +643,10 @@ DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); static void update_top_cache_domain(int cpu) { +#ifdef CONFIG_SCHED_STEAL + struct rq *rq = cpu_rq(cpu); + struct sparsemask *cfs_overload_cpus = NULL; +#endif struct sched_domain_shared *sds = NULL; struct sched_domain *sd; int id = cpu; @@ -636,8 +657,14 @@ static void update_top_cache_domain(int cpu) id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); sds = sd->shared; +#ifdef CONFIG_SCHED_STEAL + cfs_overload_cpus = sds->cfs_overload_cpus; +#endif } +#ifdef CONFIG_SCHED_STEAL + rcu_assign_pointer(rq->cfs_overload_cpus, cfs_overload_cpus); +#endif rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_size, cpu) = size; per_cpu(sd_llc_id, cpu) = id; @@ -1229,6 +1256,7 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, free_percpu(d->sd); /* Fall through */ case sa_sd_storage: + sd_llc_free_all(cpu_map); __sdt_free(cpu_map); /* Fall through */ case sa_none: @@ -1557,6 +1585,33 @@ static void init_numa_topology_type(void) #define NR_DISTANCE_VALUES (1 << DISTANCE_BITS) +#ifdef CONFIG_SCHED_STEAL +DEFINE_STATIC_KEY_TRUE(sched_steal_allow); +static int sched_steal_node_limit; +#define SCHED_STEAL_NODE_LIMIT_DEFAULT 2 + +static int __init steal_node_limit_setup(char *buf) +{ + get_option(&buf, &sched_steal_node_limit); + return 0; +} + +early_param("sched_steal_node_limit", steal_node_limit_setup); + +static void check_node_limit(void) +{ + int n = num_possible_nodes(); + + if (sched_steal_node_limit == 0) + sched_steal_node_limit = SCHED_STEAL_NODE_LIMIT_DEFAULT; + if (n > sched_steal_node_limit) { + static_branch_disable(&sched_steal_allow); + pr_debug("Suppressing sched STEAL. To enable, reboot with sched_steal_node_limit=%d", n); + } +} +#else +static inline void check_node_limit(void) { } +#endif /* CONFIG_SCHED_STEAL */ void sched_init_numa(void) { @@ -1700,6 +1755,7 @@ void sched_init_numa(void) sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1]; init_numa_topology_type(); + check_node_limit(); } void sched_domains_numa_masks_set(unsigned int cpu) @@ -1852,6 +1908,80 @@ static void __sdt_free(const struct cpumask *cpu_map) } } +#ifdef CONFIG_SCHED_STEAL +static int sd_llc_alloc(struct sched_domain *sd) +{ + struct sched_domain_shared *sds = sd->shared; + struct cpumask *span = sched_domain_span(sd); + int nid = cpu_to_node(cpumask_first(span)); + int flags = __GFP_ZERO | GFP_KERNEL; + struct sparsemask *mask; + + /* + * Allocate the bitmap if not already allocated. This is called for + * every CPU in the LLC but only allocates once per sd_llc_shared. + */ + if (!sds->cfs_overload_cpus) { + mask = sparsemask_alloc_node(nr_cpu_ids, 3, flags, nid); + if (!mask) + return 1; + sds->cfs_overload_cpus = mask; + } + + return 0; +} + +static void sd_llc_free(struct sched_domain *sd) +{ + struct sched_domain_shared *sds = sd->shared; + + if (!sds) + return; + + sparsemask_free(sds->cfs_overload_cpus); + sds->cfs_overload_cpus = NULL; +} + +static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d) +{ + struct sched_domain *sd, *hsd; + int i; + + for_each_cpu(i, cpu_map) { + /* Find highest domain that shares resources */ + hsd = NULL; + for (sd = *per_cpu_ptr(d->sd, i); sd; sd = sd->parent) { + if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) + break; + hsd = sd; + } + if (hsd && sd_llc_alloc(hsd)) + return 1; + } + + return 0; +} + +static void sd_llc_free_all(const struct cpumask *cpu_map) +{ + struct sched_domain_topology_level *tl; + struct sched_domain *sd; + struct sd_data *sdd; + int j; + + for_each_sd_topology(tl) { + sdd = &tl->data; + if (!sdd || !sdd->sd) + continue; + for_each_cpu(j, cpu_map) { + sd = *per_cpu_ptr(sdd->sd, j); + if (sd) + sd_llc_free(sd); + } + } +} +#endif + static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *child, int dflags, int cpu) @@ -2053,6 +2183,14 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att } } + /* + * Allocate shared sd data at last level cache. Must be done after + * domains are built above, but before the data is used in + * cpu_attach_domain and descendants below. + */ + if (sd_llc_alloc_all(cpu_map, &d)) + goto error; + /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) {