diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index bc59db0a75d2c83f20f20a85f510af461d3f2feb..66171d28ea64b1403a2637cc8933522d88085ae6 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -264,7 +264,8 @@ Description: Discover CPUs in the same CPU frequency coordination domain attribute is useful for user space DVFS controllers to get better power/performance results for platforms using acpi-cpufreq. - This file is only present if the acpi-cpufreq driver is in use. + This file is only present if the acpi-cpufreq or the cppc-cpufreq + drivers are in use. What: /sys/devices/system/cpu/cpuX/cpufreq/auto_act_window Date: October 2024 diff --git a/Documentation/cpu-freq/cpu-drivers.rst b/Documentation/cpu-freq/cpu-drivers.rst index a697278ce19034e87a806912418ecb93379273d5..74fac797c3969bac4a27b2981356f536653c8e2f 100644 --- a/Documentation/cpu-freq/cpu-drivers.rst +++ b/Documentation/cpu-freq/cpu-drivers.rst @@ -71,9 +71,6 @@ And optionally .exit - A pointer to a per-policy cleanup function called during CPU_POST_DEAD phase of cpu hotplug process. - .stop_cpu - A pointer to a per-policy stop function called during - CPU_DOWN_PREPARE phase of cpu hotplug process. - .suspend - A pointer to a per-policy suspend function which is called with interrupts disabled and _after_ the governor is stopped for the policy. diff --git a/Documentation/power/energy-model.rst b/Documentation/power/energy-model.rst index a6fb986abe3cab2144d5801e9bbd83d2df05c893..8c9e784d684f5a0fb645ba23587d93744d431790 100644 --- a/Documentation/power/energy-model.rst +++ b/Documentation/power/energy-model.rst @@ -110,6 +110,10 @@ More details about the above APIs can be found in include/linux/energy_model.h. 3. Example driver ----------------- +The CPUFreq framework supports dedicated callback for registering +the EM for a given CPU(s) 'policy' object: cpufreq_driver::register_em(). +That callback has to be implemented properly for a given driver, +because the framework would call it at the right time during setup. This section provides a simple example of a CPUFreq driver registering a performance domain in the Energy Model framework using the (fake) 'foo' protocol. The driver implements an est_power() function to be provided to the @@ -117,8 +121,8 @@ EM framework:: -> drivers/cpufreq/foo_cpufreq.c - 01 static int est_power(unsigned long *mW, unsigned long *KHz, - 02 struct device *dev) + 01 static int est_power(struct device *dev, unsigned long *mW, + 02 unsigned long *KHz) 03 { 04 long freq, power; 05 @@ -139,24 +143,21 @@ EM framework:: 20 return 0; 21 } 22 - 23 static int foo_cpufreq_init(struct cpufreq_policy *policy) + 23 static void foo_cpufreq_register_em(struct cpufreq_policy *policy) 24 { 25 struct em_data_callback em_cb = EM_DATA_CB(est_power); 26 struct device *cpu_dev; - 27 int nr_opp, ret; + 27 int nr_opp; 28 29 cpu_dev = get_cpu_device(cpumask_first(policy->cpus)); 30 - 31 /* Do the actual CPUFreq init work ... */ - 32 ret = do_foo_cpufreq_init(policy); - 33 if (ret) - 34 return ret; - 35 - 36 /* Find the number of OPPs for this policy */ - 37 nr_opp = foo_get_nr_opp(policy); + 31 /* Find the number of OPPs for this policy */ + 32 nr_opp = foo_get_nr_opp(policy); + 33 + 34 /* And register the new performance domain */ + 35 em_dev_register_perf_domain(cpu_dev, nr_opp, &em_cb, policy->cpus); + 37 } 38 - 39 /* And register the new performance domain */ - 40 em_dev_register_perf_domain(cpu_dev, nr_opp, &em_cb, policy->cpus); - 41 - 42 return 0; - 43 } + 39 static struct cpufreq_driver foo_cpufreq_driver = { + 40 .register_em = foo_cpufreq_register_em, + 41 }; diff --git a/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst b/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst new file mode 100644 index 0000000000000000000000000000000000000000..9570e9c9e939765a3f2836eb2b4f219801f38f19 --- /dev/null +++ b/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst @@ -0,0 +1,256 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. include:: ../disclaimer-zh_CN.rst + +:Original: :doc:`../../../cpu-freq/cpu-drivers` +:Translator: Yanteng Si + +.. _cn_cpu-drivers.rst: + + +======================================= +如何实现一个新的CPUFreq处理器驱动程序? +======================================= + +作者: + + + - Dominik Brodowski + - Rafael J. Wysocki + - Viresh Kumar + +.. Contents + + 1. 怎么做? + 1.1 初始化 + 1.2 Per-CPU 初始化 + 1.3 验证 + 1.4 target/target_index 或 setpolicy? + 1.5 target/target_index + 1.6 setpolicy + 1.7 get_intermediate 与 target_intermediate + 2. 频率表助手 + + + +1. 怎么做? +=========== + +如此,你刚刚得到了一个全新的CPU/芯片组及其数据手册,并希望为这个CPU/芯片组添加cpufreq +支持?很好,这里有一些至关重要的提示: + + +1.1 初始化 +---------- + +首先,在__initcall_level_7 (module_init())或更靠后的函数中检查这个内核是否 +运行在正确的CPU和正确的芯片组上。如果是,则使用cpufreq_register_driver()向 +CPUfreq核心层注册一个cpufreq_driver结构体。 + +结构体cpufreq_driver应该包含什么成员? + + .name - 驱动的名字。 + + .init - 一个指向per-policy初始化函数的指针。 + + .verify - 一个指向"verification"函数的指针。 + + .setpolicy 或 .fast_switch 或 .target 或 .target_index - 差异见 + 下文。 + +并且可选择 + + .flags - cpufreq核的提示。 + + .driver_data - cpufreq驱动程序的特定数据。 + + .resolve_freq - 返回最适合目标频率的频率。不过并不能改变频率。 + + .get_intermediate 和 target_intermediate - 用于在改变CPU频率时切换到稳定 + 的频率。 + + .get - 返回CPU的当前频率。 + + .bios_limit - 返回HW/BIOS对CPU的最大频率限制值。 + + .exit - 一个指向per-policy清理函数的指针,该函数在cpu热插拔过程的CPU_POST_DEAD + 阶段被调用。 + + .suspend - 一个指向per-policy暂停函数的指针,该函数在关中断且在该策略的调节器停止 + 后被调用。 + + .resume - 一个指向per-policy恢复函数的指针,该函数在关中断且在调节器再一次开始前被 + 调用。 + + .ready - 一个指向per-policy准备函数的指针,该函数在策略完全初始化之后被调用。 + + .attr - 一个指向NULL结尾的"struct freq_attr"列表的指针,该函数允许导出值到 + sysfs。 + + .boost_enabled - 如果设置,则启用提升(boost)频率。 + + .set_boost - 一个指向per-policy函数的指针,该函数用来开启/关闭提升(boost)频率功能。 + + +1.2 Per-CPU 初始化 +------------------ + +每当一个新的CPU被注册到设备模型中,或者在cpufreq驱动注册自己之后,如果此CPU的cpufreq策 +略不存在,则会调用per-policy的初始化函数cpufreq_driver.init。请注意,.init()和.exit()程序 +只对策略调用一次,而不是对策略管理的每个CPU调用一次。它需要一个 ``struct cpufreq_policy +*policy`` 作为参数。现在该怎么做呢? + +如果有必要,请在你的CPU上激活CPUfreq功能支持。 + +然后,驱动程序必须填写以下数值: + ++-----------------------------------+--------------------------------------+ +|policy->cpuinfo.min_freq 和 | | +|policy->cpuinfo.max_freq | 该CPU支持的最低和最高频率(kHz) | +| | | +| | | ++-----------------------------------+--------------------------------------+ +|policy->cpuinfo.transition_latency | | +| | CPU在两个频率之间切换所需的时间,以 | +| | 纳秒为单位(如适用,否则指定 | +| | CPUFREQ_ETERNAL) | ++-----------------------------------+--------------------------------------+ +|policy->cur | 该CPU当前的工作频率(如适用) | +| | | ++-----------------------------------+--------------------------------------+ +|policy->min, | | +|policy->max, | | +|policy->policy and, if necessary, | | +|policy->governor | 必须包含该cpu的 “默认策略”。稍后 | +| | 会用这些值调用 | +| | cpufreq_driver.verify and either | +| | cpufreq_driver.setpolicy or | +| | cpufreq_driver.target/target_index | +| | | ++-----------------------------------+--------------------------------------+ +|policy->cpus | 用与这个CPU一起做DVFS的(在线+离线) | +| | CPU(即与它共享时钟/电压轨)的掩码更新 | +| | 这个 | +| | | ++-----------------------------------+--------------------------------------+ + +对于设置其中的一些值(cpuinfo.min[max]_freq, policy->min[max]),频率表助手可能会有帮 +助。关于它们的更多信息,请参见第2节。 + + +1.3 验证 +-------- + +当用户决定设置一个新的策略(由 “policy,governor,min,max组成”)时,必须对这个策略进行验证, +以便纠正不兼容的值。为了验证这些值,cpufreq_verify_within_limits(``struct cpufreq_policy +*policy``, ``unsigned int min_freq``, ``unsigned int max_freq``)函数可能会有帮助。 +关于频率表助手的详细内容请参见第2节。 + +您需要确保至少有一个有效频率(或工作范围)在 policy->min 和 policy->max 范围内。如果有必 +要,先增加policy->max,只有在没有办法的情况下,才减少policy->min。 + + +1.4 target 或 target_index 或 setpolicy 或 fast_switch? +------------------------------------------------------- + +大多数cpufreq驱动甚至大多数cpu频率升降算法只允许将CPU频率设置为预定义的固定值。对于这些,你 +可以使用->target(),->target_index()或->fast_switch()回调。 + +有些cpufreq功能的处理器可以自己在某些限制之间切换频率。这些应使用->setpolicy()回调。 + + +1.5. target/target_index +------------------------ + +target_index调用有两个参数:``struct cpufreq_policy * policy``和``unsigned int`` +索引(于列出的频率表)。 + +当调用这里时,CPUfreq驱动必须设置新的频率。实际频率必须由freq_table[index].frequency决定。 + +它应该总是在错误的情况下恢复到之前的频率(即policy->restore_freq),即使我们之前切换到中间频率。 + +已弃用 +---------- +目标调用有三个参数。``struct cpufreq_policy * policy``, unsigned int target_frequency, +unsigned int relation. + +CPUfreq驱动在调用这里时必须设置新的频率。实际的频率必须使用以下规则来确定。 + +- 紧跟 "目标频率"。 +- policy->min <= new_freq <= policy->max (这必须是有效的!!!) +- 如果 relation==CPUFREQ_REL_L,尝试选择一个高于或等于 target_freq 的 new_freq。("L代表 + 最低,但不能低于") +- 如果 relation==CPUFREQ_REL_H,尝试选择一个低于或等于 target_freq 的 new_freq。("H代表 + 最高,但不能高于") + +这里,频率表助手可能会帮助你--详见第2节。 + +1.6. fast_switch +---------------- + +这个函数用于从调度器的上下文进行频率切换。并非所有的驱动都要实现它,因为不允许在这个回调中睡眠。这 +个回调必须经过高度优化,以尽可能快地进行切换。 + +这个函数有两个参数: ``struct cpufreq_policy *policy`` 和 ``unsigned int target_frequency``。 + + +1.7 setpolicy +------------- + +setpolicy调用只需要一个``struct cpufreq_policy * policy``作为参数。需要将处理器内或芯片组内动态频 +率切换的下限设置为policy->min,上限设置为policy->max,如果支持的话,当policy->policy为 +CPUFREQ_POLICY_PERFORMANCE时选择面向性能的设置,当CPUFREQ_POLICY_POWERSAVE时选择面向省电的设置。 +也可以查看drivers/cpufreq/longrun.c中的参考实现。 + +1.8 get_intermediate 和 target_intermediate +-------------------------------------------- + +仅适用于 target_index() 和 CPUFREQ_ASYNC_NOTIFICATION 未设置的驱动。 + +get_intermediate应该返回一个平台想要切换到的稳定的中间频率,target_intermediate()应该将CPU设置为 +该频率,然后再跳转到'index'对应的频率。核心会负责发送通知,驱动不必在target_intermediate()或 +target_index()中处理。 + +在驱动程序不想因为某个目标频率切换到中间频率的情况下,它们可以从get_intermediate()中返回'0'。在这种情况 +下,核心将直接调用->target_index()。 + +注意:->target_index()应该在失败的情况下恢复到policy->restore_freq,因为core会为此发送通知。 + + +2. 频率表助手 +============= + +由于大多数cpufreq处理器只允许被设置为几个特定的频率,因此,一个带有一些函数的 “频率表”可能会辅助处理器驱动 +程序的一些工作。这样的 "频率表" 由一个cpufreq_frequency_table条目构成的数组组成,"driver_data" 中包 +含了驱动程序的具体数值,"frequency" 中包含了相应的频率,并设置了标志。在表的最后,需要添加一个 +cpufreq_frequency_table条目,频率设置为CPUFREQ_TABLE_END。而如果想跳过表中的一个条目,则将频率设置为 +CPUFREQ_ENTRY_INVALID。这些条目不需要按照任何特定的顺序排序,但如果它们是cpufreq 核心会对它们进行快速的DVFS, +因为搜索最佳匹配会更快。 + +如果策略在其policy->freq_table字段中包含一个有效的指针,cpufreq表就会被核心自动验证。 + +cpufreq_frequency_table_verify()保证至少有一个有效的频率在policy->min和policy->max范围内,并且所有其他 +标准都被满足。这对->verify调用很有帮助。 + +cpufreq_frequency_table_target()是对应于->target阶段的频率表助手。只要把数值传递给这个函数,这个函数就会返 +回包含CPU要设置的频率的频率表条目。 + +以下宏可以作为cpufreq_frequency_table的迭代器。 + +cpufreq_for_each_entry(pos, table) - 遍历频率表的所有条目。 + +cpufreq_for_each_valid_entry(pos, table) - 该函数遍历所有条目,不包括CPUFREQ_ENTRY_INVALID频率。 +使用参数 "pos"-一个``cpufreq_frequency_table * `` 作为循环变量,使用参数 "table"-作为你想迭代 +的``cpufreq_frequency_table * `` 。 + +例如:: + + struct cpufreq_frequency_table *pos, *driver_freq_table; + + cpufreq_for_each_entry(pos, driver_freq_table) { + /* Do something with pos */ + pos->frequency = ... + } + +如果你需要在driver_freq_table中处理pos的位置,不要减去指针,因为它的代价相当高。相反,使用宏 +cpufreq_for_each_entry_idx() 和 cpufreq_for_each_valid_entry_idx() 。 diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 470299ee2fbaaf95e52f96001834e6d23c5c5101..cdc0f5e347b768ef828032b40284d345ff73bc22 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -13,6 +13,7 @@ #define arch_set_freq_scale topology_set_freq_scale #define arch_scale_freq_capacity topology_get_freq_scale #define arch_scale_freq_invariant topology_scale_freq_invariant +#define arch_scale_freq_ref topology_get_freq_ref #endif /* Replace task scheduler's default cpu-invariant accounting */ @@ -24,6 +25,7 @@ /* Replace task scheduler's default thermal pressure API */ #define arch_scale_thermal_pressure topology_get_thermal_pressure #define arch_set_thermal_pressure topology_set_thermal_pressure +#define arch_update_thermal_pressure topology_update_thermal_pressure #else diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 9d58c2d8eb5e1475c6130a2f49cbe6033bf610e0..b5c7f7ec7b6c0bef80d4059725bc7dbc51329c01 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -669,6 +669,7 @@ CONFIG_CPU_FREQ_GOV_SEEP=m # # CONFIG_CPUFREQ_DT is not set CONFIG_ACPI_CPPC_CPUFREQ=y +CONFIG_ACPI_CPPC_CPUFREQ_FIE=y CONFIG_ARM_SCPI_CPUFREQ=m # CONFIG_ARM_QCOM_CPUFREQ_HW is not set # end of CPU Frequency scaling diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 479a1ee0e161df20b6e16e4a5ab88d70e19d894c..d6c730e2a9233713158bdf421fd90d2681fbc4e8 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -17,20 +17,13 @@ int pcibus_to_node(struct pci_bus *bus); #include void update_freq_counters_refs(void); -void topology_scale_freq_tick(void); - -#ifdef CONFIG_ARM64_AMU_EXTN -/* - * Replace task scheduler's default counter-based - * frequency-invariance scale factor setting. - */ -#define arch_scale_freq_tick topology_scale_freq_tick -#endif /* CONFIG_ARM64_AMU_EXTN */ /* Replace task scheduler's default frequency-invariant accounting */ +#define arch_scale_freq_tick topology_scale_freq_tick #define arch_set_freq_scale topology_set_freq_scale #define arch_scale_freq_capacity topology_get_freq_scale #define arch_scale_freq_invariant topology_scale_freq_invariant +#define arch_scale_freq_ref topology_get_freq_ref #ifdef CONFIG_ACPI_CPPC_LIB #define arch_init_invariance_cppc topology_init_cpu_capacity_cppc @@ -45,6 +38,7 @@ void topology_scale_freq_tick(void); /* Replace task scheduler's default thermal pressure API */ #define arch_scale_thermal_pressure topology_get_thermal_pressure #define arch_set_thermal_pressure topology_set_thermal_pressure +#define arch_update_thermal_pressure topology_update_thermal_pressure #include diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 23707812f87a624b604ac0d13b557e8603800a18..84704529569b7bfac7cb1476b1150d711145b25b 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -574,6 +574,7 @@ struct acpi_madt_generic_interrupt *acpi_cpu_get_madt_gicc(int cpu) { return &cpu_madt_gicc[cpu]; } +EXPORT_SYMBOL_GPL(acpi_cpu_get_madt_gicc); /* * acpi_map_gic_cpu_interface - parse processor MADT entry diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 4c7f3687356d38538a8ed30dd89678e5745d0cfb..893d31fdd3747aa95bf3a66e14b4fe9579c68fdb 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -170,7 +169,12 @@ int __init parse_acpi_topology(void) #undef pr_fmt #define pr_fmt(fmt) "AMU: " fmt -static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, arch_max_freq_scale); +/* + * Ensure that amu_scale_freq_tick() will return SCHED_CAPACITY_SCALE until + * the CPU capacity and its associated frequency have been correctly + * initialized. + */ +static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, arch_max_freq_scale) = 1UL << (2 * SCHED_CAPACITY_SHIFT); static DEFINE_PER_CPU(u64, arch_const_cycles_prev); static DEFINE_PER_CPU(u64, arch_core_cycles_prev); static cpumask_var_t amu_fie_cpus; @@ -200,14 +204,14 @@ static inline bool freq_counters_valid(int cpu) return true; } -static int freq_inv_set_max_ratio(int cpu, u64 max_rate, u64 ref_rate) +void freq_inv_set_max_ratio(int cpu, u64 max_rate) { - u64 ratio; + u64 ratio, ref_rate = arch_timer_get_rate(); if (unlikely(!max_rate || !ref_rate)) { - pr_debug("CPU%d: invalid maximum or reference frequency.\n", + WARN_ONCE(1, "CPU%d: invalid maximum or reference frequency.\n", cpu); - return -EINVAL; + return; } /* @@ -227,108 +231,16 @@ static int freq_inv_set_max_ratio(int cpu, u64 max_rate, u64 ref_rate) ratio = div64_u64(ratio, max_rate); if (!ratio) { WARN_ONCE(1, "Reference frequency too low.\n"); - return -EINVAL; - } - - per_cpu(arch_max_freq_scale, cpu) = (unsigned long)ratio; - - return 0; -} - -static inline bool -enable_policy_freq_counters(int cpu, cpumask_var_t valid_cpus) -{ - struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); - - if (!policy) { - pr_debug("CPU%d: No cpufreq policy found.\n", cpu); - return false; - } - - if (cpumask_subset(policy->related_cpus, valid_cpus)) - cpumask_or(amu_fie_cpus, policy->related_cpus, - amu_fie_cpus); - - cpufreq_cpu_put(policy); - - return true; -} - -static DEFINE_STATIC_KEY_FALSE(amu_fie_key); -#define amu_freq_invariant() static_branch_unlikely(&amu_fie_key) - -static int __init init_amu_fie(void) -{ - cpumask_var_t valid_cpus; - bool have_policy = false; - int ret = 0; - int cpu; - - if (!zalloc_cpumask_var(&valid_cpus, GFP_KERNEL)) - return -ENOMEM; - - if (!zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL)) { - ret = -ENOMEM; - goto free_valid_mask; - } - - for_each_present_cpu(cpu) { - if (!freq_counters_valid(cpu) || - freq_inv_set_max_ratio(cpu, - cpufreq_get_hw_max_freq(cpu) * 1000ULL, - arch_timer_get_rate())) - continue; - - cpumask_set_cpu(cpu, valid_cpus); - have_policy |= enable_policy_freq_counters(cpu, valid_cpus); - } - - /* - * If we are not restricted by cpufreq policies, we only enable - * the use of the AMU feature for FIE if all CPUs support AMU. - * Otherwise, enable_policy_freq_counters has already enabled - * policy cpus. - */ - if (!have_policy && cpumask_equal(valid_cpus, cpu_present_mask)) - cpumask_or(amu_fie_cpus, amu_fie_cpus, valid_cpus); - - if (!cpumask_empty(amu_fie_cpus)) { - pr_info("CPUs[%*pbl]: counters will be used for FIE.", - cpumask_pr_args(amu_fie_cpus)); - static_branch_enable(&amu_fie_key); + return; } - /* - * If the system is not fully invariant after AMU init, disable - * partial use of counters for frequency invariance. - */ - if (!topology_scale_freq_invariant()) - static_branch_disable(&amu_fie_key); - -free_valid_mask: - free_cpumask_var(valid_cpus); - - return ret; + WRITE_ONCE(per_cpu(arch_max_freq_scale, cpu), (unsigned long)ratio); } -late_initcall_sync(init_amu_fie); -bool arch_freq_counters_available(const struct cpumask *cpus) -{ - return amu_freq_invariant() && - cpumask_subset(cpus, amu_fie_cpus); -} - -void topology_scale_freq_tick(void) +static void amu_scale_freq_tick(void) { u64 prev_core_cnt, prev_const_cnt; u64 core_cnt, const_cnt, scale; - int cpu = smp_processor_id(); - - if (!amu_freq_invariant()) - return; - - if (!cpumask_test_cpu(cpu, amu_fie_cpus)) - return; prev_const_cnt = this_cpu_read(arch_const_cycles_prev); prev_core_cnt = this_cpu_read(arch_core_cycles_prev); @@ -340,7 +252,7 @@ void topology_scale_freq_tick(void) if (unlikely(core_cnt <= prev_core_cnt || const_cnt <= prev_const_cnt)) - goto store_and_exit; + return; /* * /\core arch_max_freq_scale @@ -356,12 +268,55 @@ void topology_scale_freq_tick(void) const_cnt - prev_const_cnt); scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE); - this_cpu_write(freq_scale, (unsigned long)scale); + this_cpu_write(arch_freq_scale, (unsigned long)scale); +} + +static struct scale_freq_data amu_sfd = { + .source = SCALE_FREQ_SOURCE_ARCH, + .set_freq_scale = amu_scale_freq_tick, +}; + +static void amu_fie_setup(unsigned int cpu) +{ + if (cpumask_test_cpu(cpu, amu_fie_cpus)) + return; + + if (!freq_counters_valid(cpu)) + return; -store_and_exit: - this_cpu_write(arch_core_cycles_prev, core_cnt); - this_cpu_write(arch_const_cycles_prev, const_cnt); + cpumask_set_cpu(cpu, amu_fie_cpus); + + topology_set_scale_freq_source(&amu_sfd, amu_fie_cpus); + + pr_debug("CPUs[%u]: counters will be used for FIE.", cpu); +} + +static int cpuhp_topology_online(unsigned int cpu) +{ + amu_fie_setup(cpu); + + return 0; +} + +static int __init init_amu_fie(void) +{ + int ret; + + if (!zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL)) + return -ENOMEM; + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "arm64/topology:online", + cpuhp_topology_online, + NULL); + if (ret < 0) { + free_cpumask_var(amu_fie_cpus); + return ret; + } + + return 0; } +core_initcall(init_amu_fie); #ifdef CONFIG_SCHED_SOFT_QUOTA static DEFINE_PER_CPU(int, sibling_idle) = 1; diff --git a/arch/riscv/include/asm/topology.h b/arch/riscv/include/asm/topology.h new file mode 100644 index 0000000000000000000000000000000000000000..76e6b9079e86a85d3f5c10ad384284e60ada6400 --- /dev/null +++ b/arch/riscv/include/asm/topology.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_RISCV_TOPOLOGY_H +#define _ASM_RISCV_TOPOLOGY_H + +#include + +/* Replace task scheduler's default frequency-invariant accounting */ +#define arch_scale_freq_tick topology_scale_freq_tick +#define arch_set_freq_scale topology_set_freq_scale +#define arch_scale_freq_capacity topology_get_freq_scale +#define arch_scale_freq_invariant topology_scale_freq_invariant +#define arch_scale_freq_ref topology_get_freq_ref + +/* Replace task scheduler's default cpu-invariant accounting */ +#define arch_scale_cpu_capacity topology_get_cpu_scale + +/* Enable topology flag updates */ +#define arch_update_cpu_topology topology_update_cpu_topology + +#include +#endif /* _ASM_RISCV_TOPOLOGY_H */ diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index ad895e437df904aedaa85e30c9a2956a2fe29b4c..989ee23ec2b0dffb6bad274d6fa7fb3ac0a18b31 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -33,13 +33,15 @@ #define pr_fmt(fmt) "ACPI CPPC: " fmt -#include #include #include #include #include #include #include +#include +#include +#include #include @@ -441,108 +443,72 @@ bool acpi_cpc_valid(void) EXPORT_SYMBOL_GPL(acpi_cpc_valid); /** - * acpi_get_psd_map - Map the CPUs in a common freq domain. - * @all_cpu_data: Ptrs to CPU specific CPPC data including PSD info. + * acpi_get_psd_map - Map the CPUs in the freq domain of a given cpu + * @cpu: Find all CPUs that share a domain with cpu. + * @cpu_data: Pointer to CPU specific CPPC data including PSD info. * * Return: 0 for success or negative value for err. */ -static int __acpi_get_psd_map(struct cppc_cpudata **all_cpu_data, struct cpc_desc **cpc_pptr) +static int __acpi_get_psd_map(unsigned int cpu, struct cppc_cpudata *cpu_data, struct cpc_desc **cpc_pptr) { - int count_target; - int retval = 0; - unsigned int i, j; - cpumask_var_t covered_cpus; - struct cppc_cpudata *pr, *match_pr; - struct acpi_psd_package *pdomain; - struct acpi_psd_package *match_pdomain; struct cpc_desc *cpc_ptr, *match_cpc_ptr; - - if (!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL)) - return -ENOMEM; + struct acpi_psd_package *match_pdomain; + struct acpi_psd_package *pdomain; + int count_target, i; /* * Now that we have _PSD data from all CPUs, let's setup P-state * domain info. */ - for_each_possible_cpu(i) { - if (cpumask_test_cpu(i, covered_cpus)) - continue; + cpc_ptr = cpc_pptr[cpu]; + if (!cpc_ptr) + return -EFAULT; - pr = all_cpu_data[i]; - cpc_ptr = cpc_pptr[i]; - if (!cpc_ptr) { - retval = -EFAULT; - goto err_ret; - } + pdomain = &(cpc_ptr->domain_info); + cpumask_set_cpu(cpu, cpu_data->shared_cpu_map); + if (pdomain->num_processors <= 1) + return 0; - pdomain = &(cpc_ptr->domain_info); - cpumask_set_cpu(i, pr->shared_cpu_map); - cpumask_set_cpu(i, covered_cpus); - if (pdomain->num_processors <= 1) - continue; + /* Validate the Domain info */ + count_target = pdomain->num_processors; + if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ALL) + cpu_data->shared_type = CPUFREQ_SHARED_TYPE_ALL; + else if (pdomain->coord_type == DOMAIN_COORD_TYPE_HW_ALL) + cpu_data->shared_type = CPUFREQ_SHARED_TYPE_HW; + else if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ANY) + cpu_data->shared_type = CPUFREQ_SHARED_TYPE_ANY; - /* Validate the Domain info */ - count_target = pdomain->num_processors; - if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ALL) - pr->shared_type = CPUFREQ_SHARED_TYPE_ALL; - else if (pdomain->coord_type == DOMAIN_COORD_TYPE_HW_ALL) - pr->shared_type = CPUFREQ_SHARED_TYPE_HW; - else if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ANY) - pr->shared_type = CPUFREQ_SHARED_TYPE_ANY; - - for_each_possible_cpu(j) { - if (i == j) - continue; - - match_cpc_ptr = cpc_pptr[j]; - if (!match_cpc_ptr) { - retval = -EFAULT; - goto err_ret; - } + for_each_possible_cpu(i) { + if (i == cpu) + continue; - match_pdomain = &(match_cpc_ptr->domain_info); - if (match_pdomain->domain != pdomain->domain) - continue; + match_cpc_ptr = cpc_pptr[i]; + if (!match_cpc_ptr) + goto err_fault; - /* Here i and j are in the same domain */ - if (match_pdomain->num_processors != count_target) { - retval = -EFAULT; - goto err_ret; - } + match_pdomain = &(match_cpc_ptr->domain_info); + if (match_pdomain->domain != pdomain->domain) + continue; - if (pdomain->coord_type != match_pdomain->coord_type) { - retval = -EFAULT; - goto err_ret; - } + /* Here i and cpu are in the same domain */ + if (match_pdomain->num_processors != count_target) + goto err_fault; - cpumask_set_cpu(j, covered_cpus); - cpumask_set_cpu(j, pr->shared_cpu_map); - } + if (pdomain->coord_type != match_pdomain->coord_type) + goto err_fault; - for_each_cpu(j, pr->shared_cpu_map) { - if (i == j) - continue; - - match_pr = all_cpu_data[j]; - match_pr->shared_type = pr->shared_type; - cpumask_copy(match_pr->shared_cpu_map, - pr->shared_cpu_map); - } + cpumask_set_cpu(i, cpu_data->shared_cpu_map); } - goto out; -err_ret: - for_each_possible_cpu(i) { - pr = all_cpu_data[i]; + return 0; - /* Assume no coordination on any error parsing domain info */ - cpumask_clear(pr->shared_cpu_map); - cpumask_set_cpu(i, pr->shared_cpu_map); - pr->shared_type = CPUFREQ_SHARED_TYPE_ALL; - } -out: - free_cpumask_var(covered_cpus); - return retval; +err_fault: + /* Assume no coordination on any error parsing domain info */ + cpumask_clear(cpu_data->shared_cpu_map); + cpumask_set_cpu(cpu, cpu_data->shared_cpu_map); + cpu_data->shared_type = CPUFREQ_SHARED_TYPE_ALL; + + return -EFAULT; } static acpi_status acpi_parse_cpc(acpi_handle handle, u32 lvl, void *data, @@ -586,7 +552,7 @@ static acpi_status acpi_parse_cpc(acpi_handle handle, u32 lvl, void *data, return AE_OK; } -int acpi_get_psd_map(struct cppc_cpudata **all_cpu_data) +int acpi_get_psd_map(unsigned int cpu, struct cppc_cpudata *cpu_data) { struct cpc_desc **cpc_pptr, *cpc_ptr; int parsed_core_num = 0; @@ -617,7 +583,7 @@ int acpi_get_psd_map(struct cppc_cpudata **all_cpu_data) goto out; } - ret = __acpi_get_psd_map(all_cpu_data, cpc_pptr); + ret = __acpi_get_psd_map(cpu, cpu_data, cpc_pptr); out: for_each_possible_cpu(i) { @@ -1441,6 +1407,48 @@ int cppc_get_perf_caps(int cpunum, struct cppc_perf_caps *perf_caps) } EXPORT_SYMBOL_GPL(cppc_get_perf_caps); +/** + * cppc_perf_ctrs_in_pcc - Check if any perf counters are in a PCC region. + * + * CPPC has flexibility about how CPU performance counters are accessed. + * One of the choices is PCC regions, which can have a high access latency. This + * routine allows callers of cppc_get_perf_ctrs() to know this ahead of time. + * + * Return: true if any of the counters are in PCC regions, false otherwise + */ +bool cppc_perf_ctrs_in_pcc(void) +{ + int cpu; + + for_each_present_cpu(cpu) { + struct cpc_register_resource *ref_perf_reg; + struct cpc_desc *cpc_desc; + + cpc_desc = per_cpu(cpc_desc_ptr, cpu); + + if (CPC_IN_PCC(&cpc_desc->cpc_regs[DELIVERED_CTR]) || + CPC_IN_PCC(&cpc_desc->cpc_regs[REFERENCE_CTR]) || + CPC_IN_PCC(&cpc_desc->cpc_regs[CTR_WRAP_TIME])) + return true; + + + ref_perf_reg = &cpc_desc->cpc_regs[REFERENCE_PERF]; + + /* + * If reference perf register is not supported then we should + * use the nominal perf value + */ + if (!CPC_SUPPORTED(ref_perf_reg)) + ref_perf_reg = &cpc_desc->cpc_regs[NOMINAL_PERF]; + + if (CPC_IN_PCC(ref_perf_reg)) + return true; + } + + return false; +} +EXPORT_SYMBOL_GPL(cppc_perf_ctrs_in_pcc); + /** * cppc_get_perf_ctrs - Read a CPU's performance feedback counters. * @cpunum: CPU from which to read counters. @@ -1754,3 +1762,116 @@ unsigned int cppc_get_transition_latency(int cpu_num) return latency_ns; } EXPORT_SYMBOL_GPL(cppc_get_transition_latency); + +/* Minimum struct length needed for the DMI processor entry we want */ +#define DMI_ENTRY_PROCESSOR_MIN_LENGTH 48 + +/* Offset in the DMI processor structure for the max frequency */ +#define DMI_PROCESSOR_MAX_SPEED 0x14 + +/* Callback function used to retrieve the max frequency from DMI */ +static void cppc_find_dmi_mhz(const struct dmi_header *dm, void *private) +{ + const u8 *dmi_data = (const u8 *)dm; + u16 *mhz = (u16 *)private; + + if (dm->type == DMI_ENTRY_PROCESSOR && + dm->length >= DMI_ENTRY_PROCESSOR_MIN_LENGTH) { + u16 val = (u16)get_unaligned((const u16 *) + (dmi_data + DMI_PROCESSOR_MAX_SPEED)); + *mhz = val > *mhz ? val : *mhz; + } +} + +/* Look up the max frequency in DMI */ +static u64 cppc_get_dmi_max_khz(void) +{ + u16 mhz = 0; + + dmi_walk(cppc_find_dmi_mhz, &mhz); + + /* + * Real stupid fallback value, just in case there is no + * actual value set. + */ + mhz = mhz ? mhz : 1; + + return KHZ_PER_MHZ * mhz; +} + +/* + * If CPPC lowest_freq and nominal_freq registers are exposed then we can + * use them to convert perf to freq and vice versa. The conversion is + * extrapolated as an affine function passing by the 2 points: + * - (Low perf, Low freq) + * - (Nominal perf, Nominal freq) + */ +unsigned int cppc_perf_to_khz(struct cppc_perf_caps *caps, unsigned int perf) +{ + s64 retval, offset = 0; + static u64 max_khz; + u64 mul, div; + + if (caps->lowest_freq && caps->nominal_freq) { + /* Avoid the special case when nominal_freq is equal to lowest_freq */ + if (caps->nominal_freq == caps->lowest_freq) { + mul = caps->nominal_freq; + div = caps->nominal_perf; + } else { + mul = caps->nominal_freq - caps->lowest_freq; + div = caps->nominal_perf - caps->lowest_perf; + } + mul *= KHZ_PER_MHZ; + offset = caps->nominal_freq * KHZ_PER_MHZ - + div64_u64(caps->nominal_perf * mul, div); + } else { + if (!max_khz) + max_khz = cppc_get_dmi_max_khz(); + mul = max_khz; + div = caps->highest_perf; + } + + retval = offset + div64_u64(perf * mul, div); + if (retval >= 0) + return retval; + return 0; +} +EXPORT_SYMBOL_GPL(cppc_perf_to_khz); + +unsigned int cppc_khz_to_perf(struct cppc_perf_caps *caps, unsigned int freq) +{ + s64 retval, offset = 0; + static u64 max_khz; + u64 mul, div; + + if (caps->lowest_freq && caps->nominal_freq) { + /* Avoid special case when nominal_freq is equal to lowest_freq */ + if (caps->nominal_freq == caps->lowest_freq) { + mul = caps->nominal_perf; + div = caps->nominal_freq; + } else { + mul = caps->nominal_perf - caps->lowest_perf; + div = caps->nominal_freq - caps->lowest_freq; + } + /* + * We don't need to convert to kHz for computing offset and can + * directly use nominal_freq and lowest_freq as the div64_u64 + * will remove the frequency unit. + */ + offset = caps->nominal_perf - + div64_u64(caps->nominal_freq * mul, div); + /* But we need it for computing the perf level. */ + div *= KHZ_PER_MHZ; + } else { + if (!max_khz) + max_khz = cppc_get_dmi_max_khz(); + mul = caps->highest_perf; + div = max_khz; + } + + retval = offset + div64_u64(freq * mul, div); + if (retval >= 0) + return retval; + return 0; +} +EXPORT_SYMBOL_GPL(cppc_khz_to_perf); diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index ba1c8e1ff0b63e73fc13adf4681450f30ff97b23..6050bff683fc1d6834227835f3ecb90def3e37f6 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -18,20 +18,116 @@ #include #include #include +#include #include #include +#include + + +static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data); +static struct cpumask scale_freq_counters_mask; +static bool scale_freq_invariant; +DEFINE_PER_CPU(unsigned long, capacity_freq_ref) = 1; +EXPORT_PER_CPU_SYMBOL_GPL(capacity_freq_ref); + +static bool supports_scale_freq_counters(const struct cpumask *cpus) +{ + return cpumask_subset(cpus, &scale_freq_counters_mask); +} bool topology_scale_freq_invariant(void) { return cpufreq_supports_freq_invariance() || - arch_freq_counters_available(cpu_online_mask); + supports_scale_freq_counters(cpu_online_mask); } -__weak bool arch_freq_counters_available(const struct cpumask *cpus) +static void update_scale_freq_invariant(bool status) { - return false; + if (scale_freq_invariant == status) + return; + + /* + * Task scheduler behavior depends on frequency invariance support, + * either cpufreq or counter driven. If the support status changes as + * a result of counter initialisation and use, retrigger the build of + * scheduling domains to ensure the information is propagated properly. + */ + if (topology_scale_freq_invariant() == status) { + scale_freq_invariant = status; + rebuild_sched_domains_energy(); + } +} + +void topology_set_scale_freq_source(struct scale_freq_data *data, + const struct cpumask *cpus) +{ + struct scale_freq_data *sfd; + int cpu; + + /* + * Avoid calling rebuild_sched_domains() unnecessarily if FIE is + * supported by cpufreq. + */ + if (cpumask_empty(&scale_freq_counters_mask)) + scale_freq_invariant = topology_scale_freq_invariant(); + + rcu_read_lock(); + + for_each_cpu(cpu, cpus) { + sfd = rcu_dereference(*per_cpu_ptr(&sft_data, cpu)); + + /* Use ARCH provided counters whenever possible */ + if (!sfd || sfd->source != SCALE_FREQ_SOURCE_ARCH) { + rcu_assign_pointer(per_cpu(sft_data, cpu), data); + cpumask_set_cpu(cpu, &scale_freq_counters_mask); + } + } + + rcu_read_unlock(); + + update_scale_freq_invariant(true); +} +EXPORT_SYMBOL_GPL(topology_set_scale_freq_source); + +void topology_clear_scale_freq_source(enum scale_freq_source source, + const struct cpumask *cpus) +{ + struct scale_freq_data *sfd; + int cpu; + + rcu_read_lock(); + + for_each_cpu(cpu, cpus) { + sfd = rcu_dereference(*per_cpu_ptr(&sft_data, cpu)); + + if (sfd && sfd->source == source) { + rcu_assign_pointer(per_cpu(sft_data, cpu), NULL); + cpumask_clear_cpu(cpu, &scale_freq_counters_mask); + } + } + + rcu_read_unlock(); + + /* + * Make sure all references to previous sft_data are dropped to avoid + * use-after-free races. + */ + synchronize_rcu(); + + update_scale_freq_invariant(false); } -DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE; +EXPORT_SYMBOL_GPL(topology_clear_scale_freq_source); + +void topology_scale_freq_tick(void) +{ + struct scale_freq_data *sfd = rcu_dereference_sched(*this_cpu_ptr(&sft_data)); + + if (sfd) + sfd->set_freq_scale(); +} + +DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; +EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale); void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq, unsigned long max_freq) @@ -47,16 +143,17 @@ void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq, * want to update the scale factor with information from CPUFREQ. * Instead the scale factor will be updated from arch_scale_freq_tick. */ - if (arch_freq_counters_available(cpus)) + if (supports_scale_freq_counters(cpus)) return; scale = (cur_freq << SCHED_CAPACITY_SHIFT) / max_freq; for_each_cpu(i, cpus) - per_cpu(freq_scale, i) = scale; + per_cpu(arch_freq_scale, i) = scale; } DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE; +EXPORT_PER_CPU_SYMBOL_GPL(cpu_scale); void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity) { @@ -74,6 +171,44 @@ void topology_set_thermal_pressure(const struct cpumask *cpus, WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); } +/** + * topology_update_thermal_pressure() - Update thermal pressure for CPUs + * @cpus : The related CPUs for which capacity has been reduced + * @capped_freq : The maximum allowed frequency that CPUs can run at + * + * Update the value of thermal pressure for all @cpus in the mask. The + * cpumask should include all (online+offline) affected CPUs, to avoid + * operating on stale data when hot-plug is used for some CPUs. The + * @capped_freq reflects the currently allowed max CPUs frequency due to + * thermal capping. It might be also a boost frequency value, which is bigger + * than the internal 'capacity_freq_ref' max frequency. In such case the + * pressure value should simply be removed, since this is an indication that + * there is no thermal throttling. The @capped_freq must be provided in kHz. + */ +void topology_update_thermal_pressure(const struct cpumask *cpus, + unsigned long capped_freq) +{ + unsigned long max_capacity, capacity; + u32 max_freq; + int cpu; + + cpu = cpumask_first(cpus); + max_capacity = arch_scale_cpu_capacity(cpu); + max_freq = arch_scale_freq_ref(cpu); + + /* + * Handle properly the boost frequencies, which should simply clean + * the thermal pressure value. + */ + if (max_freq <= capped_freq) + capacity = max_capacity; + else + capacity = mult_frac(max_capacity, capped_freq, max_freq); + + arch_set_thermal_pressure(cpus, max_capacity - capacity); +} +EXPORT_SYMBOL_GPL(topology_update_thermal_pressure); + static ssize_t cpu_capacity_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -131,7 +266,6 @@ static void update_topology_flags_workfn(struct work_struct *work) arch_rebuild_cpu_topology(); } -static DEFINE_PER_CPU(u32, freq_factor) = 1; static u32 *raw_capacity; static int free_raw_capacity(void) @@ -153,13 +287,13 @@ void topology_normalize_cpu_scale(void) capacity_scale = 1; for_each_possible_cpu(cpu) { - capacity = raw_capacity[cpu] * per_cpu(freq_factor, cpu); + capacity = raw_capacity[cpu] * per_cpu(capacity_freq_ref, cpu); capacity_scale = max(capacity, capacity_scale); } pr_debug("cpu_capacity: capacity_scale=%llu\n", capacity_scale); for_each_possible_cpu(cpu) { - capacity = raw_capacity[cpu] * per_cpu(freq_factor, cpu); + capacity = raw_capacity[cpu] * per_cpu(capacity_freq_ref, cpu); capacity = div64_u64(capacity << SCHED_CAPACITY_SHIFT, capacity_scale); topology_set_cpu_scale(cpu, capacity); @@ -195,15 +329,15 @@ bool __init topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu) cpu_node, raw_capacity[cpu]); /* - * Update freq_factor for calculating early boot cpu capacities. + * Update capacity_freq_ref for calculating early boot CPU capacities. * For non-clk CPU DVFS mechanism, there's no way to get the * frequency value now, assuming they are running at the same - * frequency (by keeping the initial freq_factor value). + * frequency (by keeping the initial capacity_freq_ref value). */ cpu_clk = of_clk_get(cpu_node, 0); if (!PTR_ERR_OR_ZERO(cpu_clk)) { - per_cpu(freq_factor, cpu) = - clk_get_rate(cpu_clk) / 1000; + per_cpu(capacity_freq_ref, cpu) = + clk_get_rate(cpu_clk) / HZ_PER_KHZ; clk_put(cpu_clk); } } else { @@ -219,11 +353,16 @@ bool __init topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu) return !ret; } +void __weak freq_inv_set_max_ratio(int cpu, u64 max_rate) +{ +} + #ifdef CONFIG_ACPI_CPPC_LIB #include void topology_init_cpu_capacity_cppc(void) { + u64 capacity, capacity_scale = 0; struct cppc_perf_caps perf_caps; int cpu; @@ -240,6 +379,10 @@ void topology_init_cpu_capacity_cppc(void) (perf_caps.highest_perf >= perf_caps.nominal_perf) && (perf_caps.highest_perf >= perf_caps.lowest_perf)) { raw_capacity[cpu] = perf_caps.highest_perf; + capacity_scale = max_t(u64, capacity_scale, raw_capacity[cpu]); + + per_cpu(capacity_freq_ref, cpu) = cppc_perf_to_khz(&perf_caps, raw_capacity[cpu]); + pr_debug("cpu_capacity: CPU%d cpu_capacity=%u (raw).\n", cpu, raw_capacity[cpu]); continue; @@ -250,7 +393,18 @@ void topology_init_cpu_capacity_cppc(void) goto exit; } - topology_normalize_cpu_scale(); + for_each_possible_cpu(cpu) { + freq_inv_set_max_ratio(cpu, + per_cpu(capacity_freq_ref, cpu) * HZ_PER_KHZ); + + capacity = raw_capacity[cpu]; + capacity = div64_u64(capacity << SCHED_CAPACITY_SHIFT, + capacity_scale); + topology_set_cpu_scale(cpu, capacity); + pr_debug("cpu_capacity: CPU%d cpu_capacity=%lu\n", + cpu, topology_get_cpu_scale(cpu)); + } + schedule_work(&update_topology_flags_work); pr_debug("cpu_capacity: cpu_capacity initialization done\n"); @@ -272,9 +426,6 @@ init_cpu_capacity_callback(struct notifier_block *nb, struct cpufreq_policy *policy = data; int cpu; - if (!raw_capacity) - return 0; - if (val != CPUFREQ_CREATE_POLICY) return 0; @@ -284,13 +435,18 @@ init_cpu_capacity_callback(struct notifier_block *nb, cpumask_andnot(cpus_to_visit, cpus_to_visit, policy->related_cpus); - for_each_cpu(cpu, policy->related_cpus) - per_cpu(freq_factor, cpu) = policy->cpuinfo.max_freq / 1000; + for_each_cpu(cpu, policy->related_cpus) { + per_cpu(capacity_freq_ref, cpu) = policy->cpuinfo.max_freq; + freq_inv_set_max_ratio(cpu, + per_cpu(capacity_freq_ref, cpu) * HZ_PER_KHZ); + } if (cpumask_empty(cpus_to_visit)) { - topology_normalize_cpu_scale(); - schedule_work(&update_topology_flags_work); - free_raw_capacity(); + if (raw_capacity) { + topology_normalize_cpu_scale(); + schedule_work(&update_topology_flags_work); + free_raw_capacity(); + } pr_debug("cpu_capacity: parsing done\n"); schedule_work(&parsing_done_work); } @@ -310,7 +466,7 @@ static int __init register_cpufreq_notifier(void) * On ACPI-based systems skip registering cpufreq notifier as cpufreq * information is not needed for cpu capacity initialization. */ - if (!acpi_disabled || !raw_capacity) + if (!acpi_disabled) return -EINVAL; if (!alloc_cpumask_var(&cpus_to_visit, GFP_KERNEL)) diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm index 67e574d0d48148193764d05f827254b0f61d00df..1280a1ec86d62637d4b76ac3ab1f27e64d3abd90 100644 --- a/drivers/cpufreq/Kconfig.arm +++ b/drivers/cpufreq/Kconfig.arm @@ -19,6 +19,16 @@ config ACPI_CPPC_CPUFREQ If in doubt, say N. +config ACPI_CPPC_CPUFREQ_FIE + bool "Frequency Invariance support for CPPC cpufreq driver" + depends on ACPI_CPPC_CPUFREQ && GENERIC_ARCH_TOPOLOGY + default y + help + This extends frequency invariance support in the CPPC cpufreq driver, + by using CPPC delivered and reference performance counters. + + If in doubt, say N. + config CPPC_CPUFREQ_SYSFS_INTERFACE bool "Enable CPPC CPUFreq sysfs tuning interfaces" depends on ACPI_CPPC_CPUFREQ diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index 3f91509091265f15d1fcbc218c6abf73a871c8fc..1d5d0af10ffd04886e9bceef60bbc1b9a27948f9 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -534,15 +534,6 @@ static void free_acpi_perf_data(void) free_percpu(acpi_perf_data); } -static int cpufreq_boost_online(unsigned int cpu) -{ - /* - * On the CPU_UP path we simply keep the boost-disable flag - * in sync with the current global state. - */ - return boost_set_msr(acpi_cpufreq_driver.boost_enabled); -} - static int cpufreq_boost_down_prep(unsigned int cpu) { /* @@ -889,6 +880,14 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) policy->fast_switch_possible = !acpi_pstate_strict && !(policy_is_shared(policy) && policy->shared_type != CPUFREQ_SHARED_TYPE_ANY); + if (perf->states[0].core_frequency * 1000 != freq_table[0].frequency) + pr_warn(FW_WARN "P-state 0 is not max freq\n"); + + if (acpi_cpufreq_driver.set_boost) { + set_boost(policy, acpi_cpufreq_driver.boost_enabled); + policy->boost_enabled = acpi_cpufreq_driver.boost_enabled; + } + return result; err_unreg: @@ -908,6 +907,7 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) pr_debug("%s\n", __func__); + cpufreq_boost_down_prep(policy->cpu); policy->fast_switch_possible = false; policy->driver_data = NULL; acpi_processor_unregister_performance(data->acpi_perf_cpu); @@ -918,16 +918,6 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) return 0; } -static void acpi_cpufreq_cpu_ready(struct cpufreq_policy *policy) -{ - struct acpi_processor_performance *perf = per_cpu_ptr(acpi_perf_data, - policy->cpu); - unsigned int freq = policy->freq_table[0].frequency; - - if (perf->states[0].core_frequency * 1000 != freq) - pr_warn(FW_WARN "P-state 0 is not max freq\n"); -} - static int acpi_cpufreq_resume(struct cpufreq_policy *policy) { struct acpi_cpufreq_data *data = policy->driver_data; @@ -955,18 +945,13 @@ static struct cpufreq_driver acpi_cpufreq_driver = { .bios_limit = acpi_processor_get_bios_limit, .init = acpi_cpufreq_cpu_init, .exit = acpi_cpufreq_cpu_exit, - .ready = acpi_cpufreq_cpu_ready, .resume = acpi_cpufreq_resume, .name = "acpi-cpufreq", .attr = acpi_cpufreq_attr, }; -static enum cpuhp_state acpi_cpufreq_online; - static void __init acpi_cpufreq_boost_init(void) { - int ret; - if (!(boot_cpu_has(X86_FEATURE_CPB) || boot_cpu_has(X86_FEATURE_IDA))) { pr_debug("Boost capabilities not present in the processor\n"); return; @@ -974,24 +959,6 @@ static void __init acpi_cpufreq_boost_init(void) acpi_cpufreq_driver.set_boost = set_boost; acpi_cpufreq_driver.boost_enabled = boost_state(0); - - /* - * This calls the online callback on all online cpu and forces all - * MSRs to the same value. - */ - ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "cpufreq/acpi:online", - cpufreq_boost_online, cpufreq_boost_down_prep); - if (ret < 0) { - pr_err("acpi_cpufreq: failed to register hotplug callbacks\n"); - return; - } - acpi_cpufreq_online = ret; -} - -static void acpi_cpufreq_boost_exit(void) -{ - if (acpi_cpufreq_online > 0) - cpuhp_remove_state_nocalls(acpi_cpufreq_online); } static int __init acpi_cpufreq_init(void) @@ -1035,7 +1002,6 @@ static int __init acpi_cpufreq_init(void) ret = cpufreq_register_driver(&acpi_cpufreq_driver); if (ret) { free_acpi_perf_data(); - acpi_cpufreq_boost_exit(); } return ret; } @@ -1044,8 +1010,6 @@ static void __exit acpi_cpufreq_exit(void) { pr_debug("%s\n", __func__); - acpi_cpufreq_boost_exit(); - cpufreq_unregister_driver(&acpi_cpufreq_driver); free_acpi_perf_data(); diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 01e7cbd910b9e1e9f64c20f10be83da2a797592e..39436127cb2d8bbbbbc7d8514f8ffce63be8ff25 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -10,33 +10,22 @@ #define pr_fmt(fmt) "CPPC Cpufreq:" fmt +#include #include #include #include #include #include -#include +#include +#include #include #include +#include #include #include -/* Minimum struct length needed for the DMI processor entry we want */ -#define DMI_ENTRY_PROCESSOR_MIN_LENGTH 48 - -/* Offest in the DMI processor structure for the max frequency */ -#define DMI_PROCESSOR_MAX_SPEED 0x14 - -/* - * These structs contain information parsed from per CPU - * ACPI _CPC structures. - * e.g. For each CPU the highest, lowest supported - * performance capabilities, desired performance level - * requested etc. - */ -static struct cppc_cpudata **all_cpu_data; static bool boost_supported; struct cppc_workaround_oem_info { @@ -63,116 +52,252 @@ struct fb_ctr_pair { struct cppc_perf_fb_ctrs fb_ctrs_t1; }; -/* Callback function used to retrieve the max frequency from DMI */ -static void cppc_find_dmi_mhz(const struct dmi_header *dm, void *private) +static enum { + FIE_UNSET = -1, + FIE_ENABLED, + FIE_DISABLED +} fie_disabled = FIE_UNSET; + +static struct cpufreq_driver cppc_cpufreq_driver; + +#ifdef CONFIG_ACPI_CPPC_CPUFREQ_FIE +module_param(fie_disabled, int, 0444); +MODULE_PARM_DESC(fie_disabled, "Disable Frequency Invariance Engine (FIE)"); + +/* Frequency invariance support */ +struct cppc_freq_invariance { + int cpu; + struct irq_work irq_work; + struct kthread_work work; + struct cppc_perf_fb_ctrs prev_perf_fb_ctrs; + struct cppc_cpudata *cpu_data; +}; + +static DEFINE_PER_CPU(struct cppc_freq_invariance, cppc_freq_inv); +static struct kthread_worker *kworker_fie; + +static int cppc_perf_from_fbctrs(struct cppc_cpudata *cpu_data, + struct cppc_perf_fb_ctrs *fb_ctrs_t0, + struct cppc_perf_fb_ctrs *fb_ctrs_t1); + +/** + * cppc_scale_freq_workfn - CPPC arch_freq_scale updater for frequency invariance + * @work: The work item. + * + * The CPPC driver register itself with the topology core to provide its own + * implementation (cppc_scale_freq_tick()) of topology_scale_freq_tick() which + * gets called by the scheduler on every tick. + * + * Note that the arch specific counters have higher priority than CPPC counters, + * if available, though the CPPC driver doesn't need to have any special + * handling for that. + * + * On an invocation of cppc_scale_freq_tick(), we schedule an irq work (since we + * reach here from hard-irq context), which then schedules a normal work item + * and cppc_scale_freq_workfn() updates the per_cpu arch_freq_scale variable + * based on the counter updates since the last tick. + */ +static void cppc_scale_freq_workfn(struct kthread_work *work) { - const u8 *dmi_data = (const u8 *)dm; - u16 *mhz = (u16 *)private; + struct cppc_freq_invariance *cppc_fi; + struct cppc_perf_fb_ctrs fb_ctrs = {0}; + struct cppc_cpudata *cpu_data; + unsigned long local_freq_scale; + u64 perf; + int ret; + + cppc_fi = container_of(work, struct cppc_freq_invariance, work); + cpu_data = cppc_fi->cpu_data; - if (dm->type == DMI_ENTRY_PROCESSOR && - dm->length >= DMI_ENTRY_PROCESSOR_MIN_LENGTH) { - u16 val = (u16)get_unaligned((const u16 *) - (dmi_data + DMI_PROCESSOR_MAX_SPEED)); - *mhz = val > *mhz ? val : *mhz; + ret = cppc_get_perf_ctrs(cppc_fi->cpu, &fb_ctrs); + /* + * Perf counters could be 0 if the cpu is in a low-power idle state. + * Just try it again next time. + */ + if (ret == -EFAULT) + return; + + if (ret) { + pr_warn("%s: failed to read perf counters\n", __func__); + topology_clear_scale_freq_source(SCALE_FREQ_SOURCE_CPPC, + cpu_data->shared_cpu_map); + return; } + + perf = cppc_perf_from_fbctrs(cpu_data, &cppc_fi->prev_perf_fb_ctrs, + &fb_ctrs); + cppc_fi->prev_perf_fb_ctrs = fb_ctrs; + + perf <<= SCHED_CAPACITY_SHIFT; + local_freq_scale = div64_u64(perf, cpu_data->perf_caps.highest_perf); + + /* This can happen due to counter's overflow */ + if (unlikely(local_freq_scale > 1024)) + local_freq_scale = 1024; + + per_cpu(arch_freq_scale, cppc_fi->cpu) = local_freq_scale; } -/* Look up the max frequency in DMI */ -static u64 cppc_get_dmi_max_khz(void) +static void cppc_irq_work(struct irq_work *irq_work) { - u16 mhz = 0; + struct cppc_freq_invariance *cppc_fi; + + cppc_fi = container_of(irq_work, struct cppc_freq_invariance, irq_work); + kthread_queue_work(kworker_fie, &cppc_fi->work); +} - dmi_walk(cppc_find_dmi_mhz, &mhz); +static void cppc_scale_freq_tick(void) +{ + struct cppc_freq_invariance *cppc_fi = &per_cpu(cppc_freq_inv, smp_processor_id()); /* - * Real stupid fallback value, just in case there is no - * actual value set. + * cppc_get_perf_ctrs() can potentially sleep, call that from the right + * context. */ - mhz = mhz ? mhz : 1; + irq_work_queue(&cppc_fi->irq_work); +} + +static struct scale_freq_data cppc_sftd = { + .source = SCALE_FREQ_SOURCE_CPPC, + .set_freq_scale = cppc_scale_freq_tick, +}; + +static void cppc_cpufreq_cpu_fie_init(struct cpufreq_policy *policy) +{ + struct cppc_freq_invariance *cppc_fi; + int cpu, ret; + + if (fie_disabled) + return; - return (1000 * mhz); + for_each_cpu(cpu, policy->cpus) { + cppc_fi = &per_cpu(cppc_freq_inv, cpu); + cppc_fi->cpu = cpu; + cppc_fi->cpu_data = policy->driver_data; + kthread_init_work(&cppc_fi->work, cppc_scale_freq_workfn); + init_irq_work(&cppc_fi->irq_work, cppc_irq_work); + + ret = cppc_get_perf_ctrs(cpu, &cppc_fi->prev_perf_fb_ctrs); + if (ret && cpu_online(cpu)) { + /* + * Don't abort if the CPU was offline while the driver + * was getting registered. + */ + pr_debug("%s: failed to read perf counters for cpu:%d: %d\n", + __func__, cpu, ret); + return; + } + } + + /* Register for freq-invariance */ + topology_set_scale_freq_source(&cppc_sftd, policy->cpus); } /* - * If CPPC lowest_freq and nominal_freq registers are exposed then we can - * use them to convert perf to freq and vice versa. The conversion is - * extrapolated as an affine function passing by the 2 points: - * - (Low perf, Low freq) - * - (Nominal perf, Nominal freq) + * We free all the resources on policy's removal and not on CPU removal as the + * irq-work are per-cpu and the hotplug core takes care of flushing the pending + * irq-works (hint: smpcfd_dying_cpu()) on CPU hotplug. Even if the kthread-work + * fires on another CPU after the concerned CPU is removed, it won't harm. + * + * We just need to make sure to remove them all on policy->exit(). */ -static unsigned int cppc_cpufreq_perf_to_khz(struct cppc_cpudata *cpu_data, - unsigned int perf) +static void cppc_cpufreq_cpu_fie_exit(struct cpufreq_policy *policy) { - struct cppc_perf_caps *caps = &cpu_data->perf_caps; - s64 retval, offset = 0; - static u64 max_khz; - u64 mul, div; - - if (caps->lowest_freq && caps->nominal_freq) { - /* Avoid the special case when nominal_freq is equal to lowest_freq */ - if (caps->nominal_freq == caps->lowest_freq) { - mul = caps->nominal_freq; - div = caps->nominal_perf; - } else { - mul = caps->nominal_freq - caps->lowest_freq; - div = caps->nominal_perf - caps->lowest_perf; - } - offset = caps->nominal_freq - div64_u64(caps->nominal_perf * mul, div); - } else { - if (!max_khz) - max_khz = cppc_get_dmi_max_khz(); - mul = max_khz; - div = caps->highest_perf; - } + struct cppc_freq_invariance *cppc_fi; + int cpu; - retval = offset + div64_u64(perf * mul, div); - if (retval >= 0) - return retval; - return 0; + if (fie_disabled) + return; + + /* policy->cpus will be empty here, use related_cpus instead */ + topology_clear_scale_freq_source(SCALE_FREQ_SOURCE_CPPC, policy->related_cpus); + + for_each_cpu(cpu, policy->related_cpus) { + cppc_fi = &per_cpu(cppc_freq_inv, cpu); + irq_work_sync(&cppc_fi->irq_work); + kthread_cancel_work_sync(&cppc_fi->work); + } } -static unsigned int cppc_cpufreq_khz_to_perf(struct cppc_cpudata *cpu_data, - unsigned int freq) +static void __init cppc_freq_invariance_init(void) { - struct cppc_perf_caps *caps = &cpu_data->perf_caps; - s64 retval, offset = 0; - static u64 max_khz; - u64 mul, div; - - if (caps->lowest_freq && caps->nominal_freq) { - /* Avoid special case when nominal_freq is equal to lowest_freq */ - if (caps->nominal_freq == caps->lowest_freq) { - mul = caps->nominal_perf; - div = caps->nominal_freq; - } else { - mul = caps->nominal_perf - caps->lowest_perf; - div = caps->nominal_freq - caps->lowest_freq; + struct sched_attr attr = { + .size = sizeof(struct sched_attr), + .sched_policy = SCHED_DEADLINE, + .sched_nice = 0, + .sched_priority = 0, + /* + * Fake (unused) bandwidth; workaround to "fix" + * priority inheritance. + */ + .sched_runtime = 1000000, + .sched_deadline = 10000000, + .sched_period = 10000000, + }; + int ret; + + if (fie_disabled != FIE_ENABLED && fie_disabled != FIE_DISABLED) { + fie_disabled = FIE_ENABLED; + if (cppc_perf_ctrs_in_pcc()) { + pr_info("FIE not enabled on systems with registers in PCC\n"); + fie_disabled = FIE_DISABLED; } - offset = caps->nominal_perf - div64_u64(caps->nominal_freq * mul, div); - } else { - if (!max_khz) - max_khz = cppc_get_dmi_max_khz(); - mul = caps->highest_perf; - div = max_khz; } - retval = offset + div64_u64(freq * mul, div); - if (retval >= 0) - return retval; - return 0; + if (fie_disabled) + return; + + kworker_fie = kthread_create_worker(0, "cppc_fie"); + if (IS_ERR(kworker_fie)) + return; + + ret = sched_setattr_nocheck(kworker_fie->task, &attr); + if (ret) { + pr_warn("%s: failed to set SCHED_DEADLINE: %d\n", __func__, + ret); + kthread_destroy_worker(kworker_fie); + return; + } +} + +static void cppc_freq_invariance_exit(void) +{ + if (fie_disabled) + return; + + kthread_destroy_worker(kworker_fie); + kworker_fie = NULL; } +#else +static inline void cppc_cpufreq_cpu_fie_init(struct cpufreq_policy *policy) +{ +} + +static inline void cppc_cpufreq_cpu_fie_exit(struct cpufreq_policy *policy) +{ +} + +static inline void cppc_freq_invariance_init(void) +{ +} + +static inline void cppc_freq_invariance_exit(void) +{ +} +#endif /* CONFIG_ACPI_CPPC_CPUFREQ_FIE */ + static int cppc_cpufreq_set_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) + unsigned int target_freq, + unsigned int relation) { - struct cppc_cpudata *cpu_data = all_cpu_data[policy->cpu]; + struct cppc_cpudata *cpu_data = policy->driver_data; unsigned int cpu = policy->cpu; struct cpufreq_freqs freqs; u32 desired_perf; int ret = 0; - desired_perf = cppc_cpufreq_khz_to_perf(cpu_data, target_freq); + desired_perf = cppc_khz_to_perf(&cpu_data->perf_caps, target_freq); /* Return if it is exactly the same perf */ if (desired_perf == cpu_data->perf_ctrls.desired_perf) return ret; @@ -198,20 +323,6 @@ static int cppc_verify_policy(struct cpufreq_policy_data *policy) return 0; } -static void cppc_cpufreq_stop_cpu(struct cpufreq_policy *policy) -{ - struct cppc_cpudata *cpu_data = all_cpu_data[policy->cpu]; - unsigned int cpu = policy->cpu; - int ret; - - cpu_data->perf_ctrls.desired_perf = cpu_data->perf_caps.lowest_perf; - - ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls); - if (ret) - pr_debug("Err setting perf value:%d on CPU:%d. ret:%d\n", - cpu_data->perf_caps.lowest_perf, cpu, ret); -} - /* * The PCC subspace describes the rate at which platform can accept commands * on the shared PCC channel (including READs which do not count towards freq @@ -246,92 +357,347 @@ static unsigned int cppc_cpufreq_get_transition_delay_us(unsigned int cpu) return delay_us; } - #else - static unsigned int cppc_cpufreq_get_transition_delay_us(unsigned int cpu) { return cppc_get_transition_latency(cpu) / NSEC_PER_USEC; } #endif -static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) +#if defined(CONFIG_ARM64) && defined(CONFIG_ENERGY_MODEL) + +static DEFINE_PER_CPU(unsigned int, efficiency_class); + +/* Create an artificial performance state every CPPC_EM_CAP_STEP capacity unit. */ +#define CPPC_EM_CAP_STEP (20) +/* Increase the cost value by CPPC_EM_COST_STEP every performance state. */ +#define CPPC_EM_COST_STEP (1) +/* Add a cost gap correspnding to the energy of 4 CPUs. */ +#define CPPC_EM_COST_GAP (4 * SCHED_CAPACITY_SCALE * CPPC_EM_COST_STEP \ + / CPPC_EM_CAP_STEP) + +static unsigned int get_perf_level_count(struct cpufreq_policy *policy) { - struct cppc_cpudata *cpu_data = all_cpu_data[policy->cpu]; - unsigned int cpu = policy->cpu; - int ret = 0; + struct cppc_perf_caps *perf_caps; + unsigned int min_cap, max_cap; + struct cppc_cpudata *cpu_data; + int cpu = policy->cpu; + + cpu_data = policy->driver_data; + perf_caps = &cpu_data->perf_caps; + max_cap = arch_scale_cpu_capacity(cpu); + min_cap = div_u64(max_cap * perf_caps->lowest_perf, perf_caps->highest_perf); + if ((min_cap == 0) || (max_cap < min_cap)) + return 0; + return 1 + max_cap / CPPC_EM_CAP_STEP - min_cap / CPPC_EM_CAP_STEP; +} + +/* + * The cost is defined as: + * cost = power * max_frequency / frequency + */ +static inline unsigned long compute_cost(int cpu, int step) +{ + return CPPC_EM_COST_GAP * per_cpu(efficiency_class, cpu) + + step * CPPC_EM_COST_STEP; +} + +static int cppc_get_cpu_power(struct device *cpu_dev, + unsigned long *power, unsigned long *KHz) +{ + unsigned long perf_step, perf_prev, perf, perf_check; + unsigned int min_step, max_step, step, step_check; + unsigned long prev_freq = *KHz; + unsigned int min_cap, max_cap; + struct cpufreq_policy *policy; + + struct cppc_perf_caps *perf_caps; + struct cppc_cpudata *cpu_data; + + policy = cpufreq_cpu_get_raw(cpu_dev->id); + if (!policy) + return -EINVAL; + + cpu_data = policy->driver_data; + perf_caps = &cpu_data->perf_caps; + max_cap = arch_scale_cpu_capacity(cpu_dev->id); + min_cap = div_u64(max_cap * perf_caps->lowest_perf, + perf_caps->highest_perf); + + perf_step = CPPC_EM_CAP_STEP * perf_caps->highest_perf / max_cap; + min_step = min_cap / CPPC_EM_CAP_STEP; + max_step = max_cap / CPPC_EM_CAP_STEP; + + perf_prev = cppc_khz_to_perf(perf_caps, *KHz); + step = perf_prev / perf_step; + + if (step > max_step) + return -EINVAL; + + if (min_step == max_step) { + step = max_step; + perf = perf_caps->highest_perf; + } else if (step < min_step) { + step = min_step; + perf = perf_caps->lowest_perf; + } else { + step++; + if (step == max_step) + perf = perf_caps->highest_perf; + else + perf = step * perf_step; + } + + *KHz = cppc_perf_to_khz(perf_caps, perf); + perf_check = cppc_khz_to_perf(perf_caps, *KHz); + step_check = perf_check / perf_step; + + /* + * To avoid bad integer approximation, check that new frequency value + * increased and that the new frequency will be converted to the + * desired step value. + */ + while ((*KHz == prev_freq) || (step_check != step)) { + perf++; + *KHz = cppc_perf_to_khz(perf_caps, perf); + perf_check = cppc_khz_to_perf(perf_caps, *KHz); + step_check = perf_check / perf_step; + } - cpu_data->cpu = cpu; - ret = cppc_get_perf_caps(policy->cpu, &cpu_data->perf_caps); + /* + * With an artificial EM, only the cost value is used. Still the power + * is populated such as 0 < power < EM_MAX_POWER. This allows to add + * more sense to the artificial performance states. + */ + *power = compute_cost(cpu_dev->id, step); + + return 0; +} + +static int cppc_get_cpu_cost(struct device *cpu_dev, unsigned long KHz, + unsigned long *cost) +{ + unsigned long perf_step, perf_prev; + struct cppc_perf_caps *perf_caps; + struct cpufreq_policy *policy; + struct cppc_cpudata *cpu_data; + unsigned int max_cap; + int step; + policy = cpufreq_cpu_get_raw(cpu_dev->id); + if (!policy) + return -EINVAL; + + cpu_data = policy->driver_data; + perf_caps = &cpu_data->perf_caps; + max_cap = arch_scale_cpu_capacity(cpu_dev->id); + + perf_prev = cppc_khz_to_perf(perf_caps, KHz); + perf_step = CPPC_EM_CAP_STEP * perf_caps->highest_perf / max_cap; + step = perf_prev / perf_step; + + *cost = compute_cost(cpu_dev->id, step); + + return 0; +} + +static void cppc_cpufreq_register_em(struct cpufreq_policy *policy) +{ + struct cppc_cpudata *cpu_data; + struct em_data_callback em_cb = + EM_ADV_DATA_CB(cppc_get_cpu_power, cppc_get_cpu_cost); + + cpu_data = policy->driver_data; + em_dev_register_perf_domain(get_cpu_device(policy->cpu), + get_perf_level_count(policy), &em_cb, + cpu_data->shared_cpu_map); +} + +static void populate_efficiency_class(void) +{ + struct acpi_madt_generic_interrupt *gicc; + DECLARE_BITMAP(used_classes, 256) = {}; + int class, cpu, index; + + for_each_possible_cpu(cpu) { + gicc = acpi_cpu_get_madt_gicc(cpu); + class = gicc->efficiency_class; + bitmap_set(used_classes, class, 1); + } + + if (bitmap_weight(used_classes, 256) <= 1) { + pr_debug("Efficiency classes are all equal (=%d). " + "No EM registered", class); + return; + } + + /* + * Squeeze efficiency class values on [0:#efficiency_class-1]. + * Values are per spec in [0:255]. + */ + index = 0; + for_each_set_bit(class, used_classes, 256) { + for_each_possible_cpu(cpu) { + gicc = acpi_cpu_get_madt_gicc(cpu); + if (gicc->efficiency_class == class) + per_cpu(efficiency_class, cpu) = index; + } + index++; + } + cppc_cpufreq_driver.register_em = cppc_cpufreq_register_em; +} + +#else +static void populate_efficiency_class(void) +{ +} +#endif + +static struct cppc_cpudata *cppc_cpufreq_get_cpu_data(unsigned int cpu) +{ + struct cppc_cpudata *cpu_data; + int ret; + + cpu_data = kzalloc(sizeof(struct cppc_cpudata), GFP_KERNEL); + if (!cpu_data) + goto out; + + if (!zalloc_cpumask_var(&cpu_data->shared_cpu_map, GFP_KERNEL)) + goto free_cpu; + + ret = acpi_get_psd_map(cpu, cpu_data); if (ret) { - pr_debug("Err reading CPU%d perf capabilities. ret:%d\n", - cpu, ret); - return ret; + pr_debug("Err parsing CPU%d PSD data: ret:%d\n", cpu, ret); + goto free_mask; } - /* Convert the lowest and nominal freq from MHz to KHz */ - cpu_data->perf_caps.lowest_freq *= 1000; - cpu_data->perf_caps.nominal_freq *= 1000; + ret = cppc_get_perf_caps(cpu, &cpu_data->perf_caps); + if (ret) { + pr_debug("Err reading CPU%d perf caps: ret:%d\n", cpu, ret); + goto free_mask; + } + + return cpu_data; + +free_mask: + free_cpumask_var(cpu_data->shared_cpu_map); +free_cpu: + kfree(cpu_data); +out: + return NULL; +} + +static void cppc_cpufreq_put_cpu_data(struct cpufreq_policy *policy) +{ + struct cppc_cpudata *cpu_data = policy->driver_data; + + free_cpumask_var(cpu_data->shared_cpu_map); + kfree(cpu_data); + policy->driver_data = NULL; +} + +static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) +{ + unsigned int cpu = policy->cpu; + struct cppc_cpudata *cpu_data; + struct cppc_perf_caps *caps; + int ret; + + cpu_data = cppc_cpufreq_get_cpu_data(cpu); + if (!cpu_data) { + pr_err("Error in acquiring _CPC/_PSD data for CPU%d.\n", cpu); + return -ENODEV; + } + caps = &cpu_data->perf_caps; + policy->driver_data = cpu_data; /* * Set min to lowest nonlinear perf to avoid any efficiency penalty (see * Section 8.4.7.1.1.5 of ACPI 6.1 spec) */ - policy->min = cppc_cpufreq_perf_to_khz(cpu_data, cpu_data->perf_caps.lowest_nonlinear_perf); - policy->max = cppc_cpufreq_perf_to_khz(cpu_data, cpu_data->perf_caps.nominal_perf); + policy->min = cppc_perf_to_khz(caps, + caps->lowest_nonlinear_perf); + policy->max = cppc_perf_to_khz(caps, policy->boost_enabled ? + caps->highest_perf : caps->nominal_perf); /* * Set cpuinfo.min_freq to Lowest to make the full range of performance * available if userspace wants to use any perf between lowest & lowest * nonlinear perf */ - policy->cpuinfo.min_freq = cppc_cpufreq_perf_to_khz(cpu_data, cpu_data->perf_caps.lowest_perf); - policy->cpuinfo.max_freq = cppc_cpufreq_perf_to_khz(cpu_data, cpu_data->perf_caps.nominal_perf); + policy->cpuinfo.min_freq = cppc_perf_to_khz(caps, + caps->lowest_perf); + policy->cpuinfo.max_freq = policy->max; policy->transition_delay_us = cppc_cpufreq_get_transition_delay_us(cpu); policy->shared_type = cpu_data->shared_type; - if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) { - int i; - + switch (policy->shared_type) { + case CPUFREQ_SHARED_TYPE_HW: + case CPUFREQ_SHARED_TYPE_NONE: + /* Nothing to be done - we'll have a policy for each CPU */ + break; + case CPUFREQ_SHARED_TYPE_ANY: + /* + * All CPUs in the domain will share a policy and all cpufreq + * operations will use a single cppc_cpudata structure stored + * in policy->driver_data. + */ cpumask_copy(policy->cpus, cpu_data->shared_cpu_map); - - for_each_cpu(i, policy->cpus) { - if (unlikely(i == cpu)) - continue; - - memcpy(&all_cpu_data[i]->perf_caps, &cpu_data->perf_caps, - sizeof(cpu_data->perf_caps)); - } - } else if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL) { - /* Support only SW_ANY for now. */ - pr_debug("Unsupported CPU co-ord type\n"); - return -EFAULT; + break; + default: + pr_debug("Unsupported CPU co-ord type: %d\n", + policy->shared_type); + ret = -EFAULT; + goto out; } - cpu_data->cur_policy = policy; - /* * If 'highest_perf' is greater than 'nominal_perf', we assume CPU Boost * is supported. */ - if (cpu_data->perf_caps.highest_perf > cpu_data->perf_caps.nominal_perf) + if (caps->highest_perf > caps->nominal_perf) boost_supported = true; /* Set policy->cur to norm now. */ - policy->cur = cppc_cpufreq_perf_to_khz(cpu_data, - cpu_data->perf_caps.nominal_perf); - cpu_data->perf_ctrls.desired_perf = cpu_data->perf_caps.nominal_perf; + policy->cur = cppc_perf_to_khz(caps, caps->nominal_perf); + cpu_data->perf_ctrls.desired_perf = caps->nominal_perf; ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls); - if (ret) + if (ret) { pr_debug("Err setting perf value:%d on CPU:%d. ret:%d\n", - cpu_data->perf_caps.nominal_perf, cpu, ret); + caps->nominal_perf, cpu, ret); + goto out; + } + + cppc_cpufreq_cpu_fie_init(policy); + return 0; +out: + cppc_cpufreq_put_cpu_data(policy); return ret; } +static int cppc_cpufreq_cpu_exit(struct cpufreq_policy *policy) +{ + struct cppc_cpudata *cpu_data = policy->driver_data; + struct cppc_perf_caps *caps = &cpu_data->perf_caps; + unsigned int cpu = policy->cpu; + int ret; + + cppc_cpufreq_cpu_fie_exit(policy); + + cpu_data->perf_ctrls.desired_perf = caps->lowest_perf; + + ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls); + if (ret) + pr_debug("Err setting perf value:%d on CPU:%d. ret:%d\n", + caps->lowest_perf, cpu, ret); + + cppc_cpufreq_put_cpu_data(policy); + return 0; +} + static inline u64 get_delta(u64 t1, u64 t0) { if (t1 > t0 || t0 > ~(u32)0) @@ -340,28 +706,25 @@ static inline u64 get_delta(u64 t1, u64 t0) return (u32)t1 - (u32)t0; } -static int cppc_get_rate_from_fbctrs(struct cppc_cpudata *cpu_data, - struct cppc_perf_fb_ctrs fb_ctrs_t0, - struct cppc_perf_fb_ctrs fb_ctrs_t1) +static int cppc_perf_from_fbctrs(struct cppc_cpudata *cpu_data, + struct cppc_perf_fb_ctrs *fb_ctrs_t0, + struct cppc_perf_fb_ctrs *fb_ctrs_t1) { u64 delta_reference, delta_delivered; - u64 reference_perf, delivered_perf; + u64 reference_perf; - reference_perf = fb_ctrs_t0.reference_perf; + reference_perf = fb_ctrs_t0->reference_perf; - delta_reference = get_delta(fb_ctrs_t1.reference, - fb_ctrs_t0.reference); - delta_delivered = get_delta(fb_ctrs_t1.delivered, - fb_ctrs_t0.delivered); + delta_reference = get_delta(fb_ctrs_t1->reference, + fb_ctrs_t0->reference); + delta_delivered = get_delta(fb_ctrs_t1->delivered, + fb_ctrs_t0->delivered); - /* Check to avoid divide-by zero */ - if (delta_reference || delta_delivered) - delivered_perf = (reference_perf * delta_delivered) / - delta_reference; - else - delivered_perf = cpu_data->perf_ctrls.desired_perf; + /* Check to avoid divide-by zero and invalid delivered_perf */ + if (!delta_reference || !delta_delivered) + return cpu_data->perf_ctrls.desired_perf; - return cppc_cpufreq_perf_to_khz(cpu_data, delivered_perf); + return (reference_perf * delta_delivered) / delta_reference; } static int cppc_get_perf_ctrs_sample(void *val) @@ -396,9 +759,18 @@ static int cppc_get_perf_ctrs_sample(void *val) static unsigned int cppc_cpufreq_get_rate(unsigned int cpu) { struct fb_ctr_pair fb_ctrs = { .cpu = cpu, }; - struct cppc_cpudata *cpu_data = all_cpu_data[cpu]; + struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + struct cppc_cpudata *cpu_data; + u64 delivered_perf; int ret; + if (!policy) + return 0; + + cpu_data = policy->driver_data; + + cpufreq_cpu_put(policy); + if (cpu_has_amu_feat(cpu)) ret = smp_call_on_cpu(cpu, cppc_get_perf_ctrs_sample, &fb_ctrs, false); @@ -408,14 +780,17 @@ static unsigned int cppc_cpufreq_get_rate(unsigned int cpu) if (ret) return ret; - return cppc_get_rate_from_fbctrs(cpu_data, - fb_ctrs.fb_ctrs_t0, - fb_ctrs.fb_ctrs_t1); + delivered_perf = cppc_perf_from_fbctrs(cpu_data, + &fb_ctrs.fb_ctrs_t0, + &fb_ctrs.fb_ctrs_t1); + + return cppc_perf_to_khz(&cpu_data->perf_caps, delivered_perf); } static int cppc_cpufreq_set_boost(struct cpufreq_policy *policy, int state) { - struct cppc_cpudata *cpu_data = all_cpu_data[policy->cpu]; + struct cppc_cpudata *cpu_data = policy->driver_data; + struct cppc_perf_caps *caps = &cpu_data->perf_caps; int ret; if (!boost_supported) { @@ -424,11 +799,9 @@ static int cppc_cpufreq_set_boost(struct cpufreq_policy *policy, int state) } if (state) - policy->max = cppc_cpufreq_perf_to_khz(cpu_data, - cpu_data->perf_caps.highest_perf); + policy->max = cppc_perf_to_khz(caps, caps->highest_perf); else - policy->max = cppc_cpufreq_perf_to_khz(cpu_data, - cpu_data->perf_caps.nominal_perf); + policy->max = cppc_perf_to_khz(caps, caps->nominal_perf); policy->cpuinfo.max_freq = policy->max; ret = freq_qos_update_request(policy->max_freq_req, policy->max); @@ -537,14 +910,24 @@ static ssize_t store_energy_perf(struct cpufreq_policy *policy, cpufreq_freq_attr_rw(auto_act_window); cpufreq_freq_attr_rw(energy_perf); +#endif // CONFIG_CPPC_CPUFREQ_SYSFS_INTERFACE + +static ssize_t show_freqdomain_cpus(struct cpufreq_policy *policy, char *buf) +{ + struct cppc_cpudata *cpu_data = policy->driver_data; + + return cpufreq_show_cpus(cpu_data->shared_cpu_map, buf); +} +cpufreq_freq_attr_ro(freqdomain_cpus); static struct freq_attr *cppc_cpufreq_attr[] = { + &freqdomain_cpus, +#ifdef CONFIG_CPPC_CPUFREQ_SYSFS_INTERFACE &auto_act_window, &energy_perf, +#endif // CONFIG_CPPC_CPUFREQ_SYSFS_INTERFACE NULL, }; -#endif // CONFIG_CPPC_CPUFREQ_SYSFS_INTERFACE - static struct cpufreq_driver cppc_cpufreq_driver = { .flags = CPUFREQ_CONST_LOOPS, @@ -552,11 +935,9 @@ static struct cpufreq_driver cppc_cpufreq_driver = { .target = cppc_cpufreq_set_target, .get = cppc_cpufreq_get_rate, .init = cppc_cpufreq_cpu_init, - .stop_cpu = cppc_cpufreq_stop_cpu, + .exit = cppc_cpufreq_cpu_exit, .set_boost = cppc_cpufreq_set_boost, -#ifdef CONFIG_CPPC_CPUFREQ_SYSFS_INTERFACE .attr = cppc_cpufreq_attr, -#endif .name = "cppc_cpufreq", }; @@ -568,15 +949,23 @@ static struct cpufreq_driver cppc_cpufreq_driver = { */ static unsigned int hisi_cppc_cpufreq_get_rate(unsigned int cpu) { - struct cppc_cpudata *cpu_data = all_cpu_data[cpu]; + struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + struct cppc_cpudata *cpu_data; u64 desired_perf; int ret; + if (!policy) + return 0; + + cpu_data = policy->driver_data; + + cpufreq_cpu_put(policy); + ret = cppc_get_desired_perf(cpu, &desired_perf); if (ret < 0) - return -EIO; + return 0; - return cppc_cpufreq_perf_to_khz(cpu_data, desired_perf); + return cppc_perf_to_khz(&cpu_data->perf_caps, desired_perf); } static void cppc_check_hisi_workaround(void) @@ -595,6 +984,7 @@ static void cppc_check_hisi_workaround(void) wa_info[i].oem_revision == tbl->oem_revision) { /* Overwrite the get() callback */ cppc_cpufreq_driver.get = hisi_cppc_cpufreq_get_rate; + fie_disabled = FIE_DISABLED; break; } } @@ -604,68 +994,26 @@ static void cppc_check_hisi_workaround(void) static int __init cppc_cpufreq_init(void) { - int i, ret = 0; - struct cppc_cpudata *cpu_data; + int ret; - if (acpi_disabled) + if ((acpi_disabled) || !acpi_cpc_valid()) return -ENODEV; - all_cpu_data = kcalloc(num_possible_cpus(), sizeof(void *), - GFP_KERNEL); - if (!all_cpu_data) - return -ENOMEM; - - for_each_possible_cpu(i) { - all_cpu_data[i] = kzalloc(sizeof(struct cppc_cpudata), GFP_KERNEL); - if (!all_cpu_data[i]) - goto out; - - cpu_data = all_cpu_data[i]; - if (!zalloc_cpumask_var(&cpu_data->shared_cpu_map, GFP_KERNEL)) - goto out; - } - - ret = acpi_get_psd_map(all_cpu_data); - if (ret) { - pr_debug("Error parsing PSD data. Aborting cpufreq registration.\n"); - goto out; - } - cppc_check_hisi_workaround(); + cppc_freq_invariance_init(); + populate_efficiency_class(); ret = cpufreq_register_driver(&cppc_cpufreq_driver); if (ret) - goto out; + cppc_freq_invariance_exit(); return ret; - -out: - for_each_possible_cpu(i) { - cpu_data = all_cpu_data[i]; - if (!cpu_data) - break; - free_cpumask_var(cpu_data->shared_cpu_map); - kfree(cpu_data); - } - - kfree(all_cpu_data); - return -ENODEV; } static void __exit cppc_cpufreq_exit(void) { - struct cppc_cpudata *cpu_data; - int i; - cpufreq_unregister_driver(&cppc_cpufreq_driver); - - for_each_possible_cpu(i) { - cpu_data = all_cpu_data[i]; - free_cpumask_var(cpu_data->shared_cpu_map); - kfree(cpu_data); - } - - kfree(all_cpu_data); + cppc_freq_invariance_exit(); } module_exit(cppc_cpufreq_exit); diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 950e00dfd75d8391227e00f535e420e010f313fb..35a0c8e62d464140886378e787c2d3a3445d43dd 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -83,6 +83,7 @@ static void cpufreq_governor_limits(struct cpufreq_policy *policy); static int cpufreq_set_policy(struct cpufreq_policy *policy, struct cpufreq_governor *new_gov, unsigned int new_pol); +static bool cpufreq_boost_supported(void); /* * Two notifier lists: the "policy" list is involved in the @@ -448,7 +449,7 @@ void cpufreq_freq_transition_end(struct cpufreq_policy *policy, arch_set_freq_scale(policy->related_cpus, policy->cur, - policy->cpuinfo.max_freq); + arch_scale_freq_ref(policy->cpu)); spin_lock(&policy->transition_lock); policy->transition_ongoing = false; @@ -617,6 +618,42 @@ static ssize_t store_boost(struct kobject *kobj, struct kobj_attribute *attr, } define_one_global_rw(boost); +static ssize_t show_local_boost(struct cpufreq_policy *policy, char *buf) +{ + return sysfs_emit(buf, "%d\n", policy->boost_enabled); +} + +static ssize_t store_local_boost(struct cpufreq_policy *policy, + const char *buf, size_t count) +{ + int ret, enable; + + ret = kstrtoint(buf, 10, &enable); + if (ret || enable < 0 || enable > 1) + return -EINVAL; + + if (!cpufreq_driver->boost_enabled) + return -EINVAL; + + if (policy->boost_enabled == enable) + return count; + + policy->boost_enabled = enable; + + cpus_read_lock(); + ret = cpufreq_driver->set_boost(policy, enable); + cpus_read_unlock(); + + if (ret) { + policy->boost_enabled = !policy->boost_enabled; + return ret; + } + + return count; +} + +static struct freq_attr local_boost = __ATTR(boost, 0644, show_local_boost, store_local_boost); + static struct cpufreq_governor *find_governor(const char *str_governor) { struct cpufreq_governor *t; @@ -930,6 +967,7 @@ static struct attribute *default_attrs[] = { &cpuinfo_min_freq.attr, &cpuinfo_max_freq.attr, &cpuinfo_transition_latency.attr, + &scaling_cur_freq.attr, &scaling_min_freq.attr, &scaling_max_freq.attr, &affected_cpus.attr, @@ -1048,16 +1086,18 @@ static int cpufreq_add_dev_interface(struct cpufreq_policy *policy) return ret; } - ret = sysfs_create_file(&policy->kobj, &scaling_cur_freq.attr); - if (ret) - return ret; - if (cpufreq_driver->bios_limit) { ret = sysfs_create_file(&policy->kobj, &bios_limit.attr); if (ret) return ret; } + if (cpufreq_boost_supported()) { + ret = sysfs_create_file(&policy->kobj, &local_boost.attr); + if (ret) + return ret; + } + return 0; } @@ -1372,6 +1412,10 @@ static int cpufreq_online(unsigned int cpu) goto out_free_policy; } + /* Let the per-policy boost flag mirror the cpufreq_driver boost during init */ + if (cpufreq_boost_enabled() && policy_has_boost_freq(policy)) + policy->boost_enabled = true; + /* * The initialization has succeeded and the policy is online. * If there is a problem with its frequency table, take it @@ -1432,6 +1476,10 @@ static int cpufreq_online(unsigned int cpu) blocking_notifier_call_chain(&cpufreq_policy_notifier_list, CPUFREQ_CREATE_POLICY, policy); + } else { + ret = freq_qos_update_request(policy->max_freq_req, policy->max); + if (ret < 0) + goto out_destroy_policy; } if (cpufreq_driver->get && has_target()) { @@ -1491,6 +1539,19 @@ static int cpufreq_online(unsigned int cpu) write_lock_irqsave(&cpufreq_driver_lock, flags); list_add(&policy->policy_list, &cpufreq_policy_list); write_unlock_irqrestore(&cpufreq_driver_lock, flags); + + /* + * Register with the energy model before + * sched_cpufreq_governor_change() is called, which will result + * in rebuilding of the sched domains, which should only be done + * once the energy model is properly initialized for the policy + * first. + * + * Also, this should be called before the policy is registered + * with cooling framework. + */ + if (cpufreq_driver->register_em) + cpufreq_driver->register_em(policy); } ret = cpufreq_init_policy(policy); @@ -1508,9 +1569,23 @@ static int cpufreq_online(unsigned int cpu) if (cpufreq_driver->ready) cpufreq_driver->ready(policy); - if (cpufreq_thermal_control_enabled(cpufreq_driver)) + /* Register cpufreq cooling only for a new policy */ + if (new_policy && cpufreq_thermal_control_enabled(cpufreq_driver)) policy->cdev = of_cpufreq_cooling_register(policy); + /* Let the per-policy boost flag mirror the cpufreq_driver boost during init */ + if (cpufreq_driver->set_boost && + policy->boost_enabled != cpufreq_boost_enabled()) { + policy->boost_enabled = cpufreq_boost_enabled(); + ret = cpufreq_driver->set_boost(policy, policy->boost_enabled); + if (ret) { + /* If the set_boost fails, the online operation is not affected */ + pr_info("%s: CPU%d: Cannot %s BOOST\n", __func__, policy->cpu, + policy->boost_enabled ? "enable" : "disable"); + policy->boost_enabled = !policy->boost_enabled; + } + } + pr_debug("initialization complete\n"); return 0; @@ -1587,23 +1662,14 @@ static void __cpufreq_offline(unsigned int cpu, struct cpufreq_policy *policy) return; } - if (has_target()) + if (has_target()) { strncpy(policy->last_governor, policy->governor->name, CPUFREQ_NAME_LEN); - else + cpufreq_exit_governor(policy); + } else { policy->last_policy = policy->policy; - - if (cpufreq_thermal_control_enabled(cpufreq_driver)) { - cpufreq_cooling_unregister(policy->cdev); - policy->cdev = NULL; } - if (cpufreq_driver->stop_cpu) - cpufreq_driver->stop_cpu(policy); - - if (has_target()) - cpufreq_exit_governor(policy); - /* * Perform the ->offline() during light-weight tear-down, as * that allows fast recovery when the CPU comes back. @@ -1665,6 +1731,15 @@ static void cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif) return; } + /* + * Unregister cpufreq cooling once all the CPUs of the policy are + * removed. + */ + if (cpufreq_thermal_control_enabled(cpufreq_driver)) { + cpufreq_cooling_unregister(policy->cdev); + policy->cdev = NULL; + } + /* We did light-weight exit earlier, do full tear down now */ if (cpufreq_driver->offline && cpufreq_driver->exit) cpufreq_driver->exit(policy); @@ -1702,6 +1777,9 @@ static unsigned int cpufreq_verify_current_freq(struct cpufreq_policy *policy, b { unsigned int new_freq; + if (!cpufreq_driver->get) + return 0; + new_freq = cpufreq_driver->get(policy->cpu); if (!new_freq) return 0; @@ -1816,8 +1894,7 @@ unsigned int cpufreq_get(unsigned int cpu) if (policy) { down_read(&policy->rwsem); - if (cpufreq_driver->get) - ret_freq = __cpufreq_get(policy); + ret_freq = __cpufreq_get(policy); up_read(&policy->rwsem); cpufreq_cpu_put(policy); @@ -2111,7 +2188,7 @@ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, policy->cur = freq; arch_set_freq_scale(policy->related_cpus, freq, - policy->cpuinfo.max_freq); + arch_scale_freq_ref(policy->cpu)); cpufreq_stats_record_transition(policy, freq); if (trace_cpu_frequency_enabled()) { @@ -2339,8 +2416,7 @@ int cpufreq_start_governor(struct cpufreq_policy *policy) pr_debug("%s: for CPU %u\n", __func__, policy->cpu); - if (cpufreq_driver->get) - cpufreq_verify_current_freq(policy, false); + cpufreq_verify_current_freq(policy, false); if (policy->governor->start) { ret = policy->governor->start(policy); @@ -2548,10 +2624,12 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy, pr_debug("starting governor %s failed\n", policy->governor->name); if (old_gov) { policy->governor = old_gov; - if (cpufreq_init_governor(policy)) + if (cpufreq_init_governor(policy)) { policy->governor = NULL; - else - cpufreq_start_governor(policy); + } else if (cpufreq_start_governor(policy)) { + cpufreq_exit_governor(policy); + policy->governor = NULL; + } } return ret; @@ -2642,9 +2720,12 @@ int cpufreq_boost_trigger_state(int state) get_online_cpus(); for_each_active_policy(policy) { + policy->boost_enabled = state; ret = cpufreq_driver->set_boost(policy, state); - if (ret) + if (ret) { + policy->boost_enabled = !policy->boost_enabled; goto err_reset_state; + } } put_online_cpus(); @@ -2688,13 +2769,17 @@ static void remove_boost_sysfs_file(void) int cpufreq_enable_boost_support(void) { + unsigned long flags; + if (!cpufreq_driver) return -EINVAL; if (cpufreq_boost_supported()) return 0; + write_lock_irqsave(&cpufreq_driver_lock, flags); cpufreq_driver->set_boost = cpufreq_boost_set_sw; + write_unlock_irqrestore(&cpufreq_driver_lock, flags); /* This will get removed on driver unregister */ return create_boost_sysfs_file(); @@ -3007,15 +3092,6 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) cpufreq_driver = driver_data; write_unlock_irqrestore(&cpufreq_driver_lock, flags); - /* - * Mark support for the scheduler's frequency invariance engine for - * drivers that implement target(), target_index() or fast_switch(). - */ - if (!cpufreq_driver->setpolicy) { - static_branch_enable_cpuslocked(&cpufreq_freq_invariance); - pr_debug("supports frequency invariance"); - } - if (driver_data->setpolicy) driver_data->flags |= CPUFREQ_CONST_LOOPS; @@ -3025,6 +3101,15 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) goto err_null_driver; } + /* + * Mark support for the scheduler's frequency invariance engine for + * drivers that implement target(), target_index() or fast_switch(). + */ + if (!cpufreq_driver->setpolicy) { + static_branch_enable_cpuslocked(&cpufreq_freq_invariance); + pr_debug("cpufreq: supports frequency invariance\n"); + } + ret = subsys_interface_register(&cpufreq_interface); if (ret) goto err_boost_unreg; @@ -3053,6 +3138,8 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) err_if_unreg: subsys_interface_unregister(&cpufreq_interface); err_boost_unreg: + if (!cpufreq_driver->setpolicy) + static_branch_disable_cpuslocked(&cpufreq_freq_invariance); remove_boost_sysfs_file(); err_null_driver: write_lock_irqsave(&cpufreq_driver_lock, flags); diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c index 55c80319d26843fdffcf6472310dc64335593191..5981e3ef9ce0eea5f59617b78b14fb5197f4ea12 100644 --- a/drivers/cpufreq/cpufreq_governor.c +++ b/drivers/cpufreq/cpufreq_governor.c @@ -145,7 +145,23 @@ unsigned int dbs_update(struct cpufreq_policy *policy) time_elapsed = update_time - j_cdbs->prev_update_time; j_cdbs->prev_update_time = update_time; - idle_time = cur_idle_time - j_cdbs->prev_cpu_idle; + /* + * cur_idle_time could be smaller than j_cdbs->prev_cpu_idle if + * it's obtained from get_cpu_idle_time_jiffy() when NOHZ is + * off, where idle_time is calculated by the difference between + * time elapsed in jiffies and "busy time" obtained from CPU + * statistics. If a CPU is 100% busy, the time elapsed and busy + * time should grow with the same amount in two consecutive + * samples, but in practice there could be a tiny difference, + * making the accumulated idle time decrease sometimes. Hence, + * in this case, idle_time should be regarded as 0 in order to + * make the further process correct. + */ + if (cur_idle_time > j_cdbs->prev_cpu_idle) + idle_time = cur_idle_time - j_cdbs->prev_cpu_idle; + else + idle_time = 0; + j_cdbs->prev_cpu_idle = cur_idle_time; if (ignore_nice) { @@ -162,7 +178,7 @@ unsigned int dbs_update(struct cpufreq_policy *policy) * calls, so the previous load value can be used then. */ load = j_cdbs->prev_load; - } else if (unlikely((int)idle_time > 2 * sampling_rate && + } else if (unlikely(idle_time > 2 * sampling_rate && j_cdbs->prev_load)) { /* * If the CPU had gone completely idle and a task has @@ -189,30 +205,15 @@ unsigned int dbs_update(struct cpufreq_policy *policy) load = j_cdbs->prev_load; j_cdbs->prev_load = 0; } else { - if (time_elapsed >= idle_time) { + if (time_elapsed > idle_time) load = 100 * (time_elapsed - idle_time) / time_elapsed; - } else { - /* - * That can happen if idle_time is returned by - * get_cpu_idle_time_jiffy(). In that case - * idle_time is roughly equal to the difference - * between time_elapsed and "busy time" obtained - * from CPU statistics. Then, the "busy time" - * can end up being greater than time_elapsed - * (for example, if jiffies_64 and the CPU - * statistics are updated by different CPUs), - * so idle_time may in fact be negative. That - * means, though, that the CPU was busy all - * the time (on the rough average) during the - * last sampling interval and 100 can be - * returned as the load. - */ - load = (int)idle_time < 0 ? 100 : 0; - } + else + load = 0; + j_cdbs->prev_load = load; } - if (unlikely((int)idle_time > 2 * sampling_rate)) { + if (unlikely(idle_time > 2 * sampling_rate)) { unsigned int periods = idle_time / sampling_rate; if (periods < idle_periods) diff --git a/drivers/cpufreq/freq_table.c b/drivers/cpufreq/freq_table.c index d3f756f7b5a0594c9543d76c902f9f422fd571e1..fe9c087afe01d8d3ee15f2ac82187858d1157f40 100644 --- a/drivers/cpufreq/freq_table.c +++ b/drivers/cpufreq/freq_table.c @@ -40,7 +40,7 @@ int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy, cpufreq_for_each_valid_entry(pos, table) { freq = pos->frequency; - if (!cpufreq_boost_enabled() + if ((!cpufreq_boost_enabled() || !policy->boost_enabled) && (pos->flags & CPUFREQ_BOOST_FREQ)) continue; diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 0cec5148b69f18f6b93befcee4a2fb8c74bd4852..36ce05e4c98323c25d88fe8bace3a8a2b920905b 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2490,7 +2490,7 @@ static int intel_pstate_verify_policy(struct cpufreq_policy_data *policy) return 0; } -static int intel_pstate_cpu_offline(struct cpufreq_policy *policy) +static int intel_cpufreq_cpu_offline(struct cpufreq_policy *policy) { struct cpudata *cpu = all_cpu_data[policy->cpu]; @@ -2535,11 +2535,11 @@ static int intel_pstate_cpu_online(struct cpufreq_policy *policy) return 0; } -static void intel_pstate_stop_cpu(struct cpufreq_policy *policy) +static int intel_pstate_cpu_offline(struct cpufreq_policy *policy) { - pr_debug("CPU %d stopping\n", policy->cpu); - intel_pstate_clear_update_util_hook(policy->cpu); + + return intel_cpufreq_cpu_offline(policy); } static int intel_pstate_cpu_exit(struct cpufreq_policy *policy) @@ -2612,7 +2612,6 @@ static struct cpufreq_driver intel_pstate = { .resume = intel_pstate_resume, .init = intel_pstate_cpu_init, .exit = intel_pstate_cpu_exit, - .stop_cpu = intel_pstate_stop_cpu, .offline = intel_pstate_cpu_offline, .online = intel_pstate_cpu_online, .update_limits = intel_pstate_update_limits, @@ -2870,7 +2869,7 @@ static struct cpufreq_driver intel_cpufreq = { .fast_switch = intel_cpufreq_fast_switch, .init = intel_cpufreq_cpu_init, .exit = intel_cpufreq_cpu_exit, - .offline = intel_pstate_cpu_offline, + .offline = intel_cpufreq_cpu_offline, .online = intel_pstate_cpu_online, .suspend = intel_pstate_suspend, .resume = intel_pstate_resume, diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 8977e4de591570a728958f4000acd0f43d51edc6..6fbb46b2f6dacbf6198f0d5caa23ae039fb37a4f 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -876,7 +876,15 @@ static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy) static int powernv_cpufreq_cpu_exit(struct cpufreq_policy *policy) { - /* timer is deleted in cpufreq_cpu_stop() */ + struct powernv_smp_call_data freq_data; + struct global_pstate_info *gpstates = policy->driver_data; + + freq_data.pstate_id = idx_to_pstate(powernv_pstate_info.min); + freq_data.gpstate_id = idx_to_pstate(powernv_pstate_info.min); + smp_call_function_single(policy->cpu, set_pstate, &freq_data, 1); + if (gpstates) + del_timer_sync(&gpstates->timer); + kfree(policy->driver_data); return 0; @@ -1008,18 +1016,6 @@ static struct notifier_block powernv_cpufreq_opal_nb = { .priority = 0, }; -static void powernv_cpufreq_stop_cpu(struct cpufreq_policy *policy) -{ - struct powernv_smp_call_data freq_data; - struct global_pstate_info *gpstates = policy->driver_data; - - freq_data.pstate_id = idx_to_pstate(powernv_pstate_info.min); - freq_data.gpstate_id = idx_to_pstate(powernv_pstate_info.min); - smp_call_function_single(policy->cpu, set_pstate, &freq_data, 1); - if (gpstates) - del_timer_sync(&gpstates->timer); -} - static unsigned int powernv_fast_switch(struct cpufreq_policy *policy, unsigned int target_freq) { @@ -1043,7 +1039,6 @@ static struct cpufreq_driver powernv_cpufreq_driver = { .target_index = powernv_cpufreq_target_index, .fast_switch = powernv_fast_switch, .get = powernv_cpufreq_get, - .stop_cpu = powernv_cpufreq_stop_cpu, .attr = powernv_cpu_freq_attr, }; diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c index bb1389f276d7ab25a05764829c0ff16dabcc2954..4b86b074a874ecd77e654c5fc0113d05a6da16a1 100644 --- a/drivers/cpufreq/scmi-cpufreq.c +++ b/drivers/cpufreq/scmi-cpufreq.c @@ -96,8 +96,8 @@ scmi_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask) } static int __maybe_unused -scmi_get_cpu_power(unsigned long *power, unsigned long *KHz, - struct device *cpu_dev) +scmi_get_cpu_power(struct device *cpu_dev, unsigned long *power, + unsigned long *KHz) { unsigned long Hz; int ret, domain; diff --git a/drivers/opp/of.c b/drivers/opp/of.c index 3d7adc0de12889f77fe3647156a7066139d789c6..6873085e11e00a0ba04de1d38acdd6d33d5934cf 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -1240,8 +1240,8 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_of_node); * Returns -EINVAL if the power calculation failed because of missing * parameters, 0 otherwise. */ -static int __maybe_unused _get_power(unsigned long *mW, unsigned long *kHz, - struct device *dev) +static int __maybe_unused _get_power(struct device *dev, unsigned long *mW, + unsigned long *kHz) { struct dev_pm_opp *opp; struct device_node *np; diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index 95c7d27cd7888fc210f31eddb483a88730c383fc..bb21c00f0d5a568cd76680d363c8d2cb84961be0 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -11,6 +11,7 @@ #define _CPPC_ACPI_H #include +#include #include #include @@ -124,15 +125,14 @@ struct cppc_perf_fb_ctrs { /* Per CPU container for runtime CPPC management. */ struct cppc_cpudata { - int cpu; struct cppc_perf_caps perf_caps; struct cppc_perf_ctrls perf_ctrls; struct cppc_perf_fb_ctrs perf_fb_ctrs; - struct cpufreq_policy *cur_policy; unsigned int shared_type; cpumask_var_t shared_cpu_map; }; +#ifdef CONFIG_ACPI_CPPC_LIB extern int cppc_get_desired_perf(int cpunum, u64 *desired_perf); extern int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs); extern int cppc_get_epp_perf(int cpunum, u64 *epp_perf); @@ -143,11 +143,56 @@ extern int cppc_set_auto_sel(int cpu, bool enable); extern int cppc_set_epp(int cpu, u64 epp_val); extern int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls); extern int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps); -extern int acpi_get_psd_map(struct cppc_cpudata **); +extern bool cppc_perf_ctrs_in_pcc(void); +extern unsigned int cppc_perf_to_khz(struct cppc_perf_caps *caps, unsigned int perf); +extern unsigned int cppc_khz_to_perf(struct cppc_perf_caps *caps, unsigned int freq); extern bool acpi_cpc_valid(void); +extern int acpi_get_psd_map(unsigned int cpu, struct cppc_cpudata *cpu_data); extern unsigned int cppc_get_transition_latency(int cpu); extern bool cpc_ffh_supported(void); extern int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val); extern int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val); +#else /* !CONFIG_ACPI_CPPC_LIB */ +static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf) +{ + return -ENOTSUPP; +} +static inline int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs) +{ + return -ENOTSUPP; +} +static inline int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) +{ + return -ENOTSUPP; +} +static inline int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps) +{ + return -ENOTSUPP; +} +static inline bool cppc_perf_ctrs_in_pcc(void) +{ + return false; +} +static inline bool acpi_cpc_valid(void) +{ + return false; +} +static inline unsigned int cppc_get_transition_latency(int cpu) +{ + return CPUFREQ_ETERNAL; +} +static inline bool cpc_ffh_supported(void) +{ + return false; +} +static inline int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val) +{ + return -ENOTSUPP; +} +static inline int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) +{ + return -ENOTSUPP; +} +#endif /* !CONFIG_ACPI_CPPC_LIB */ #endif /* _CPPC_ACPI_H*/ diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index 64a05d38a45fbd69b28f1cf390b049780c61e990..984a20559a4d70d9327bad4ce346d009043615f0 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -27,18 +27,38 @@ static inline unsigned long topology_get_cpu_scale(int cpu) void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity); -DECLARE_PER_CPU(unsigned long, freq_scale); +DECLARE_PER_CPU(unsigned long, capacity_freq_ref); + +static inline unsigned long topology_get_freq_ref(int cpu) +{ + return per_cpu(capacity_freq_ref, cpu); +} + +DECLARE_PER_CPU(unsigned long, arch_freq_scale); static inline unsigned long topology_get_freq_scale(int cpu) { - return per_cpu(freq_scale, cpu); + return per_cpu(arch_freq_scale, cpu); } void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq, unsigned long max_freq); bool topology_scale_freq_invariant(void); -bool arch_freq_counters_available(const struct cpumask *cpus); +enum scale_freq_source { + SCALE_FREQ_SOURCE_CPUFREQ = 0, + SCALE_FREQ_SOURCE_ARCH, + SCALE_FREQ_SOURCE_CPPC, +}; + +struct scale_freq_data { + enum scale_freq_source source; + void (*set_freq_scale)(void); +}; + +void topology_scale_freq_tick(void); +void topology_set_scale_freq_source(struct scale_freq_data *data, const struct cpumask *cpus); +void topology_clear_scale_freq_source(enum scale_freq_source source, const struct cpumask *cpus); DECLARE_PER_CPU(unsigned long, thermal_pressure); @@ -50,6 +70,9 @@ static inline unsigned long topology_get_thermal_pressure(int cpu) void topology_set_thermal_pressure(const struct cpumask *cpus, unsigned long th_pressure); +void topology_update_thermal_pressure(const struct cpumask *cpus, + unsigned long capped_freq); + struct cpu_topology { int thread_id; int core_id; @@ -80,6 +103,7 @@ void update_siblings_masks(unsigned int cpu); void remove_cpu_topology(unsigned int cpuid); void reset_cpu_topology(void); int parse_acpi_topology(void); +void freq_inv_set_max_ratio(int cpu, u64 max_rate); #ifdef CONFIG_HOTPLUG_SMT bool topology_is_primary_thread(unsigned int cpu); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index af9e116fa59dce638eb74886fd9e121110420d76..be1a36076941b6a70fddc36dca38573b4e257fb1 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -9,10 +9,12 @@ #define _LINUX_CPUFREQ_H #include +#include #include #include #include #include +#include #include #include #include @@ -134,6 +136,9 @@ struct cpufreq_policy { */ bool dvfs_possible_from_any_cpu; + /* Per policy boost enabled flag. */ + bool boost_enabled; + /* Cached frequency lookup from cpufreq_driver_resolve_freq. */ unsigned int cached_target_freq; unsigned int cached_resolved_idx; @@ -370,7 +375,6 @@ struct cpufreq_driver { int (*online)(struct cpufreq_policy *policy); int (*offline)(struct cpufreq_policy *policy); int (*exit)(struct cpufreq_policy *policy); - void (*stop_cpu)(struct cpufreq_policy *policy); int (*suspend)(struct cpufreq_policy *policy); int (*resume)(struct cpufreq_policy *policy); @@ -382,6 +386,12 @@ struct cpufreq_driver { /* platform specific boost support code */ bool boost_enabled; int (*set_boost)(struct cpufreq_policy *policy, int state); + + /* + * Set by drivers that want to register with the energy model after the + * policy is properly initialized, but before the governor is started. + */ + void (*register_em)(struct cpufreq_policy *policy); }; /* flags */ @@ -1069,4 +1079,10 @@ unsigned int cpufreq_generic_get(unsigned int cpu); void cpufreq_generic_init(struct cpufreq_policy *policy, struct cpufreq_frequency_table *table, unsigned int transition_latency); + +static inline void cpufreq_register_em_with_opp(struct cpufreq_policy *policy) +{ + dev_pm_opp_of_register_em(get_cpu_device(policy->cpu), + policy->related_cpus); +} #endif /* _LINUX_CPUFREQ_H */ diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 5f04a2b35e80b7bc5f55707ef93dbb21a767dd00..257dab79caa865db02edb0de7593492a51a0a6c8 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -71,11 +71,11 @@ struct em_data_callback { /** * active_power() - Provide power at the next performance state of * a device + * @dev : Device for which we do this operation (can be a CPU) * @power : Active power at the performance state in mW * (modified) * @freq : Frequency at the performance state in kHz * (modified) - * @dev : Device for which we do this operation (can be a CPU) * * active_power() must find the lowest performance state of 'dev' above * 'freq' and update 'power' and 'freq' to the matching active power @@ -87,10 +87,31 @@ struct em_data_callback { * * Return 0 on success. */ - int (*active_power)(unsigned long *power, unsigned long *freq, - struct device *dev); + int (*active_power)(struct device *dev, unsigned long *power, + unsigned long *freq); + + /** + * get_cost() - Provide the cost at the given performance state of + * a device + * @dev : Device for which we do this operation (can be a CPU) + * @freq : Frequency at the performance state in kHz + * @cost : The cost value for the performance state + * (modified) + * + * In case of CPUs, the cost is the one of a single CPU in the domain. + * It is expected to fit in the [0, EM_MAX_POWER] range due to internal + * usage in EAS calculation. + * + * Return 0 on success, or appropriate error value in case of failure. + */ + int (*get_cost)(struct device *dev, unsigned long freq, + unsigned long *cost); }; -#define EM_DATA_CB(_active_power_cb) { .active_power = &_active_power_cb } +#define EM_ADV_DATA_CB(_active_power_cb, _cost_cb) \ + { .active_power = _active_power_cb, \ + .get_cost = _cost_cb } +#define EM_DATA_CB(_active_power_cb) \ + EM_ADV_DATA_CB(_active_power_cb, NULL) struct em_perf_domain *em_cpu_get(int cpu); struct em_perf_domain *em_pd_get(struct device *dev); @@ -115,7 +136,7 @@ void em_dev_unregister_perf_domain(struct device *dev); static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, unsigned long max_util, unsigned long sum_util) { - unsigned long freq, scale_cpu; + unsigned long freq, ref_freq, scale_cpu; struct em_perf_state *ps; int i, cpu; @@ -126,8 +147,8 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, */ cpu = cpumask_first(to_cpumask(pd->cpus)); scale_cpu = arch_scale_cpu_capacity(cpu); - ps = &pd->table[pd->nr_perf_states - 1]; - freq = map_util_freq(max_util, ps->frequency, scale_cpu); + ref_freq = arch_scale_freq_ref(cpu); + freq = map_util_freq(max_util, ref_freq, scale_cpu); /* * Find the lowest performance state of the Energy Model above the @@ -198,6 +219,7 @@ static inline int em_pd_nr_perf_states(struct em_perf_domain *pd) #else struct em_data_callback {}; +#define EM_ADV_DATA_CB(_active_power_cb, _cost_cb) { } #define EM_DATA_CB(_active_power_cb) { } static inline diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 31b9ccf21d5edc6ec32669e440be3f845dac3343..7a8efe57b119471fa88c770507cb983580ea0d8b 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -272,6 +272,14 @@ static inline bool cpus_share_resources(int this_cpu, int that_cpu) #endif /* !CONFIG_SMP */ +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) +extern void rebuild_sched_domains_energy(void); +#else +static inline void rebuild_sched_domains_energy(void) +{ +} +#endif + #ifndef arch_scale_cpu_capacity /** * arch_scale_cpu_capacity - get the capacity scale factor of a given CPU. @@ -305,6 +313,21 @@ void arch_set_thermal_pressure(const struct cpumask *cpus, { } #endif +#ifndef arch_update_thermal_pressure +static __always_inline +void arch_update_thermal_pressure(const struct cpumask *cpus, + unsigned long capped_frequency) +{ } +#endif + +#ifndef arch_scale_freq_ref +static __always_inline +unsigned int arch_scale_freq_ref(int cpu) +{ + return 0; +} +#endif + static inline int task_node(const struct task_struct *p) { return cpu_to_node(task_cpu(p)); diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 334173fe6940ebc61356ebb71e8459b1c9bbdb5c..0dab668f8c9e2cb09cbff09d9d1b0b228367b19f 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -107,7 +107,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, * lowest performance state of 'dev' above 'freq' and updates * 'power' and 'freq' accordingly. */ - ret = cb->active_power(&power, &freq, dev); + ret = cb->active_power(dev, &power, &freq); if (ret) { dev_err(dev, "EM: invalid perf. state: %d\n", ret); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5d2937935b47b911b207263a98ff01167b6fcf5e..74423f98ddf972dd64259311e5ef7cba7879db7f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6639,6 +6639,7 @@ int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) { return __sched_setscheduler(p, attr, false, true); } +EXPORT_SYMBOL_GPL(sched_setattr_nocheck); /** * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 5e39da0ae0868756819d096717422e67ccda7998..175db52f16e00660b8118e629a4e0dd3834428c4 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -134,6 +134,28 @@ static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time, } } +/** + * get_capacity_ref_freq - get the reference frequency that has been used to + * correlate frequency and compute capacity for a given cpufreq policy. We use + * the CPU managing it for the arch_scale_freq_ref() call in the function. + * @policy: the cpufreq policy of the CPU in question. + * + * Return: the reference CPU frequency to compute a capacity. + */ +static __always_inline +unsigned long get_capacity_ref_freq(struct cpufreq_policy *policy) +{ + unsigned int freq = arch_scale_freq_ref(policy->cpu); + + if (freq) + return freq; + + if (arch_scale_freq_invariant()) + return policy->cpuinfo.max_freq; + + return policy->cur; +} + /** * get_next_freq - Compute a new frequency for a given cpufreq policy. * @sg_policy: schedutil policy object to compute the new frequency for. @@ -160,9 +182,9 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, unsigned long util, unsigned long max) { struct cpufreq_policy *policy = sg_policy->policy; - unsigned int freq = arch_scale_freq_invariant() ? - policy->cpuinfo.max_freq : policy->cur; + unsigned int freq; + freq = get_capacity_ref_freq(policy); freq = map_util_freq(util, freq, max); if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update) @@ -905,16 +927,9 @@ struct cpufreq_governor *cpufreq_default_governor(void) cpufreq_governor_init(schedutil_gov); #ifdef CONFIG_ENERGY_MODEL -extern bool sched_energy_update; -extern struct mutex sched_energy_mutex; - static void rebuild_sd_workfn(struct work_struct *work) { - mutex_lock(&sched_energy_mutex); - sched_energy_update = true; - rebuild_sched_domains(); - sched_energy_update = false; - mutex_unlock(&sched_energy_mutex); + rebuild_sched_domains_energy(); } static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 3a8673a1a3fc5d1747204836cc6b7c36fe08ccb3..62c658d8d022b52428ab4a81ab54c2cbf6900cd6 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -226,6 +226,15 @@ unsigned int sysctl_sched_energy_aware = 1; DEFINE_MUTEX(sched_energy_mutex); bool sched_energy_update; +void rebuild_sched_domains_energy(void) +{ + mutex_lock(&sched_energy_mutex); + sched_energy_update = true; + rebuild_sched_domains(); + sched_energy_update = false; + mutex_unlock(&sched_energy_mutex); +} + #ifdef CONFIG_PROC_SYSCTL int sched_energy_aware_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -238,13 +247,8 @@ int sched_energy_aware_handler(struct ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) { state = static_branch_unlikely(&sched_energy_present); - if (state != sysctl_sched_energy_aware) { - mutex_lock(&sched_energy_mutex); - sched_energy_update = 1; - rebuild_sched_domains(); - sched_energy_update = 0; - mutex_unlock(&sched_energy_mutex); - } + if (state != sysctl_sched_energy_aware) + rebuild_sched_domains_energy(); } return ret;