From e496f9a0bbc2892076af7c8e848f86ce5a031c80 Mon Sep 17 00:00:00 2001 From: Jie Zhan Date: Tue, 22 Aug 2023 20:48:37 +0800 Subject: [PATCH 01/78] cpufreq: Support per-policy performance boost mainline inclusion from mainline-v6.6-rc1 commit 218a06a79d9a98a96ef46bb003d4d8adb0962056 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=218a06a79d9a98a96ef46bb003d4d8adb0962056 ---------------------------------------------------------------------- The boost control currently applies to the whole system. However, users may prefer to boost a subset of cores in order to provide prioritized performance to workloads running on the boosted cores. Enable per-policy boost by adding a 'boost' sysfs interface under each policy path. This can be found at: /sys/devices/system/cpu/cpufreq/policy<*>/boost Same to the global boost switch, writing 1/0 to the per-policy 'boost' enables/disables boost on a cpufreq policy respectively. The user view of global and per-policy boost controls should be: 1. Enabling global boost initially enables boost on all policies, and per-policy boost can then be enabled or disabled individually, given that the platform does support so. 2. Disabling global boost makes the per-policy boost interface illegal. Signed-off-by: Jie Zhan Reviewed-by: Wei Xu Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- drivers/cpufreq/cpufreq.c | 43 +++++++++++++++++++++++++++++++++++++++ include/linux/cpufreq.h | 3 +++ 2 files changed, 46 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 950e00dfd75d..a32652b395c1 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -83,6 +83,7 @@ static void cpufreq_governor_limits(struct cpufreq_policy *policy); static int cpufreq_set_policy(struct cpufreq_policy *policy, struct cpufreq_governor *new_gov, unsigned int new_pol); +static bool cpufreq_boost_supported(void); /* * Two notifier lists: the "policy" list is involved in the @@ -617,6 +618,40 @@ static ssize_t store_boost(struct kobject *kobj, struct kobj_attribute *attr, } define_one_global_rw(boost); +static ssize_t show_local_boost(struct cpufreq_policy *policy, char *buf) +{ + return sysfs_emit(buf, "%d\n", policy->boost_enabled); +} + +static ssize_t store_local_boost(struct cpufreq_policy *policy, + const char *buf, size_t count) +{ + int ret, enable; + + ret = kstrtoint(buf, 10, &enable); + if (ret || enable < 0 || enable > 1) + return -EINVAL; + + if (!cpufreq_driver->boost_enabled) + return -EINVAL; + + if (policy->boost_enabled == enable) + return count; + + cpus_read_lock(); + ret = cpufreq_driver->set_boost(policy, enable); + cpus_read_unlock(); + + if (ret) + return ret; + + policy->boost_enabled = enable; + + return count; +} + +static struct freq_attr local_boost = __ATTR(boost, 0644, show_local_boost, store_local_boost); + static struct cpufreq_governor *find_governor(const char *str_governor) { struct cpufreq_governor *t; @@ -1058,6 +1093,12 @@ static int cpufreq_add_dev_interface(struct cpufreq_policy *policy) return ret; } + if (cpufreq_boost_supported()) { + ret = sysfs_create_file(&policy->kobj, &local_boost.attr); + if (ret) + return ret; + } + return 0; } @@ -2645,6 +2686,8 @@ int cpufreq_boost_trigger_state(int state) ret = cpufreq_driver->set_boost(policy, state); if (ret) goto err_reset_state; + + policy->boost_enabled = state; } put_online_cpus(); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index af9e116fa59d..d4a0374617aa 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -134,6 +134,9 @@ struct cpufreq_policy { */ bool dvfs_possible_from_any_cpu; + /* Per policy boost enabled flag. */ + bool boost_enabled; + /* Cached frequency lookup from cpufreq_driver_resolve_freq. */ unsigned int cached_target_freq; unsigned int cached_resolved_idx; -- Gitee From e99fccc6c0160b66ac9d1bf800eeb708d3c155eb Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Tue, 11 Mar 2025 11:24:03 +0800 Subject: [PATCH 02/78] cpufreq: Fix re-boost issue after hotplugging a CPU mainline inclusion from mainline-v6.7-rc5 commit 1608f0230510489d74a2e24e47054233b7e4678a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1608f0230510489d74a2e24e47054233b7e4678a --------------------------------------------------------------------- It turns out that CPUX will stay on the base frequency after performing these operations: 1. boost all CPUs: echo 1 > /sys/devices/system/cpu/cpufreq/boost 2. offline one CPU: echo 0 > /sys/devices/system/cpu/cpuX/online 3. deboost all CPUs: echo 0 > /sys/devices/system/cpu/cpufreq/boost 4. online CPUX: echo 1 > /sys/devices/system/cpu/cpuX/online 5. boost all CPUs again: echo 1 > /sys/devices/system/cpu/cpufreq/boost This is because max_freq_req of the policy is not updated during the online process, and the value of max_freq_req before the last offline is retained. When the CPU is boosted again, freq_qos_update_request() will do nothing because the old value is the same as the new one. This causes the CPU to stay at the base frequency. Updating max_freq_req in cpufreq_online() will solve this problem. Signed-off-by: Lifeng Zheng Acked-by: Viresh Kumar Link: https://patch.msgid.link/20250117101457.1530653-2-zhenglifeng1@huawei.com [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki Signed-off-by: Xinghai Cen Signed-off-by: huwentao --- drivers/cpufreq/cpufreq.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index a32652b395c1..761289e31531 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1473,6 +1473,10 @@ static int cpufreq_online(unsigned int cpu) blocking_notifier_call_chain(&cpufreq_policy_notifier_list, CPUFREQ_CREATE_POLICY, policy); + } else { + ret = freq_qos_update_request(policy->max_freq_req, policy->max); + if (ret < 0) + goto out_destroy_policy; } if (cpufreq_driver->get && has_target()) { -- Gitee From ecdfe460111fb0ef05e00ad483482cbc65d7a030 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Tue, 11 Mar 2025 11:24:04 +0800 Subject: [PATCH 03/78] cpufreq: Introduce a more generic way to set default per-policy boost flag mainline inclusion from mainline-v6.7-rc5 commit dd016f379ebc2d43a9405742d1a6066577509bd7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=dd016f379ebc2d43a9405742d1a6066577509bd7 --------------------------------------------------------------------- In cpufreq_online() of cpufreq.c, the per-policy boost flag is already set to mirror the cpufreq_driver boost during init but using freq_table to judge if the policy has boost frequency. There are two drawbacks to this approach: 1. It doesn't work for the cpufreq drivers that do not use a frequency table. For now, acpi-cpufreq and amd-pstate have to enable boost in policy initialization. And cppc_cpufreq never set policy to boost when going online no matter what the cpufreq_driver boost flag is. 2. If the CPU goes offline when cpufreq_driver boost is enabled and then goes online when cpufreq_driver boost is disabled, the per-policy boost flag will incorrectly remain true. Running set_boost at the end of the online process is a more generic way for all cpufreq drivers. Signed-off-by: Lifeng Zheng Link: https://patch.msgid.link/20250117101457.1530653-3-zhenglifeng1@huawei.com Acked-by: Viresh Kumar [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki Signed-off-by: Xinghai Cen Signed-off-by: huwentao --- drivers/cpufreq/cpufreq.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 761289e31531..d06eed6cb05f 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1556,6 +1556,18 @@ static int cpufreq_online(unsigned int cpu) if (cpufreq_thermal_control_enabled(cpufreq_driver)) policy->cdev = of_cpufreq_cooling_register(policy); + /* Let the per-policy boost flag mirror the cpufreq_driver boost during init */ + if (policy->boost_enabled != cpufreq_boost_enabled()) { + policy->boost_enabled = cpufreq_boost_enabled(); + ret = cpufreq_driver->set_boost(policy, policy->boost_enabled); + if (ret) { + /* If the set_boost fails, the online operation is not affected */ + pr_info("%s: CPU%d: Cannot %s BOOST\n", __func__, policy->cpu, + policy->boost_enabled ? "enable" : "disable"); + policy->boost_enabled = !policy->boost_enabled; + } + } + pr_debug("initialization complete\n"); return 0; -- Gitee From 82394386dd92e57e66993434681b7db596b75726 Mon Sep 17 00:00:00 2001 From: Ionela Voinescu Date: Thu, 5 Nov 2020 12:55:19 +0000 Subject: [PATCH 04/78] cppc_cpufreq: simplify use of performance capabilities mainline inclusion from mainline-v5.11-rc1 commit bb025fb6c276ac874b718b9d884b7ee1099b2c22 category: cleanup bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=bb025fb6c276ac874b718b9d884b7ee1099b2c22 ---------------------------------------------------------------------- The CPPC performance capabilities are used significantly throughout the driver. Simplify the use of them by introducing a local pointer "caps" to point to cpu_data->perf_caps, in functions that access performance capabilities often. Signed-off-by: Ionela Voinescu Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 40 +++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 01e7cbd910b9..15285b65fd3a 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -201,15 +201,16 @@ static int cppc_verify_policy(struct cpufreq_policy_data *policy) static void cppc_cpufreq_stop_cpu(struct cpufreq_policy *policy) { struct cppc_cpudata *cpu_data = all_cpu_data[policy->cpu]; + struct cppc_perf_caps *caps = &cpu_data->perf_caps; unsigned int cpu = policy->cpu; int ret; - cpu_data->perf_ctrls.desired_perf = cpu_data->perf_caps.lowest_perf; + cpu_data->perf_ctrls.desired_perf = caps->lowest_perf; ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls); if (ret) pr_debug("Err setting perf value:%d on CPU:%d. ret:%d\n", - cpu_data->perf_caps.lowest_perf, cpu, ret); + caps->lowest_perf, cpu, ret); } /* @@ -258,11 +259,12 @@ static unsigned int cppc_cpufreq_get_transition_delay_us(unsigned int cpu) static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) { struct cppc_cpudata *cpu_data = all_cpu_data[policy->cpu]; + struct cppc_perf_caps *caps = &cpu_data->perf_caps; unsigned int cpu = policy->cpu; int ret = 0; cpu_data->cpu = cpu; - ret = cppc_get_perf_caps(policy->cpu, &cpu_data->perf_caps); + ret = cppc_get_perf_caps(cpu, caps); if (ret) { pr_debug("Err reading CPU%d perf capabilities. ret:%d\n", @@ -271,23 +273,27 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) } /* Convert the lowest and nominal freq from MHz to KHz */ - cpu_data->perf_caps.lowest_freq *= 1000; - cpu_data->perf_caps.nominal_freq *= 1000; + caps->lowest_freq *= 1000; + caps->nominal_freq *= 1000; /* * Set min to lowest nonlinear perf to avoid any efficiency penalty (see * Section 8.4.7.1.1.5 of ACPI 6.1 spec) */ - policy->min = cppc_cpufreq_perf_to_khz(cpu_data, cpu_data->perf_caps.lowest_nonlinear_perf); - policy->max = cppc_cpufreq_perf_to_khz(cpu_data, cpu_data->perf_caps.nominal_perf); + policy->min = cppc_cpufreq_perf_to_khz(cpu_data, + caps->lowest_nonlinear_perf); + policy->max = cppc_cpufreq_perf_to_khz(cpu_data, + caps->nominal_perf); /* * Set cpuinfo.min_freq to Lowest to make the full range of performance * available if userspace wants to use any perf between lowest & lowest * nonlinear perf */ - policy->cpuinfo.min_freq = cppc_cpufreq_perf_to_khz(cpu_data, cpu_data->perf_caps.lowest_perf); - policy->cpuinfo.max_freq = cppc_cpufreq_perf_to_khz(cpu_data, cpu_data->perf_caps.nominal_perf); + policy->cpuinfo.min_freq = cppc_cpufreq_perf_to_khz(cpu_data, + caps->lowest_perf); + policy->cpuinfo.max_freq = cppc_cpufreq_perf_to_khz(cpu_data, + caps->nominal_perf); policy->transition_delay_us = cppc_cpufreq_get_transition_delay_us(cpu); policy->shared_type = cpu_data->shared_type; @@ -301,7 +307,7 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) if (unlikely(i == cpu)) continue; - memcpy(&all_cpu_data[i]->perf_caps, &cpu_data->perf_caps, + memcpy(&all_cpu_data[i]->perf_caps, caps, sizeof(cpu_data->perf_caps)); } } else if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL) { @@ -316,18 +322,17 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) * If 'highest_perf' is greater than 'nominal_perf', we assume CPU Boost * is supported. */ - if (cpu_data->perf_caps.highest_perf > cpu_data->perf_caps.nominal_perf) + if (caps->highest_perf > caps->nominal_perf) boost_supported = true; /* Set policy->cur to norm now. */ - policy->cur = cppc_cpufreq_perf_to_khz(cpu_data, - cpu_data->perf_caps.nominal_perf); - cpu_data->perf_ctrls.desired_perf = cpu_data->perf_caps.nominal_perf; + policy->cur = cppc_cpufreq_perf_to_khz(cpu_data, caps->nominal_perf); + cpu_data->perf_ctrls.desired_perf = caps->nominal_perf; ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls); if (ret) pr_debug("Err setting perf value:%d on CPU:%d. ret:%d\n", - cpu_data->perf_caps.nominal_perf, cpu, ret); + caps->nominal_perf, cpu, ret); return ret; } @@ -416,6 +421,7 @@ static unsigned int cppc_cpufreq_get_rate(unsigned int cpu) static int cppc_cpufreq_set_boost(struct cpufreq_policy *policy, int state) { struct cppc_cpudata *cpu_data = all_cpu_data[policy->cpu]; + struct cppc_perf_caps *caps = &cpu_data->perf_caps; int ret; if (!boost_supported) { @@ -425,10 +431,10 @@ static int cppc_cpufreq_set_boost(struct cpufreq_policy *policy, int state) if (state) policy->max = cppc_cpufreq_perf_to_khz(cpu_data, - cpu_data->perf_caps.highest_perf); + caps->highest_perf); else policy->max = cppc_cpufreq_perf_to_khz(cpu_data, - cpu_data->perf_caps.nominal_perf); + caps->nominal_perf); policy->cpuinfo.max_freq = policy->max; ret = freq_qos_update_request(policy->max_freq_req, policy->max); -- Gitee From 88f2baa6d34c0a6692c1ae3ca4e0f78297ec87f9 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Tue, 11 Mar 2025 11:24:05 +0800 Subject: [PATCH 05/78] cpufreq: CPPC: Fix wrong max_freq in policy initialization mainline inclusion from mainline-v6.7-rc5 commit 03d8b4e76266e11662c5e544854b737843173e2d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=03d8b4e76266e11662c5e544854b737843173e2d --------------------------------------------------------------------- In policy initialization, policy->max and policy->cpuinfo.max_freq are always set to the value calculated from caps->nominal_perf. This will cause the frequency stay on base frequency even if the policy is already boosted when a CPU is going online. Fix this by using policy->boost_enabled to determine which value should be set. Signed-off-by: Lifeng Zheng Acked-by: Viresh Kumar Link: https://patch.msgid.link/20250117101457.1530653-4-zhenglifeng1@huawei.com [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki Signed-off-by: Xinghai Cen Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 15285b65fd3a..83c6139ec163 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -282,8 +282,8 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) */ policy->min = cppc_cpufreq_perf_to_khz(cpu_data, caps->lowest_nonlinear_perf); - policy->max = cppc_cpufreq_perf_to_khz(cpu_data, - caps->nominal_perf); + policy->max = cppc_cpufreq_perf_to_khz(cpu_data, policy->boost_enabled ? + caps->highest_perf : caps->nominal_perf); /* * Set cpuinfo.min_freq to Lowest to make the full range of performance @@ -292,8 +292,7 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) */ policy->cpuinfo.min_freq = cppc_cpufreq_perf_to_khz(cpu_data, caps->lowest_perf); - policy->cpuinfo.max_freq = cppc_cpufreq_perf_to_khz(cpu_data, - caps->nominal_perf); + policy->cpuinfo.max_freq = policy->max; policy->transition_delay_us = cppc_cpufreq_get_transition_delay_us(cpu); policy->shared_type = cpu_data->shared_type; -- Gitee From 5597d11fdf3d033194baefd2a9c661826825ca88 Mon Sep 17 00:00:00 2001 From: Aboorva Devarajan Date: Tue, 11 Mar 2025 11:24:07 +0800 Subject: [PATCH 06/78] cpufreq: prevent NULL dereference in cpufreq_online() mainline inclusion from mainline-v6.7-rc5 commit 0813fd2e14ca6ecd4e6ba005a9766f08e26020d7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0813fd2e14ca6ecd4e6ba005a9766f08e26020d7 --------------------------------------------------------------------- Ensure cpufreq_driver->set_boost is non-NULL before using it in cpufreq_online() to prevent a potential NULL pointer dereference. Reported-by: Gautam Menghani Closes: https://lore.kernel.org/all/c9e56c5f54cc33338762c94e9bed7b5a0d5de812.camel@linux.ibm.com/ Fixes: da59223d340c ("cpufreq: Introduce a more generic way to set default per-policy boost flag") Suggested-by: Viresh Kumar Signed-off-by: Aboorva Devarajan Link: https://patch.msgid.link/20250205181347.2079272-1-aboorvad@linux.ibm.com [ rjw: Minor edits in the subject and changelog ] Signed-off-by: Rafael J. Wysocki Signed-off-by: Xinghai Cen Signed-off-by: huwentao --- drivers/cpufreq/cpufreq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index d06eed6cb05f..20cd91468d92 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1557,7 +1557,8 @@ static int cpufreq_online(unsigned int cpu) policy->cdev = of_cpufreq_cooling_register(policy); /* Let the per-policy boost flag mirror the cpufreq_driver boost during init */ - if (policy->boost_enabled != cpufreq_boost_enabled()) { + if (cpufreq_driver->set_boost && + policy->boost_enabled != cpufreq_boost_enabled()) { policy->boost_enabled = cpufreq_boost_enabled(); ret = cpufreq_driver->set_boost(policy, policy->boost_enabled); if (ret) { -- Gitee From cfaa53a19ccd291ab13de18b53ad1caf93573151 Mon Sep 17 00:00:00 2001 From: Jie Zhan Date: Mon, 21 Apr 2025 20:34:35 +0800 Subject: [PATCH 07/78] cpufreq: governor: Fix negative 'idle_time' handling in dbs_update() mainline inclusion from mainline-v6.14-rc3 commit 3698dd6b139dc37b35a9ad83d9330c1f99666c02 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3698dd6b139dc37b35a9ad83d9330c1f99666c02 --------------------------------------------------------------------- We observed an issue that the CPU frequency can't raise up with a 100% CPU load when NOHZ is off and the 'conservative' governor is selected. 'idle_time' can be negative if it's obtained from get_cpu_idle_time_jiffy() when NOHZ is off. This was found and explained in commit 9485e4ca0b48 ("cpufreq: governor: Fix handling of special cases in dbs_update()"). However, commit 7592019634f8 ("cpufreq: governors: Fix long idle detection logic in load calculation") introduced a comparison between 'idle_time' and 'samling_rate' to detect a long idle interval. While 'idle_time' is converted to int before comparison, it's actually promoted to unsigned again when compared with an unsigned 'sampling_rate'. Hence, this leads to wrong idle interval detection when it's in fact 100% busy and sets policy_dbs->idle_periods to a very large value. 'conservative' adjusts the frequency to minimum because of the large 'idle_periods', such that the frequency can't raise up. 'Ondemand' doesn't use policy_dbs->idle_periods so it fortunately avoids the issue. Correct negative 'idle_time' to 0 before any use of it in dbs_update(). Fixes: 7592019634f8 ("cpufreq: governors: Fix long idle detection logic in load calculation") Signed-off-by: Jie Zhan Reviewed-by: Chen Yu Link: https://patch.msgid.link/20250213035510.2402076-1-zhanjie9@hisilicon.com Signed-off-by: Rafael J. Wysocki Signed-off-by: Xinghai Cen Signed-off-by: huwentao --- drivers/cpufreq/cpufreq_governor.c | 45 +++++++++++++++--------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c index 55c80319d268..5981e3ef9ce0 100644 --- a/drivers/cpufreq/cpufreq_governor.c +++ b/drivers/cpufreq/cpufreq_governor.c @@ -145,7 +145,23 @@ unsigned int dbs_update(struct cpufreq_policy *policy) time_elapsed = update_time - j_cdbs->prev_update_time; j_cdbs->prev_update_time = update_time; - idle_time = cur_idle_time - j_cdbs->prev_cpu_idle; + /* + * cur_idle_time could be smaller than j_cdbs->prev_cpu_idle if + * it's obtained from get_cpu_idle_time_jiffy() when NOHZ is + * off, where idle_time is calculated by the difference between + * time elapsed in jiffies and "busy time" obtained from CPU + * statistics. If a CPU is 100% busy, the time elapsed and busy + * time should grow with the same amount in two consecutive + * samples, but in practice there could be a tiny difference, + * making the accumulated idle time decrease sometimes. Hence, + * in this case, idle_time should be regarded as 0 in order to + * make the further process correct. + */ + if (cur_idle_time > j_cdbs->prev_cpu_idle) + idle_time = cur_idle_time - j_cdbs->prev_cpu_idle; + else + idle_time = 0; + j_cdbs->prev_cpu_idle = cur_idle_time; if (ignore_nice) { @@ -162,7 +178,7 @@ unsigned int dbs_update(struct cpufreq_policy *policy) * calls, so the previous load value can be used then. */ load = j_cdbs->prev_load; - } else if (unlikely((int)idle_time > 2 * sampling_rate && + } else if (unlikely(idle_time > 2 * sampling_rate && j_cdbs->prev_load)) { /* * If the CPU had gone completely idle and a task has @@ -189,30 +205,15 @@ unsigned int dbs_update(struct cpufreq_policy *policy) load = j_cdbs->prev_load; j_cdbs->prev_load = 0; } else { - if (time_elapsed >= idle_time) { + if (time_elapsed > idle_time) load = 100 * (time_elapsed - idle_time) / time_elapsed; - } else { - /* - * That can happen if idle_time is returned by - * get_cpu_idle_time_jiffy(). In that case - * idle_time is roughly equal to the difference - * between time_elapsed and "busy time" obtained - * from CPU statistics. Then, the "busy time" - * can end up being greater than time_elapsed - * (for example, if jiffies_64 and the CPU - * statistics are updated by different CPUs), - * so idle_time may in fact be negative. That - * means, though, that the CPU was busy all - * the time (on the rough average) during the - * last sampling interval and 100 can be - * returned as the load. - */ - load = (int)idle_time < 0 ? 100 : 0; - } + else + load = 0; + j_cdbs->prev_load = load; } - if (unlikely((int)idle_time > 2 * sampling_rate)) { + if (unlikely(idle_time > 2 * sampling_rate)) { unsigned int periods = idle_time / sampling_rate; if (periods < idle_periods) -- Gitee From 198cd80a78fc374cce5c7f1c75201cb237622723 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 8 Jan 2021 16:46:51 +0530 Subject: [PATCH 08/78] arm64: topology: Avoid the have_policy check mainline inclusion from mainline-v5.12-rc1 commit 384e5699e101b1ee184590a39ee60f10ec0d36b6 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=384e5699e101b1ee184590a39ee60f10ec0d36b6 ---------------------------------------------------------------------- Every time I have stumbled upon this routine, I get confused with the way 'have_policy' is used and I have to dig in to understand why is it so. Here is an attempt to make it easier to understand, and hopefully it is an improvement. The 'have_policy' check was just an optimization to avoid writing to amu_fie_cpus in case we don't have to, but that optimization itself is creating more confusion than the real work. Lets just do that if all the CPUs support AMUs. It is much cleaner that way. Reviewed-by: Ionela Voinescu Signed-off-by: Viresh Kumar Tested-by: Ionela Voinescu Link: https://lore.kernel.org/r/c125766c4be93461772015ac7c9a6ae45d5756f6.1610104461.git.viresh.kumar@linaro.org Signed-off-by: Will Deacon Signed-off-by: huwentao --- arch/arm64/kernel/topology.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 4c7f3687356d..203c93c69f46 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -235,14 +235,14 @@ static int freq_inv_set_max_ratio(int cpu, u64 max_rate, u64 ref_rate) return 0; } -static inline bool +static inline void enable_policy_freq_counters(int cpu, cpumask_var_t valid_cpus) { struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); if (!policy) { pr_debug("CPU%d: No cpufreq policy found.\n", cpu); - return false; + return; } if (cpumask_subset(policy->related_cpus, valid_cpus)) @@ -250,8 +250,6 @@ enable_policy_freq_counters(int cpu, cpumask_var_t valid_cpus) amu_fie_cpus); cpufreq_cpu_put(policy); - - return true; } static DEFINE_STATIC_KEY_FALSE(amu_fie_key); @@ -260,7 +258,6 @@ static DEFINE_STATIC_KEY_FALSE(amu_fie_key); static int __init init_amu_fie(void) { cpumask_var_t valid_cpus; - bool have_policy = false; int ret = 0; int cpu; @@ -280,17 +277,12 @@ static int __init init_amu_fie(void) continue; cpumask_set_cpu(cpu, valid_cpus); - have_policy |= enable_policy_freq_counters(cpu, valid_cpus); + enable_policy_freq_counters(cpu, valid_cpus); } - /* - * If we are not restricted by cpufreq policies, we only enable - * the use of the AMU feature for FIE if all CPUs support AMU. - * Otherwise, enable_policy_freq_counters has already enabled - * policy cpus. - */ - if (!have_policy && cpumask_equal(valid_cpus, cpu_present_mask)) - cpumask_or(amu_fie_cpus, amu_fie_cpus, valid_cpus); + /* Overwrite amu_fie_cpus if all CPUs support AMU */ + if (cpumask_equal(valid_cpus, cpu_present_mask)) + cpumask_copy(amu_fie_cpus, cpu_present_mask); if (!cpumask_empty(amu_fie_cpus)) { pr_info("CPUs[%*pbl]: counters will be used for FIE.", -- Gitee From c7e8da0e90e982125983a82c13895e9067dbb086 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 8 Jan 2021 16:46:52 +0530 Subject: [PATCH 09/78] arm64: topology: Reorder init_amu_fie() a bit mainline inclusion from mainline-v5.12-rc1 commit 47b10b737c0794c2fa584d7c8103b485e274ed51 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=47b10b737c0794c2fa584d7c8103b485e274ed51 ---------------------------------------------------------------------- This patch does a couple of optimizations in init_amu_fie(), like early exits from paths where we don't need to continue any further, avoid the enable/disable dance, moving the calls to topology_scale_freq_invariant() just when we need them, instead of at the top of the routine, and avoiding calling it for the third time. Reviewed-by: Ionela Voinescu Signed-off-by: Viresh Kumar Tested-by: Ionela Voinescu Link: https://lore.kernel.org/r/a732e71ab9ec28c354eb28dd898c9b47d490863f.1610104461.git.viresh.kumar@linaro.org Signed-off-by: Will Deacon Signed-off-by: huwentao --- arch/arm64/kernel/topology.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 203c93c69f46..497063f0ba73 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -258,6 +258,7 @@ static DEFINE_STATIC_KEY_FALSE(amu_fie_key); static int __init init_amu_fie(void) { cpumask_var_t valid_cpus; + bool invariant; int ret = 0; int cpu; @@ -284,18 +285,19 @@ static int __init init_amu_fie(void) if (cpumask_equal(valid_cpus, cpu_present_mask)) cpumask_copy(amu_fie_cpus, cpu_present_mask); - if (!cpumask_empty(amu_fie_cpus)) { - pr_info("CPUs[%*pbl]: counters will be used for FIE.", - cpumask_pr_args(amu_fie_cpus)); - static_branch_enable(&amu_fie_key); - } + if (cpumask_empty(amu_fie_cpus)) + goto free_valid_mask; - /* - * If the system is not fully invariant after AMU init, disable - * partial use of counters for frequency invariance. - */ - if (!topology_scale_freq_invariant()) - static_branch_disable(&amu_fie_key); + invariant = topology_scale_freq_invariant(); + + /* We aren't fully invariant yet */ + if (!invariant && !cpumask_equal(amu_fie_cpus, cpu_present_mask)) + goto free_valid_mask; + + static_branch_enable(&amu_fie_key); + + pr_info("CPUs[%*pbl]: counters will be used for FIE.", + cpumask_pr_args(amu_fie_cpus)); free_valid_mask: free_cpumask_var(valid_cpus); -- Gitee From be72994c4b732f8d05e51fac07ffaeb6945a826c Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 8 Jan 2021 16:46:53 +0530 Subject: [PATCH 10/78] arm64: topology: Make AMUs work with modular cpufreq drivers mainline inclusion from mainline-v5.12-rc1 commit a5f1b187cd24f7da35eb7a48fc50839c554d52a2 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=a5f1b187cd24f7da35eb7a48fc50839c554d52a2 ---------------------------------------------------------------------- The AMU counters won't get used today if the cpufreq driver is built as a module as the amu core requires everything to be ready by late init. Fix that properly by registering for cpufreq policy notifier. Note that the amu core don't have any cpufreq dependency after the first time CPUFREQ_CREATE_POLICY notifier is called for all the CPUs. And so we don't need to do anything on the CPUFREQ_REMOVE_POLICY notifier. And for the same reason we check if the CPUs are already parsed in the beginning of amu_fie_setup() and skip if that is true. Alternatively we can shoot a work from there to unregister the notifier instead, but that seemed too much instead of this simple check. While at it, convert the print message to pr_debug instead of pr_info. Signed-off-by: Viresh Kumar Reviewed-by: Ionela Voinescu Tested-by: Ionela Voinescu Link: https://lore.kernel.org/r/89c1921334443e133c9c8791b4693607d65ed9f5.1610104461.git.viresh.kumar@linaro.org Signed-off-by: Will Deacon Signed-off-by: huwentao --- arch/arm64/kernel/topology.c | 92 +++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 44 deletions(-) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 497063f0ba73..fc1293f4fcd1 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -235,76 +235,80 @@ static int freq_inv_set_max_ratio(int cpu, u64 max_rate, u64 ref_rate) return 0; } -static inline void -enable_policy_freq_counters(int cpu, cpumask_var_t valid_cpus) -{ - struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); - - if (!policy) { - pr_debug("CPU%d: No cpufreq policy found.\n", cpu); - return; - } - - if (cpumask_subset(policy->related_cpus, valid_cpus)) - cpumask_or(amu_fie_cpus, policy->related_cpus, - amu_fie_cpus); - - cpufreq_cpu_put(policy); -} - static DEFINE_STATIC_KEY_FALSE(amu_fie_key); #define amu_freq_invariant() static_branch_unlikely(&amu_fie_key) -static int __init init_amu_fie(void) +static void amu_fie_setup(const struct cpumask *cpus) { - cpumask_var_t valid_cpus; bool invariant; - int ret = 0; int cpu; - if (!zalloc_cpumask_var(&valid_cpus, GFP_KERNEL)) - return -ENOMEM; - - if (!zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL)) { - ret = -ENOMEM; - goto free_valid_mask; - } + /* We are already set since the last insmod of cpufreq driver */ + if (unlikely(cpumask_subset(cpus, amu_fie_cpus))) + return; - for_each_present_cpu(cpu) { + for_each_cpu(cpu, cpus) { if (!freq_counters_valid(cpu) || freq_inv_set_max_ratio(cpu, cpufreq_get_hw_max_freq(cpu) * 1000ULL, arch_timer_get_rate())) - continue; - - cpumask_set_cpu(cpu, valid_cpus); - enable_policy_freq_counters(cpu, valid_cpus); + return; } - /* Overwrite amu_fie_cpus if all CPUs support AMU */ - if (cpumask_equal(valid_cpus, cpu_present_mask)) - cpumask_copy(amu_fie_cpus, cpu_present_mask); - - if (cpumask_empty(amu_fie_cpus)) - goto free_valid_mask; + cpumask_or(amu_fie_cpus, amu_fie_cpus, cpus); invariant = topology_scale_freq_invariant(); /* We aren't fully invariant yet */ if (!invariant && !cpumask_equal(amu_fie_cpus, cpu_present_mask)) - goto free_valid_mask; + return; static_branch_enable(&amu_fie_key); - pr_info("CPUs[%*pbl]: counters will be used for FIE.", - cpumask_pr_args(amu_fie_cpus)); + pr_debug("CPUs[%*pbl]: counters will be used for FIE.", + cpumask_pr_args(cpus)); +} -free_valid_mask: - free_cpumask_var(valid_cpus); +static int init_amu_fie_callback(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_policy *policy = data; + + if (val == CPUFREQ_CREATE_POLICY) + amu_fie_setup(policy->related_cpus); + + /* + * We don't need to handle CPUFREQ_REMOVE_POLICY event as the AMU + * counters don't have any dependency on cpufreq driver once we have + * initialized AMU support and enabled invariance. The AMU counters will + * keep on working just fine in the absence of the cpufreq driver, and + * for the CPUs for which there are no counters available, the last set + * value of freq_scale will remain valid as that is the frequency those + * CPUs are running at. + */ + + return 0; +} + +static struct notifier_block init_amu_fie_notifier = { + .notifier_call = init_amu_fie_callback, +}; + +static int __init init_amu_fie(void) +{ + int ret; + + if (!zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL)) + return -ENOMEM; + + ret = cpufreq_register_notifier(&init_amu_fie_notifier, + CPUFREQ_POLICY_NOTIFIER); + if (ret) + free_cpumask_var(amu_fie_cpus); return ret; } -late_initcall_sync(init_amu_fie); +core_initcall(init_amu_fie); bool arch_freq_counters_available(const struct cpumask *cpus) { -- Gitee From ca27bd65304bccda470f0e2b4ac4dd0a8126298c Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 10 Dec 2020 11:17:40 +0530 Subject: [PATCH 11/78] arm64: topology: Drop the useless update to per-cpu cycles mainline inclusion from mainline-v5.11-rc1 commit 51550a483606e35c379f78d28a7827f50e8fc09c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=51550a483606e35c379f78d28a7827f50e8fc09c ---------------------------------------------------------------------- The previous call to update_freq_counters_refs() has already updated the per-cpu variables, don't overwrite them with the same value again. Fixes: 4b9cf23c179a ("arm64: wrap and generalise counter read functions") Signed-off-by: Viresh Kumar Reviewed-by: Ionela Voinescu Reviewed-by: Sudeep Holla Link: https://lore.kernel.org/r/7a171f710cdc0f808a2bfbd7db839c0d265527e7.1607579234.git.viresh.kumar@linaro.org Signed-off-by: Catalin Marinas Signed-off-by: huwentao --- arch/arm64/kernel/topology.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index fc1293f4fcd1..6aa3e66291aa 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -338,7 +338,7 @@ void topology_scale_freq_tick(void) if (unlikely(core_cnt <= prev_core_cnt || const_cnt <= prev_const_cnt)) - goto store_and_exit; + return; /* * /\core arch_max_freq_scale @@ -355,10 +355,6 @@ void topology_scale_freq_tick(void) scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE); this_cpu_write(freq_scale, (unsigned long)scale); - -store_and_exit: - this_cpu_write(arch_core_cycles_prev, core_cnt); - this_cpu_write(arch_const_cycles_prev, const_cnt); } #ifdef CONFIG_SCHED_SOFT_QUOTA -- Gitee From f72d197e6a65689c27e6294b7e39501ab8cc855a Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 10 Mar 2021 08:16:40 +0530 Subject: [PATCH 12/78] arch_topology: Rename freq_scale as arch_freq_scale mainline inclusion from mainline-v5.13-rc1 commit eec73529a9321616ed13cf732cd21a17eb1a2836 category: cleanup bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=eec73529a9321616ed13cf732cd21a17eb1a2836 ---------------------------------------------------------------------- Rename freq_scale to a less generic name, as it will get exported soon for modules. Since x86 already names its own implementation of this as arch_freq_scale, lets stick to that. Suggested-by: Will Deacon Signed-off-by: Viresh Kumar Signed-off-by: huwentao --- arch/arm64/kernel/topology.c | 6 +++--- drivers/base/arch_topology.c | 4 ++-- include/linux/arch_topology.h | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 6aa3e66291aa..d22592bc2d8a 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -283,8 +283,8 @@ static int init_amu_fie_callback(struct notifier_block *nb, unsigned long val, * initialized AMU support and enabled invariance. The AMU counters will * keep on working just fine in the absence of the cpufreq driver, and * for the CPUs for which there are no counters available, the last set - * value of freq_scale will remain valid as that is the frequency those - * CPUs are running at. + * value of arch_freq_scale will remain valid as that is the frequency + * those CPUs are running at. */ return 0; @@ -354,7 +354,7 @@ void topology_scale_freq_tick(void) const_cnt - prev_const_cnt); scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE); - this_cpu_write(freq_scale, (unsigned long)scale); + this_cpu_write(arch_freq_scale, (unsigned long)scale); } #ifdef CONFIG_SCHED_SOFT_QUOTA diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index ba1c8e1ff0b6..3e3ca9814801 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -31,7 +31,7 @@ __weak bool arch_freq_counters_available(const struct cpumask *cpus) { return false; } -DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE; +DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq, unsigned long max_freq) @@ -53,7 +53,7 @@ void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq, scale = (cur_freq << SCHED_CAPACITY_SHIFT) / max_freq; for_each_cpu(i, cpus) - per_cpu(freq_scale, i) = scale; + per_cpu(arch_freq_scale, i) = scale; } DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE; diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index 64a05d38a45f..c81f97a4a9a4 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -27,11 +27,11 @@ static inline unsigned long topology_get_cpu_scale(int cpu) void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity); -DECLARE_PER_CPU(unsigned long, freq_scale); +DECLARE_PER_CPU(unsigned long, arch_freq_scale); static inline unsigned long topology_get_freq_scale(int cpu) { - return per_cpu(freq_scale, cpu); + return per_cpu(arch_freq_scale, cpu); } void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq, -- Gitee From 400bd16125c5c6ea209062894df20f467c95068e Mon Sep 17 00:00:00 2001 From: Ionela Voinescu Date: Tue, 27 Oct 2020 18:07:11 +0000 Subject: [PATCH 13/78] sched/topology,schedutil: Wrap sched domains rebuild mainline inclusion from mainline-v5.11-rc1 commit 31f6a8c0a471be7d7d05c93eac50fcb729e79b9d category: cleanup bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=31f6a8c0a471be7d7d05c93eac50fcb729e79b9d ---------------------------------------------------------------------- Add the rebuild_sched_domains_energy() function to wrap the functionality that rebuilds the scheduling domains if any of the Energy Aware Scheduling (EAS) initialisation conditions change. This functionality is used when schedutil is added or removed or when EAS is enabled or disabled through the sched_energy_aware sysctl. Therefore, create a single function that is used in both these cases and that can be later reused. Signed-off-by: Ionela Voinescu Signed-off-by: Peter Zijlstra (Intel) Acked-by: Quentin Perret Acked-by: Rafael J. Wysocki Link: https://lkml.kernel.org/r/20201027180713.7642-2-ionela.voinescu@arm.com Signed-off-by: huwentao --- include/linux/sched/topology.h | 8 ++++++++ kernel/sched/cpufreq_schedutil.c | 9 +-------- kernel/sched/topology.c | 18 +++++++++++------- 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 31b9ccf21d5e..431ff2c6bf67 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -272,6 +272,14 @@ static inline bool cpus_share_resources(int this_cpu, int that_cpu) #endif /* !CONFIG_SMP */ +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) +extern void rebuild_sched_domains_energy(void); +#else +static inline void rebuild_sched_domains_energy(void) +{ +} +#endif + #ifndef arch_scale_cpu_capacity /** * arch_scale_cpu_capacity - get the capacity scale factor of a given CPU. diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 5e39da0ae086..405577e99cb2 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -905,16 +905,9 @@ struct cpufreq_governor *cpufreq_default_governor(void) cpufreq_governor_init(schedutil_gov); #ifdef CONFIG_ENERGY_MODEL -extern bool sched_energy_update; -extern struct mutex sched_energy_mutex; - static void rebuild_sd_workfn(struct work_struct *work) { - mutex_lock(&sched_energy_mutex); - sched_energy_update = true; - rebuild_sched_domains(); - sched_energy_update = false; - mutex_unlock(&sched_energy_mutex); + rebuild_sched_domains_energy(); } static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 3a8673a1a3fc..62c658d8d022 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -226,6 +226,15 @@ unsigned int sysctl_sched_energy_aware = 1; DEFINE_MUTEX(sched_energy_mutex); bool sched_energy_update; +void rebuild_sched_domains_energy(void) +{ + mutex_lock(&sched_energy_mutex); + sched_energy_update = true; + rebuild_sched_domains(); + sched_energy_update = false; + mutex_unlock(&sched_energy_mutex); +} + #ifdef CONFIG_PROC_SYSCTL int sched_energy_aware_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -238,13 +247,8 @@ int sched_energy_aware_handler(struct ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) { state = static_branch_unlikely(&sched_energy_present); - if (state != sysctl_sched_energy_aware) { - mutex_lock(&sched_energy_mutex); - sched_energy_update = 1; - rebuild_sched_domains(); - sched_energy_update = 0; - mutex_unlock(&sched_energy_mutex); - } + if (state != sysctl_sched_energy_aware) + rebuild_sched_domains_energy(); } return ret; -- Gitee From 4544b1790b89cf72f5e55c1a8c0486f2ca5fc291 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 10 Mar 2021 08:21:04 +0530 Subject: [PATCH 14/78] arch_topology: Allow multiple entities to provide sched_freq_tick() callback mainline inclusion from mainline-v5.13-rc1 commit 01e055c120a46e78650b5f903088badbbdaae9ad category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=01e055c120a46e78650b5f903088badbbdaae9ad ---------------------------------------------------------------------- This patch attempts to make it generic enough so other parts of the kernel can also provide their own implementation of scale_freq_tick() callback, which is called by the scheduler periodically to update the per-cpu arch_freq_scale variable. The implementations now need to provide 'struct scale_freq_data' for the CPUs for which they have hardware counters available, and a callback gets registered for each possible CPU in a per-cpu variable. The arch specific (or ARM AMU) counters are updated to adapt to this and they take the highest priority if they are available, i.e. they will be used instead of CPPC based counters for example. The special code to rebuild the sched domains, in case invariance status change for the system, is moved out of arm64 specific code and is added to arch_topology.c. Note that this also defines SCALE_FREQ_SOURCE_CPUFREQ but doesn't use it and it is added to show that cpufreq is also acts as source of information for FIE and will be used by default if no other counters are supported for a platform. Reviewed-by: Ionela Voinescu Tested-by: Ionela Voinescu Acked-by: Will Deacon # for arm64 Tested-by: Vincent Guittot Signed-off-by: Viresh Kumar Signed-off-by: huwentao --- arch/arm64/include/asm/topology.h | 10 +--- arch/arm64/kernel/topology.c | 96 +++++++++++++------------------ drivers/base/arch_topology.c | 82 ++++++++++++++++++++++++-- include/linux/arch_topology.h | 14 ++++- 4 files changed, 131 insertions(+), 71 deletions(-) diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 479a1ee0e161..5ebe51ef6d6c 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -17,17 +17,9 @@ int pcibus_to_node(struct pci_bus *bus); #include void update_freq_counters_refs(void); -void topology_scale_freq_tick(void); - -#ifdef CONFIG_ARM64_AMU_EXTN -/* - * Replace task scheduler's default counter-based - * frequency-invariance scale factor setting. - */ -#define arch_scale_freq_tick topology_scale_freq_tick -#endif /* CONFIG_ARM64_AMU_EXTN */ /* Replace task scheduler's default frequency-invariant accounting */ +#define arch_scale_freq_tick topology_scale_freq_tick #define arch_set_freq_scale topology_set_freq_scale #define arch_scale_freq_capacity topology_get_freq_scale #define arch_scale_freq_invariant topology_scale_freq_invariant diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index d22592bc2d8a..a320b03fcced 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -235,12 +235,47 @@ static int freq_inv_set_max_ratio(int cpu, u64 max_rate, u64 ref_rate) return 0; } -static DEFINE_STATIC_KEY_FALSE(amu_fie_key); -#define amu_freq_invariant() static_branch_unlikely(&amu_fie_key) +static void amu_scale_freq_tick(void) +{ + u64 prev_core_cnt, prev_const_cnt; + u64 core_cnt, const_cnt, scale; + + prev_const_cnt = this_cpu_read(arch_const_cycles_prev); + prev_core_cnt = this_cpu_read(arch_core_cycles_prev); + + update_freq_counters_refs(); + + const_cnt = this_cpu_read(arch_const_cycles_prev); + core_cnt = this_cpu_read(arch_core_cycles_prev); + + if (unlikely(core_cnt <= prev_core_cnt || + const_cnt <= prev_const_cnt)) + return; + + /* + * /\core arch_max_freq_scale + * scale = ------- * -------------------- + * /\const SCHED_CAPACITY_SCALE + * + * See validate_cpu_freq_invariance_counters() for details on + * arch_max_freq_scale and the use of SCHED_CAPACITY_SHIFT. + */ + scale = core_cnt - prev_core_cnt; + scale *= this_cpu_read(arch_max_freq_scale); + scale = div64_u64(scale >> SCHED_CAPACITY_SHIFT, + const_cnt - prev_const_cnt); + + scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE); + this_cpu_write(arch_freq_scale, (unsigned long)scale); +} + +static struct scale_freq_data amu_sfd = { + .source = SCALE_FREQ_SOURCE_ARCH, + .set_freq_scale = amu_scale_freq_tick, +}; static void amu_fie_setup(const struct cpumask *cpus) { - bool invariant; int cpu; /* We are already set since the last insmod of cpufreq driver */ @@ -257,13 +292,7 @@ static void amu_fie_setup(const struct cpumask *cpus) cpumask_or(amu_fie_cpus, amu_fie_cpus, cpus); - invariant = topology_scale_freq_invariant(); - - /* We aren't fully invariant yet */ - if (!invariant && !cpumask_equal(amu_fie_cpus, cpu_present_mask)) - return; - - static_branch_enable(&amu_fie_key); + topology_set_scale_freq_source(&amu_sfd, amu_fie_cpus); pr_debug("CPUs[%*pbl]: counters will be used for FIE.", cpumask_pr_args(cpus)); @@ -310,53 +339,6 @@ static int __init init_amu_fie(void) } core_initcall(init_amu_fie); -bool arch_freq_counters_available(const struct cpumask *cpus) -{ - return amu_freq_invariant() && - cpumask_subset(cpus, amu_fie_cpus); -} - -void topology_scale_freq_tick(void) -{ - u64 prev_core_cnt, prev_const_cnt; - u64 core_cnt, const_cnt, scale; - int cpu = smp_processor_id(); - - if (!amu_freq_invariant()) - return; - - if (!cpumask_test_cpu(cpu, amu_fie_cpus)) - return; - - prev_const_cnt = this_cpu_read(arch_const_cycles_prev); - prev_core_cnt = this_cpu_read(arch_core_cycles_prev); - - update_freq_counters_refs(); - - const_cnt = this_cpu_read(arch_const_cycles_prev); - core_cnt = this_cpu_read(arch_core_cycles_prev); - - if (unlikely(core_cnt <= prev_core_cnt || - const_cnt <= prev_const_cnt)) - return; - - /* - * /\core arch_max_freq_scale - * scale = ------- * -------------------- - * /\const SCHED_CAPACITY_SCALE - * - * See validate_cpu_freq_invariance_counters() for details on - * arch_max_freq_scale and the use of SCHED_CAPACITY_SHIFT. - */ - scale = core_cnt - prev_core_cnt; - scale *= this_cpu_read(arch_max_freq_scale); - scale = div64_u64(scale >> SCHED_CAPACITY_SHIFT, - const_cnt - prev_const_cnt); - - scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE); - this_cpu_write(arch_freq_scale, (unsigned long)scale); -} - #ifdef CONFIG_SCHED_SOFT_QUOTA static DEFINE_PER_CPU(int, sibling_idle) = 1; diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 3e3ca9814801..be39683e5b77 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -21,16 +21,90 @@ #include #include +static DEFINE_PER_CPU(struct scale_freq_data *, sft_data); +static struct cpumask scale_freq_counters_mask; +static bool scale_freq_invariant; + +static bool supports_scale_freq_counters(const struct cpumask *cpus) +{ + return cpumask_subset(cpus, &scale_freq_counters_mask); +} + bool topology_scale_freq_invariant(void) { return cpufreq_supports_freq_invariance() || - arch_freq_counters_available(cpu_online_mask); + supports_scale_freq_counters(cpu_online_mask); } -__weak bool arch_freq_counters_available(const struct cpumask *cpus) +static void update_scale_freq_invariant(bool status) { - return false; + if (scale_freq_invariant == status) + return; + + /* + * Task scheduler behavior depends on frequency invariance support, + * either cpufreq or counter driven. If the support status changes as + * a result of counter initialisation and use, retrigger the build of + * scheduling domains to ensure the information is propagated properly. + */ + if (topology_scale_freq_invariant() == status) { + scale_freq_invariant = status; + rebuild_sched_domains_energy(); + } +} + +void topology_set_scale_freq_source(struct scale_freq_data *data, + const struct cpumask *cpus) +{ + struct scale_freq_data *sfd; + int cpu; + + /* + * Avoid calling rebuild_sched_domains() unnecessarily if FIE is + * supported by cpufreq. + */ + if (cpumask_empty(&scale_freq_counters_mask)) + scale_freq_invariant = topology_scale_freq_invariant(); + + for_each_cpu(cpu, cpus) { + sfd = per_cpu(sft_data, cpu); + + /* Use ARCH provided counters whenever possible */ + if (!sfd || sfd->source != SCALE_FREQ_SOURCE_ARCH) { + per_cpu(sft_data, cpu) = data; + cpumask_set_cpu(cpu, &scale_freq_counters_mask); + } + } + + update_scale_freq_invariant(true); +} + +void topology_clear_scale_freq_source(enum scale_freq_source source, + const struct cpumask *cpus) +{ + struct scale_freq_data *sfd; + int cpu; + + for_each_cpu(cpu, cpus) { + sfd = per_cpu(sft_data, cpu); + + if (sfd && sfd->source == source) { + per_cpu(sft_data, cpu) = NULL; + cpumask_clear_cpu(cpu, &scale_freq_counters_mask); + } + } + + update_scale_freq_invariant(false); } + +void topology_scale_freq_tick(void) +{ + struct scale_freq_data *sfd = *this_cpu_ptr(&sft_data); + + if (sfd) + sfd->set_freq_scale(); +} + DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq, @@ -47,7 +121,7 @@ void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq, * want to update the scale factor with information from CPUFREQ. * Instead the scale factor will be updated from arch_scale_freq_tick. */ - if (arch_freq_counters_available(cpus)) + if (supports_scale_freq_counters(cpus)) return; scale = (cur_freq << SCHED_CAPACITY_SHIFT) / max_freq; diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index c81f97a4a9a4..9dc94841803c 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -38,7 +38,19 @@ void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq, unsigned long max_freq); bool topology_scale_freq_invariant(void); -bool arch_freq_counters_available(const struct cpumask *cpus); +enum scale_freq_source { + SCALE_FREQ_SOURCE_CPUFREQ = 0, + SCALE_FREQ_SOURCE_ARCH, +}; + +struct scale_freq_data { + enum scale_freq_source source; + void (*set_freq_scale)(void); +}; + +void topology_scale_freq_tick(void); +void topology_set_scale_freq_source(struct scale_freq_data *data, const struct cpumask *cpus); +void topology_clear_scale_freq_source(enum scale_freq_source source, const struct cpumask *cpus); DECLARE_PER_CPU(unsigned long, thermal_pressure); -- Gitee From 3269802c33aa7bbe8d74323fee790752a2d7b6ba Mon Sep 17 00:00:00 2001 From: Ionela Voinescu Date: Mon, 14 Dec 2020 12:38:21 +0000 Subject: [PATCH 15/78] cppc_cpufreq: clarify support for coordination types mainline inclusion from mainline-v5.11-rc1 commit bf76bb208f2b653306f2fc8f9c2a22f9890702bd category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=bf76bb208f2b653306f2fc8f9c2a22f9890702bd ---------------------------------------------------------------------- The previous coordination type handling in the cppc_cpufreq init code created some confusion: the comment mentioned "Support only SW_ANY for now" while only the SW_ALL/ALL case resulted in a failure. The other coordination types (HW_ALL/HW, NONE) were silently supported. Clarify support for coordination types while describing in comments the intended behavior. Signed-off-by: Ionela Voinescu Acked-by: Viresh Kumar Tested-by: Mian Yousaf Kaukab Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 83c6139ec163..fbee2b3f9492 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -261,7 +261,7 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) struct cppc_cpudata *cpu_data = all_cpu_data[policy->cpu]; struct cppc_perf_caps *caps = &cpu_data->perf_caps; unsigned int cpu = policy->cpu; - int ret = 0; + int i, ret = 0; cpu_data->cpu = cpu; ret = cppc_get_perf_caps(cpu, caps); @@ -297,9 +297,13 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) policy->transition_delay_us = cppc_cpufreq_get_transition_delay_us(cpu); policy->shared_type = cpu_data->shared_type; - if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) { - int i; - + switch (policy->shared_type) { + case CPUFREQ_SHARED_TYPE_HW: + case CPUFREQ_SHARED_TYPE_NONE: + /* Nothing to be done - we'll have a policy for each CPU */ + break; + case CPUFREQ_SHARED_TYPE_ANY: + /* All CPUs in the domain will share a policy */ cpumask_copy(policy->cpus, cpu_data->shared_cpu_map); for_each_cpu(i, policy->cpus) { @@ -309,9 +313,10 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) memcpy(&all_cpu_data[i]->perf_caps, caps, sizeof(cpu_data->perf_caps)); } - } else if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL) { - /* Support only SW_ANY for now. */ - pr_debug("Unsupported CPU co-ord type\n"); + break; + default: + pr_debug("Unsupported CPU co-ord type: %d\n", + policy->shared_type); return -EFAULT; } -- Gitee From 78b58248e3b4c03429d0a6be741b4d08b99a6431 Mon Sep 17 00:00:00 2001 From: Ionela Voinescu Date: Mon, 14 Dec 2020 12:38:22 +0000 Subject: [PATCH 16/78] cppc_cpufreq: expose information on frequency domains mainline inclusion from mainline-v5.11-rc1 commit cfdc589f4b5f94bf1a975b4a67d8163d533f6e9b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=cfdc589f4b5f94bf1a975b4a67d8163d533f6e9b ---------------------------------------------------------------------- Use the existing sysfs attribute "freqdomain_cpus" to expose information to userspace about CPUs in the same frequency domain. Signed-off-by: Ionela Voinescu Acked-by: Viresh Kumar Tested-by: Mian Yousaf Kaukab Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- .../ABI/testing/sysfs-devices-system-cpu | 3 ++- drivers/cpufreq/cppc_cpufreq.c | 16 ++++++++++++---- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index bc59db0a75d2..66171d28ea64 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -264,7 +264,8 @@ Description: Discover CPUs in the same CPU frequency coordination domain attribute is useful for user space DVFS controllers to get better power/performance results for platforms using acpi-cpufreq. - This file is only present if the acpi-cpufreq driver is in use. + This file is only present if the acpi-cpufreq or the cppc-cpufreq + drivers are in use. What: /sys/devices/system/cpu/cpuX/cpufreq/auto_act_window Date: October 2024 diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index fbee2b3f9492..f09154e8a5f5 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -547,14 +547,24 @@ static ssize_t store_energy_perf(struct cpufreq_policy *policy, cpufreq_freq_attr_rw(auto_act_window); cpufreq_freq_attr_rw(energy_perf); +#endif // CONFIG_CPPC_CPUFREQ_SYSFS_INTERFACE + +static ssize_t show_freqdomain_cpus(struct cpufreq_policy *policy, char *buf) +{ + unsigned int cpu = policy->cpu; + + return cpufreq_show_cpus(all_cpu_data[cpu]->shared_cpu_map, buf); +} +cpufreq_freq_attr_ro(freqdomain_cpus); static struct freq_attr *cppc_cpufreq_attr[] = { + &freqdomain_cpus, +#ifdef CONFIG_CPPC_CPUFREQ_SYSFS_INTERFACE &auto_act_window, &energy_perf, +#endif // CONFIG_CPPC_CPUFREQ_SYSFS_INTERFACE NULL, }; -#endif // CONFIG_CPPC_CPUFREQ_SYSFS_INTERFACE - static struct cpufreq_driver cppc_cpufreq_driver = { .flags = CPUFREQ_CONST_LOOPS, @@ -564,9 +574,7 @@ static struct cpufreq_driver cppc_cpufreq_driver = { .init = cppc_cpufreq_cpu_init, .stop_cpu = cppc_cpufreq_stop_cpu, .set_boost = cppc_cpufreq_set_boost, -#ifdef CONFIG_CPPC_CPUFREQ_SYSFS_INTERFACE .attr = cppc_cpufreq_attr, -#endif .name = "cppc_cpufreq", }; -- Gitee From 1a5a2231259ffcf9251912301a94226f6951fc6d Mon Sep 17 00:00:00 2001 From: Ionela Voinescu Date: Mon, 14 Dec 2020 12:38:23 +0000 Subject: [PATCH 17/78] cppc_cpufreq: replace per-cpu data array with a list mainline inclusion from mainline-v5.11-rc1 commit a28b2bfc099c6b9caa6ef697660408e076a32019 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=a28b2bfc099c6b9caa6ef697660408e076a32019 ---------------------------------------------------------------------- The cppc_cpudata per-cpu storage was inefficient (1) additional to causing functional issues (2) when CPUs are hotplugged out, due to per-cpu data being improperly initialised. (1) The amount of information needed for CPPC performance control in its cpufreq driver depends on the domain (PSD) coordination type: ANY: One set of CPPC control and capability data (e.g desired performance, highest/lowest performance, etc) applies to all CPUs in the domain. ALL: Same as ANY. To be noted that this type is not currently supported. When supported, information about which CPUs belong to a domain is needed in order for frequency change requests to be sent to each of them. HW: It's necessary to store CPPC control and capability information for all the CPUs. HW will then coordinate the performance state based on their limitations and requests. NONE: Same as HW. No HW coordination is expected. Despite this, the previous initialisation code would indiscriminately allocate memory for all CPUs (all_cpu_data) and unnecessarily duplicate performance capabilities and the domain sharing mask and type for each possible CPU. (2) With the current per-cpu structure, when having ANY coordination, the cppc_cpudata cpu information is not initialised (will remain 0) for all CPUs in a policy, other than policy->cpu. When policy->cpu is hotplugged out, the driver will incorrectly use the uninitialised (0) value of the other CPUs when making frequency changes. Additionally, the previous values stored in the perf_ctrls.desired_perf will be lost when policy->cpu changes. Therefore replace the array of per cpu data with a list. The memory for each structure is allocated at policy init, where a single structure can be allocated per policy, not per cpu. In order to accommodate the struct list_head node in the cppc_cpudata structure, the now unused cpu and cur_policy variables are removed. For example, on a arm64 Juno platform with 6 CPUs: (0, 1, 2, 3) in PSD1, (4, 5) in PSD2 - ANY coordination, the memory allocation comparison shows: Before patch: - ANY coordination: total slack req alloc/free caller 0 0 0 0/1 _kernel_size_le_hi32+0x0xffff800008ff7810 0 0 0 0/6 _kernel_size_le_hi32+0x0xffff800008ff7808 128 80 48 1/0 _kernel_size_le_hi32+0x0xffff800008ffc070 768 0 768 6/0 _kernel_size_le_hi32+0x0xffff800008ffc0e4 After patch: - ANY coordination: total slack req alloc/free caller 256 0 256 2/0 _kernel_size_le_hi32+0x0xffff800008fed410 0 0 0 0/2 _kernel_size_le_hi32+0x0xffff800008fed274 Additional notes: - A pointer to the policy's cppc_cpudata is stored in policy->driver_data - Driver registration is skipped if _CPC entries are not present. Signed-off-by: Ionela Voinescu Tested-by: Mian Yousaf Kaukab Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- drivers/acpi/cppc_acpi.c | 130 +++++++++---------------- drivers/cpufreq/cppc_cpufreq.c | 173 +++++++++++++++++---------------- include/acpi/cppc_acpi.h | 5 +- 3 files changed, 139 insertions(+), 169 deletions(-) diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index ad895e437df9..0ba9838ce64b 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -441,108 +441,72 @@ bool acpi_cpc_valid(void) EXPORT_SYMBOL_GPL(acpi_cpc_valid); /** - * acpi_get_psd_map - Map the CPUs in a common freq domain. - * @all_cpu_data: Ptrs to CPU specific CPPC data including PSD info. + * acpi_get_psd_map - Map the CPUs in the freq domain of a given cpu + * @cpu: Find all CPUs that share a domain with cpu. + * @cpu_data: Pointer to CPU specific CPPC data including PSD info. * * Return: 0 for success or negative value for err. */ -static int __acpi_get_psd_map(struct cppc_cpudata **all_cpu_data, struct cpc_desc **cpc_pptr) +static int __acpi_get_psd_map(unsigned int cpu, struct cppc_cpudata *cpu_data, struct cpc_desc **cpc_pptr) { - int count_target; - int retval = 0; - unsigned int i, j; - cpumask_var_t covered_cpus; - struct cppc_cpudata *pr, *match_pr; - struct acpi_psd_package *pdomain; - struct acpi_psd_package *match_pdomain; struct cpc_desc *cpc_ptr, *match_cpc_ptr; - - if (!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL)) - return -ENOMEM; + struct acpi_psd_package *match_pdomain; + struct acpi_psd_package *pdomain; + int count_target, i; /* * Now that we have _PSD data from all CPUs, let's setup P-state * domain info. */ - for_each_possible_cpu(i) { - if (cpumask_test_cpu(i, covered_cpus)) - continue; - - pr = all_cpu_data[i]; - cpc_ptr = cpc_pptr[i]; - if (!cpc_ptr) { - retval = -EFAULT; - goto err_ret; - } + cpc_ptr = cpc_pptr[cpu]; + if (!cpc_ptr) + return -EFAULT; - pdomain = &(cpc_ptr->domain_info); - cpumask_set_cpu(i, pr->shared_cpu_map); - cpumask_set_cpu(i, covered_cpus); - if (pdomain->num_processors <= 1) - continue; + pdomain = &(cpc_ptr->domain_info); + cpumask_set_cpu(cpu, cpu_data->shared_cpu_map); + if (pdomain->num_processors <= 1) + return 0; - /* Validate the Domain info */ - count_target = pdomain->num_processors; - if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ALL) - pr->shared_type = CPUFREQ_SHARED_TYPE_ALL; - else if (pdomain->coord_type == DOMAIN_COORD_TYPE_HW_ALL) - pr->shared_type = CPUFREQ_SHARED_TYPE_HW; - else if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ANY) - pr->shared_type = CPUFREQ_SHARED_TYPE_ANY; - - for_each_possible_cpu(j) { - if (i == j) - continue; - - match_cpc_ptr = cpc_pptr[j]; - if (!match_cpc_ptr) { - retval = -EFAULT; - goto err_ret; - } + /* Validate the Domain info */ + count_target = pdomain->num_processors; + if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ALL) + cpu_data->shared_type = CPUFREQ_SHARED_TYPE_ALL; + else if (pdomain->coord_type == DOMAIN_COORD_TYPE_HW_ALL) + cpu_data->shared_type = CPUFREQ_SHARED_TYPE_HW; + else if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ANY) + cpu_data->shared_type = CPUFREQ_SHARED_TYPE_ANY; - match_pdomain = &(match_cpc_ptr->domain_info); - if (match_pdomain->domain != pdomain->domain) - continue; + for_each_possible_cpu(i) { + if (i == cpu) + continue; - /* Here i and j are in the same domain */ - if (match_pdomain->num_processors != count_target) { - retval = -EFAULT; - goto err_ret; - } + match_cpc_ptr = cpc_pptr[i]; + if (!match_cpc_ptr) + goto err_fault; - if (pdomain->coord_type != match_pdomain->coord_type) { - retval = -EFAULT; - goto err_ret; - } + match_pdomain = &(match_cpc_ptr->domain_info); + if (match_pdomain->domain != pdomain->domain) + continue; - cpumask_set_cpu(j, covered_cpus); - cpumask_set_cpu(j, pr->shared_cpu_map); - } + /* Here i and cpu are in the same domain */ + if (match_pdomain->num_processors != count_target) + goto err_fault; - for_each_cpu(j, pr->shared_cpu_map) { - if (i == j) - continue; + if (pdomain->coord_type != match_pdomain->coord_type) + goto err_fault; - match_pr = all_cpu_data[j]; - match_pr->shared_type = pr->shared_type; - cpumask_copy(match_pr->shared_cpu_map, - pr->shared_cpu_map); - } + cpumask_set_cpu(i, cpu_data->shared_cpu_map); } - goto out; -err_ret: - for_each_possible_cpu(i) { - pr = all_cpu_data[i]; + return 0; - /* Assume no coordination on any error parsing domain info */ - cpumask_clear(pr->shared_cpu_map); - cpumask_set_cpu(i, pr->shared_cpu_map); - pr->shared_type = CPUFREQ_SHARED_TYPE_ALL; - } -out: - free_cpumask_var(covered_cpus); - return retval; +err_fault: + /* Assume no coordination on any error parsing domain info */ + cpumask_clear(cpu_data->shared_cpu_map); + cpumask_set_cpu(cpu, cpu_data->shared_cpu_map); + cpu_data->shared_type = CPUFREQ_SHARED_TYPE_ALL; + + return -EFAULT; } static acpi_status acpi_parse_cpc(acpi_handle handle, u32 lvl, void *data, @@ -586,7 +550,7 @@ static acpi_status acpi_parse_cpc(acpi_handle handle, u32 lvl, void *data, return AE_OK; } -int acpi_get_psd_map(struct cppc_cpudata **all_cpu_data) +int acpi_get_psd_map(unsigned int cpu, struct cppc_cpudata *cpu_data) { struct cpc_desc **cpc_pptr, *cpc_ptr; int parsed_core_num = 0; @@ -617,7 +581,7 @@ int acpi_get_psd_map(struct cppc_cpudata **all_cpu_data) goto out; } - ret = __acpi_get_psd_map(all_cpu_data, cpc_pptr); + ret = __acpi_get_psd_map(cpu, cpu_data, cpc_pptr); out: for_each_possible_cpu(i) { diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index f09154e8a5f5..d57fb570a006 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -30,13 +30,13 @@ #define DMI_PROCESSOR_MAX_SPEED 0x14 /* - * These structs contain information parsed from per CPU - * ACPI _CPC structures. - * e.g. For each CPU the highest, lowest supported - * performance capabilities, desired performance level - * requested etc. + * This list contains information parsed from per CPU ACPI _CPC and _PSD + * structures: e.g. the highest and lowest supported performance, capabilities, + * desired performance, level requested etc. Depending on the share_type, not + * all CPUs will have an entry in the list. */ -static struct cppc_cpudata **all_cpu_data; +static LIST_HEAD(cpu_data_list); + static bool boost_supported; struct cppc_workaround_oem_info { @@ -166,7 +166,7 @@ static int cppc_cpufreq_set_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { - struct cppc_cpudata *cpu_data = all_cpu_data[policy->cpu]; + struct cppc_cpudata *cpu_data = policy->driver_data; unsigned int cpu = policy->cpu; struct cpufreq_freqs freqs; u32 desired_perf; @@ -200,7 +200,7 @@ static int cppc_verify_policy(struct cpufreq_policy_data *policy) static void cppc_cpufreq_stop_cpu(struct cpufreq_policy *policy) { - struct cppc_cpudata *cpu_data = all_cpu_data[policy->cpu]; + struct cppc_cpudata *cpu_data = policy->driver_data; struct cppc_perf_caps *caps = &cpu_data->perf_caps; unsigned int cpu = policy->cpu; int ret; @@ -211,6 +211,12 @@ static void cppc_cpufreq_stop_cpu(struct cpufreq_policy *policy) if (ret) pr_debug("Err setting perf value:%d on CPU:%d. ret:%d\n", caps->lowest_perf, cpu, ret); + + /* Remove CPU node from list and free driver data for policy */ + free_cpumask_var(cpu_data->shared_cpu_map); + list_del(&cpu_data->node); + kfree(policy->driver_data); + policy->driver_data = NULL; } /* @@ -256,25 +262,61 @@ static unsigned int cppc_cpufreq_get_transition_delay_us(unsigned int cpu) } #endif -static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) + +static struct cppc_cpudata *cppc_cpufreq_get_cpu_data(unsigned int cpu) { - struct cppc_cpudata *cpu_data = all_cpu_data[policy->cpu]; - struct cppc_perf_caps *caps = &cpu_data->perf_caps; - unsigned int cpu = policy->cpu; - int i, ret = 0; + struct cppc_cpudata *cpu_data; + int ret; - cpu_data->cpu = cpu; - ret = cppc_get_perf_caps(cpu, caps); + cpu_data = kzalloc(sizeof(struct cppc_cpudata), GFP_KERNEL); + if (!cpu_data) + goto out; + + if (!zalloc_cpumask_var(&cpu_data->shared_cpu_map, GFP_KERNEL)) + goto free_cpu; + ret = acpi_get_psd_map(cpu, cpu_data); if (ret) { - pr_debug("Err reading CPU%d perf capabilities. ret:%d\n", - cpu, ret); - return ret; + pr_debug("Err parsing CPU%d PSD data: ret:%d\n", cpu, ret); + goto free_mask; + } + + ret = cppc_get_perf_caps(cpu, &cpu_data->perf_caps); + if (ret) { + pr_debug("Err reading CPU%d perf caps: ret:%d\n", cpu, ret); + goto free_mask; } /* Convert the lowest and nominal freq from MHz to KHz */ - caps->lowest_freq *= 1000; - caps->nominal_freq *= 1000; + cpu_data->perf_caps.lowest_freq *= 1000; + cpu_data->perf_caps.nominal_freq *= 1000; + + list_add(&cpu_data->node, &cpu_data_list); + + return cpu_data; + +free_mask: + free_cpumask_var(cpu_data->shared_cpu_map); +free_cpu: + kfree(cpu_data); +out: + return NULL; +} + +static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) +{ + unsigned int cpu = policy->cpu; + struct cppc_cpudata *cpu_data; + struct cppc_perf_caps *caps; + int ret; + + cpu_data = cppc_cpufreq_get_cpu_data(cpu); + if (!cpu_data) { + pr_err("Error in acquiring _CPC/_PSD data for CPU%d.\n", cpu); + return -ENODEV; + } + caps = &cpu_data->perf_caps; + policy->driver_data = cpu_data; /* * Set min to lowest nonlinear perf to avoid any efficiency penalty (see @@ -303,16 +345,12 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) /* Nothing to be done - we'll have a policy for each CPU */ break; case CPUFREQ_SHARED_TYPE_ANY: - /* All CPUs in the domain will share a policy */ + /* + * All CPUs in the domain will share a policy and all cpufreq + * operations will use a single cppc_cpudata structure stored + * in policy->driver_data. + */ cpumask_copy(policy->cpus, cpu_data->shared_cpu_map); - - for_each_cpu(i, policy->cpus) { - if (unlikely(i == cpu)) - continue; - - memcpy(&all_cpu_data[i]->perf_caps, caps, - sizeof(cpu_data->perf_caps)); - } break; default: pr_debug("Unsupported CPU co-ord type: %d\n", @@ -320,8 +358,6 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) return -EFAULT; } - cpu_data->cur_policy = policy; - /* * If 'highest_perf' is greater than 'nominal_perf', we assume CPU Boost * is supported. @@ -405,9 +441,12 @@ static int cppc_get_perf_ctrs_sample(void *val) static unsigned int cppc_cpufreq_get_rate(unsigned int cpu) { struct fb_ctr_pair fb_ctrs = { .cpu = cpu, }; - struct cppc_cpudata *cpu_data = all_cpu_data[cpu]; + struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + struct cppc_cpudata *cpu_data = policy->driver_data; int ret; + cpufreq_cpu_put(policy); + if (cpu_has_amu_feat(cpu)) ret = smp_call_on_cpu(cpu, cppc_get_perf_ctrs_sample, &fb_ctrs, false); @@ -424,7 +463,7 @@ static unsigned int cppc_cpufreq_get_rate(unsigned int cpu) static int cppc_cpufreq_set_boost(struct cpufreq_policy *policy, int state) { - struct cppc_cpudata *cpu_data = all_cpu_data[policy->cpu]; + struct cppc_cpudata *cpu_data = policy->driver_data; struct cppc_perf_caps *caps = &cpu_data->perf_caps; int ret; @@ -551,9 +590,9 @@ cpufreq_freq_attr_rw(energy_perf); static ssize_t show_freqdomain_cpus(struct cpufreq_policy *policy, char *buf) { - unsigned int cpu = policy->cpu; + struct cppc_cpudata *cpu_data = policy->driver_data; - return cpufreq_show_cpus(all_cpu_data[cpu]->shared_cpu_map, buf); + return cpufreq_show_cpus(cpu_data->shared_cpu_map, buf); } cpufreq_freq_attr_ro(freqdomain_cpus); @@ -586,10 +625,13 @@ static struct cpufreq_driver cppc_cpufreq_driver = { */ static unsigned int hisi_cppc_cpufreq_get_rate(unsigned int cpu) { - struct cppc_cpudata *cpu_data = all_cpu_data[cpu]; + struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + struct cppc_cpudata *cpu_data = policy->driver_data; u64 desired_perf; int ret; + cpufreq_cpu_put(policy); + ret = cppc_get_desired_perf(cpu, &desired_perf); if (ret < 0) return -EIO; @@ -622,68 +664,33 @@ static void cppc_check_hisi_workaround(void) static int __init cppc_cpufreq_init(void) { - int i, ret = 0; - struct cppc_cpudata *cpu_data; - - if (acpi_disabled) + if ((acpi_disabled) || !acpi_cpc_valid()) return -ENODEV; - all_cpu_data = kcalloc(num_possible_cpus(), sizeof(void *), - GFP_KERNEL); - if (!all_cpu_data) - return -ENOMEM; - - for_each_possible_cpu(i) { - all_cpu_data[i] = kzalloc(sizeof(struct cppc_cpudata), GFP_KERNEL); - if (!all_cpu_data[i]) - goto out; - - cpu_data = all_cpu_data[i]; - if (!zalloc_cpumask_var(&cpu_data->shared_cpu_map, GFP_KERNEL)) - goto out; - } - - ret = acpi_get_psd_map(all_cpu_data); - if (ret) { - pr_debug("Error parsing PSD data. Aborting cpufreq registration.\n"); - goto out; - } + INIT_LIST_HEAD(&cpu_data_list); cppc_check_hisi_workaround(); - ret = cpufreq_register_driver(&cppc_cpufreq_driver); - if (ret) - goto out; + return cpufreq_register_driver(&cppc_cpufreq_driver); +} - return ret; +static inline void free_cpu_data(void) +{ + struct cppc_cpudata *iter, *tmp; -out: - for_each_possible_cpu(i) { - cpu_data = all_cpu_data[i]; - if (!cpu_data) - break; - free_cpumask_var(cpu_data->shared_cpu_map); - kfree(cpu_data); + list_for_each_entry_safe(iter, tmp, &cpu_data_list, node) { + free_cpumask_var(iter->shared_cpu_map); + list_del(&iter->node); + kfree(iter); } - kfree(all_cpu_data); - return -ENODEV; } static void __exit cppc_cpufreq_exit(void) { - struct cppc_cpudata *cpu_data; - int i; - cpufreq_unregister_driver(&cppc_cpufreq_driver); - for_each_possible_cpu(i) { - cpu_data = all_cpu_data[i]; - free_cpumask_var(cpu_data->shared_cpu_map); - kfree(cpu_data); - } - - kfree(all_cpu_data); + free_cpu_data(); } module_exit(cppc_cpufreq_exit); diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index 95c7d27cd788..e20a92ee5493 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -124,11 +124,10 @@ struct cppc_perf_fb_ctrs { /* Per CPU container for runtime CPPC management. */ struct cppc_cpudata { - int cpu; + struct list_head node; struct cppc_perf_caps perf_caps; struct cppc_perf_ctrls perf_ctrls; struct cppc_perf_fb_ctrs perf_fb_ctrs; - struct cpufreq_policy *cur_policy; unsigned int shared_type; cpumask_var_t shared_cpu_map; }; @@ -143,8 +142,8 @@ extern int cppc_set_auto_sel(int cpu, bool enable); extern int cppc_set_epp(int cpu, u64 epp_val); extern int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls); extern int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps); -extern int acpi_get_psd_map(struct cppc_cpudata **); extern bool acpi_cpc_valid(void); +extern int acpi_get_psd_map(unsigned int cpu, struct cppc_cpudata *cpu_data); extern unsigned int cppc_get_transition_latency(int cpu); extern bool cpc_ffh_supported(void); extern int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val); -- Gitee From f248dcb7476cd3c2638af7b1a67280449f6fcce7 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 23 Jun 2021 09:54:39 +0530 Subject: [PATCH 18/78] cpufreq: CPPC: Migrate to ->exit() callback instead of ->stop_cpu() mainline inclusion from mainline-v5.14-rc1 commit 9357a380f90a89a168d505561d11f68272e0e768 category: cleanup bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=9357a380f90a89a168d505561d11f68272e0e768 ---------------------------------------------------------------------- Commit 367dc4aa932b ("cpufreq: Add stop CPU callback to cpufreq_driver interface") added the ->stop_cpu() callback to allow the drivers to do clean up before the CPU is completely down and its state can't be modified. At that time the CPU hotplug framework used to call the cpufreq core's registered notifier for different events like CPU_DOWN_PREPARE and CPU_POST_DEAD. The ->stop_cpu() callback was called during the CPU_DOWN_PREPARE event. This is no longer the case, cpuhp_cpufreq_offline() is called only once by the CPU hotplug core now and we don't really need two separate callbacks for cpufreq drivers, i.e. ->stop_cpu() and -exit() callback itself. Migrate to using the ->exit() callback instead of ->stop_cpu(). Signed-off-by: Viresh Kumar [ rjw: Minor edits in the changelog and subject ] Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 46 ++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index d57fb570a006..81c7dcb369fd 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -198,27 +198,6 @@ static int cppc_verify_policy(struct cpufreq_policy_data *policy) return 0; } -static void cppc_cpufreq_stop_cpu(struct cpufreq_policy *policy) -{ - struct cppc_cpudata *cpu_data = policy->driver_data; - struct cppc_perf_caps *caps = &cpu_data->perf_caps; - unsigned int cpu = policy->cpu; - int ret; - - cpu_data->perf_ctrls.desired_perf = caps->lowest_perf; - - ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls); - if (ret) - pr_debug("Err setting perf value:%d on CPU:%d. ret:%d\n", - caps->lowest_perf, cpu, ret); - - /* Remove CPU node from list and free driver data for policy */ - free_cpumask_var(cpu_data->shared_cpu_map); - list_del(&cpu_data->node); - kfree(policy->driver_data); - policy->driver_data = NULL; -} - /* * The PCC subspace describes the rate at which platform can accept commands * on the shared PCC channel (including READs which do not count towards freq @@ -377,6 +356,29 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) return ret; } +static int cppc_cpufreq_cpu_exit(struct cpufreq_policy *policy) +{ + struct cppc_cpudata *cpu_data = policy->driver_data; + struct cppc_perf_caps *caps = &cpu_data->perf_caps; + unsigned int cpu = policy->cpu; + int ret; + + cpu_data->perf_ctrls.desired_perf = caps->lowest_perf; + + ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls); + if (ret) + pr_debug("Err setting perf value:%d on CPU:%d. ret:%d\n", + caps->lowest_perf, cpu, ret); + + /* Remove CPU node from list and free driver data for policy */ + free_cpumask_var(cpu_data->shared_cpu_map); + list_del(&cpu_data->node); + kfree(policy->driver_data); + policy->driver_data = NULL; + + return 0; +} + static inline u64 get_delta(u64 t1, u64 t0) { if (t1 > t0 || t0 > ~(u32)0) @@ -611,7 +613,7 @@ static struct cpufreq_driver cppc_cpufreq_driver = { .target = cppc_cpufreq_set_target, .get = cppc_cpufreq_get_rate, .init = cppc_cpufreq_cpu_init, - .stop_cpu = cppc_cpufreq_stop_cpu, + .exit = cppc_cpufreq_cpu_exit, .set_boost = cppc_cpufreq_set_boost, .attr = cppc_cpufreq_attr, .name = "cppc_cpufreq", -- Gitee From 3083467c1057dc11db75b65c7ccf84ca069eb2ef Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 18 Jun 2021 13:31:27 +0530 Subject: [PATCH 19/78] cpufreq: CPPC: Fix potential memleak in cppc_cpufreq_cpu_init mainline inclusion from mainline-v5.14-rc1 commit fe2535a44904a77615a3af8e8fd7dafb98fb0e1b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=fe2535a44904a77615a3af8e8fd7dafb98fb0e1b ---------------------------------------------------------------------- It's a classic example of memleak, we allocate something, we fail and never free the resources. Make sure we free all resources on policy ->init() failures. Fixes: a28b2bfc099c ("cppc_cpufreq: replace per-cpu data array with a list") Tested-by: Vincent Guittot Reviewed-by: Ionela Voinescu Tested-by: Qian Cai Signed-off-by: Viresh Kumar Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 81c7dcb369fd..f3dfe54c0dfd 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -282,6 +282,16 @@ static struct cppc_cpudata *cppc_cpufreq_get_cpu_data(unsigned int cpu) return NULL; } +static void cppc_cpufreq_put_cpu_data(struct cpufreq_policy *policy) +{ + struct cppc_cpudata *cpu_data = policy->driver_data; + + list_del(&cpu_data->node); + free_cpumask_var(cpu_data->shared_cpu_map); + kfree(cpu_data); + policy->driver_data = NULL; +} + static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) { unsigned int cpu = policy->cpu; @@ -334,7 +344,8 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) default: pr_debug("Unsupported CPU co-ord type: %d\n", policy->shared_type); - return -EFAULT; + ret = -EFAULT; + goto out; } /* @@ -349,10 +360,16 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) cpu_data->perf_ctrls.desired_perf = caps->nominal_perf; ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls); - if (ret) + if (ret) { pr_debug("Err setting perf value:%d on CPU:%d. ret:%d\n", caps->nominal_perf, cpu, ret); + goto out; + } + + return 0; +out: + cppc_cpufreq_put_cpu_data(policy); return ret; } @@ -370,12 +387,7 @@ static int cppc_cpufreq_cpu_exit(struct cpufreq_policy *policy) pr_debug("Err setting perf value:%d on CPU:%d. ret:%d\n", caps->lowest_perf, cpu, ret); - /* Remove CPU node from list and free driver data for policy */ - free_cpumask_var(cpu_data->shared_cpu_map); - list_del(&cpu_data->node); - kfree(policy->driver_data); - policy->driver_data = NULL; - + cppc_cpufreq_put_cpu_data(policy); return 0; } -- Gitee From 42e886f75e0189a0fc427505cb72804a29602309 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 18 Jun 2021 13:42:23 +0530 Subject: [PATCH 20/78] cpufreq: CPPC: Pass structure instance by reference mainline inclusion from mainline-v5.14-rc1 commit eead1840cbd31e553bf8ccdefbd5b065bf596b71 category: cleanup bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=eead1840cbd31e553bf8ccdefbd5b065bf596b71 ---------------------------------------------------------------------- Don't pass structure instance by value, pass it by reference instead. Tested-by: Vincent Guittot Reviewed-by: Ionela Voinescu Tested-by: Qian Cai Signed-off-by: Viresh Kumar Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index f3dfe54c0dfd..36627a0a3c3c 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -400,18 +400,18 @@ static inline u64 get_delta(u64 t1, u64 t0) } static int cppc_get_rate_from_fbctrs(struct cppc_cpudata *cpu_data, - struct cppc_perf_fb_ctrs fb_ctrs_t0, - struct cppc_perf_fb_ctrs fb_ctrs_t1) + struct cppc_perf_fb_ctrs *fb_ctrs_t0, + struct cppc_perf_fb_ctrs *fb_ctrs_t1) { u64 delta_reference, delta_delivered; u64 reference_perf, delivered_perf; - reference_perf = fb_ctrs_t0.reference_perf; + reference_perf = fb_ctrs_t0->reference_perf; - delta_reference = get_delta(fb_ctrs_t1.reference, - fb_ctrs_t0.reference); - delta_delivered = get_delta(fb_ctrs_t1.delivered, - fb_ctrs_t0.delivered); + delta_reference = get_delta(fb_ctrs_t1->reference, + fb_ctrs_t0->reference); + delta_delivered = get_delta(fb_ctrs_t1->delivered, + fb_ctrs_t0->delivered); /* Check to avoid divide-by zero */ if (delta_reference || delta_delivered) @@ -471,8 +471,8 @@ static unsigned int cppc_cpufreq_get_rate(unsigned int cpu) return ret; return cppc_get_rate_from_fbctrs(cpu_data, - fb_ctrs.fb_ctrs_t0, - fb_ctrs.fb_ctrs_t1); + &fb_ctrs.fb_ctrs_t0, + &fb_ctrs.fb_ctrs_t1); } static int cppc_cpufreq_set_boost(struct cpufreq_policy *policy, int state) -- Gitee From a50dd12c3fa31a2945e230a63ff23413cb052627 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 10 Mar 2021 08:25:27 +0530 Subject: [PATCH 21/78] arch_topology: Export arch_freq_scale and helpers mainline inclusion from mainline-v5.13-rc1 commit 2f5339582e7b540851bfb37e184d4dee0ab9e387 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=2f5339582e7b540851bfb37e184d4dee0ab9e387 ---------------------------------------------------------------------- It is possible now for other parts of the kernel to provide their own implementation of sched_freq_tick() and they can very well be modules themselves (like CPPC cpufreq driver, which is going to use these in a later commit). Export arch_freq_scale and topology_{set|clear}_scale_freq_source(). Reviewed-by: Ionela Voinescu Tested-by: Ionela Voinescu Tested-by: Vincent Guittot Signed-off-by: Viresh Kumar Signed-off-by: huwentao --- drivers/base/arch_topology.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index be39683e5b77..3e7d15e25c22 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -78,6 +78,7 @@ void topology_set_scale_freq_source(struct scale_freq_data *data, update_scale_freq_invariant(true); } +EXPORT_SYMBOL_GPL(topology_set_scale_freq_source); void topology_clear_scale_freq_source(enum scale_freq_source source, const struct cpumask *cpus) @@ -96,6 +97,7 @@ void topology_clear_scale_freq_source(enum scale_freq_source source, update_scale_freq_invariant(false); } +EXPORT_SYMBOL_GPL(topology_clear_scale_freq_source); void topology_scale_freq_tick(void) { @@ -106,6 +108,7 @@ void topology_scale_freq_tick(void) } DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; +EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale); void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq, unsigned long max_freq) -- Gitee From d84eb2ad2679d52d1cb027b90e28240fc49571fb Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 23 Jun 2020 15:49:40 +0530 Subject: [PATCH 22/78] cpufreq: CPPC: Add support for frequency invariance mainline inclusion from mainline-v5.14-rc1 commit 1eb5dde674f57b1a1918dab33f09e35cdd64eb07 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=1eb5dde674f57b1a1918dab33f09e35cdd64eb07 ---------------------------------------------------------------------- The Frequency Invariance Engine (FIE) is providing a frequency scaling correction factor that helps achieve more accurate load-tracking. Normally, this scaling factor can be obtained directly with the help of the cpufreq drivers as they know the exact frequency the hardware is running at. But that isn't the case for CPPC cpufreq driver. Another way of obtaining that is using the arch specific counter support, which is already present in kernel, but that hardware is optional for platforms. This patch updates the CPPC driver to register itself with the topology core to provide its own implementation (cppc_scale_freq_tick()) of topology_scale_freq_tick() which gets called by the scheduler on every tick. Note that the arch specific counters have higher priority than CPPC counters, if available, though the CPPC driver doesn't need to have any special handling for that. On an invocation of cppc_scale_freq_tick(), we schedule an irq work (since we reach here from hard-irq context), which then schedules a normal work item and cppc_scale_freq_workfn() updates the per_cpu arch_freq_scale variable based on the counter updates since the last tick. To allow platforms to disable this CPPC counter-based frequency invariance support, this is all done under CONFIG_ACPI_CPPC_CPUFREQ_FIE, which is enabled by default. This also exports sched_setattr_nocheck() as the CPPC driver can be built as a module. Cc: linux-acpi@vger.kernel.org Tested-by: Vincent Guittot Reviewed-by: Ionela Voinescu Tested-by: Qian Cai Signed-off-by: Viresh Kumar Signed-off-by: huwentao --- drivers/cpufreq/Kconfig.arm | 10 ++ drivers/cpufreq/cppc_cpufreq.c | 255 +++++++++++++++++++++++++++++++-- include/linux/arch_topology.h | 1 + kernel/sched/core.c | 1 + 4 files changed, 252 insertions(+), 15 deletions(-) diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm index 67e574d0d481..1280a1ec86d6 100644 --- a/drivers/cpufreq/Kconfig.arm +++ b/drivers/cpufreq/Kconfig.arm @@ -19,6 +19,16 @@ config ACPI_CPPC_CPUFREQ If in doubt, say N. +config ACPI_CPPC_CPUFREQ_FIE + bool "Frequency Invariance support for CPPC cpufreq driver" + depends on ACPI_CPPC_CPUFREQ && GENERIC_ARCH_TOPOLOGY + default y + help + This extends frequency invariance support in the CPPC cpufreq driver, + by using CPPC delivered and reference performance counters. + + If in doubt, say N. + config CPPC_CPUFREQ_SYSFS_INTERFACE bool "Enable CPPC CPUFreq sysfs tuning interfaces" depends on ACPI_CPPC_CPUFREQ diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 36627a0a3c3c..e551a8ce3232 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -10,14 +10,18 @@ #define pr_fmt(fmt) "CPPC Cpufreq:" fmt +#include #include #include #include #include #include #include +#include +#include #include #include +#include #include @@ -63,6 +67,216 @@ struct fb_ctr_pair { struct cppc_perf_fb_ctrs fb_ctrs_t1; }; +#ifdef CONFIG_ACPI_CPPC_CPUFREQ_FIE + +/* Frequency invariance support */ +struct cppc_freq_invariance { + int cpu; + struct irq_work irq_work; + struct kthread_work work; + struct cppc_perf_fb_ctrs prev_perf_fb_ctrs; + struct cppc_cpudata *cpu_data; +}; + +static DEFINE_PER_CPU(struct cppc_freq_invariance, cppc_freq_inv); +static struct kthread_worker *kworker_fie; + +static struct cpufreq_driver cppc_cpufreq_driver; +static unsigned int hisi_cppc_cpufreq_get_rate(unsigned int cpu); +static int cppc_perf_from_fbctrs(struct cppc_cpudata *cpu_data, + struct cppc_perf_fb_ctrs *fb_ctrs_t0, + struct cppc_perf_fb_ctrs *fb_ctrs_t1); + +/** + * cppc_scale_freq_workfn - CPPC arch_freq_scale updater for frequency invariance + * @work: The work item. + * + * The CPPC driver register itself with the topology core to provide its own + * implementation (cppc_scale_freq_tick()) of topology_scale_freq_tick() which + * gets called by the scheduler on every tick. + * + * Note that the arch specific counters have higher priority than CPPC counters, + * if available, though the CPPC driver doesn't need to have any special + * handling for that. + * + * On an invocation of cppc_scale_freq_tick(), we schedule an irq work (since we + * reach here from hard-irq context), which then schedules a normal work item + * and cppc_scale_freq_workfn() updates the per_cpu arch_freq_scale variable + * based on the counter updates since the last tick. + */ +static void cppc_scale_freq_workfn(struct kthread_work *work) +{ + struct cppc_freq_invariance *cppc_fi; + struct cppc_perf_fb_ctrs fb_ctrs = {0}; + struct cppc_cpudata *cpu_data; + unsigned long local_freq_scale; + u64 perf; + + cppc_fi = container_of(work, struct cppc_freq_invariance, work); + cpu_data = cppc_fi->cpu_data; + + if (cppc_get_perf_ctrs(cppc_fi->cpu, &fb_ctrs)) { + pr_warn("%s: failed to read perf counters\n", __func__); + return; + } + + perf = cppc_perf_from_fbctrs(cpu_data, &cppc_fi->prev_perf_fb_ctrs, + &fb_ctrs); + cppc_fi->prev_perf_fb_ctrs = fb_ctrs; + + perf <<= SCHED_CAPACITY_SHIFT; + local_freq_scale = div64_u64(perf, cpu_data->perf_caps.highest_perf); + + /* This can happen due to counter's overflow */ + if (unlikely(local_freq_scale > 1024)) + local_freq_scale = 1024; + + per_cpu(arch_freq_scale, cppc_fi->cpu) = local_freq_scale; +} + +static void cppc_irq_work(struct irq_work *irq_work) +{ + struct cppc_freq_invariance *cppc_fi; + + cppc_fi = container_of(irq_work, struct cppc_freq_invariance, irq_work); + kthread_queue_work(kworker_fie, &cppc_fi->work); +} + +static void cppc_scale_freq_tick(void) +{ + struct cppc_freq_invariance *cppc_fi = &per_cpu(cppc_freq_inv, smp_processor_id()); + + /* + * cppc_get_perf_ctrs() can potentially sleep, call that from the right + * context. + */ + irq_work_queue(&cppc_fi->irq_work); +} + +static struct scale_freq_data cppc_sftd = { + .source = SCALE_FREQ_SOURCE_CPPC, + .set_freq_scale = cppc_scale_freq_tick, +}; + +static void cppc_cpufreq_cpu_fie_init(struct cpufreq_policy *policy) +{ + struct cppc_freq_invariance *cppc_fi; + int cpu, ret; + + if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) + return; + + for_each_cpu(cpu, policy->cpus) { + cppc_fi = &per_cpu(cppc_freq_inv, cpu); + cppc_fi->cpu = cpu; + cppc_fi->cpu_data = policy->driver_data; + kthread_init_work(&cppc_fi->work, cppc_scale_freq_workfn); + init_irq_work(&cppc_fi->irq_work, cppc_irq_work); + + ret = cppc_get_perf_ctrs(cpu, &cppc_fi->prev_perf_fb_ctrs); + if (ret) { + pr_warn("%s: failed to read perf counters for cpu:%d: %d\n", + __func__, cpu, ret); + + /* + * Don't abort if the CPU was offline while the driver + * was getting registered. + */ + if (cpu_online(cpu)) + return; + } + } + + /* Register for freq-invariance */ + topology_set_scale_freq_source(&cppc_sftd, policy->cpus); +} + +/* + * We free all the resources on policy's removal and not on CPU removal as the + * irq-work are per-cpu and the hotplug core takes care of flushing the pending + * irq-works (hint: smpcfd_dying_cpu()) on CPU hotplug. Even if the kthread-work + * fires on another CPU after the concerned CPU is removed, it won't harm. + * + * We just need to make sure to remove them all on policy->exit(). + */ +static void cppc_cpufreq_cpu_fie_exit(struct cpufreq_policy *policy) +{ + struct cppc_freq_invariance *cppc_fi; + int cpu; + + if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) + return; + + /* policy->cpus will be empty here, use related_cpus instead */ + topology_clear_scale_freq_source(SCALE_FREQ_SOURCE_CPPC, policy->related_cpus); + + for_each_cpu(cpu, policy->related_cpus) { + cppc_fi = &per_cpu(cppc_freq_inv, cpu); + irq_work_sync(&cppc_fi->irq_work); + kthread_cancel_work_sync(&cppc_fi->work); + } +} + +static void __init cppc_freq_invariance_init(void) +{ + struct sched_attr attr = { + .size = sizeof(struct sched_attr), + .sched_policy = SCHED_DEADLINE, + .sched_nice = 0, + .sched_priority = 0, + /* + * Fake (unused) bandwidth; workaround to "fix" + * priority inheritance. + */ + .sched_runtime = 1000000, + .sched_deadline = 10000000, + .sched_period = 10000000, + }; + int ret; + + if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) + return; + + kworker_fie = kthread_create_worker(0, "cppc_fie"); + if (IS_ERR(kworker_fie)) + return; + + ret = sched_setattr_nocheck(kworker_fie->task, &attr); + if (ret) { + pr_warn("%s: failed to set SCHED_DEADLINE: %d\n", __func__, + ret); + kthread_destroy_worker(kworker_fie); + return; + } +} + +static void cppc_freq_invariance_exit(void) +{ + if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) + return; + + kthread_destroy_worker(kworker_fie); + kworker_fie = NULL; +} + +#else +static inline void cppc_cpufreq_cpu_fie_init(struct cpufreq_policy *policy) +{ +} + +static inline void cppc_cpufreq_cpu_fie_exit(struct cpufreq_policy *policy) +{ +} + +static inline void cppc_freq_invariance_init(void) +{ +} + +static inline void cppc_freq_invariance_exit(void) +{ +} +#endif /* CONFIG_ACPI_CPPC_CPUFREQ_FIE */ + /* Callback function used to retrieve the max frequency from DMI */ static void cppc_find_dmi_mhz(const struct dmi_header *dm, void *private) { @@ -366,6 +580,7 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) goto out; } + cppc_cpufreq_cpu_fie_init(policy); return 0; out: @@ -380,6 +595,8 @@ static int cppc_cpufreq_cpu_exit(struct cpufreq_policy *policy) unsigned int cpu = policy->cpu; int ret; + cppc_cpufreq_cpu_fie_exit(policy); + cpu_data->perf_ctrls.desired_perf = caps->lowest_perf; ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls); @@ -399,12 +616,12 @@ static inline u64 get_delta(u64 t1, u64 t0) return (u32)t1 - (u32)t0; } -static int cppc_get_rate_from_fbctrs(struct cppc_cpudata *cpu_data, - struct cppc_perf_fb_ctrs *fb_ctrs_t0, - struct cppc_perf_fb_ctrs *fb_ctrs_t1) +static int cppc_perf_from_fbctrs(struct cppc_cpudata *cpu_data, + struct cppc_perf_fb_ctrs *fb_ctrs_t0, + struct cppc_perf_fb_ctrs *fb_ctrs_t1) { u64 delta_reference, delta_delivered; - u64 reference_perf, delivered_perf; + u64 reference_perf; reference_perf = fb_ctrs_t0->reference_perf; @@ -413,14 +630,11 @@ static int cppc_get_rate_from_fbctrs(struct cppc_cpudata *cpu_data, delta_delivered = get_delta(fb_ctrs_t1->delivered, fb_ctrs_t0->delivered); - /* Check to avoid divide-by zero */ - if (delta_reference || delta_delivered) - delivered_perf = (reference_perf * delta_delivered) / - delta_reference; - else - delivered_perf = cpu_data->perf_ctrls.desired_perf; + /* Check to avoid divide-by zero and invalid delivered_perf */ + if (!delta_reference || !delta_delivered) + return cpu_data->perf_ctrls.desired_perf; - return cppc_cpufreq_perf_to_khz(cpu_data, delivered_perf); + return (reference_perf * delta_delivered) / delta_reference; } static int cppc_get_perf_ctrs_sample(void *val) @@ -457,6 +671,7 @@ static unsigned int cppc_cpufreq_get_rate(unsigned int cpu) struct fb_ctr_pair fb_ctrs = { .cpu = cpu, }; struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); struct cppc_cpudata *cpu_data = policy->driver_data; + u64 delivered_perf; int ret; cpufreq_cpu_put(policy); @@ -470,9 +685,11 @@ static unsigned int cppc_cpufreq_get_rate(unsigned int cpu) if (ret) return ret; - return cppc_get_rate_from_fbctrs(cpu_data, - &fb_ctrs.fb_ctrs_t0, - &fb_ctrs.fb_ctrs_t1); + delivered_perf = cppc_perf_from_fbctrs(cpu_data, + &fb_ctrs.fb_ctrs_t0, + &fb_ctrs.fb_ctrs_t1); + + return cppc_cpufreq_perf_to_khz(cpu_data, delivered_perf); } static int cppc_cpufreq_set_boost(struct cpufreq_policy *policy, int state) @@ -678,14 +895,21 @@ static void cppc_check_hisi_workaround(void) static int __init cppc_cpufreq_init(void) { + int ret; + if ((acpi_disabled) || !acpi_cpc_valid()) return -ENODEV; INIT_LIST_HEAD(&cpu_data_list); cppc_check_hisi_workaround(); + cppc_freq_invariance_init(); - return cpufreq_register_driver(&cppc_cpufreq_driver); + ret = cpufreq_register_driver(&cppc_cpufreq_driver); + if (ret) + cppc_freq_invariance_exit(); + + return ret; } static inline void free_cpu_data(void) @@ -703,6 +927,7 @@ static inline void free_cpu_data(void) static void __exit cppc_cpufreq_exit(void) { cpufreq_unregister_driver(&cppc_cpufreq_driver); + cppc_freq_invariance_exit(); free_cpu_data(); } diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index 9dc94841803c..dea46128b2da 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -41,6 +41,7 @@ bool topology_scale_freq_invariant(void); enum scale_freq_source { SCALE_FREQ_SOURCE_CPUFREQ = 0, SCALE_FREQ_SOURCE_ARCH, + SCALE_FREQ_SOURCE_CPPC, }; struct scale_freq_data { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5d2937935b47..74423f98ddf9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6639,6 +6639,7 @@ int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) { return __sched_setscheduler(p, attr, false, true); } +EXPORT_SYMBOL_GPL(sched_setattr_nocheck); /** * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. -- Gitee From 188087eab3b787d68dea469789de65e93eaadc6f Mon Sep 17 00:00:00 2001 From: Jie Zhan Date: Fri, 13 Jun 2025 15:27:49 +0800 Subject: [PATCH 23/78] cpufreq: CPPC: Don't warn on failing to read perf counters on offline cpus driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U ---------------------------------------------------------------------- Reading perf counters on offline cpus should be expected to fail, e.g. it returns -EFAULT as counters are shown to be 0. Remove the unnecessary warning print on this failure path. Fixes: 1eb5dde674f5 ("cpufreq: CPPC: Add support for frequency invariance") Signed-off-by: Jie Zhan Signed-off-by: Hongye Lin Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index e551a8ce3232..c79d6d9c9412 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -174,16 +174,14 @@ static void cppc_cpufreq_cpu_fie_init(struct cpufreq_policy *policy) init_irq_work(&cppc_fi->irq_work, cppc_irq_work); ret = cppc_get_perf_ctrs(cpu, &cppc_fi->prev_perf_fb_ctrs); - if (ret) { - pr_warn("%s: failed to read perf counters for cpu:%d: %d\n", - __func__, cpu, ret); - + if (ret && cpu_online(cpu)) { /* * Don't abort if the CPU was offline while the driver * was getting registered. */ - if (cpu_online(cpu)) - return; + pr_debug("%s: failed to read perf counters for cpu:%d: %d\n", + __func__, cpu, ret); + return; } } -- Gitee From a19c1ab25f230328838a0c17aef011b862b97224 Mon Sep 17 00:00:00 2001 From: Jie Zhan Date: Fri, 13 Jun 2025 15:27:50 +0800 Subject: [PATCH 24/78] cpufreq: CPPC: Fix error handling in cppc_scale_freq_workfn() driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U ---------------------------------------------------------------------- Perf counters could be 0 if the cpu is in a low-power idle state. Just try it again next time and update the frequency scale when the cpu is active and perf counters successfully return. Also, remove the FIE source on an actual failure. Fixes: 1eb5dde674f5 ("cpufreq: CPPC: Add support for frequency invariance") Signed-off-by: Jie Zhan Signed-off-by: Hongye Lin Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index c79d6d9c9412..379c7793a38f 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -111,12 +111,23 @@ static void cppc_scale_freq_workfn(struct kthread_work *work) struct cppc_cpudata *cpu_data; unsigned long local_freq_scale; u64 perf; + int ret; cppc_fi = container_of(work, struct cppc_freq_invariance, work); cpu_data = cppc_fi->cpu_data; - if (cppc_get_perf_ctrs(cppc_fi->cpu, &fb_ctrs)) { + ret = cppc_get_perf_ctrs(cppc_fi->cpu, &fb_ctrs); + /* + * Perf counters could be 0 if the cpu is in a low-power idle state. + * Just try it again next time. + */ + if (ret == -EFAULT) + return; + + if (ret) { pr_warn("%s: failed to read perf counters\n", __func__); + topology_clear_scale_freq_source(SCALE_FREQ_SOURCE_CPPC, + cpu_data->shared_cpu_map); return; } -- Gitee From c00a3b8d82186d21aac9f473209f8b611ce63131 Mon Sep 17 00:00:00 2001 From: Aleksandr Mishin Date: Tue, 8 Oct 2024 11:09:18 +0000 Subject: [PATCH 25/78] cppc_cpufreq: Fix possible null pointer dereference stable inclusion from stable-v6.6.33 commit f84b9b25d045e67a7eee5e73f21278c8ab06713c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f84b9b25d045e67a7eee5e73f21278c8ab06713c -------------------------------- [ Upstream commit cf7de25878a1f4508c69dc9f6819c21ba177dbfe ] cppc_cpufreq_get_rate() and hisi_cppc_cpufreq_get_rate() can be called from different places with various parameters. So cpufreq_cpu_get() can return null as 'policy' in some circumstances. Fix this bug by adding null return check. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: a28b2bfc099c ("cppc_cpufreq: replace per-cpu data array with a list") Signed-off-by: Aleksandr Mishin Signed-off-by: Viresh Kumar Signed-off-by: Sasha Levin Conflicts: drivers/cpufreq/cppc_cpufreq.c [Context conflict] Signed-off-by: ZhangPeng Signed-off-by: Liu Mingrui Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 379c7793a38f..e7a078878d2a 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -679,10 +679,15 @@ static unsigned int cppc_cpufreq_get_rate(unsigned int cpu) { struct fb_ctr_pair fb_ctrs = { .cpu = cpu, }; struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); - struct cppc_cpudata *cpu_data = policy->driver_data; + struct cppc_cpudata *cpu_data; u64 delivered_perf; int ret; + if (!policy) + return -ENODEV; + + cpu_data = policy->driver_data; + cpufreq_cpu_put(policy); if (cpu_has_amu_feat(cpu)) @@ -866,10 +871,15 @@ static struct cpufreq_driver cppc_cpufreq_driver = { static unsigned int hisi_cppc_cpufreq_get_rate(unsigned int cpu) { struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); - struct cppc_cpudata *cpu_data = policy->driver_data; + struct cppc_cpudata *cpu_data; u64 desired_perf; int ret; + if (!policy) + return -ENODEV; + + cpu_data = policy->driver_data; + cpufreq_cpu_put(policy); ret = cppc_get_desired_perf(cpu, &desired_perf); -- Gitee From 8870795b0e7f19fa8712d7a3f6c64f8e6bc06aac Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 21 Aug 2025 15:15:57 +0800 Subject: [PATCH 26/78] cpufreq: cppc: Fix invalid return value in .get() callback mainline inclusion from mainline-v6.16-rc1 commit 2b8e6b58889c672e1ae3601d9b2b070be4dc2fbc category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2b8e6b58889c672e1ae3601d9b2b070be4dc2fbc ---------------------------------------------------------------------- Returning a negative error code in a function with an unsigned return type is a pretty bad idea. It is probably worse when the justification for the change is "our static analisys tool found it". Fixes: cf7de25878a1 ("cppc_cpufreq: Fix possible null pointer dereference") Signed-off-by: Marc Zyngier Cc: "Rafael J. Wysocki" Cc: Viresh Kumar Reviewed-by: Lifeng Zheng Signed-off-by: Viresh Kumar Signed-off-by: Hongye Lin Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index e7a078878d2a..97d30f0be14e 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -684,7 +684,7 @@ static unsigned int cppc_cpufreq_get_rate(unsigned int cpu) int ret; if (!policy) - return -ENODEV; + return 0; cpu_data = policy->driver_data; -- Gitee From 0105755fb4234a3cf16923f39bd6f019e8510d08 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Thu, 21 Aug 2025 15:15:58 +0800 Subject: [PATCH 27/78] cpufreq: cppc: Fix invalid return value in hisi_cppc_cpufreq_get_rate() driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U ---------------------------------------------------------------------- Returning a negative error code in a function with an unsigned return type is a pretty bad idea. Return 0 is enough when something wrong. Fixes: cf7de25878a1 ("cppc_cpufreq: Fix possible null pointer dereference") Signed-off-by: Lifeng Zheng Signed-off-by: Hongye Lin Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 97d30f0be14e..c3155f546c82 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -876,7 +876,7 @@ static unsigned int hisi_cppc_cpufreq_get_rate(unsigned int cpu) int ret; if (!policy) - return -ENODEV; + return 0; cpu_data = policy->driver_data; @@ -884,7 +884,7 @@ static unsigned int hisi_cppc_cpufreq_get_rate(unsigned int cpu) ret = cppc_get_desired_perf(cpu, &desired_perf); if (ret < 0) - return -EIO; + return 0; return cppc_cpufreq_perf_to_khz(cpu_data, desired_perf); } -- Gitee From e3aa26daaf7b50f2ba8ce0e5f0a6e8c04705011a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 16 Mar 2021 16:54:03 +0100 Subject: [PATCH 28/78] ACPI: CPPC: Add emtpy stubs of functions for CONFIG_ACPI_CPPC_LIB unset mainline inclusion from mainline-v5.13-rc1 commit 8a02d99876362f35bc918097440445de18e3c47c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=8a02d99876362f35bc918097440445de18e3c47c ---------------------------------------------------------------------- For convenience, add empty stubs of library functions defined in cppc_acpi.c for the CONFIG_ACPI_CPPC_LIB unset case. Because one of them needs to return CPUFREQ_ETERNAL, include linux/cpufreq.h into the CPPC library header file and drop the direct inclusion of it from cppc_acpi.c. Signed-off-by: Rafael J. Wysocki Tested-by: Chen Yu Signed-off-by: huwentao --- drivers/acpi/cppc_acpi.c | 1 - include/acpi/cppc_acpi.h | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index 0ba9838ce64b..7c4b9f63ae1e 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -33,7 +33,6 @@ #define pr_fmt(fmt) "ACPI CPPC: " fmt -#include #include #include #include diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index e20a92ee5493..54c5d21e2827 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -11,6 +11,7 @@ #define _CPPC_ACPI_H #include +#include #include #include @@ -132,6 +133,7 @@ struct cppc_cpudata { cpumask_var_t shared_cpu_map; }; +#ifdef CONFIG_ACPI_CPPC_LIB extern int cppc_get_desired_perf(int cpunum, u64 *desired_perf); extern int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs); extern int cppc_get_epp_perf(int cpunum, u64 *epp_perf); @@ -148,5 +150,43 @@ extern unsigned int cppc_get_transition_latency(int cpu); extern bool cpc_ffh_supported(void); extern int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val); extern int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val); +#else /* !CONFIG_ACPI_CPPC_LIB */ +static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf) +{ + return -ENOTSUPP; +} +static inline int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs) +{ + return -ENOTSUPP; +} +static inline int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) +{ + return -ENOTSUPP; +} +static inline int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps) +{ + return -ENOTSUPP; +} +static inline bool acpi_cpc_valid(void) +{ + return false; +} +static inline unsigned int cppc_get_transition_latency(int cpu) +{ + return CPUFREQ_ETERNAL; +} +static inline bool cpc_ffh_supported(void) +{ + return false; +} +static inline int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val) +{ + return -ENOTSUPP; +} +static inline int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) +{ + return -ENOTSUPP; +} +#endif /* !CONFIG_ACPI_CPPC_LIB */ #endif /* _CPPC_ACPI_H*/ -- Gitee From cdac1221d33cf146a453ce5d66d480d38f8b5e5b Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Mon, 12 Sep 2022 15:37:22 -0500 Subject: [PATCH 29/78] ACPI: CPPC: Disable FIE if registers in PCC regions mainline inclusion from mainline-v6.1-rc1 commit ae2df912d1a557a3548be83da20851ac55f42ab3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=ae2df912d1a557a3548be83da20851ac55f42ab3 ---------------------------------------------------------------------- PCC regions utilize a mailbox to set/retrieve register values used by the CPPC code. This is fine as long as the operations are infrequent. With the FIE code enabled though the overhead can range from 2-11% of system CPU overhead (ex: as measured by top) on Arm based machines. So, before enabling FIE assure none of the registers used by cppc_get_perf_ctrs() are in the PCC region. Finally, add a module parameter which can override the PCC region detection at boot or module reload. Signed-off-by: Jeremy Linton Acked-by: Viresh Kumar Reviewed-by: Ionela Voinescu Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- drivers/acpi/cppc_acpi.c | 42 ++++++++++++++++++++++++++++++++++ drivers/cpufreq/cppc_cpufreq.c | 25 ++++++++++++++++---- include/acpi/cppc_acpi.h | 5 ++++ 3 files changed, 68 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index 7c4b9f63ae1e..c2477bd58efc 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -1404,6 +1404,48 @@ int cppc_get_perf_caps(int cpunum, struct cppc_perf_caps *perf_caps) } EXPORT_SYMBOL_GPL(cppc_get_perf_caps); +/** + * cppc_perf_ctrs_in_pcc - Check if any perf counters are in a PCC region. + * + * CPPC has flexibility about how CPU performance counters are accessed. + * One of the choices is PCC regions, which can have a high access latency. This + * routine allows callers of cppc_get_perf_ctrs() to know this ahead of time. + * + * Return: true if any of the counters are in PCC regions, false otherwise + */ +bool cppc_perf_ctrs_in_pcc(void) +{ + int cpu; + + for_each_present_cpu(cpu) { + struct cpc_register_resource *ref_perf_reg; + struct cpc_desc *cpc_desc; + + cpc_desc = per_cpu(cpc_desc_ptr, cpu); + + if (CPC_IN_PCC(&cpc_desc->cpc_regs[DELIVERED_CTR]) || + CPC_IN_PCC(&cpc_desc->cpc_regs[REFERENCE_CTR]) || + CPC_IN_PCC(&cpc_desc->cpc_regs[CTR_WRAP_TIME])) + return true; + + + ref_perf_reg = &cpc_desc->cpc_regs[REFERENCE_PERF]; + + /* + * If reference perf register is not supported then we should + * use the nominal perf value + */ + if (!CPC_SUPPORTED(ref_perf_reg)) + ref_perf_reg = &cpc_desc->cpc_regs[NOMINAL_PERF]; + + if (CPC_IN_PCC(ref_perf_reg)) + return true; + } + + return false; +} +EXPORT_SYMBOL_GPL(cppc_perf_ctrs_in_pcc); + /** * cppc_get_perf_ctrs - Read a CPU's performance feedback counters. * @cpunum: CPU from which to read counters. diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index c3155f546c82..8274f8dd1054 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -67,7 +67,15 @@ struct fb_ctr_pair { struct cppc_perf_fb_ctrs fb_ctrs_t1; }; +static enum { + FIE_UNSET = -1, + FIE_ENABLED, + FIE_DISABLED +} fie_disabled = FIE_UNSET; + #ifdef CONFIG_ACPI_CPPC_CPUFREQ_FIE +module_param(fie_disabled, int, 0444); +MODULE_PARM_DESC(fie_disabled, "Disable Frequency Invariance Engine (FIE)"); /* Frequency invariance support */ struct cppc_freq_invariance { @@ -174,7 +182,7 @@ static void cppc_cpufreq_cpu_fie_init(struct cpufreq_policy *policy) struct cppc_freq_invariance *cppc_fi; int cpu, ret; - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) + if (fie_disabled) return; for_each_cpu(cpu, policy->cpus) { @@ -213,7 +221,7 @@ static void cppc_cpufreq_cpu_fie_exit(struct cpufreq_policy *policy) struct cppc_freq_invariance *cppc_fi; int cpu; - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) + if (fie_disabled) return; /* policy->cpus will be empty here, use related_cpus instead */ @@ -243,7 +251,15 @@ static void __init cppc_freq_invariance_init(void) }; int ret; - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) + if (fie_disabled != FIE_ENABLED && fie_disabled != FIE_DISABLED) { + fie_disabled = FIE_ENABLED; + if (cppc_perf_ctrs_in_pcc()) { + pr_info("FIE not enabled on systems with registers in PCC\n"); + fie_disabled = FIE_DISABLED; + } + } + + if (fie_disabled) return; kworker_fie = kthread_create_worker(0, "cppc_fie"); @@ -261,7 +277,7 @@ static void __init cppc_freq_invariance_init(void) static void cppc_freq_invariance_exit(void) { - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) + if (fie_disabled) return; kthread_destroy_worker(kworker_fie); @@ -905,6 +921,7 @@ static void cppc_check_hisi_workaround(void) wa_info[i].oem_revision == tbl->oem_revision) { /* Overwrite the get() callback */ cppc_cpufreq_driver.get = hisi_cppc_cpufreq_get_rate; + fie_disabled = FIE_DISABLED; break; } } diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index 54c5d21e2827..f1532779ba16 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -144,6 +144,7 @@ extern int cppc_set_auto_sel(int cpu, bool enable); extern int cppc_set_epp(int cpu, u64 epp_val); extern int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls); extern int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps); +extern bool cppc_perf_ctrs_in_pcc(void); extern bool acpi_cpc_valid(void); extern int acpi_get_psd_map(unsigned int cpu, struct cppc_cpudata *cpu_data); extern unsigned int cppc_get_transition_latency(int cpu); @@ -167,6 +168,10 @@ static inline int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps) { return -ENOTSUPP; } +static inline bool cppc_perf_ctrs_in_pcc(void) +{ + return false; +} static inline bool acpi_cpc_valid(void) { return false; -- Gitee From 58d98182c760d754a2d9410bf8fe4f8514462d1b Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sun, 15 Aug 2021 21:07:29 +0800 Subject: [PATCH 30/78] cpufreq: remove useless INIT_LIST_HEAD() mainline inclusion from mainline-v5.16-rc1 commit 6065a672679f0502ede024a03db82c03534a5e00 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=6065a672679f0502ede024a03db82c03534a5e00 ---------------------------------------------------------------------- list cpu_data_list has been inited staticly through LIST_HEAD, so there's no need to call another INIT_LIST_HEAD. Simply remove it from cppc_cpufreq_init. Signed-off-by: Han Wang Signed-off-by: Viresh Kumar Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 8274f8dd1054..f3cf3cdfbe3f 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -936,8 +936,6 @@ static int __init cppc_cpufreq_init(void) if ((acpi_disabled) || !acpi_cpc_valid()) return -ENODEV; - INIT_LIST_HEAD(&cpu_data_list); - cppc_check_hisi_workaround(); cppc_freq_invariance_init(); -- Gitee From 086d7de26ef7cf5baec565927b3d2f656435851d Mon Sep 17 00:00:00 2001 From: Pierre Gondois Date: Mon, 25 Apr 2022 14:38:07 +0200 Subject: [PATCH 31/78] cpufreq: CPPC: Add per_cpu efficiency_class MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v5.19-rc1 commit d3c3db41df7e1bdefc9c68073070b62ce3b260bd category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=d3c3db41df7e1bdefc9c68073070b62ce3b260bd ---------------------------------------------------------------------- In ACPI, describing power efficiency of CPUs can be done through the following arm specific field: ACPI 6.4, s5.2.12.14 'GIC CPU Interface (GICC) Structure', 'Processor Power Efficiency Class field': Describes the relative power efficiency of the associated pro- cessor. Lower efficiency class numbers are more efficient than higher ones (e.g. efficiency class 0 should be treated as more efficient than efficiency class 1). However, absolute values of this number have no meaning: 2 isn’t necessarily half as efficient as 1. The efficiency_class field is stored in the GicC structure of the ACPI MADT table and it's currently supported in Linux for arm64 only. Thus, this new functionality is introduced for arm64 only. To allow the cppc_cpufreq driver to know and preprocess the efficiency_class values of all the CPUs, add a per_cpu efficiency_class variable to store them. At least 2 different efficiency classes must be present, otherwise there is no use in creating an Energy Model. The efficiency_class values are squeezed in [0:#efficiency_class-1] while conserving the order. For instance, efficiency classes of: [111, 212, 250] will be mapped to: [0 (was 111), 1 (was 212), 2 (was 250)]. Each policy being independently registered in the driver, populating the per_cpu efficiency_class is done only once at the driver initialization. This prevents from having each policy re-searching the efficiency_class values of other CPUs. The EM will be registered in a following patch. The patch also exports acpi_cpu_get_madt_gicc() to fetch the GicC structure of the ACPI MADT table for each CPU. Acked-by: Catalin Marinas Signed-off-by: Pierre Gondois Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- arch/arm64/kernel/smp.c | 1 + drivers/cpufreq/cppc_cpufreq.c | 42 ++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 23707812f87a..84704529569b 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -574,6 +574,7 @@ struct acpi_madt_generic_interrupt *acpi_cpu_get_madt_gicc(int cpu) { return &cpu_madt_gicc[cpu]; } +EXPORT_SYMBOL_GPL(acpi_cpu_get_madt_gicc); /* * acpi_map_gic_cpu_interface - parse processor MADT entry diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index f3cf3cdfbe3f..d525c3c92def 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -472,12 +472,53 @@ static unsigned int cppc_cpufreq_get_transition_delay_us(unsigned int cpu) return delay_us; } +static DEFINE_PER_CPU(unsigned int, efficiency_class); + +static int populate_efficiency_class(void) +{ + struct acpi_madt_generic_interrupt *gicc; + DECLARE_BITMAP(used_classes, 256) = {}; + int class, cpu, index; + + for_each_possible_cpu(cpu) { + gicc = acpi_cpu_get_madt_gicc(cpu); + class = gicc->efficiency_class; + bitmap_set(used_classes, class, 1); + } + + if (bitmap_weight(used_classes, 256) <= 1) { + pr_debug("Efficiency classes are all equal (=%d). " + "No EM registered", class); + return -EINVAL; + } + + /* + * Squeeze efficiency class values on [0:#efficiency_class-1]. + * Values are per spec in [0:255]. + */ + index = 0; + for_each_set_bit(class, used_classes, 256) { + for_each_possible_cpu(cpu) { + gicc = acpi_cpu_get_madt_gicc(cpu); + if (gicc->efficiency_class == class) + per_cpu(efficiency_class, cpu) = index; + } + index++; + } + + return 0; +} + #else static unsigned int cppc_cpufreq_get_transition_delay_us(unsigned int cpu) { return cppc_get_transition_latency(cpu) / NSEC_PER_USEC; } +static int populate_efficiency_class(void) +{ + return 0; +} #endif @@ -938,6 +979,7 @@ static int __init cppc_cpufreq_init(void) cppc_check_hisi_workaround(); cppc_freq_invariance_init(); + populate_efficiency_class(); ret = cpufreq_register_driver(&cppc_cpufreq_driver); if (ret) -- Gitee From baa73831b23d86cf55e42d8b2dc4091ebb6439ed Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Mon, 21 Mar 2022 09:57:22 +0000 Subject: [PATCH 32/78] PM: EM: Add .get_cost() callback mainline inclusion from mainline-v5.19-rc1 commit bdc21a4d286c6917fe966fd765de99411095b1b4 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=bdc21a4d286c6917fe966fd765de99411095b1b4 ---------------------------------------------------------------------- The Energy Model (EM) supports devices which report abstract power scale, not only real Watts. The primary goal for EM is to enable the Energy Aware Scheduler (EAS) for a given platform. Some of the platforms might not be able to deliver proper power values. The only information that they might have is the relative efficiency between CPU types. Thus, it makes sense to remove some restrictions in the EM framework and introduce a mechanism which would support those platforms. What is crucial for EAS to operate is the 'cost' field in the EM. The 'cost' is calculated internally in EM framework based on knowledge from 'power' values. The 'cost' values must be strictly increasing. The existing API with its 'power' value size restrictions cannot guarantee that the 'cost' will meet this requirement. Since the platform is missing this detailed information, but has only efficiency details, introduce a new custom callback in the EM framework. The new callback would allow to provide the 'cost' values which reflect efficiency of the CPUs. This would allow to provide EAS information which has different relation than what would be forced by the EM internal formulas calculating 'cost' values. Thanks to this new callback it is possible to create a system view for EAS which has no overlapping performance states across many Performance Domains. Signed-off-by: Lukasz Luba Reviewed-by: Ionela Voinescu Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- include/linux/energy_model.h | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 5f04a2b35e80..97b0c9c1c31c 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -89,8 +89,29 @@ struct em_data_callback { */ int (*active_power)(unsigned long *power, unsigned long *freq, struct device *dev); + + /** + * get_cost() - Provide the cost at the given performance state of + * a device + * @dev : Device for which we do this operation (can be a CPU) + * @freq : Frequency at the performance state in kHz + * @cost : The cost value for the performance state + * (modified) + * + * In case of CPUs, the cost is the one of a single CPU in the domain. + * It is expected to fit in the [0, EM_MAX_POWER] range due to internal + * usage in EAS calculation. + * + * Return 0 on success, or appropriate error value in case of failure. + */ + int (*get_cost)(struct device *dev, unsigned long freq, + unsigned long *cost); }; -#define EM_DATA_CB(_active_power_cb) { .active_power = &_active_power_cb } +#define EM_ADV_DATA_CB(_active_power_cb, _cost_cb) \ + { .active_power = _active_power_cb, \ + .get_cost = _cost_cb } +#define EM_DATA_CB(_active_power_cb) \ + EM_ADV_DATA_CB(_active_power_cb, NULL) struct em_perf_domain *em_cpu_get(int cpu); struct em_perf_domain *em_pd_get(struct device *dev); @@ -198,6 +219,7 @@ static inline int em_pd_nr_perf_states(struct em_perf_domain *pd) #else struct em_data_callback {}; +#define EM_ADV_DATA_CB(_active_power_cb, _cost_cb) { } #define EM_DATA_CB(_active_power_cb) { } static inline -- Gitee From 7e43efae588111f0f913f6470fd9f7950afc4865 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 10 Aug 2021 12:04:33 +0530 Subject: [PATCH 33/78] cpufreq: Add callback to register with energy model mainline inclusion from mainline-v5.15-rc1 commit c17495b01b72b53bd290f442d39b060e015c7aea category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=c17495b01b72b53bd290f442d39b060e015c7aea ---------------------------------------------------------------------- Many cpufreq drivers register with the energy model for each policy and do exactly the same thing. Follow the footsteps of thermal-cooling, to get it done from the cpufreq core itself. Provide a new callback, which will be called, if present, by the cpufreq core at the right moment (more on that in the code's comment). Also provide a generic implementation that uses dev_pm_opp_of_register_em(). This also allows us to register with the EM at a later point of time, compared to ->init(), from where the EM core can access cpufreq policy directly using cpufreq_cpu_get() type of helpers and perform other work, like marking few frequencies inefficient, this will be done separately. Reviewed-by: Quentin Perret Reviewed-by: Lukasz Luba Signed-off-by: Viresh Kumar Signed-off-by: huwentao --- drivers/cpufreq/cpufreq.c | 13 +++++++++++++ include/linux/cpufreq.h | 14 ++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 20cd91468d92..dfcb6fc75758 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1536,6 +1536,19 @@ static int cpufreq_online(unsigned int cpu) write_lock_irqsave(&cpufreq_driver_lock, flags); list_add(&policy->policy_list, &cpufreq_policy_list); write_unlock_irqrestore(&cpufreq_driver_lock, flags); + + /* + * Register with the energy model before + * sched_cpufreq_governor_change() is called, which will result + * in rebuilding of the sched domains, which should only be done + * once the energy model is properly initialized for the policy + * first. + * + * Also, this should be called before the policy is registered + * with cooling framework. + */ + if (cpufreq_driver->register_em) + cpufreq_driver->register_em(policy); } ret = cpufreq_init_policy(policy); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index d4a0374617aa..547cddaab2fb 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -9,10 +9,12 @@ #define _LINUX_CPUFREQ_H #include +#include #include #include #include #include +#include #include #include #include @@ -385,6 +387,12 @@ struct cpufreq_driver { /* platform specific boost support code */ bool boost_enabled; int (*set_boost)(struct cpufreq_policy *policy, int state); + + /* + * Set by drivers that want to register with the energy model after the + * policy is properly initialized, but before the governor is started. + */ + void (*register_em)(struct cpufreq_policy *policy); }; /* flags */ @@ -1072,4 +1080,10 @@ unsigned int cpufreq_generic_get(unsigned int cpu); void cpufreq_generic_init(struct cpufreq_policy *policy, struct cpufreq_frequency_table *table, unsigned int transition_latency); + +static inline void cpufreq_register_em_with_opp(struct cpufreq_policy *policy) +{ + dev_pm_opp_of_register_em(get_cpu_device(policy->cpu), + policy->related_cpus); +} #endif /* _LINUX_CPUFREQ_H */ -- Gitee From 016a61b58eb346176d47d66ce263dd00334b26ea Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Mon, 21 Mar 2022 09:57:25 +0000 Subject: [PATCH 34/78] PM: EM: Change the order of arguments in the .active_power() callback mainline inclusion from mainline-v5.19-rc1 commit 75a3a99a5a9886af13be44e640cb415ebda80db2 category: cleanup bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=75a3a99a5a9886af13be44e640cb415ebda80db2 ---------------------------------------------------------------------- The .active_power() callback passes the device pointer when it's called. Aligned with a convetion present in other subsystems and pass the 'dev' as a first argument. It looks more cleaner. Adjust all affected drivers which implement that API callback. Suggested-by: Ionela Voinescu Signed-off-by: Lukasz Luba Reviewed-by: Ionela Voinescu Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- Documentation/power/energy-model.rst | 4 ++-- drivers/cpufreq/scmi-cpufreq.c | 4 ++-- drivers/opp/of.c | 4 ++-- include/linux/energy_model.h | 6 +++--- kernel/power/energy_model.c | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Documentation/power/energy-model.rst b/Documentation/power/energy-model.rst index a6fb986abe3c..5f03b6f47d9e 100644 --- a/Documentation/power/energy-model.rst +++ b/Documentation/power/energy-model.rst @@ -117,8 +117,8 @@ EM framework:: -> drivers/cpufreq/foo_cpufreq.c - 01 static int est_power(unsigned long *mW, unsigned long *KHz, - 02 struct device *dev) + 01 static int est_power(struct device *dev, unsigned long *mW, + 02 unsigned long *KHz) 03 { 04 long freq, power; 05 diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c index bb1389f276d7..4b86b074a874 100644 --- a/drivers/cpufreq/scmi-cpufreq.c +++ b/drivers/cpufreq/scmi-cpufreq.c @@ -96,8 +96,8 @@ scmi_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask) } static int __maybe_unused -scmi_get_cpu_power(unsigned long *power, unsigned long *KHz, - struct device *cpu_dev) +scmi_get_cpu_power(struct device *cpu_dev, unsigned long *power, + unsigned long *KHz) { unsigned long Hz; int ret, domain; diff --git a/drivers/opp/of.c b/drivers/opp/of.c index 3d7adc0de128..6873085e11e0 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -1240,8 +1240,8 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_of_node); * Returns -EINVAL if the power calculation failed because of missing * parameters, 0 otherwise. */ -static int __maybe_unused _get_power(unsigned long *mW, unsigned long *kHz, - struct device *dev) +static int __maybe_unused _get_power(struct device *dev, unsigned long *mW, + unsigned long *kHz) { struct dev_pm_opp *opp; struct device_node *np; diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 97b0c9c1c31c..47506c394841 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -71,11 +71,11 @@ struct em_data_callback { /** * active_power() - Provide power at the next performance state of * a device + * @dev : Device for which we do this operation (can be a CPU) * @power : Active power at the performance state in mW * (modified) * @freq : Frequency at the performance state in kHz * (modified) - * @dev : Device for which we do this operation (can be a CPU) * * active_power() must find the lowest performance state of 'dev' above * 'freq' and update 'power' and 'freq' to the matching active power @@ -87,8 +87,8 @@ struct em_data_callback { * * Return 0 on success. */ - int (*active_power)(unsigned long *power, unsigned long *freq, - struct device *dev); + int (*active_power)(struct device *dev, unsigned long *power, + unsigned long *freq); /** * get_cost() - Provide the cost at the given performance state of diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 334173fe6940..0dab668f8c9e 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -107,7 +107,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, * lowest performance state of 'dev' above 'freq' and updates * 'power' and 'freq' accordingly. */ - ret = cb->active_power(&power, &freq, dev); + ret = cb->active_power(dev, &power, &freq); if (ret) { dev_err(dev, "EM: invalid perf. state: %d\n", ret); -- Gitee From 74b5fe4a6b0cf770ccb4bb11ff5224aaf1317432 Mon Sep 17 00:00:00 2001 From: Pierre Gondois Date: Mon, 25 Apr 2022 14:38:08 +0200 Subject: [PATCH 35/78] cpufreq: CPPC: Register EM based on efficiency class information mainline inclusion from mainline-v5.19-rc1 commit 740fcdc2c20ecf855b36b919d7fa1b872b5a7eae category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=740fcdc2c20ecf855b36b919d7fa1b872b5a7eae ---------------------------------------------------------------------- Performance states and energy consumption values are not advertised in ACPI. In the GicC structure of the MADT table, the "Processor Power Efficiency Class field" (called efficiency class from now) allows to describe the relative energy efficiency of CPUs. To leverage the EM and EAS, the CPPC driver creates a set of artificial performance states and registers them in the Energy Model (EM), such as: - Every 20 capacity unit, a performance state is created. - The energy cost of each performance state gradually increases. No power value is generated as only the cost is used in the EM. During task placement, a task can raise the frequency of its whole pd. This can make EAS place a task on a pd with CPUs that are individually less energy efficient. As cost values are artificial, and to place tasks on CPUs with the lower efficiency class, a gap in cost values is generated for adjacent efficiency classes. E.g.: - efficiency class = 0, capacity is in [0-1024], so cost values are in [0: 51] (one performance state every 20 capacity unit) - efficiency class = 1, capacity is in [0-1024], cost values are in [1*gap+0: 1*gap+51]. The value of the cost gap is chosen to absorb a the energy of 4 CPUs at their maximum capacity. This means that between: 1- a pd of 4 CPUs, each of them being used at almost their full capacity. Their efficiency class is N. 2- a CPU using almost none of its capacity. Its efficiency class is N+1 EAS will choose the first option. This patch also populates the (struct cpufreq_driver).register_em callback if the valid efficiency_class ACPI values are provided. Signed-off-by: Pierre Gondois Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- drivers/base/arch_topology.c | 1 + drivers/cpufreq/cppc_cpufreq.c | 144 +++++++++++++++++++++++++++++++++ 2 files changed, 145 insertions(+) diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 3e7d15e25c22..7f947e5734c2 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -134,6 +134,7 @@ void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq, } DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE; +EXPORT_PER_CPU_SYMBOL_GPL(cpu_scale); void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity) { diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index d525c3c92def..e358178d26ef 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -473,6 +473,134 @@ static unsigned int cppc_cpufreq_get_transition_delay_us(unsigned int cpu) } static DEFINE_PER_CPU(unsigned int, efficiency_class); +static void cppc_cpufreq_register_em(struct cpufreq_policy *policy); + +/* Create an artificial performance state every CPPC_EM_CAP_STEP capacity unit. */ +#define CPPC_EM_CAP_STEP (20) +/* Increase the cost value by CPPC_EM_COST_STEP every performance state. */ +#define CPPC_EM_COST_STEP (1) +/* Add a cost gap correspnding to the energy of 4 CPUs. */ +#define CPPC_EM_COST_GAP (4 * SCHED_CAPACITY_SCALE * CPPC_EM_COST_STEP \ + / CPPC_EM_CAP_STEP) + +static unsigned int get_perf_level_count(struct cpufreq_policy *policy) +{ + struct cppc_perf_caps *perf_caps; + unsigned int min_cap, max_cap; + struct cppc_cpudata *cpu_data; + int cpu = policy->cpu; + + cpu_data = policy->driver_data; + perf_caps = &cpu_data->perf_caps; + max_cap = arch_scale_cpu_capacity(cpu); + min_cap = div_u64(max_cap * perf_caps->lowest_perf, perf_caps->highest_perf); + if ((min_cap == 0) || (max_cap < min_cap)) + return 0; + return 1 + max_cap / CPPC_EM_CAP_STEP - min_cap / CPPC_EM_CAP_STEP; +} + +/* + * The cost is defined as: + * cost = power * max_frequency / frequency + */ +static inline unsigned long compute_cost(int cpu, int step) +{ + return CPPC_EM_COST_GAP * per_cpu(efficiency_class, cpu) + + step * CPPC_EM_COST_STEP; +} + +static int cppc_get_cpu_power(struct device *cpu_dev, + unsigned long *power, unsigned long *KHz) +{ + unsigned long perf_step, perf_prev, perf, perf_check; + unsigned int min_step, max_step, step, step_check; + unsigned long prev_freq = *KHz; + unsigned int min_cap, max_cap; + struct cpufreq_policy *policy; + + struct cppc_perf_caps *perf_caps; + struct cppc_cpudata *cpu_data; + + policy = cpufreq_cpu_get_raw(cpu_dev->id); + cpu_data = policy->driver_data; + perf_caps = &cpu_data->perf_caps; + max_cap = arch_scale_cpu_capacity(cpu_dev->id); + min_cap = div_u64(max_cap * perf_caps->lowest_perf, + perf_caps->highest_perf); + + perf_step = CPPC_EM_CAP_STEP * perf_caps->highest_perf / max_cap; + min_step = min_cap / CPPC_EM_CAP_STEP; + max_step = max_cap / CPPC_EM_CAP_STEP; + + perf_prev = cppc_cpufreq_khz_to_perf(cpu_data, *KHz); + step = perf_prev / perf_step; + + if (step > max_step) + return -EINVAL; + + if (min_step == max_step) { + step = max_step; + perf = perf_caps->highest_perf; + } else if (step < min_step) { + step = min_step; + perf = perf_caps->lowest_perf; + } else { + step++; + if (step == max_step) + perf = perf_caps->highest_perf; + else + perf = step * perf_step; + } + + *KHz = cppc_cpufreq_perf_to_khz(cpu_data, perf); + perf_check = cppc_cpufreq_khz_to_perf(cpu_data, *KHz); + step_check = perf_check / perf_step; + + /* + * To avoid bad integer approximation, check that new frequency value + * increased and that the new frequency will be converted to the + * desired step value. + */ + while ((*KHz == prev_freq) || (step_check != step)) { + perf++; + *KHz = cppc_cpufreq_perf_to_khz(cpu_data, perf); + perf_check = cppc_cpufreq_khz_to_perf(cpu_data, *KHz); + step_check = perf_check / perf_step; + } + + /* + * With an artificial EM, only the cost value is used. Still the power + * is populated such as 0 < power < EM_MAX_POWER. This allows to add + * more sense to the artificial performance states. + */ + *power = compute_cost(cpu_dev->id, step); + + return 0; +} + +static int cppc_get_cpu_cost(struct device *cpu_dev, unsigned long KHz, + unsigned long *cost) +{ + unsigned long perf_step, perf_prev; + struct cppc_perf_caps *perf_caps; + struct cpufreq_policy *policy; + struct cppc_cpudata *cpu_data; + unsigned int max_cap; + int step; + + policy = cpufreq_cpu_get_raw(cpu_dev->id); + cpu_data = policy->driver_data; + perf_caps = &cpu_data->perf_caps; + max_cap = arch_scale_cpu_capacity(cpu_dev->id); + + perf_prev = cppc_cpufreq_khz_to_perf(cpu_data, KHz); + perf_step = CPPC_EM_CAP_STEP * perf_caps->highest_perf / max_cap; + step = perf_prev / perf_step; + + *cost = compute_cost(cpu_dev->id, step); + + return 0; +} static int populate_efficiency_class(void) { @@ -505,10 +633,23 @@ static int populate_efficiency_class(void) } index++; } + cppc_cpufreq_driver.register_em = cppc_cpufreq_register_em; return 0; } +static void cppc_cpufreq_register_em(struct cpufreq_policy *policy) +{ + struct cppc_cpudata *cpu_data; + struct em_data_callback em_cb = + EM_ADV_DATA_CB(cppc_get_cpu_power, cppc_get_cpu_cost); + + cpu_data = policy->driver_data; + em_dev_register_perf_domain(get_cpu_device(policy->cpu), + get_perf_level_count(policy), &em_cb, + cpu_data->shared_cpu_map); +} + #else static unsigned int cppc_cpufreq_get_transition_delay_us(unsigned int cpu) @@ -519,6 +660,9 @@ static int populate_efficiency_class(void) { return 0; } +static void cppc_cpufreq_register_em(struct cpufreq_policy *policy) +{ +} #endif -- Gitee From a0a7222d8d215b33ef10138554c86abc84529cb9 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Thu, 21 Aug 2025 15:15:59 +0800 Subject: [PATCH 36/78] cpufreq: CPPC: Remove forward declaration of hisi_cppc_cpufreq_get_rate() driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U ---------------------------------------------------------------------- After commit ae2df912d1a5 ("ACPI: CPPC: Disable FIE if registers in PCC regions"), the only place uses hisi_cppc_cpufreq_get_rate() is cppc_check_hisi_workaround(), which is after the implementation of hisi_cppc_cpufreq_get_rate(). A forward declaration of hisi_cppc_cpufreq_get_rate() is unnecessarily. Fixes: ae2df912d1a5 ("ACPI: CPPC: Disable FIE if registers in PCC regions") Signed-off-by: Lifeng Zheng Signed-off-by: Hongye Lin Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index e358178d26ef..238c1c75f2f2 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -90,7 +90,6 @@ static DEFINE_PER_CPU(struct cppc_freq_invariance, cppc_freq_inv); static struct kthread_worker *kworker_fie; static struct cpufreq_driver cppc_cpufreq_driver; -static unsigned int hisi_cppc_cpufreq_get_rate(unsigned int cpu); static int cppc_perf_from_fbctrs(struct cppc_cpudata *cpu_data, struct cppc_perf_fb_ctrs *fb_ctrs_t0, struct cppc_perf_fb_ctrs *fb_ctrs_t1); -- Gitee From 09958e2ec0014b0b9852ee331d11b1ab7f8220b6 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Thu, 21 Aug 2025 15:16:00 +0800 Subject: [PATCH 37/78] cpufreq: CPPC: Remove cpu_data_list mainline inclusion from mainline-v6.17-rc2 commit d80a75624051b817043431f847470fb4680f2582 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d80a75624051b817043431f847470fb4680f2582 ---------------------------------------------------------------------- After commit a28b2bfc099c ("cppc_cpufreq: replace per-cpu data array with a list"), cpu_data can be got from policy->driver_data, so cpu_data_list is not actually needed and can be removed. Signed-off-by: Lifeng Zheng Link: https://patch.msgid.link/20250526113057.3086513-2-zhenglifeng1@huawei.com Signed-off-by: Rafael J. Wysocki Signed-off-by: Hongye Lin Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 25 ------------------------- include/acpi/cppc_acpi.h | 1 - 2 files changed, 26 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 238c1c75f2f2..e2de648951d2 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -33,14 +33,6 @@ /* Offest in the DMI processor structure for the max frequency */ #define DMI_PROCESSOR_MAX_SPEED 0x14 -/* - * This list contains information parsed from per CPU ACPI _CPC and _PSD - * structures: e.g. the highest and lowest supported performance, capabilities, - * desired performance, level requested etc. Depending on the share_type, not - * all CPUs will have an entry in the list. - */ -static LIST_HEAD(cpu_data_list); - static bool boost_supported; struct cppc_workaround_oem_info { @@ -693,8 +685,6 @@ static struct cppc_cpudata *cppc_cpufreq_get_cpu_data(unsigned int cpu) cpu_data->perf_caps.lowest_freq *= 1000; cpu_data->perf_caps.nominal_freq *= 1000; - list_add(&cpu_data->node, &cpu_data_list); - return cpu_data; free_mask: @@ -709,7 +699,6 @@ static void cppc_cpufreq_put_cpu_data(struct cpufreq_policy *policy) { struct cppc_cpudata *cpu_data = policy->driver_data; - list_del(&cpu_data->node); free_cpumask_var(cpu_data->shared_cpu_map); kfree(cpu_data); policy->driver_data = NULL; @@ -1131,24 +1120,10 @@ static int __init cppc_cpufreq_init(void) return ret; } -static inline void free_cpu_data(void) -{ - struct cppc_cpudata *iter, *tmp; - - list_for_each_entry_safe(iter, tmp, &cpu_data_list, node) { - free_cpumask_var(iter->shared_cpu_map); - list_del(&iter->node); - kfree(iter); - } - -} - static void __exit cppc_cpufreq_exit(void) { cpufreq_unregister_driver(&cppc_cpufreq_driver); cppc_freq_invariance_exit(); - - free_cpu_data(); } module_exit(cppc_cpufreq_exit); diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index f1532779ba16..5d24633e3177 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -125,7 +125,6 @@ struct cppc_perf_fb_ctrs { /* Per CPU container for runtime CPPC management. */ struct cppc_cpudata { - struct list_head node; struct cppc_perf_caps perf_caps; struct cppc_perf_ctrls perf_ctrls; struct cppc_perf_fb_ctrs perf_fb_ctrs; -- Gitee From 16ff035285a16e431a4079770f022acc646f9d98 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Thu, 21 Aug 2025 15:16:01 +0800 Subject: [PATCH 38/78] cpufreq: CPPC: Do not return a value from populate_efficiency_class() mainline inclusion from mainline-v6.17-rc2 commit 3d5978ea6cbc4df192d0ea1800ef5d55b28b965e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3d5978ea6cbc4df192d0ea1800ef5d55b28b965e ---------------------------------------------------------------------- The return value of populate_efficiency_class() is never needed and the result of it doesn't affect the initialization of cppc_cpufreq. It makes more sense to change it into a void function. Signed-off-by: Lifeng Zheng Link: https://patch.msgid.link/20250526113057.3086513-3-zhenglifeng1@huawei.com [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki Signed-off-by: Hongye Lin Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index e2de648951d2..6b19a66b29db 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -593,7 +593,7 @@ static int cppc_get_cpu_cost(struct device *cpu_dev, unsigned long KHz, return 0; } -static int populate_efficiency_class(void) +static void populate_efficiency_class(void) { struct acpi_madt_generic_interrupt *gicc; DECLARE_BITMAP(used_classes, 256) = {}; @@ -608,7 +608,7 @@ static int populate_efficiency_class(void) if (bitmap_weight(used_classes, 256) <= 1) { pr_debug("Efficiency classes are all equal (=%d). " "No EM registered", class); - return -EINVAL; + return; } /* @@ -625,8 +625,6 @@ static int populate_efficiency_class(void) index++; } cppc_cpufreq_driver.register_em = cppc_cpufreq_register_em; - - return 0; } static void cppc_cpufreq_register_em(struct cpufreq_policy *policy) @@ -647,9 +645,8 @@ static unsigned int cppc_cpufreq_get_transition_delay_us(unsigned int cpu) { return cppc_get_transition_latency(cpu) / NSEC_PER_USEC; } -static int populate_efficiency_class(void) +static void populate_efficiency_class(void) { - return 0; } static void cppc_cpufreq_register_em(struct cpufreq_policy *policy) { -- Gitee From 0c1b694fdf30dff6f314d483a2f3f7db295d65e5 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Thu, 21 Aug 2025 15:16:02 +0800 Subject: [PATCH 39/78] cpufreq: CPPC: Remove forward declaration of cppc_cpufreq_register_em() mainline inclusion from mainline-v6.17-rc2 commit c83a92df2fc60bf0b3130cdf0bc2104d61750317 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c83a92df2fc60bf0b3130cdf0bc2104d61750317 ---------------------------------------------------------------------- cppc_cpufreq_register_em() is only used in populate_efficiency_class(). A forward declaration of it is not necessary. Move cppc_cpufreq_register_em() in front of populate_efficiency_class() and remove the forward declaration of cppc_cpufreq_register_em(). No functional change. Signed-off-by: Lifeng Zheng Link: https://patch.msgid.link/20250526113057.3086513-4-zhenglifeng1@huawei.com [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki Signed-off-by: Hongye Lin Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 6b19a66b29db..9ad7d2ac2049 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -464,7 +464,6 @@ static unsigned int cppc_cpufreq_get_transition_delay_us(unsigned int cpu) } static DEFINE_PER_CPU(unsigned int, efficiency_class); -static void cppc_cpufreq_register_em(struct cpufreq_policy *policy); /* Create an artificial performance state every CPPC_EM_CAP_STEP capacity unit. */ #define CPPC_EM_CAP_STEP (20) @@ -593,6 +592,18 @@ static int cppc_get_cpu_cost(struct device *cpu_dev, unsigned long KHz, return 0; } +static void cppc_cpufreq_register_em(struct cpufreq_policy *policy) +{ + struct cppc_cpudata *cpu_data; + struct em_data_callback em_cb = + EM_ADV_DATA_CB(cppc_get_cpu_power, cppc_get_cpu_cost); + + cpu_data = policy->driver_data; + em_dev_register_perf_domain(get_cpu_device(policy->cpu), + get_perf_level_count(policy), &em_cb, + cpu_data->shared_cpu_map); +} + static void populate_efficiency_class(void) { struct acpi_madt_generic_interrupt *gicc; @@ -627,18 +638,6 @@ static void populate_efficiency_class(void) cppc_cpufreq_driver.register_em = cppc_cpufreq_register_em; } -static void cppc_cpufreq_register_em(struct cpufreq_policy *policy) -{ - struct cppc_cpudata *cpu_data; - struct em_data_callback em_cb = - EM_ADV_DATA_CB(cppc_get_cpu_power, cppc_get_cpu_cost); - - cpu_data = policy->driver_data; - em_dev_register_perf_domain(get_cpu_device(policy->cpu), - get_perf_level_count(policy), &em_cb, - cpu_data->shared_cpu_map); -} - #else static unsigned int cppc_cpufreq_get_transition_delay_us(unsigned int cpu) -- Gitee From 64a82d0ce17d2a8cafab08289cfb585c031d6427 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Thu, 21 Aug 2025 15:16:03 +0800 Subject: [PATCH 40/78] cpufreq: Contain scaling_cur_freq.attr in cpufreq_attrs mainline inclusion from mainline-v6.17-rc2 commit 2e554cfa259fe07085a4fcff7d2ec4b7041bbd9c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2e554cfa259fe07085a4fcff7d2ec4b7041bbd9c ---------------------------------------------------------------------- After commit c034b02e213d ("cpufreq: expose scaling_cur_freq sysfs file for set_policy() drivers"), the file scaling_cur_freq is exposed to all drivers. No need to create this file separately. It's better to be contained in cpufreq_attrs. Signed-off-by: Lifeng Zheng Link: https://patch.msgid.link/20250623133402.3120230-4-zhenglifeng1@huawei.com Signed-off-by: Rafael J. Wysocki Signed-off-by: Hongye Lin Signed-off-by: huwentao --- drivers/cpufreq/cpufreq.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index dfcb6fc75758..d4c6196d0b45 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -965,6 +965,7 @@ static struct attribute *default_attrs[] = { &cpuinfo_min_freq.attr, &cpuinfo_max_freq.attr, &cpuinfo_transition_latency.attr, + &scaling_cur_freq.attr, &scaling_min_freq.attr, &scaling_max_freq.attr, &affected_cpus.attr, @@ -1083,10 +1084,6 @@ static int cpufreq_add_dev_interface(struct cpufreq_policy *policy) return ret; } - ret = sysfs_create_file(&policy->kobj, &scaling_cur_freq.attr); - if (ret) - return ret; - if (cpufreq_driver->bios_limit) { ret = sysfs_create_file(&policy->kobj, &bios_limit.attr); if (ret) -- Gitee From 0d0db8ce66a3d2750e1fb61c3858b8641bff31bb Mon Sep 17 00:00:00 2001 From: Yanteng Si Date: Tue, 6 Apr 2021 15:02:33 +0800 Subject: [PATCH 41/78] docs/zh_CN: add cpu-freq cpu-drivers.rst translation mainline inclusion from mainline-v5.13-rc1 commit 8b6d5ae8a9968c0a3807587d33b5952cdf71de1b category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=8b6d5ae8a9968c0a3807587d33b5952cdf71de1b ---------------------------------------------------------------------- This patch translates Documention/cpu-freq/cpu-drivers.rst into Chinese. Signed-off-by: Yanteng Si Reviewed-by: Alex Shi Link: https://lore.kernel.org/r/20210406070239.19910-3-siyanteng@loongson.cn Signed-off-by: Jonathan Corbet Signed-off-by: huwentao --- .../zh_CN/cpu-freq/cpu-drivers.rst | 259 ++++++++++++++++++ 1 file changed, 259 insertions(+) create mode 100644 Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst diff --git a/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst b/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst new file mode 100644 index 000000000000..0ca2cb646666 --- /dev/null +++ b/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst @@ -0,0 +1,259 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. include:: ../disclaimer-zh_CN.rst + +:Original: :doc:`../../../cpu-freq/cpu-drivers` +:Translator: Yanteng Si + +.. _cn_cpu-drivers.rst: + + +======================================= +如何实现一个新的CPUFreq处理器驱动程序? +======================================= + +作者: + + + - Dominik Brodowski + - Rafael J. Wysocki + - Viresh Kumar + +.. Contents + + 1. 怎么做? + 1.1 初始化 + 1.2 Per-CPU 初始化 + 1.3 验证 + 1.4 target/target_index 或 setpolicy? + 1.5 target/target_index + 1.6 setpolicy + 1.7 get_intermediate 与 target_intermediate + 2. 频率表助手 + + + +1. 怎么做? +=========== + +如此,你刚刚得到了一个全新的CPU/芯片组及其数据手册,并希望为这个CPU/芯片组添加cpufreq +支持?很好,这里有一些至关重要的提示: + + +1.1 初始化 +---------- + +首先,在__initcall_level_7 (module_init())或更靠后的函数中检查这个内核是否 +运行在正确的CPU和正确的芯片组上。如果是,则使用cpufreq_register_driver()向 +CPUfreq核心层注册一个cpufreq_driver结构体。 + +结构体cpufreq_driver应该包含什么成员? + + .name - 驱动的名字。 + + .init - 一个指向per-policy初始化函数的指针。 + + .verify - 一个指向"verification"函数的指针。 + + .setpolicy 或 .fast_switch 或 .target 或 .target_index - 差异见 + 下文。 + +并且可选择 + + .flags - cpufreq核的提示。 + + .driver_data - cpufreq驱动程序的特定数据。 + + .resolve_freq - 返回最适合目标频率的频率。不过并不能改变频率。 + + .get_intermediate 和 target_intermediate - 用于在改变CPU频率时切换到稳定 + 的频率。 + + .get - 返回CPU的当前频率。 + + .bios_limit - 返回HW/BIOS对CPU的最大频率限制值。 + + .exit - 一个指向per-policy清理函数的指针,该函数在cpu热插拔过程的CPU_POST_DEAD + 阶段被调用。 + + .stop_cpu - 一个指向per-policy停止函数的指针,该函数在cpu热插拔过程的CPU_DOWN_PREPARE + 阶段被调用。 + + .suspend - 一个指向per-policy暂停函数的指针,该函数在关中断且在该策略的调节器停止 + 后被调用。 + + .resume - 一个指向per-policy恢复函数的指针,该函数在关中断且在调节器再一次开始前被 + 调用。 + + .ready - 一个指向per-policy准备函数的指针,该函数在策略完全初始化之后被调用。 + + .attr - 一个指向NULL结尾的"struct freq_attr"列表的指针,该函数允许导出值到 + sysfs。 + + .boost_enabled - 如果设置,则启用提升(boost)频率。 + + .set_boost - 一个指向per-policy函数的指针,该函数用来开启/关闭提升(boost)频率功能。 + + +1.2 Per-CPU 初始化 +------------------ + +每当一个新的CPU被注册到设备模型中,或者在cpufreq驱动注册自己之后,如果此CPU的cpufreq策 +略不存在,则会调用per-policy的初始化函数cpufreq_driver.init。请注意,.init()和.exit()程序 +只对策略调用一次,而不是对策略管理的每个CPU调用一次。它需要一个 ``struct cpufreq_policy +*policy`` 作为参数。现在该怎么做呢? + +如果有必要,请在你的CPU上激活CPUfreq功能支持。 + +然后,驱动程序必须填写以下数值: + ++-----------------------------------+--------------------------------------+ +|policy->cpuinfo.min_freq 和 | | +|policy->cpuinfo.max_freq | 该CPU支持的最低和最高频率(kHz) | +| | | +| | | ++-----------------------------------+--------------------------------------+ +|policy->cpuinfo.transition_latency | | +| | CPU在两个频率之间切换所需的时间,以 | +| | 纳秒为单位(如适用,否则指定 | +| | CPUFREQ_ETERNAL) | ++-----------------------------------+--------------------------------------+ +|policy->cur | 该CPU当前的工作频率(如适用) | +| | | ++-----------------------------------+--------------------------------------+ +|policy->min, | | +|policy->max, | | +|policy->policy and, if necessary, | | +|policy->governor | 必须包含该cpu的 “默认策略”。稍后 | +| | 会用这些值调用 | +| | cpufreq_driver.verify and either | +| | cpufreq_driver.setpolicy or | +| | cpufreq_driver.target/target_index | +| | | ++-----------------------------------+--------------------------------------+ +|policy->cpus | 用与这个CPU一起做DVFS的(在线+离线) | +| | CPU(即与它共享时钟/电压轨)的掩码更新 | +| | 这个 | +| | | ++-----------------------------------+--------------------------------------+ + +对于设置其中的一些值(cpuinfo.min[max]_freq, policy->min[max]),频率表助手可能会有帮 +助。关于它们的更多信息,请参见第2节。 + + +1.3 验证 +-------- + +当用户决定设置一个新的策略(由 “policy,governor,min,max组成”)时,必须对这个策略进行验证, +以便纠正不兼容的值。为了验证这些值,cpufreq_verify_within_limits(``struct cpufreq_policy +*policy``, ``unsigned int min_freq``, ``unsigned int max_freq``)函数可能会有帮助。 +关于频率表助手的详细内容请参见第2节。 + +您需要确保至少有一个有效频率(或工作范围)在 policy->min 和 policy->max 范围内。如果有必 +要,先增加policy->max,只有在没有办法的情况下,才减少policy->min。 + + +1.4 target 或 target_index 或 setpolicy 或 fast_switch? +------------------------------------------------------- + +大多数cpufreq驱动甚至大多数cpu频率升降算法只允许将CPU频率设置为预定义的固定值。对于这些,你 +可以使用->target(),->target_index()或->fast_switch()回调。 + +有些cpufreq功能的处理器可以自己在某些限制之间切换频率。这些应使用->setpolicy()回调。 + + +1.5. target/target_index +------------------------ + +target_index调用有两个参数:``struct cpufreq_policy * policy``和``unsigned int`` +索引(于列出的频率表)。 + +当调用这里时,CPUfreq驱动必须设置新的频率。实际频率必须由freq_table[index].frequency决定。 + +它应该总是在错误的情况下恢复到之前的频率(即policy->restore_freq),即使我们之前切换到中间频率。 + +已弃用 +---------- +目标调用有三个参数。``struct cpufreq_policy * policy``, unsigned int target_frequency, +unsigned int relation. + +CPUfreq驱动在调用这里时必须设置新的频率。实际的频率必须使用以下规则来确定。 + +- 紧跟 "目标频率"。 +- policy->min <= new_freq <= policy->max (这必须是有效的!!!) +- 如果 relation==CPUFREQ_REL_L,尝试选择一个高于或等于 target_freq 的 new_freq。("L代表 + 最低,但不能低于") +- 如果 relation==CPUFREQ_REL_H,尝试选择一个低于或等于 target_freq 的 new_freq。("H代表 + 最高,但不能高于") + +这里,频率表助手可能会帮助你--详见第2节。 + +1.6. fast_switch +---------------- + +这个函数用于从调度器的上下文进行频率切换。并非所有的驱动都要实现它,因为不允许在这个回调中睡眠。这 +个回调必须经过高度优化,以尽可能快地进行切换。 + +这个函数有两个参数: ``struct cpufreq_policy *policy`` 和 ``unsigned int target_frequency``。 + + +1.7 setpolicy +------------- + +setpolicy调用只需要一个``struct cpufreq_policy * policy``作为参数。需要将处理器内或芯片组内动态频 +率切换的下限设置为policy->min,上限设置为policy->max,如果支持的话,当policy->policy为 +CPUFREQ_POLICY_PERFORMANCE时选择面向性能的设置,当CPUFREQ_POLICY_POWERSAVE时选择面向省电的设置。 +也可以查看drivers/cpufreq/longrun.c中的参考实现。 + +1.8 get_intermediate 和 target_intermediate +-------------------------------------------- + +仅适用于 target_index() 和 CPUFREQ_ASYNC_NOTIFICATION 未设置的驱动。 + +get_intermediate应该返回一个平台想要切换到的稳定的中间频率,target_intermediate()应该将CPU设置为 +该频率,然后再跳转到'index'对应的频率。核心会负责发送通知,驱动不必在target_intermediate()或 +target_index()中处理。 + +在驱动程序不想因为某个目标频率切换到中间频率的情况下,它们可以从get_intermediate()中返回'0'。在这种情况 +下,核心将直接调用->target_index()。 + +注意:->target_index()应该在失败的情况下恢复到policy->restore_freq,因为core会为此发送通知。 + + +2. 频率表助手 +============= + +由于大多数cpufreq处理器只允许被设置为几个特定的频率,因此,一个带有一些函数的 “频率表”可能会辅助处理器驱动 +程序的一些工作。这样的 "频率表" 由一个cpufreq_frequency_table条目构成的数组组成,"driver_data" 中包 +含了驱动程序的具体数值,"frequency" 中包含了相应的频率,并设置了标志。在表的最后,需要添加一个 +cpufreq_frequency_table条目,频率设置为CPUFREQ_TABLE_END。而如果想跳过表中的一个条目,则将频率设置为 +CPUFREQ_ENTRY_INVALID。这些条目不需要按照任何特定的顺序排序,但如果它们是cpufreq 核心会对它们进行快速的DVFS, +因为搜索最佳匹配会更快。 + +如果策略在其policy->freq_table字段中包含一个有效的指针,cpufreq表就会被核心自动验证。 + +cpufreq_frequency_table_verify()保证至少有一个有效的频率在policy->min和policy->max范围内,并且所有其他 +标准都被满足。这对->verify调用很有帮助。 + +cpufreq_frequency_table_target()是对应于->target阶段的频率表助手。只要把数值传递给这个函数,这个函数就会返 +回包含CPU要设置的频率的频率表条目。 + +以下宏可以作为cpufreq_frequency_table的迭代器。 + +cpufreq_for_each_entry(pos, table) - 遍历频率表的所有条目。 + +cpufreq_for_each_valid_entry(pos, table) - 该函数遍历所有条目,不包括CPUFREQ_ENTRY_INVALID频率。 +使用参数 "pos"-一个``cpufreq_frequency_table * `` 作为循环变量,使用参数 "table"-作为你想迭代 +的``cpufreq_frequency_table * `` 。 + +例如:: + + struct cpufreq_frequency_table *pos, *driver_freq_table; + + cpufreq_for_each_entry(pos, driver_freq_table) { + /* Do something with pos */ + pos->frequency = ... + } + +如果你需要在driver_freq_table中处理pos的位置,不要减去指针,因为它的代价相当高。相反,使用宏 +cpufreq_for_each_entry_idx() 和 cpufreq_for_each_valid_entry_idx() 。 -- Gitee From da38a61f8a159764d9e389f7c05816bba16bb9ee Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 23 Jun 2021 11:31:14 +0530 Subject: [PATCH 42/78] cpufreq: powernv: Migrate to ->exit() callback instead of ->stop_cpu() mainline inclusion from mainline-v5.14-rc1 commit 952da0c9ab5b047665442dc239cee36d5c9edb98 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=952da0c9ab5b047665442dc239cee36d5c9edb98 ---------------------------------------------------------------------- Commit 367dc4aa932b ("cpufreq: Add stop CPU callback to cpufreq_driver interface") added the ->stop_cpu() callback to allow the drivers to do clean up before the CPU is completely down and its state can't be modified. At that time the CPU hotplug framework used to call the cpufreq core's registered notifier for different events like CPU_DOWN_PREPARE and CPU_POST_DEAD. The ->stop_cpu() callback was called during the CPU_DOWN_PREPARE event. This is no longer the case, cpuhp_cpufreq_offline() is called only once by the CPU hotplug core now and we don't really need two separate callbacks for cpufreq drivers, i.e. ->stop_cpu() and ->exit(), as everything can be done from the ->exit() callback itself. Migrate to using the ->exit() callback instead of ->stop_cpu(). Signed-off-by: Viresh Kumar [ rjw: Minor changelog edits ] Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- drivers/cpufreq/powernv-cpufreq.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 8977e4de5915..6fbb46b2f6da 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -876,7 +876,15 @@ static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy) static int powernv_cpufreq_cpu_exit(struct cpufreq_policy *policy) { - /* timer is deleted in cpufreq_cpu_stop() */ + struct powernv_smp_call_data freq_data; + struct global_pstate_info *gpstates = policy->driver_data; + + freq_data.pstate_id = idx_to_pstate(powernv_pstate_info.min); + freq_data.gpstate_id = idx_to_pstate(powernv_pstate_info.min); + smp_call_function_single(policy->cpu, set_pstate, &freq_data, 1); + if (gpstates) + del_timer_sync(&gpstates->timer); + kfree(policy->driver_data); return 0; @@ -1008,18 +1016,6 @@ static struct notifier_block powernv_cpufreq_opal_nb = { .priority = 0, }; -static void powernv_cpufreq_stop_cpu(struct cpufreq_policy *policy) -{ - struct powernv_smp_call_data freq_data; - struct global_pstate_info *gpstates = policy->driver_data; - - freq_data.pstate_id = idx_to_pstate(powernv_pstate_info.min); - freq_data.gpstate_id = idx_to_pstate(powernv_pstate_info.min); - smp_call_function_single(policy->cpu, set_pstate, &freq_data, 1); - if (gpstates) - del_timer_sync(&gpstates->timer); -} - static unsigned int powernv_fast_switch(struct cpufreq_policy *policy, unsigned int target_freq) { @@ -1043,7 +1039,6 @@ static struct cpufreq_driver powernv_cpufreq_driver = { .target_index = powernv_cpufreq_target_index, .fast_switch = powernv_fast_switch, .get = powernv_cpufreq_get, - .stop_cpu = powernv_cpufreq_stop_cpu, .attr = powernv_cpu_freq_attr, }; -- Gitee From b4ec4fcd7ee01caf1737e4a4351194574345e365 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 30 Jun 2021 18:44:46 +0200 Subject: [PATCH 43/78] cpufreq: intel_pstate: Combine ->stop_cpu() and ->offline() mainline inclusion from mainline-v5.14-rc1 commit 49d6feef94c9f47ac4030563058f8a36267597b0 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=49d6feef94c9f47ac4030563058f8a36267597b0 ---------------------------------------------------------------------- Combine the ->stop_cpu() and ->offline() callback routines for intel_pstate in the active mode so as to avoid setting the ->stop_cpu callback pointer which is going to be dropped from the framework. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar Signed-off-by: huwentao --- drivers/cpufreq/intel_pstate.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 0cec5148b69f..36ce05e4c983 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2490,7 +2490,7 @@ static int intel_pstate_verify_policy(struct cpufreq_policy_data *policy) return 0; } -static int intel_pstate_cpu_offline(struct cpufreq_policy *policy) +static int intel_cpufreq_cpu_offline(struct cpufreq_policy *policy) { struct cpudata *cpu = all_cpu_data[policy->cpu]; @@ -2535,11 +2535,11 @@ static int intel_pstate_cpu_online(struct cpufreq_policy *policy) return 0; } -static void intel_pstate_stop_cpu(struct cpufreq_policy *policy) +static int intel_pstate_cpu_offline(struct cpufreq_policy *policy) { - pr_debug("CPU %d stopping\n", policy->cpu); - intel_pstate_clear_update_util_hook(policy->cpu); + + return intel_cpufreq_cpu_offline(policy); } static int intel_pstate_cpu_exit(struct cpufreq_policy *policy) @@ -2612,7 +2612,6 @@ static struct cpufreq_driver intel_pstate = { .resume = intel_pstate_resume, .init = intel_pstate_cpu_init, .exit = intel_pstate_cpu_exit, - .stop_cpu = intel_pstate_stop_cpu, .offline = intel_pstate_cpu_offline, .online = intel_pstate_cpu_online, .update_limits = intel_pstate_update_limits, @@ -2870,7 +2869,7 @@ static struct cpufreq_driver intel_cpufreq = { .fast_switch = intel_cpufreq_fast_switch, .init = intel_cpufreq_cpu_init, .exit = intel_cpufreq_cpu_exit, - .offline = intel_pstate_cpu_offline, + .offline = intel_cpufreq_cpu_offline, .online = intel_pstate_cpu_online, .suspend = intel_pstate_suspend, .resume = intel_pstate_resume, -- Gitee From 73af5520fd5aa00e474fc10ef919dcfa4712b64b Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 23 Jun 2021 09:54:42 +0530 Subject: [PATCH 44/78] cpufreq: Remove the ->stop_cpu() driver callback mainline inclusion from mainline-v5.14-rc1 commit 3e0f897fd92662f0ff21ca1759d724a9ad574858 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=3e0f897fd92662f0ff21ca1759d724a9ad574858 ---------------------------------------------------------------------- Now that all users of ->stop_cpu() have been migrated to using other callbacks, drop it from the core. Signed-off-by: Viresh Kumar [ rjw: Minor edits in the subject and changelog ] Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- Documentation/cpu-freq/cpu-drivers.rst | 3 --- Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst | 3 --- drivers/cpufreq/cpufreq.c | 3 --- include/linux/cpufreq.h | 1 - 4 files changed, 10 deletions(-) diff --git a/Documentation/cpu-freq/cpu-drivers.rst b/Documentation/cpu-freq/cpu-drivers.rst index a697278ce190..74fac797c396 100644 --- a/Documentation/cpu-freq/cpu-drivers.rst +++ b/Documentation/cpu-freq/cpu-drivers.rst @@ -71,9 +71,6 @@ And optionally .exit - A pointer to a per-policy cleanup function called during CPU_POST_DEAD phase of cpu hotplug process. - .stop_cpu - A pointer to a per-policy stop function called during - CPU_DOWN_PREPARE phase of cpu hotplug process. - .suspend - A pointer to a per-policy suspend function which is called with interrupts disabled and _after_ the governor is stopped for the policy. diff --git a/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst b/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst index 0ca2cb646666..9570e9c9e939 100644 --- a/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst +++ b/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst @@ -76,9 +76,6 @@ CPUfreq核心层注册一个cpufreq_driver结构体。 .exit - 一个指向per-policy清理函数的指针,该函数在cpu热插拔过程的CPU_POST_DEAD 阶段被调用。 - .stop_cpu - 一个指向per-policy停止函数的指针,该函数在cpu热插拔过程的CPU_DOWN_PREPARE - 阶段被调用。 - .suspend - 一个指向per-policy暂停函数的指针,该函数在关中断且在该策略的调节器停止 后被调用。 diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index d4c6196d0b45..9ad4b21dcf35 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1666,9 +1666,6 @@ static void __cpufreq_offline(unsigned int cpu, struct cpufreq_policy *policy) policy->cdev = NULL; } - if (cpufreq_driver->stop_cpu) - cpufreq_driver->stop_cpu(policy); - if (has_target()) cpufreq_exit_governor(policy); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 547cddaab2fb..be1a36076941 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -375,7 +375,6 @@ struct cpufreq_driver { int (*online)(struct cpufreq_policy *policy); int (*offline)(struct cpufreq_policy *policy); int (*exit)(struct cpufreq_policy *policy); - void (*stop_cpu)(struct cpufreq_policy *policy); int (*suspend)(struct cpufreq_policy *policy); int (*resume)(struct cpufreq_policy *policy); -- Gitee From 2bb460f6c2a55fbc4b1f986981df3769e3caacf8 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 7 May 2024 14:25:11 +0800 Subject: [PATCH 45/78] cpufreq: Don't unregister cpufreq cooling on CPU hotplug stable inclusion from stable-v6.6.27 commit 3ba4aceb68f013f36fbfd0a6aab5df439b4198d4 bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=3ba4aceb68f013f36fbfd0a6aab5df439b4198d4 -------------------------------- [ Upstream commit c4d61a529db788d2e52654f5b02c8d1de4952c5b ] Offlining a CPU and bringing it back online is a common operation and it happens frequently during system suspend/resume, where the non-boot CPUs are hotplugged out during suspend and brought back at resume. The cpufreq core already tries to make this path as fast as possible as the changes are only temporary in nature and full cleanup of resources isn't required in this case. For example the drivers can implement online()/offline() callbacks to avoid a lot of tear down of resources. On similar lines, there is no need to unregister the cpufreq cooling device during suspend / resume, but only while the policy is getting removed. Moreover, unregistering the cpufreq cooling device is resulting in an unwanted outcome, where the system suspend is eventually aborted in the process. Currently, during system suspend the cpufreq core unregisters the cooling device, which in turn removes a kobject using device_del() and that generates a notification to the userspace via uevent broadcast. This causes system suspend to abort in some setups. This was also earlier reported (indirectly) by Roman [1]. Maybe there is another way around to fixing that problem properly, but this change makes sense anyways. Move the registering and unregistering of the cooling device to policy creation and removal times onlyy. Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218521 Reported-by: Manaf Meethalavalappu Pallikunhi Reported-by: Roman Stratiienko Link: https://patchwork.kernel.org/project/linux-pm/patch/20220710164026.541466-1-r.stratiienko@gmail.com/ [1] Tested-by: Manaf Meethalavalappu Pallikunhi Signed-off-by: Viresh Kumar Reviewed-by: Dhruva Gole Signed-off-by: Rafael J. Wysocki Signed-off-by: Sasha Levin Signed-off-by: ZhangPeng Signed-off-by: huwentao --- drivers/cpufreq/cpufreq.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 9ad4b21dcf35..e47817b6d785 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1563,7 +1563,8 @@ static int cpufreq_online(unsigned int cpu) if (cpufreq_driver->ready) cpufreq_driver->ready(policy); - if (cpufreq_thermal_control_enabled(cpufreq_driver)) + /* Register cpufreq cooling only for a new policy */ + if (new_policy && cpufreq_thermal_control_enabled(cpufreq_driver)) policy->cdev = of_cpufreq_cooling_register(policy); /* Let the per-policy boost flag mirror the cpufreq_driver boost during init */ @@ -1661,11 +1662,6 @@ static void __cpufreq_offline(unsigned int cpu, struct cpufreq_policy *policy) else policy->last_policy = policy->policy; - if (cpufreq_thermal_control_enabled(cpufreq_driver)) { - cpufreq_cooling_unregister(policy->cdev); - policy->cdev = NULL; - } - if (has_target()) cpufreq_exit_governor(policy); @@ -1730,6 +1726,15 @@ static void cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif) return; } + /* + * Unregister cpufreq cooling once all the CPUs of the policy are + * removed. + */ + if (cpufreq_thermal_control_enabled(cpufreq_driver)) { + cpufreq_cooling_unregister(policy->cdev); + policy->cdev = NULL; + } + /* We did light-weight exit earlier, do full tear down now */ if (cpufreq_driver->offline && cpufreq_driver->exit) cpufreq_driver->exit(policy); -- Gitee From 658ad5c46d26cc3478025aea32e3f4d2a40749e6 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Thu, 21 Aug 2025 15:16:04 +0800 Subject: [PATCH 46/78] cpufreq: Remove duplicate check in __cpufreq_offline() mainline inclusion from mainline-v6.17-rc2 commit 5d6ecaaa922611ec3ca067723ccefafb543010ee category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5d6ecaaa922611ec3ca067723ccefafb543010ee ---------------------------------------------------------------------- The has_target() checks in __cpufreq_offline() are duplicate. Remove one of them and put the operations of exiting governor together with storing last governor's name. Signed-off-by: Lifeng Zheng Link: https://patch.msgid.link/20250623133402.3120230-5-zhenglifeng1@huawei.com Signed-off-by: Rafael J. Wysocki Signed-off-by: Hongye Lin Signed-off-by: huwentao --- drivers/cpufreq/cpufreq.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index e47817b6d785..975c5c0e5814 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1656,14 +1656,13 @@ static void __cpufreq_offline(unsigned int cpu, struct cpufreq_policy *policy) return; } - if (has_target()) + if (has_target()) { strncpy(policy->last_governor, policy->governor->name, CPUFREQ_NAME_LEN); - else - policy->last_policy = policy->policy; - - if (has_target()) cpufreq_exit_governor(policy); + } else { + policy->last_policy = policy->policy; + } /* * Perform the ->offline() during light-weight tear-down, as -- Gitee From 71e0c225615567f825eceba2ea8a21c4efd33500 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Thu, 21 Aug 2025 15:16:05 +0800 Subject: [PATCH 47/78] cpufreq: Hold cpufreq_driver_lock when assigning cpufreq_driver->set_boost driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U ---------------------------------------------------------------------- Hold the lock to avoid concurrency problems in cpufreq_enable_boost_support() when assigning cpufreq_driver->set_boost. Fixes: 7a6c79f2fe53 ("cpufreq: Simplify core code related to boost support") Signed-off-by: Lifeng Zheng Signed-off-by: Hongye Lin Signed-off-by: huwentao --- drivers/cpufreq/cpufreq.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 975c5c0e5814..f0a53880561d 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2759,13 +2759,17 @@ static void remove_boost_sysfs_file(void) int cpufreq_enable_boost_support(void) { + unsigned long flags; + if (!cpufreq_driver) return -EINVAL; if (cpufreq_boost_supported()) return 0; + write_lock_irqsave(&cpufreq_driver_lock, flags); cpufreq_driver->set_boost = cpufreq_boost_set_sw; + write_unlock_irqrestore(&cpufreq_driver_lock, flags); /* This will get removed on driver unregister */ return create_boost_sysfs_file(); -- Gitee From 5db0748bc065c0884fcd6022c70434a7037deff5 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Thu, 21 Aug 2025 15:16:06 +0800 Subject: [PATCH 48/78] cpufreq: Initialize cpufreq-based frequency-invariance later mainline inclusion from mainline-v6.17-rc2 commit 2a6c727387062a2ea79eb6cf5004820cb1b0afe2 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2a6c727387062a2ea79eb6cf5004820cb1b0afe2 ---------------------------------------------------------------------- The cpufreq-based invariance is enabled in cpufreq_register_driver(), but never disabled after registration fails. Move the invariance initialization to where all other initializations have been successfully done to solve this problem. Fixes: 874f63531064 ("cpufreq: report whether cpufreq supports Frequency Invariance (FI)") Signed-off-by: Lifeng Zheng Link: https://patch.msgid.link/20250709104145.2348017-2-zhenglifeng1@huawei.com [ rjw: New subject ] Signed-off-by: Rafael J. Wysocki Signed-off-by: Hongye Lin Signed-off-by: huwentao --- drivers/cpufreq/cpufreq.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index f0a53880561d..059c1b5c57a0 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -3082,15 +3082,6 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) cpufreq_driver = driver_data; write_unlock_irqrestore(&cpufreq_driver_lock, flags); - /* - * Mark support for the scheduler's frequency invariance engine for - * drivers that implement target(), target_index() or fast_switch(). - */ - if (!cpufreq_driver->setpolicy) { - static_branch_enable_cpuslocked(&cpufreq_freq_invariance); - pr_debug("supports frequency invariance"); - } - if (driver_data->setpolicy) driver_data->flags |= CPUFREQ_CONST_LOOPS; @@ -3122,6 +3113,15 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) hp_online = ret; ret = 0; + /* + * Mark support for the scheduler's frequency invariance engine for + * drivers that implement target(), target_index() or fast_switch(). + */ + if (!cpufreq_driver->setpolicy) { + static_branch_enable_cpuslocked(&cpufreq_freq_invariance); + pr_debug("supports frequency invariance"); + } + pr_debug("driver %s up and running\n", driver_data->name); goto out; -- Gitee From b3f98de02312c41d3f643dbc44e9f2dd82e54280 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Thu, 21 Aug 2025 15:16:08 +0800 Subject: [PATCH 49/78] cpufreq: Move the check of cpufreq_driver->get into cpufreq_verify_current_freq() mainline inclusion from mainline-v6.17-rc2 commit 908981d85f86c5e2b39dfe0b2267c6d44d9c48f7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=908981d85f86c5e2b39dfe0b2267c6d44d9c48f7 ---------------------------------------------------------------------- Move the check of cpufreq_driver->get into cpufreq_verify_current_freq() in case of calling it without check. Signed-off-by: Lifeng Zheng Link: https://patch.msgid.link/20250709104145.2348017-4-zhenglifeng1@huawei.com Signed-off-by: Rafael J. Wysocki Signed-off-by: Hongye Lin Signed-off-by: huwentao --- drivers/cpufreq/cpufreq.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 059c1b5c57a0..49cec8b57a7e 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1771,6 +1771,9 @@ static unsigned int cpufreq_verify_current_freq(struct cpufreq_policy *policy, b { unsigned int new_freq; + if (!cpufreq_driver->get) + return 0; + new_freq = cpufreq_driver->get(policy->cpu); if (!new_freq) return 0; @@ -1885,8 +1888,7 @@ unsigned int cpufreq_get(unsigned int cpu) if (policy) { down_read(&policy->rwsem); - if (cpufreq_driver->get) - ret_freq = __cpufreq_get(policy); + ret_freq = __cpufreq_get(policy); up_read(&policy->rwsem); cpufreq_cpu_put(policy); @@ -2408,8 +2410,7 @@ int cpufreq_start_governor(struct cpufreq_policy *policy) pr_debug("%s: for CPU %u\n", __func__, policy->cpu); - if (cpufreq_driver->get) - cpufreq_verify_current_freq(policy, false); + cpufreq_verify_current_freq(policy, false); if (policy->governor->start) { ret = policy->governor->start(policy); -- Gitee From bf717edbb8af64633c905afca61632922a7a3fa0 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Thu, 21 Aug 2025 15:16:09 +0800 Subject: [PATCH 50/78] cpufreq: Exit governor when failed to start old governor mainline inclusion from mainline-v6.17-rc2 commit 0ae204405095abfbc2d694ee0fbb49bcbbe55c57 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0ae204405095abfbc2d694ee0fbb49bcbbe55c57 ---------------------------------------------------------------------- Detect the result of starting old governor in cpufreq_set_policy(). If it fails, exit the governor and clear policy->governor. Signed-off-by: Lifeng Zheng Link: https://patch.msgid.link/20250709104145.2348017-5-zhenglifeng1@huawei.com Signed-off-by: Rafael J. Wysocki Signed-off-by: Hongye Lin Signed-off-by: huwentao --- drivers/cpufreq/cpufreq.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 49cec8b57a7e..d9494e1d6a80 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2618,10 +2618,12 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy, pr_debug("starting governor %s failed\n", policy->governor->name); if (old_gov) { policy->governor = old_gov; - if (cpufreq_init_governor(policy)) + if (cpufreq_init_governor(policy)) { policy->governor = NULL; - else - cpufreq_start_governor(policy); + } else if (cpufreq_start_governor(policy)) { + cpufreq_exit_governor(policy); + policy->governor = NULL; + } } return ret; -- Gitee From b5b9d202dd9598549b4c676a691b2fe1c07e648c Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 15 Jun 2021 14:27:50 +0530 Subject: [PATCH 51/78] arch_topology: Avoid use-after-free for scale_freq_data mainline inclusion from mainline-v5.14-rc1 commit 83150f5d05f065fb5c12c612f119015cabdcc124 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=83150f5d05f065fb5c12c612f119015cabdcc124 ---------------------------------------------------------------------- Currently topology_scale_freq_tick() (which gets called from scheduler_tick()) may end up using a pointer to "struct scale_freq_data", which was previously cleared by topology_clear_scale_freq_source(), as there is no protection in place here. The users of topology_clear_scale_freq_source() though needs a guarantee that the previously cleared scale_freq_data isn't used anymore, so they can free the related resources. Since topology_scale_freq_tick() is called from scheduler tick, we don't want to add locking in there. Use the RCU update mechanism instead (which is already used by the scheduler's utilization update path) to guarantee race free updates here. synchronize_rcu() makes sure that all RCU critical sections that started before it is called, will finish before it returns. And so the callers of topology_clear_scale_freq_source() don't need to worry about their callback getting called anymore. Cc: Paul E. McKenney Fixes: 01e055c120a4 ("arch_topology: Allow multiple entities to provide sched_freq_tick() callback") Tested-by: Vincent Guittot Reviewed-by: Ionela Voinescu Tested-by: Qian Cai Signed-off-by: Viresh Kumar Signed-off-by: huwentao --- drivers/base/arch_topology.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 7f947e5734c2..ca6e77ae43d0 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -18,10 +18,11 @@ #include #include #include +#include #include #include -static DEFINE_PER_CPU(struct scale_freq_data *, sft_data); +static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data); static struct cpumask scale_freq_counters_mask; static bool scale_freq_invariant; @@ -66,16 +67,20 @@ void topology_set_scale_freq_source(struct scale_freq_data *data, if (cpumask_empty(&scale_freq_counters_mask)) scale_freq_invariant = topology_scale_freq_invariant(); + rcu_read_lock(); + for_each_cpu(cpu, cpus) { - sfd = per_cpu(sft_data, cpu); + sfd = rcu_dereference(*per_cpu_ptr(&sft_data, cpu)); /* Use ARCH provided counters whenever possible */ if (!sfd || sfd->source != SCALE_FREQ_SOURCE_ARCH) { - per_cpu(sft_data, cpu) = data; + rcu_assign_pointer(per_cpu(sft_data, cpu), data); cpumask_set_cpu(cpu, &scale_freq_counters_mask); } } + rcu_read_unlock(); + update_scale_freq_invariant(true); } EXPORT_SYMBOL_GPL(topology_set_scale_freq_source); @@ -86,22 +91,32 @@ void topology_clear_scale_freq_source(enum scale_freq_source source, struct scale_freq_data *sfd; int cpu; + rcu_read_lock(); + for_each_cpu(cpu, cpus) { - sfd = per_cpu(sft_data, cpu); + sfd = rcu_dereference(*per_cpu_ptr(&sft_data, cpu)); if (sfd && sfd->source == source) { - per_cpu(sft_data, cpu) = NULL; + rcu_assign_pointer(per_cpu(sft_data, cpu), NULL); cpumask_clear_cpu(cpu, &scale_freq_counters_mask); } } + rcu_read_unlock(); + + /* + * Make sure all references to previous sft_data are dropped to avoid + * use-after-free races. + */ + synchronize_rcu(); + update_scale_freq_invariant(false); } EXPORT_SYMBOL_GPL(topology_clear_scale_freq_source); void topology_scale_freq_tick(void) { - struct scale_freq_data *sfd = *this_cpu_ptr(&sft_data); + struct scale_freq_data *sfd = rcu_dereference_sched(*this_cpu_ptr(&sft_data)); if (sfd) sfd->set_freq_scale(); -- Gitee From 27540d87f65758681b490e19b79bd2047bf5f586 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Tue, 2 Nov 2021 18:01:43 +0000 Subject: [PATCH 52/78] Documentation: power: Add description about new callback for EM registration mainline inclusion from mainline-v5.16-rc2 commit d704aa0d44ade12660d4d7220b2a8d785b7b4247 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=d704aa0d44ade12660d4d7220b2a8d785b7b4247 ---------------------------------------------------------------------- The Energy Model (EM) registration for CPUs should now be done using a dedicated callback added recently into CPUFreq framework and drivers. Commit c17495b01b72 ("cpufreq: Add callback to register with energy model") The callback guaranties that the EM registration is called at the right time during driver setup. To avoid mistakes update the documentation to align with the existing code implementation. Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- Documentation/power/energy-model.rst | 29 ++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/Documentation/power/energy-model.rst b/Documentation/power/energy-model.rst index 5f03b6f47d9e..8c9e784d684f 100644 --- a/Documentation/power/energy-model.rst +++ b/Documentation/power/energy-model.rst @@ -110,6 +110,10 @@ More details about the above APIs can be found in include/linux/energy_model.h. 3. Example driver ----------------- +The CPUFreq framework supports dedicated callback for registering +the EM for a given CPU(s) 'policy' object: cpufreq_driver::register_em(). +That callback has to be implemented properly for a given driver, +because the framework would call it at the right time during setup. This section provides a simple example of a CPUFreq driver registering a performance domain in the Energy Model framework using the (fake) 'foo' protocol. The driver implements an est_power() function to be provided to the @@ -139,24 +143,21 @@ EM framework:: 20 return 0; 21 } 22 - 23 static int foo_cpufreq_init(struct cpufreq_policy *policy) + 23 static void foo_cpufreq_register_em(struct cpufreq_policy *policy) 24 { 25 struct em_data_callback em_cb = EM_DATA_CB(est_power); 26 struct device *cpu_dev; - 27 int nr_opp, ret; + 27 int nr_opp; 28 29 cpu_dev = get_cpu_device(cpumask_first(policy->cpus)); 30 - 31 /* Do the actual CPUFreq init work ... */ - 32 ret = do_foo_cpufreq_init(policy); - 33 if (ret) - 34 return ret; - 35 - 36 /* Find the number of OPPs for this policy */ - 37 nr_opp = foo_get_nr_opp(policy); + 31 /* Find the number of OPPs for this policy */ + 32 nr_opp = foo_get_nr_opp(policy); + 33 + 34 /* And register the new performance domain */ + 35 em_dev_register_perf_domain(cpu_dev, nr_opp, &em_cb, policy->cpus); + 37 } 38 - 39 /* And register the new performance domain */ - 40 em_dev_register_perf_domain(cpu_dev, nr_opp, &em_cb, policy->cpus); - 41 - 42 return 0; - 43 } + 39 static struct cpufreq_driver foo_cpufreq_driver = { + 40 .register_em = foo_cpufreq_register_em, + 41 }; -- Gitee From 0042a7107bd5fa47adcda1e0a4682fe1047d29be Mon Sep 17 00:00:00 2001 From: Sibi Sankar Date: Tue, 7 May 2024 14:13:19 +0800 Subject: [PATCH 53/78] cpufreq: Fix per-policy boost behavior on SoCs using cpufreq_boost_set_sw() stable inclusion from stable-v6.6.23 commit 9d47d2e7f82dec7cab09996bdb9062786eb827f6 bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9d47d2e7f82dec7cab09996bdb9062786eb827f6 -------------------------------- [ Upstream commit f37a4d6b4a2c77414e8b9d25dd5ee31537ce9b00 ] In the existing code, per-policy flags don't have any impact i.e. if cpufreq_driver boost is enabled and boost is disabled for one or more of the policies, the cpufreq driver will behave as if boost is enabled. Fix this by incorporating per-policy boost flag in the policy->max computation used in cpufreq_frequency_table_cpuinfo and setting the default per-policy boost to mirror the cpufreq_driver boost flag. Fixes: 218a06a79d9a ("cpufreq: Support per-policy performance boost") Reported-by: Dietmar Eggemann Reviewed-by: Viresh Kumar Reviewed-by: Dhruva Gole Signed-off-by: Sibi Sankar Tested-by:Yipeng Zou Reviewed-by: Yipeng Zou Signed-off-by: Rafael J. Wysocki Signed-off-by: Sasha Levin Signed-off-by: ZhangPeng Signed-off-by: huwentao --- drivers/cpufreq/cpufreq.c | 18 ++++++++++++------ drivers/cpufreq/freq_table.c | 2 +- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index d9494e1d6a80..29b58b143a42 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -638,14 +638,16 @@ static ssize_t store_local_boost(struct cpufreq_policy *policy, if (policy->boost_enabled == enable) return count; + policy->boost_enabled = enable; + cpus_read_lock(); ret = cpufreq_driver->set_boost(policy, enable); cpus_read_unlock(); - if (ret) + if (ret) { + policy->boost_enabled = !policy->boost_enabled; return ret; - - policy->boost_enabled = enable; + } return count; } @@ -1410,6 +1412,9 @@ static int cpufreq_online(unsigned int cpu) goto out_free_policy; } + /* Let the per-policy boost flag mirror the cpufreq_driver boost during init */ + policy->boost_enabled = cpufreq_boost_enabled() && policy_has_boost_freq(policy); + /* * The initialization has succeeded and the policy is online. * If there is a problem with its frequency table, take it @@ -2714,11 +2719,12 @@ int cpufreq_boost_trigger_state(int state) get_online_cpus(); for_each_active_policy(policy) { + policy->boost_enabled = state; ret = cpufreq_driver->set_boost(policy, state); - if (ret) + if (ret) { + policy->boost_enabled = !policy->boost_enabled; goto err_reset_state; - - policy->boost_enabled = state; + } } put_online_cpus(); diff --git a/drivers/cpufreq/freq_table.c b/drivers/cpufreq/freq_table.c index d3f756f7b5a0..fe9c087afe01 100644 --- a/drivers/cpufreq/freq_table.c +++ b/drivers/cpufreq/freq_table.c @@ -40,7 +40,7 @@ int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy, cpufreq_for_each_valid_entry(pos, table) { freq = pos->frequency; - if (!cpufreq_boost_enabled() + if ((!cpufreq_boost_enabled() || !policy->boost_enabled) && (pos->flags & CPUFREQ_BOOST_FREQ)) continue; -- Gitee From ad0ed5db8623bf542fc28044b7ae67f854c6d0e6 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 1 Sep 2021 14:41:55 +0530 Subject: [PATCH 54/78] cpufreq: acpi: Remove acpi_cpufreq_cpu_ready() mainline inclusion from mainline-v5.15-rc1 commit 692a3b9a89947b27fc76d40b2613b33286a1690b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=692a3b9a89947b27fc76d40b2613b33286a1690b ---------------------------------------------------------------------- The ready() callback was implemented earlier for acpi-cpufreq driver as we wanted to use policy->cpuinfo.max_freq for which the policy was required to be verified. That is no longer the case and we can do the pr_warn() right from ->init() callback now. Remove acpi_cpufreq_cpu_ready(). Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- drivers/cpufreq/acpi-cpufreq.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index 3f9150909126..d71c5af338ad 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -889,6 +889,9 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) policy->fast_switch_possible = !acpi_pstate_strict && !(policy_is_shared(policy) && policy->shared_type != CPUFREQ_SHARED_TYPE_ANY); + if (perf->states[0].core_frequency * 1000 != freq_table[0].frequency) + pr_warn(FW_WARN "P-state 0 is not max freq\n"); + return result; err_unreg: @@ -918,16 +921,6 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) return 0; } -static void acpi_cpufreq_cpu_ready(struct cpufreq_policy *policy) -{ - struct acpi_processor_performance *perf = per_cpu_ptr(acpi_perf_data, - policy->cpu); - unsigned int freq = policy->freq_table[0].frequency; - - if (perf->states[0].core_frequency * 1000 != freq) - pr_warn(FW_WARN "P-state 0 is not max freq\n"); -} - static int acpi_cpufreq_resume(struct cpufreq_policy *policy) { struct acpi_cpufreq_data *data = policy->driver_data; @@ -955,7 +948,6 @@ static struct cpufreq_driver acpi_cpufreq_driver = { .bios_limit = acpi_processor_get_bios_limit, .init = acpi_cpufreq_cpu_init, .exit = acpi_cpufreq_cpu_exit, - .ready = acpi_cpufreq_cpu_ready, .resume = acpi_cpufreq_resume, .name = "acpi-cpufreq", .attr = acpi_cpufreq_attr, -- Gitee From d0e0b5fcbc25c59b0ace32c299f0522925c6cf24 Mon Sep 17 00:00:00 2001 From: Stuart Hayes Date: Wed, 2 Nov 2022 14:59:57 -0500 Subject: [PATCH 55/78] cpufreq: ACPI: Defer setting boost MSRs mainline inclusion from mainline-v6.2-rc1 commit 13fdbc8b8da6a2325cad3359c9a70504b0ff2f93 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=13fdbc8b8da6a2325cad3359c9a70504b0ff2f93 ---------------------------------------------------------------------- When acpi-cpufreq is loaded, boost is enabled on every CPU (by setting an MSR) before the driver is registered with cpufreq. This can be very time consuming, because it is done with a CPU hotplug startup callback, and cpuhp_setup_state() schedules the callback (cpufreq_boost_online()) to run on each CPU one at a time, waiting for each to run before calling the next. If cpufreq_register_driver() fails--if, for example, there are no ACPI P-states present--this is wasted time. Since cpufreq already sets up a CPU hotplug startup callback if and when acpi-cpufreq is registered, set the boost MSRs in acpi_cpufreq_cpu_init(), which is called by the cpufreq cpuhp callback. This allows acpi-cpufreq to exit quickly if it is loaded but not needed. On one system with 192 CPUs, this patch speeds up boot by about 30 seconds. Signed-off-by: Stuart Hayes Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- drivers/cpufreq/acpi-cpufreq.c | 31 +++---------------------------- 1 file changed, 3 insertions(+), 28 deletions(-) diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index d71c5af338ad..b99de0999560 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -534,15 +534,6 @@ static void free_acpi_perf_data(void) free_percpu(acpi_perf_data); } -static int cpufreq_boost_online(unsigned int cpu) -{ - /* - * On the CPU_UP path we simply keep the boost-disable flag - * in sync with the current global state. - */ - return boost_set_msr(acpi_cpufreq_driver.boost_enabled); -} - static int cpufreq_boost_down_prep(unsigned int cpu) { /* @@ -892,6 +883,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) if (perf->states[0].core_frequency * 1000 != freq_table[0].frequency) pr_warn(FW_WARN "P-state 0 is not max freq\n"); + set_boost(policy, acpi_cpufreq_driver.boost_enabled); + return result; err_unreg: @@ -911,6 +904,7 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) pr_debug("%s\n", __func__); + cpufreq_boost_down_prep(policy->cpu); policy->fast_switch_possible = false; policy->driver_data = NULL; acpi_processor_unregister_performance(data->acpi_perf_cpu); @@ -967,25 +961,9 @@ static void __init acpi_cpufreq_boost_init(void) acpi_cpufreq_driver.set_boost = set_boost; acpi_cpufreq_driver.boost_enabled = boost_state(0); - /* - * This calls the online callback on all online cpu and forces all - * MSRs to the same value. - */ - ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "cpufreq/acpi:online", - cpufreq_boost_online, cpufreq_boost_down_prep); - if (ret < 0) { - pr_err("acpi_cpufreq: failed to register hotplug callbacks\n"); - return; - } acpi_cpufreq_online = ret; } -static void acpi_cpufreq_boost_exit(void) -{ - if (acpi_cpufreq_online > 0) - cpuhp_remove_state_nocalls(acpi_cpufreq_online); -} - static int __init acpi_cpufreq_init(void) { int ret; @@ -1027,7 +1005,6 @@ static int __init acpi_cpufreq_init(void) ret = cpufreq_register_driver(&acpi_cpufreq_driver); if (ret) { free_acpi_perf_data(); - acpi_cpufreq_boost_exit(); } return ret; } @@ -1036,8 +1013,6 @@ static void __exit acpi_cpufreq_exit(void) { pr_debug("%s\n", __func__); - acpi_cpufreq_boost_exit(); - cpufreq_unregister_driver(&acpi_cpufreq_driver); free_acpi_perf_data(); -- Gitee From c9920c880c85c850741d396946e6be090ba98c62 Mon Sep 17 00:00:00 2001 From: Stuart Hayes Date: Mon, 5 Dec 2022 11:57:44 -0600 Subject: [PATCH 56/78] cpufreq: ACPI: Only set boost MSRs on supported CPUs mainline inclusion from mainline-v6.2-rc1 commit 442046328f27e30ce2c830759af89f42e7169bc1 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=442046328f27e30ce2c830759af89f42e7169bc1 ---------------------------------------------------------------------- Stop trying to set boost MSRs on CPUs that don't support boost. This corrects a bug in the recent patch "Defer setting boost MSRs". Fixes: 13fdbc8b8da6 ("cpufreq: ACPI: Defer setting boost MSRs") Signed-off-by: Stuart Hayes Reported-by: Borislav Petkov (AMD) Tested-by: Borislav Petkov (AMD) Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- drivers/cpufreq/acpi-cpufreq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index b99de0999560..7d74a21f7c8f 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -883,7 +883,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) if (perf->states[0].core_frequency * 1000 != freq_table[0].frequency) pr_warn(FW_WARN "P-state 0 is not max freq\n"); - set_boost(policy, acpi_cpufreq_driver.boost_enabled); + if (acpi_cpufreq_driver.set_boost) + set_boost(policy, acpi_cpufreq_driver.boost_enabled); return result; -- Gitee From c9cc6fbcf85453b6cb0737cb06fe22c629457ccb Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 26 Jun 2024 15:47:23 -0500 Subject: [PATCH 57/78] cpufreq: ACPI: Mark boost policy as enabled when setting boost stable inclusion from stable-v6.6.41 commit 2ca2fd474d862c9b144ea4ed100ee4591f52d574 bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=2ca2fd474d862c9b144ea4ed100ee4591f52d574 -------------------------------- commit d92467ad9d9ee63a700934b9228a989ef671d511 upstream. When boost is set for CPUs using acpi-cpufreq, the policy is not updated which can cause boost to be incorrectly not reported. Fixes: 218a06a79d9a ("cpufreq: Support per-policy performance boost") Link: https://patch.msgid.link/20240626204723.6237-2-mario.limonciello@amd.com Suggested-by: Viresh Kumar Suggested-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello Reviewed-by: Gautham R. Shenoy Acked-by: Viresh Kumar Cc: All applicable Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman Signed-off-by: ZhangPeng Signed-off-by: huwentao --- drivers/cpufreq/acpi-cpufreq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index 7d74a21f7c8f..41a3c81614d4 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -883,8 +883,10 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) if (perf->states[0].core_frequency * 1000 != freq_table[0].frequency) pr_warn(FW_WARN "P-state 0 is not max freq\n"); - if (acpi_cpufreq_driver.set_boost) + if (acpi_cpufreq_driver.set_boost) { set_boost(policy, acpi_cpufreq_driver.boost_enabled); + policy->boost_enabled = acpi_cpufreq_driver.boost_enabled; + } return result; -- Gitee From a9c5ca6c5abdc12292953fafdd5d357787715dcf Mon Sep 17 00:00:00 2001 From: Christian Loehle Date: Mon, 13 Oct 2025 03:14:35 +0000 Subject: [PATCH 58/78] cpufreq: Initialize cpufreq-based invariance before subsys mainline inclusion from mainline-v6.17-rc4 commit 8ffe28b4e8d8b18cb2f2933410322c24f039d5d6 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8ffe28b4e8d8b18cb2f2933410322c24f039d5d6 ---------------------------------------------------------------------- commit 2a6c72738706 ("cpufreq: Initialize cpufreq-based frequency-invariance later") postponed the frequency invariance initialization to avoid disabling it in the error case. This isn't locking safe, instead move the initialization up before the subsys interface is registered (which will rebuild the sched_domains) and add the corresponding disable on the error path. Observed lockdep without this patch: [ 0.989686] ====================================================== [ 0.989688] WARNING: possible circular locking dependency detected [ 0.989690] 6.17.0-rc4-cix-build+ #31 Tainted: G S [ 0.989691] ------------------------------------------------------ [ 0.989692] swapper/0/1 is trying to acquire lock: [ 0.989693] ffff800082ada7f8 (sched_energy_mutex){+.+.}-{4:4}, at: rebuild_sched_domains_energy+0x30/0x58 [ 0.989705] but task is already holding lock: [ 0.989706] ffff000088c89bc8 (&policy->rwsem){+.+.}-{4:4}, at: cpufreq_online+0x7f8/0xbe0 [ 0.989713] which lock already depends on the new lock. Fixes: 2a6c72738706 ("cpufreq: Initialize cpufreq-based frequency-invariance later") Signed-off-by: Christian Loehle Signed-off-by: Rafael J. Wysocki Signed-off-by: Xinyu Zheng Signed-off-by: huwentao --- drivers/cpufreq/cpufreq.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 29b58b143a42..de1cc7a3f26c 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -3100,6 +3100,15 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) goto err_null_driver; } + /* + * Mark support for the scheduler's frequency invariance engine for + * drivers that implement target(), target_index() or fast_switch(). + */ + if (!cpufreq_driver->setpolicy) { + static_branch_enable_cpuslocked(&cpufreq_freq_invariance); + pr_debug("cpufreq: supports frequency invariance\n"); + } + ret = subsys_interface_register(&cpufreq_interface); if (ret) goto err_boost_unreg; @@ -3122,21 +3131,14 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) hp_online = ret; ret = 0; - /* - * Mark support for the scheduler's frequency invariance engine for - * drivers that implement target(), target_index() or fast_switch(). - */ - if (!cpufreq_driver->setpolicy) { - static_branch_enable_cpuslocked(&cpufreq_freq_invariance); - pr_debug("supports frequency invariance"); - } - pr_debug("driver %s up and running\n", driver_data->name); goto out; err_if_unreg: subsys_interface_unregister(&cpufreq_interface); err_boost_unreg: + if (!cpufreq_driver->setpolicy) + static_branch_disable_cpuslocked(&cpufreq_freq_invariance); remove_boost_sysfs_file(); err_null_driver: write_lock_irqsave(&cpufreq_driver_lock, flags); -- Gitee From dc2908e43018869e8d7341aab55f12f1f4566841 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Sat, 28 Dec 2024 06:49:45 +0000 Subject: [PATCH 59/78] cpufreq: CPPC: Fix possible null-ptr-deref for cppc_get_cpu_cost() stable inclusion from stable-v6.6.64 commit f05ef81db63889f6f14eb77fd140dac6cedb6f7f category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/ID1B9U CVE: CVE-2024-53230 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f05ef81db63889f6f14eb77fd140dac6cedb6f7f -------------------------------- [ Upstream commit 1a1374bb8c5926674973d849feed500bc61ad535 ] cpufreq_cpu_get_raw() may return NULL if the cpu is not in policy->cpus cpu mask and it will cause null pointer dereference, so check NULL for cppc_get_cpu_cost(). Fixes: 740fcdc2c20e ("cpufreq: CPPC: Register EM based on efficiency class information") Signed-off-by: Jinjie Ruan Signed-off-by: Viresh Kumar Signed-off-by: Sasha Levin Signed-off-by: Jinjie Ruan Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 9ad7d2ac2049..37f78688a099 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -579,6 +579,9 @@ static int cppc_get_cpu_cost(struct device *cpu_dev, unsigned long KHz, int step; policy = cpufreq_cpu_get_raw(cpu_dev->id); + if (!policy) + return 0; + cpu_data = policy->driver_data; perf_caps = &cpu_data->perf_caps; max_cap = arch_scale_cpu_capacity(cpu_dev->id); -- Gitee From e7d92b090920fd1b4835da6d23235cf5397e73d2 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Wed, 8 Jan 2025 08:43:35 +0000 Subject: [PATCH 60/78] cpufreq: CPPC: Fix possible null-ptr-deref for cpufreq_cpu_get_raw() mainline inclusion from mainline-v6.13-rc1 commit a78e7207564258db6e373e86294a85f9d646d35a category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/ID1B9U Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=a78e7207564258db6e373e86294a85f9d646d35a -------------------------------- cpufreq_cpu_get_raw() may return NULL if the cpu is not in policy->cpus cpu mask and it will cause null pointer dereference. Fixes: 740fcdc2c20e ("cpufreq: CPPC: Register EM based on efficiency class information") Signed-off-by: Jinjie Ruan Signed-off-by: Viresh Kumar Signed-off-by: Yuntao Liu Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 37f78688a099..e5a1c7939fc9 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -512,6 +512,9 @@ static int cppc_get_cpu_power(struct device *cpu_dev, struct cppc_cpudata *cpu_data; policy = cpufreq_cpu_get_raw(cpu_dev->id); + if (!policy) + return 0; + cpu_data = policy->driver_data; perf_caps = &cpu_data->perf_caps; max_cap = arch_scale_cpu_capacity(cpu_dev->id); -- Gitee From f0d1f308e835ebd05b463fa2424e6f8f5915aada Mon Sep 17 00:00:00 2001 From: Pierre Gondois Date: Mon, 30 May 2022 12:04:24 +0200 Subject: [PATCH 61/78] cpufreq: CPPC: Fix unused-function warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v5.19-rc1 commit da4363457f777906d49d765398f5227657c82ef9 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=da4363457f777906d49d765398f5227657c82ef9 ---------------------------------------------------------------------- Building the cppc_cpufreq driver with for arm64 with CONFIG_ENERGY_MODEL=n triggers the following warnings: drivers/cpufreq/cppc_cpufreq.c:550:12: error: ‘cppc_get_cpu_cost’ defined but not used [-Werror=unused-function] 550 | static int cppc_get_cpu_cost(struct device *cpu_dev, unsigned long KHz, | ^~~~~~~~~~~~~~~~~ drivers/cpufreq/cppc_cpufreq.c:481:12: error: ‘cppc_get_cpu_power’ defined but not used [-Werror=unused-function] 481 | static int cppc_get_cpu_power(struct device *cpu_dev, | ^~~~~~~~~~~~~~~~~~ Move the Energy Model related functions into specific guards. This allows to fix the warning and prevent doing extra work when the Energy Model is not present. Fixes: 740fcdc2c20e ("cpufreq: CPPC: Register EM based on efficiency class information") Reported-by: Shaokun Zhang Signed-off-by: Pierre Gondois Tested-by: Shaokun Zhang Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index e5a1c7939fc9..824f205e4428 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -462,6 +462,14 @@ static unsigned int cppc_cpufreq_get_transition_delay_us(unsigned int cpu) return delay_us; } +#else +static unsigned int cppc_cpufreq_get_transition_delay_us(unsigned int cpu) +{ + return cppc_get_transition_latency(cpu) / NSEC_PER_USEC; +} +#endif + +#if defined(CONFIG_ARM64) && defined(CONFIG_ENERGY_MODEL) static DEFINE_PER_CPU(unsigned int, efficiency_class); @@ -645,20 +653,11 @@ static void populate_efficiency_class(void) } #else - -static unsigned int cppc_cpufreq_get_transition_delay_us(unsigned int cpu) -{ - return cppc_get_transition_latency(cpu) / NSEC_PER_USEC; -} static void populate_efficiency_class(void) { } -static void cppc_cpufreq_register_em(struct cpufreq_policy *policy) -{ -} #endif - static struct cppc_cpudata *cppc_cpufreq_get_cpu_data(unsigned int cpu) { struct cppc_cpudata *cpu_data; -- Gitee From 3c3b817514da41ae32ad72d338ee09ca7757a456 Mon Sep 17 00:00:00 2001 From: Zheng Bin Date: Sat, 21 May 2022 11:24:38 +0800 Subject: [PATCH 62/78] cpufreq: CPPC: Fix build error without CONFIG_ACPI_CPPC_CPUFREQ_FIE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v5.19-rc1 commit a3f083e04a87662559bbadddc02f4739b78e743a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=a3f083e04a87662559bbadddc02f4739b78e743a ---------------------------------------------------------------------- If CONFIG_ACPI_CPPC_CPUFREQ_FIE is not set, building fails: drivers/cpufreq/cppc_cpufreq.c: In function ‘populate_efficiency_class’: drivers/cpufreq/cppc_cpufreq.c:584:2: error: ‘cppc_cpufreq_driver’ undeclared (first use in this function); did you mean ‘cpufreq_driver’? cppc_cpufreq_driver.register_em = cppc_cpufreq_register_em; ^~~~~~~~~~~~~~~~~~~ cpufreq_driver Make declare of cppc_cpufreq_driver out of CONFIG_ACPI_CPPC_CPUFREQ_FIE to fix this. Fixes: 740fcdc2c20e ("cpufreq: CPPC: Register EM based on efficiency class information") Signed-off-by: Zheng Bin Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 824f205e4428..adf2a2bdacea 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -65,6 +65,8 @@ static enum { FIE_DISABLED } fie_disabled = FIE_UNSET; +static struct cpufreq_driver cppc_cpufreq_driver; + #ifdef CONFIG_ACPI_CPPC_CPUFREQ_FIE module_param(fie_disabled, int, 0444); MODULE_PARM_DESC(fie_disabled, "Disable Frequency Invariance Engine (FIE)"); @@ -81,7 +83,6 @@ struct cppc_freq_invariance { static DEFINE_PER_CPU(struct cppc_freq_invariance, cppc_freq_inv); static struct kthread_worker *kworker_fie; -static struct cpufreq_driver cppc_cpufreq_driver; static int cppc_perf_from_fbctrs(struct cppc_cpudata *cpu_data, struct cppc_perf_fb_ctrs *fb_ctrs_t0, struct cppc_perf_fb_ctrs *fb_ctrs_t1); -- Gitee From 40104aa87ab6c7b0d5915856a2440ad6b695983f Mon Sep 17 00:00:00 2001 From: huwentao Date: Fri, 17 Oct 2025 18:15:56 +0800 Subject: [PATCH 63/78] openeuler_defconfig: Enable CONFIG_ACPI_CPPC_CPUFREQ_FIE hulk inclusion category: featrue bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U -------------------------------- Enable CONFIG_ACPI_CPPC_CPUFREQ_FIE in openeuler_defconfig. Signed-off-by: huwentao --- arch/arm64/configs/openeuler_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 9d58c2d8eb5e..b5c7f7ec7b6c 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -669,6 +669,7 @@ CONFIG_CPU_FREQ_GOV_SEEP=m # # CONFIG_CPUFREQ_DT is not set CONFIG_ACPI_CPPC_CPUFREQ=y +CONFIG_ACPI_CPPC_CPUFREQ_FIE=y CONFIG_ARM_SCPI_CPUFREQ=m # CONFIG_ARM_QCOM_CPUFREQ_HW is not set # end of CPU Frequency scaling -- Gitee From cf278191e5e2aa43d460cefec7b1925f78cb43b2 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 8 Nov 2022 10:01:03 -0700 Subject: [PATCH 64/78] cpufreq: ACPI: Remove unused variables 'acpi_cpufreq_online' and 'ret' mainline inclusion from mainline-v6.2-rc1 commit cab75e1c8e42ebb4bb386247a928af6ab412e82b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=cab75e1c8e42ebb4bb386247a928af6ab412e82b ---------------------------------------------------------------------- Clang warns: drivers/cpufreq/acpi-cpufreq.c:970:24: error: variable 'ret' is uninitialized when used here [-Werror,-Wuninitialized] acpi_cpufreq_online = ret; ^~~ drivers/cpufreq/acpi-cpufreq.c:960:9: note: initialize the variable 'ret' to silence this warning int ret; ^ = 0 1 error generated. Both ret and acpi_cpufreq_online are now unused so they can be safely removed, clearing up the warning. Fixes: 13fdbc8b8da6 ("cpufreq: ACPI: Defer setting boost MSRs") Link: https://github.com/ClangBuiltLinux/linux/issues/1757 Signed-off-by: Nathan Chancellor Reviewed-by: Nick Desaulniers Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- drivers/cpufreq/acpi-cpufreq.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index 41a3c81614d4..1d5d0af10ffd 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -950,12 +950,8 @@ static struct cpufreq_driver acpi_cpufreq_driver = { .attr = acpi_cpufreq_attr, }; -static enum cpuhp_state acpi_cpufreq_online; - static void __init acpi_cpufreq_boost_init(void) { - int ret; - if (!(boot_cpu_has(X86_FEATURE_CPB) || boot_cpu_has(X86_FEATURE_IDA))) { pr_debug("Boost capabilities not present in the processor\n"); return; @@ -963,8 +959,6 @@ static void __init acpi_cpufreq_boost_init(void) acpi_cpufreq_driver.set_boost = set_boost; acpi_cpufreq_driver.boost_enabled = boost_state(0); - - acpi_cpufreq_online = ret; } static int __init acpi_cpufreq_init(void) -- Gitee From 797e3ba153e83bcdbc14bfa5c18d4f3c3863f1b7 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 1 Aug 2024 20:26:08 +0800 Subject: [PATCH 65/78] cpufreq: Allow drivers to advertise boost enabled mainline inclusion from mainline-v6.10-rc7 commit 102fa9c4b439ca3bd93d13fb53f5b7592d96a109 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=102fa9c4b439ca3bd93d13fb53f5b7592d96a109 -------------------------------- The behavior introduced in commit f37a4d6b4a2c ("cpufreq: Fix per-policy boost behavior on SoCs using cpufreq_boost_set_sw()") sets up the boost policy incorrectly when boost has been enabled by the platform firmware initially even if a driver sets the policy up. This is because policy_has_boost_freq() assumes that there is a frequency table set up by the driver and that the boost frequencies are advertised in that table. This assumption doesn't work for acpi-cpufreq or amd-pstate. Only use this check to enable boost if it's not already enabled instead of also disabling it if alreayd enabled. Fixes: f37a4d6b4a2c ("cpufreq: Fix per-policy boost behavior on SoCs using cpufreq_boost_set_sw()") Link: https://patch.msgid.link/20240626204723.6237-1-mario.limonciello@amd.com Reviewed-by: Sibi Sankar Reviewed-by: Dhruva Gole Acked-by: Viresh Kumar Reviewed-by: Gautham R. Shenoy Suggested-by: Viresh Kumar Suggested-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello Cc: All applicable Signed-off-by: Rafael J. Wysocki Signed-off-by: Lin Yujun Signed-off-by: huwentao --- drivers/cpufreq/cpufreq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index de1cc7a3f26c..eaed6763d9f5 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1413,7 +1413,8 @@ static int cpufreq_online(unsigned int cpu) } /* Let the per-policy boost flag mirror the cpufreq_driver boost during init */ - policy->boost_enabled = cpufreq_boost_enabled() && policy_has_boost_freq(policy); + if (cpufreq_boost_enabled() && policy_has_boost_freq(policy)) + policy->boost_enabled = true; /* * The initialization has succeeded and the policy is online. -- Gitee From 97348b0b4e6d9f8ef9ea67eae886ac6f68173dae Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Sat, 28 Dec 2024 06:49:46 +0000 Subject: [PATCH 66/78] cpufreq: CPPC: Fix wrong return value in cppc_get_cpu_cost() mainline inclusion from mainline-v6.13-rc1 commit be392aa80f1e5b0b65ccc2a540b9304fefcfe3d8 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/ID1B9U Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=be392aa80f1e5b0b65ccc2a540b9304fefcfe3d8 -------------------------------- cppc_get_cpu_cost() return 0 if the policy is NULL. Then in em_compute_costs(), the later zero check for cost is not valid as cost is uninitialized. As Quentin pointed out, kernel energy model core check the return value of get_cost() first, so if the callback failed it should tell the core. Return -EINVAL to fix it. Fixes: 1a1374bb8c59 ("cpufreq: CPPC: Fix possible null-ptr-deref for cppc_get_cpu_cost()") Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/c4765377-7830-44c2-84fa-706b6e304e10@stanley.mountain/ Signed-off-by: Jinjie Ruan Suggested-by: Quentin Perret Signed-off-by: Viresh Kumar Signed-off-by: Sasha Levin Signed-off-by: Jinjie Ruan Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index adf2a2bdacea..7c35408c4ff9 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -592,7 +592,7 @@ static int cppc_get_cpu_cost(struct device *cpu_dev, unsigned long KHz, policy = cpufreq_cpu_get_raw(cpu_dev->id); if (!policy) - return 0; + return -EINVAL; cpu_data = policy->driver_data; perf_caps = &cpu_data->perf_caps; -- Gitee From 51c8ae0fb9b0dc5957a4cfe8e24fdec626362de4 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Wed, 6 Nov 2024 09:01:11 +0800 Subject: [PATCH 67/78] cpufreq: CPPC: Fix wrong return value in cppc_get_cpu_power() mainline inclusion from mainline-v6.13-rc1 commit b51eb0874d8170028434fbd259e80b78ed9b8eca category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=b51eb0874d8170028434fbd259e80b78ed9b8eca ---------------------------------------------------------------------- cppc_get_cpu_power() return 0 if the policy is NULL. Then in em_create_perf_table(), the later zero check for power is not valid as power is uninitialized. As Quentin pointed out, kernel energy model core check the return value of active_power() first, so if the callback failed it should tell the core. So return -EINVAL to fix it. Fixes: a78e72075642 ("cpufreq: CPPC: Fix possible null-ptr-deref for cpufreq_cpu_get_raw()") Signed-off-by: Jinjie Ruan Suggested-by: Quentin Perret Signed-off-by: Viresh Kumar Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 7c35408c4ff9..ec5d341c1420 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -522,7 +522,7 @@ static int cppc_get_cpu_power(struct device *cpu_dev, policy = cpufreq_cpu_get_raw(cpu_dev->id); if (!policy) - return 0; + return -EINVAL; cpu_data = policy->driver_data; perf_caps = &cpu_data->perf_caps; -- Gitee From 49809bb9f49d5c4757ddff49c445ca6b8207a4ff Mon Sep 17 00:00:00 2001 From: Song Shuai Date: Thu, 23 Mar 2023 20:39:24 +0800 Subject: [PATCH 68/78] riscv: export cpu/freq invariant to scheduler mainline inclusion from mainline-v6.4-rc1 commit c4b52d8b6c1de1e6359bef2d1394d5917940b3dc category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICEXTY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=c4b52d8b6c1de1e6359bef2d1394d5917940b3dc ---------------------------------------------------------------------- RISC-V now manages CPU topology using arch_topology which provides CPU capacity and frequency related interfaces to access the cpu/freq invariant in possible heterogeneous or DVFS-enabled platforms. Here adds topology.h file to export the arch_topology interfaces for replacing the scheduler's constant-based cpu/freq invariant accounting. Signed-off-by: Song Shuai Reviewed-by: Andrew Jones Reviewed-by: Ley Foon Tan Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20230323123924.3032174-1-suagrfillet@gmail.com [Palmer: Fix the whitespace issues.] Signed-off-by: Palmer Dabbelt Signed-off-by: huwentao --- arch/riscv/include/asm/topology.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 arch/riscv/include/asm/topology.h diff --git a/arch/riscv/include/asm/topology.h b/arch/riscv/include/asm/topology.h new file mode 100644 index 000000000000..727e8d163a3b --- /dev/null +++ b/arch/riscv/include/asm/topology.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_RISCV_TOPOLOGY_H +#define _ASM_RISCV_TOPOLOGY_H + +#include + +/* Replace task scheduler's default frequency-invariant accounting */ +#define arch_scale_freq_tick topology_scale_freq_tick +#define arch_set_freq_scale topology_set_freq_scale +#define arch_scale_freq_capacity topology_get_freq_scale +#define arch_scale_freq_invariant topology_scale_freq_invariant + +/* Replace task scheduler's default cpu-invariant accounting */ +#define arch_scale_cpu_capacity topology_get_cpu_scale + +/* Enable topology flag updates */ +#define arch_update_cpu_topology topology_update_cpu_topology + +#include +#endif /* _ASM_RISCV_TOPOLOGY_H */ -- Gitee From 4abaa97104f2c27117bf4d65afe98a67a01cadd5 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Tue, 9 Nov 2021 19:57:10 +0000 Subject: [PATCH 69/78] arch_topology: Introduce thermal pressure update function mainline inclusion from mainline-v5.17-rc1 commit c214f124161d446b340597e7c968e0a2dc149142 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICEXTY CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=c214f124161d446b340597e7c968e0a2dc149142 ---------------------------------------------------------------------- The thermal pressure is a mechanism which is used for providing information about reduced CPU performance to the scheduler. Usually code has to convert the value from frequency units into capacity units, which are understandable by the scheduler. Create a common conversion code which can be just used via a handy API. Internally, the topology_update_thermal_pressure() operates on frequency in MHz and max CPU frequency is taken from 'freq_factor' (per-cpu). Signed-off-by: Lukasz Luba Reviewed-by: Thara Gopinath Signed-off-by: Viresh Kumar Signed-off-by: huwentao --- arch/arm/include/asm/topology.h | 1 + arch/arm64/include/asm/topology.h | 1 + drivers/base/arch_topology.c | 43 ++++++++++++++++++++++++++++++- include/linux/arch_topology.h | 3 +++ include/linux/sched/topology.h | 7 +++++ 5 files changed, 54 insertions(+), 1 deletion(-) diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 470299ee2fba..f1eafacc9a30 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -24,6 +24,7 @@ /* Replace task scheduler's default thermal pressure API */ #define arch_scale_thermal_pressure topology_get_thermal_pressure #define arch_set_thermal_pressure topology_set_thermal_pressure +#define arch_update_thermal_pressure topology_update_thermal_pressure #else diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 5ebe51ef6d6c..7379878579ab 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -37,6 +37,7 @@ void update_freq_counters_refs(void); /* Replace task scheduler's default thermal pressure API */ #define arch_scale_thermal_pressure topology_get_thermal_pressure #define arch_set_thermal_pressure topology_set_thermal_pressure +#define arch_update_thermal_pressure topology_update_thermal_pressure #include diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index ca6e77ae43d0..7aef6651f3b3 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -25,6 +25,7 @@ static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data); static struct cpumask scale_freq_counters_mask; static bool scale_freq_invariant; +static DEFINE_PER_CPU(u32, freq_factor) = 1; static bool supports_scale_freq_counters(const struct cpumask *cpus) { @@ -167,6 +168,47 @@ void topology_set_thermal_pressure(const struct cpumask *cpus, WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); } +/** + * topology_update_thermal_pressure() - Update thermal pressure for CPUs + * @cpus : The related CPUs for which capacity has been reduced + * @capped_freq : The maximum allowed frequency that CPUs can run at + * + * Update the value of thermal pressure for all @cpus in the mask. The + * cpumask should include all (online+offline) affected CPUs, to avoid + * operating on stale data when hot-plug is used for some CPUs. The + * @capped_freq reflects the currently allowed max CPUs frequency due to + * thermal capping. It might be also a boost frequency value, which is bigger + * than the internal 'freq_factor' max frequency. In such case the pressure + * value should simply be removed, since this is an indication that there is + * no thermal throttling. The @capped_freq must be provided in kHz. + */ +void topology_update_thermal_pressure(const struct cpumask *cpus, + unsigned long capped_freq) +{ + unsigned long max_capacity, capacity; + u32 max_freq; + int cpu; + + cpu = cpumask_first(cpus); + max_capacity = arch_scale_cpu_capacity(cpu); + max_freq = per_cpu(freq_factor, cpu); + + /* Convert to MHz scale which is used in 'freq_factor' */ + capped_freq /= 1000; + + /* + * Handle properly the boost frequencies, which should simply clean + * the thermal pressure value. + */ + if (max_freq <= capped_freq) + capacity = max_capacity; + else + capacity = mult_frac(max_capacity, capped_freq, max_freq); + + arch_set_thermal_pressure(cpus, max_capacity - capacity); +} +EXPORT_SYMBOL_GPL(topology_update_thermal_pressure); + static ssize_t cpu_capacity_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -224,7 +266,6 @@ static void update_topology_flags_workfn(struct work_struct *work) arch_rebuild_cpu_topology(); } -static DEFINE_PER_CPU(u32, freq_factor) = 1; static u32 *raw_capacity; static int free_raw_capacity(void) diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index dea46128b2da..8739bf45766b 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -63,6 +63,9 @@ static inline unsigned long topology_get_thermal_pressure(int cpu) void topology_set_thermal_pressure(const struct cpumask *cpus, unsigned long th_pressure); +void topology_update_thermal_pressure(const struct cpumask *cpus, + unsigned long capped_freq); + struct cpu_topology { int thread_id; int core_id; diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 431ff2c6bf67..ae0faeee1a5b 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -313,6 +313,13 @@ void arch_set_thermal_pressure(const struct cpumask *cpus, { } #endif +#ifndef arch_update_thermal_pressure +static __always_inline +void arch_update_thermal_pressure(const struct cpumask *cpus, + unsigned long capped_frequency) +{ } +#endif + static inline int task_node(const struct task_struct *p) { return cpu_to_node(task_cpu(p)); -- Gitee From 50cbdd00582153e2441f929af0855258f9c43acc Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 13 Jun 2025 15:27:43 +0800 Subject: [PATCH 70/78] sched/topology: Add a new arch_scale_freq_ref() method mainline inclusion from mainline-v6.8-rc6 commit 9942cb22ea458c34fa17b73d143ea32d4df1caca category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICEXYT CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9942cb22ea458c34fa17b73d143ea32d4df1caca ---------------------------------------------------------------------- Create a new method to get a unique and fixed max frequency. Currently cpuinfo.max_freq or the highest (or last) state of performance domain are used as the max frequency when computing the frequency for a level of utilization, but: - cpuinfo_max_freq can change at runtime. boost is one example of such change. - cpuinfo.max_freq and last item of the PD can be different leading to different results between cpufreq and energy model. We need to save the reference frequency that has been used when computing the CPUs capacity and use this fixed and coherent value to convert between frequency and CPU's capacity. In fact, we already save the frequency that has been used when computing the capacity of each CPU. We extend the precision to save kHz instead of MHz currently and we modify the type to be aligned with other variables used when converting frequency to capacity and the other way. [ mingo: Minor edits. ] Fixes: 1eb5dde674f5 ("cpufreq: CPPC: Add support for frequency invariance") Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Tested-by: Lukasz Luba Reviewed-by: Lukasz Luba Acked-by: Sudeep Holla Link: https://lore.kernel.org/r/20231211104855.558096-2-vincent.guittot@linaro.org Signed-off-by: Lifeng Zheng Signed-off-by: Hongye Lin Signed-off-by: huwentao --- arch/arm/include/asm/topology.h | 1 + arch/arm64/include/asm/topology.h | 1 + arch/riscv/include/asm/topology.h | 1 + drivers/base/arch_topology.c | 30 +++++++++++++++--------------- include/linux/arch_topology.h | 7 +++++++ include/linux/sched/topology.h | 8 ++++++++ 6 files changed, 33 insertions(+), 15 deletions(-) diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index f1eafacc9a30..cdc0f5e347b7 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -13,6 +13,7 @@ #define arch_set_freq_scale topology_set_freq_scale #define arch_scale_freq_capacity topology_get_freq_scale #define arch_scale_freq_invariant topology_scale_freq_invariant +#define arch_scale_freq_ref topology_get_freq_ref #endif /* Replace task scheduler's default cpu-invariant accounting */ diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 7379878579ab..d6c730e2a923 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -23,6 +23,7 @@ void update_freq_counters_refs(void); #define arch_set_freq_scale topology_set_freq_scale #define arch_scale_freq_capacity topology_get_freq_scale #define arch_scale_freq_invariant topology_scale_freq_invariant +#define arch_scale_freq_ref topology_get_freq_ref #ifdef CONFIG_ACPI_CPPC_LIB #define arch_init_invariance_cppc topology_init_cpu_capacity_cppc diff --git a/arch/riscv/include/asm/topology.h b/arch/riscv/include/asm/topology.h index 727e8d163a3b..76e6b9079e86 100644 --- a/arch/riscv/include/asm/topology.h +++ b/arch/riscv/include/asm/topology.h @@ -9,6 +9,7 @@ #define arch_set_freq_scale topology_set_freq_scale #define arch_scale_freq_capacity topology_get_freq_scale #define arch_scale_freq_invariant topology_scale_freq_invariant +#define arch_scale_freq_ref topology_get_freq_ref /* Replace task scheduler's default cpu-invariant accounting */ #define arch_scale_cpu_capacity topology_get_cpu_scale diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 7aef6651f3b3..3bc8b007f1bd 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -21,11 +21,14 @@ #include #include #include +#include + static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data); static struct cpumask scale_freq_counters_mask; static bool scale_freq_invariant; -static DEFINE_PER_CPU(u32, freq_factor) = 1; +DEFINE_PER_CPU(unsigned long, capacity_freq_ref) = 1; +EXPORT_PER_CPU_SYMBOL_GPL(capacity_freq_ref); static bool supports_scale_freq_counters(const struct cpumask *cpus) { @@ -178,9 +181,9 @@ void topology_set_thermal_pressure(const struct cpumask *cpus, * operating on stale data when hot-plug is used for some CPUs. The * @capped_freq reflects the currently allowed max CPUs frequency due to * thermal capping. It might be also a boost frequency value, which is bigger - * than the internal 'freq_factor' max frequency. In such case the pressure - * value should simply be removed, since this is an indication that there is - * no thermal throttling. The @capped_freq must be provided in kHz. + * than the internal 'capacity_freq_ref' max frequency. In such case the + * pressure value should simply be removed, since this is an indication that + * there is no thermal throttling. The @capped_freq must be provided in kHz. */ void topology_update_thermal_pressure(const struct cpumask *cpus, unsigned long capped_freq) @@ -191,10 +194,7 @@ void topology_update_thermal_pressure(const struct cpumask *cpus, cpu = cpumask_first(cpus); max_capacity = arch_scale_cpu_capacity(cpu); - max_freq = per_cpu(freq_factor, cpu); - - /* Convert to MHz scale which is used in 'freq_factor' */ - capped_freq /= 1000; + max_freq = arch_scale_freq_ref(cpu); /* * Handle properly the boost frequencies, which should simply clean @@ -287,13 +287,13 @@ void topology_normalize_cpu_scale(void) capacity_scale = 1; for_each_possible_cpu(cpu) { - capacity = raw_capacity[cpu] * per_cpu(freq_factor, cpu); + capacity = raw_capacity[cpu] * per_cpu(capacity_freq_ref, cpu); capacity_scale = max(capacity, capacity_scale); } pr_debug("cpu_capacity: capacity_scale=%llu\n", capacity_scale); for_each_possible_cpu(cpu) { - capacity = raw_capacity[cpu] * per_cpu(freq_factor, cpu); + capacity = raw_capacity[cpu] * per_cpu(capacity_freq_ref, cpu); capacity = div64_u64(capacity << SCHED_CAPACITY_SHIFT, capacity_scale); topology_set_cpu_scale(cpu, capacity); @@ -329,15 +329,15 @@ bool __init topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu) cpu_node, raw_capacity[cpu]); /* - * Update freq_factor for calculating early boot cpu capacities. + * Update capacity_freq_ref for calculating early boot CPU capacities. * For non-clk CPU DVFS mechanism, there's no way to get the * frequency value now, assuming they are running at the same - * frequency (by keeping the initial freq_factor value). + * frequency (by keeping the initial capacity_freq_ref value). */ cpu_clk = of_clk_get(cpu_node, 0); if (!PTR_ERR_OR_ZERO(cpu_clk)) { - per_cpu(freq_factor, cpu) = - clk_get_rate(cpu_clk) / 1000; + per_cpu(capacity_freq_ref, cpu) = + clk_get_rate(cpu_clk) / HZ_PER_KHZ; clk_put(cpu_clk); } } else { @@ -419,7 +419,7 @@ init_cpu_capacity_callback(struct notifier_block *nb, cpumask_andnot(cpus_to_visit, cpus_to_visit, policy->related_cpus); for_each_cpu(cpu, policy->related_cpus) - per_cpu(freq_factor, cpu) = policy->cpuinfo.max_freq / 1000; + per_cpu(capacity_freq_ref, cpu) = policy->cpuinfo.max_freq; if (cpumask_empty(cpus_to_visit)) { topology_normalize_cpu_scale(); diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index 8739bf45766b..c02f645fd47d 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -27,6 +27,13 @@ static inline unsigned long topology_get_cpu_scale(int cpu) void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity); +DECLARE_PER_CPU(unsigned long, capacity_freq_ref); + +static inline unsigned long topology_get_freq_ref(int cpu) +{ + return per_cpu(capacity_freq_ref, cpu); +} + DECLARE_PER_CPU(unsigned long, arch_freq_scale); static inline unsigned long topology_get_freq_scale(int cpu) diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index ae0faeee1a5b..7a8efe57b119 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -320,6 +320,14 @@ void arch_update_thermal_pressure(const struct cpumask *cpus, { } #endif +#ifndef arch_scale_freq_ref +static __always_inline +unsigned int arch_scale_freq_ref(int cpu) +{ + return 0; +} +#endif + static inline int task_node(const struct task_struct *p) { return cpu_to_node(task_cpu(p)); -- Gitee From 9035b3abf823ffb92c84354eeea06e233db53282 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 13 Jun 2025 15:27:44 +0800 Subject: [PATCH 71/78] cpufreq: Use the fixed and coherent frequency for scaling capacity mainline inclusion from mainline-v6.8-rc6 commit 599457ba15403037b489fe536266a3d5f9efaed7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICEXYT CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=599457ba15403037b489fe536266a3d5f9efaed7 ---------------------------------------------------------------------- cpuinfo.max_freq can change at runtime because of boost as an example. This implies that the value could be different from the frequency that has been used to compute the capacity of a CPU. The new arch_scale_freq_ref() returns a fixed and coherent frequency that can be used to compute the capacity for a given frequency. [ Also fix a arch_set_freq_scale() newline style wart in . ] Fixes: 1eb5dde674f5 ("cpufreq: CPPC: Add support for frequency invariance") Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Tested-by: Lukasz Luba Reviewed-by: Lukasz Luba Acked-by: Viresh Kumar Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20231211104855.558096-3-vincent.guittot@linaro.org Signed-off-by: Lifeng Zheng Signed-off-by: Hongye Lin Signed-off-by: huwentao --- drivers/cpufreq/cpufreq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index eaed6763d9f5..35a0c8e62d46 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -449,7 +449,7 @@ void cpufreq_freq_transition_end(struct cpufreq_policy *policy, arch_set_freq_scale(policy->related_cpus, policy->cur, - policy->cpuinfo.max_freq); + arch_scale_freq_ref(policy->cpu)); spin_lock(&policy->transition_lock); policy->transition_ongoing = false; @@ -2188,7 +2188,7 @@ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, policy->cur = freq; arch_set_freq_scale(policy->related_cpus, freq, - policy->cpuinfo.max_freq); + arch_scale_freq_ref(policy->cpu)); cpufreq_stats_record_transition(policy, freq); if (trace_cpu_frequency_enabled()) { -- Gitee From 5a7da807568512b3c8d6cfec919a150143c2fed7 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 13 Jun 2025 15:27:45 +0800 Subject: [PATCH 72/78] cpufreq/schedutil: Use a fixed reference frequency mainline inclusion from mainline-v6.8-rc6 commit b3edde44e5d4504c23a176819865cd603fd16d6c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICEXYT CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b3edde44e5d4504c23a176819865cd603fd16d6c ---------------------------------------------------------------------- cpuinfo.max_freq can change at runtime because of boost as an example. This implies that the value could be different than the one that has been used when computing the capacity of a CPU. The new arch_scale_freq_ref() returns a fixed and coherent reference frequency that can be used when computing a frequency based on utilization. Use this arch_scale_freq_ref() when available and fallback to policy otherwise. Fixes: 1eb5dde674f5 ("cpufreq: CPPC: Add support for frequency invariance") Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Tested-by: Lukasz Luba Reviewed-by: Lukasz Luba Reviewed-by: Dietmar Eggemann Acked-by: Rafael J. Wysocki Acked-by: Viresh Kumar Link: https://lore.kernel.org/r/20231211104855.558096-4-vincent.guittot@linaro.org Signed-off-by: Lifeng Zheng Signed-off-by: Hongye Lin Signed-off-by: huwentao --- kernel/sched/cpufreq_schedutil.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 405577e99cb2..175db52f16e0 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -134,6 +134,28 @@ static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time, } } +/** + * get_capacity_ref_freq - get the reference frequency that has been used to + * correlate frequency and compute capacity for a given cpufreq policy. We use + * the CPU managing it for the arch_scale_freq_ref() call in the function. + * @policy: the cpufreq policy of the CPU in question. + * + * Return: the reference CPU frequency to compute a capacity. + */ +static __always_inline +unsigned long get_capacity_ref_freq(struct cpufreq_policy *policy) +{ + unsigned int freq = arch_scale_freq_ref(policy->cpu); + + if (freq) + return freq; + + if (arch_scale_freq_invariant()) + return policy->cpuinfo.max_freq; + + return policy->cur; +} + /** * get_next_freq - Compute a new frequency for a given cpufreq policy. * @sg_policy: schedutil policy object to compute the new frequency for. @@ -160,9 +182,9 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, unsigned long util, unsigned long max) { struct cpufreq_policy *policy = sg_policy->policy; - unsigned int freq = arch_scale_freq_invariant() ? - policy->cpuinfo.max_freq : policy->cur; + unsigned int freq; + freq = get_capacity_ref_freq(policy); freq = map_util_freq(util, freq, max); if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update) -- Gitee From c48660cf1e2a358b8b8da5d6c218d743e9de4a0e Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 13 Jun 2025 15:27:46 +0800 Subject: [PATCH 73/78] energy_model: Use a fixed reference frequency mainline inclusion from mainline-v6.8-rc6 commit 15cbbd1d317e07b4e5c6aca5d4c5579539a82784 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICEXYT CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=15cbbd1d317e07b4e5c6aca5d4c5579539a82784 ---------------------------------------------------------------------- The last item of a performance domain is not always the performance point that has been used to compute CPU's capacity. This can lead to different target frequency compared with other part of the system like schedutil and would result in wrong energy estimation. A new arch_scale_freq_ref() is available to return a fixed and coherent frequency reference that can be used when computing the CPU's frequency for an level of utilization. Use this function to get this reference frequency. Energy model is never used without defining arch_scale_freq_ref() but can be compiled. Define a default arch_scale_freq_ref() returning 0 in such case. Fixes: 1eb5dde674f5 ("cpufreq: CPPC: Add support for frequency invariance") Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Tested-by: Lukasz Luba Reviewed-by: Lukasz Luba Link: https://lore.kernel.org/r/20231211104855.558096-5-vincent.guittot@linaro.org Signed-off-by: Lifeng Zheng Signed-off-by: Hongye Lin Signed-off-by: huwentao --- include/linux/energy_model.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 47506c394841..257dab79caa8 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -136,7 +136,7 @@ void em_dev_unregister_perf_domain(struct device *dev); static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, unsigned long max_util, unsigned long sum_util) { - unsigned long freq, scale_cpu; + unsigned long freq, ref_freq, scale_cpu; struct em_perf_state *ps; int i, cpu; @@ -147,8 +147,8 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, */ cpu = cpumask_first(to_cpumask(pd->cpus)); scale_cpu = arch_scale_cpu_capacity(cpu); - ps = &pd->table[pd->nr_perf_states - 1]; - freq = map_util_freq(max_util, ps->frequency, scale_cpu); + ref_freq = arch_scale_freq_ref(cpu); + freq = map_util_freq(max_util, ref_freq, scale_cpu); /* * Find the lowest performance state of the Energy Model above the -- Gitee From 607f6ddc86a1193545fb54b5a1213fe4c08d8b89 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 11 Dec 2023 11:48:53 +0100 Subject: [PATCH 74/78] cpufreq/cppc: Move and rename cppc_cpufreq_{perf_to_khz|khz_to_perf}() stable inclusion from stable-v6.6.59 commit 33e89c16cea0882bb05c585fa13236b730dd0efa category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=33e89c16cea0882bb05c585fa13236b730dd0efa -------------------------------- [ Upstream commit 50b813b147e9eb6546a1fc49d4e703e6d23691f2 ] Move and rename cppc_cpufreq_perf_to_khz() and cppc_cpufreq_khz_to_perf() to use them outside cppc_cpufreq in topology_init_cpu_capacity_cppc(). Modify the interface to use struct cppc_perf_caps *caps instead of struct cppc_cpudata *cpu_data as we only use the fields of cppc_perf_caps. cppc_cpufreq was converting the lowest and nominal freq from MHz to kHz before using them. We move this conversion inside cppc_perf_to_khz and cppc_khz_to_perf to make them generic and usable outside cppc_cpufreq. No functional change Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Tested-by: Pierre Gondois Acked-by: Rafael J. Wysocki Acked-by: Viresh Kumar Link: https://lore.kernel.org/r/20231211104855.558096-6-vincent.guittot@linaro.org Stable-dep-of: d93df29bdab1 ("cpufreq: CPPC: fix perf_to_khz/khz_to_perf conversion exception") Signed-off-by: Sasha Levin Signed-off-by: Wen Zhiwei Signed-off-by: huwentao --- drivers/acpi/cppc_acpi.c | 116 +++++++++++++++++++++++++ drivers/cpufreq/cppc_cpufreq.c | 152 +++++---------------------------- include/acpi/cppc_acpi.h | 2 + 3 files changed, 138 insertions(+), 132 deletions(-) diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index c2477bd58efc..989ee23ec2b0 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -39,6 +39,9 @@ #include #include #include +#include +#include +#include #include @@ -1759,3 +1762,116 @@ unsigned int cppc_get_transition_latency(int cpu_num) return latency_ns; } EXPORT_SYMBOL_GPL(cppc_get_transition_latency); + +/* Minimum struct length needed for the DMI processor entry we want */ +#define DMI_ENTRY_PROCESSOR_MIN_LENGTH 48 + +/* Offset in the DMI processor structure for the max frequency */ +#define DMI_PROCESSOR_MAX_SPEED 0x14 + +/* Callback function used to retrieve the max frequency from DMI */ +static void cppc_find_dmi_mhz(const struct dmi_header *dm, void *private) +{ + const u8 *dmi_data = (const u8 *)dm; + u16 *mhz = (u16 *)private; + + if (dm->type == DMI_ENTRY_PROCESSOR && + dm->length >= DMI_ENTRY_PROCESSOR_MIN_LENGTH) { + u16 val = (u16)get_unaligned((const u16 *) + (dmi_data + DMI_PROCESSOR_MAX_SPEED)); + *mhz = val > *mhz ? val : *mhz; + } +} + +/* Look up the max frequency in DMI */ +static u64 cppc_get_dmi_max_khz(void) +{ + u16 mhz = 0; + + dmi_walk(cppc_find_dmi_mhz, &mhz); + + /* + * Real stupid fallback value, just in case there is no + * actual value set. + */ + mhz = mhz ? mhz : 1; + + return KHZ_PER_MHZ * mhz; +} + +/* + * If CPPC lowest_freq and nominal_freq registers are exposed then we can + * use them to convert perf to freq and vice versa. The conversion is + * extrapolated as an affine function passing by the 2 points: + * - (Low perf, Low freq) + * - (Nominal perf, Nominal freq) + */ +unsigned int cppc_perf_to_khz(struct cppc_perf_caps *caps, unsigned int perf) +{ + s64 retval, offset = 0; + static u64 max_khz; + u64 mul, div; + + if (caps->lowest_freq && caps->nominal_freq) { + /* Avoid the special case when nominal_freq is equal to lowest_freq */ + if (caps->nominal_freq == caps->lowest_freq) { + mul = caps->nominal_freq; + div = caps->nominal_perf; + } else { + mul = caps->nominal_freq - caps->lowest_freq; + div = caps->nominal_perf - caps->lowest_perf; + } + mul *= KHZ_PER_MHZ; + offset = caps->nominal_freq * KHZ_PER_MHZ - + div64_u64(caps->nominal_perf * mul, div); + } else { + if (!max_khz) + max_khz = cppc_get_dmi_max_khz(); + mul = max_khz; + div = caps->highest_perf; + } + + retval = offset + div64_u64(perf * mul, div); + if (retval >= 0) + return retval; + return 0; +} +EXPORT_SYMBOL_GPL(cppc_perf_to_khz); + +unsigned int cppc_khz_to_perf(struct cppc_perf_caps *caps, unsigned int freq) +{ + s64 retval, offset = 0; + static u64 max_khz; + u64 mul, div; + + if (caps->lowest_freq && caps->nominal_freq) { + /* Avoid special case when nominal_freq is equal to lowest_freq */ + if (caps->nominal_freq == caps->lowest_freq) { + mul = caps->nominal_perf; + div = caps->nominal_freq; + } else { + mul = caps->nominal_perf - caps->lowest_perf; + div = caps->nominal_freq - caps->lowest_freq; + } + /* + * We don't need to convert to kHz for computing offset and can + * directly use nominal_freq and lowest_freq as the div64_u64 + * will remove the frequency unit. + */ + offset = caps->nominal_perf - + div64_u64(caps->nominal_freq * mul, div); + /* But we need it for computing the perf level. */ + div *= KHZ_PER_MHZ; + } else { + if (!max_khz) + max_khz = cppc_get_dmi_max_khz(); + mul = caps->highest_perf; + div = max_khz; + } + + retval = offset + div64_u64(freq * mul, div); + if (retval >= 0) + return retval; + return 0; +} +EXPORT_SYMBOL_GPL(cppc_khz_to_perf); diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index ec5d341c1420..39436127cb2d 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include @@ -27,12 +26,6 @@ #include -/* Minimum struct length needed for the DMI processor entry we want */ -#define DMI_ENTRY_PROCESSOR_MIN_LENGTH 48 - -/* Offest in the DMI processor structure for the max frequency */ -#define DMI_PROCESSOR_MAX_SPEED 0x14 - static bool boost_supported; struct cppc_workaround_oem_info { @@ -294,108 +287,9 @@ static inline void cppc_freq_invariance_exit(void) } #endif /* CONFIG_ACPI_CPPC_CPUFREQ_FIE */ -/* Callback function used to retrieve the max frequency from DMI */ -static void cppc_find_dmi_mhz(const struct dmi_header *dm, void *private) -{ - const u8 *dmi_data = (const u8 *)dm; - u16 *mhz = (u16 *)private; - - if (dm->type == DMI_ENTRY_PROCESSOR && - dm->length >= DMI_ENTRY_PROCESSOR_MIN_LENGTH) { - u16 val = (u16)get_unaligned((const u16 *) - (dmi_data + DMI_PROCESSOR_MAX_SPEED)); - *mhz = val > *mhz ? val : *mhz; - } -} - -/* Look up the max frequency in DMI */ -static u64 cppc_get_dmi_max_khz(void) -{ - u16 mhz = 0; - - dmi_walk(cppc_find_dmi_mhz, &mhz); - - /* - * Real stupid fallback value, just in case there is no - * actual value set. - */ - mhz = mhz ? mhz : 1; - - return (1000 * mhz); -} - -/* - * If CPPC lowest_freq and nominal_freq registers are exposed then we can - * use them to convert perf to freq and vice versa. The conversion is - * extrapolated as an affine function passing by the 2 points: - * - (Low perf, Low freq) - * - (Nominal perf, Nominal freq) - */ -static unsigned int cppc_cpufreq_perf_to_khz(struct cppc_cpudata *cpu_data, - unsigned int perf) -{ - struct cppc_perf_caps *caps = &cpu_data->perf_caps; - s64 retval, offset = 0; - static u64 max_khz; - u64 mul, div; - - if (caps->lowest_freq && caps->nominal_freq) { - /* Avoid the special case when nominal_freq is equal to lowest_freq */ - if (caps->nominal_freq == caps->lowest_freq) { - mul = caps->nominal_freq; - div = caps->nominal_perf; - } else { - mul = caps->nominal_freq - caps->lowest_freq; - div = caps->nominal_perf - caps->lowest_perf; - } - offset = caps->nominal_freq - div64_u64(caps->nominal_perf * mul, div); - } else { - if (!max_khz) - max_khz = cppc_get_dmi_max_khz(); - mul = max_khz; - div = caps->highest_perf; - } - - retval = offset + div64_u64(perf * mul, div); - if (retval >= 0) - return retval; - return 0; -} - -static unsigned int cppc_cpufreq_khz_to_perf(struct cppc_cpudata *cpu_data, - unsigned int freq) -{ - struct cppc_perf_caps *caps = &cpu_data->perf_caps; - s64 retval, offset = 0; - static u64 max_khz; - u64 mul, div; - - if (caps->lowest_freq && caps->nominal_freq) { - /* Avoid special case when nominal_freq is equal to lowest_freq */ - if (caps->nominal_freq == caps->lowest_freq) { - mul = caps->nominal_perf; - div = caps->nominal_freq; - } else { - mul = caps->nominal_perf - caps->lowest_perf; - div = caps->nominal_freq - caps->lowest_freq; - } - offset = caps->nominal_perf - div64_u64(caps->nominal_freq * mul, div); - } else { - if (!max_khz) - max_khz = cppc_get_dmi_max_khz(); - mul = caps->highest_perf; - div = max_khz; - } - - retval = offset + div64_u64(freq * mul, div); - if (retval >= 0) - return retval; - return 0; -} - static int cppc_cpufreq_set_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) + unsigned int target_freq, + unsigned int relation) { struct cppc_cpudata *cpu_data = policy->driver_data; unsigned int cpu = policy->cpu; @@ -403,7 +297,7 @@ static int cppc_cpufreq_set_target(struct cpufreq_policy *policy, u32 desired_perf; int ret = 0; - desired_perf = cppc_cpufreq_khz_to_perf(cpu_data, target_freq); + desired_perf = cppc_khz_to_perf(&cpu_data->perf_caps, target_freq); /* Return if it is exactly the same perf */ if (desired_perf == cpu_data->perf_ctrls.desired_perf) return ret; @@ -534,7 +428,7 @@ static int cppc_get_cpu_power(struct device *cpu_dev, min_step = min_cap / CPPC_EM_CAP_STEP; max_step = max_cap / CPPC_EM_CAP_STEP; - perf_prev = cppc_cpufreq_khz_to_perf(cpu_data, *KHz); + perf_prev = cppc_khz_to_perf(perf_caps, *KHz); step = perf_prev / perf_step; if (step > max_step) @@ -554,8 +448,8 @@ static int cppc_get_cpu_power(struct device *cpu_dev, perf = step * perf_step; } - *KHz = cppc_cpufreq_perf_to_khz(cpu_data, perf); - perf_check = cppc_cpufreq_khz_to_perf(cpu_data, *KHz); + *KHz = cppc_perf_to_khz(perf_caps, perf); + perf_check = cppc_khz_to_perf(perf_caps, *KHz); step_check = perf_check / perf_step; /* @@ -565,8 +459,8 @@ static int cppc_get_cpu_power(struct device *cpu_dev, */ while ((*KHz == prev_freq) || (step_check != step)) { perf++; - *KHz = cppc_cpufreq_perf_to_khz(cpu_data, perf); - perf_check = cppc_cpufreq_khz_to_perf(cpu_data, *KHz); + *KHz = cppc_perf_to_khz(perf_caps, perf); + perf_check = cppc_khz_to_perf(perf_caps, *KHz); step_check = perf_check / perf_step; } @@ -598,7 +492,7 @@ static int cppc_get_cpu_cost(struct device *cpu_dev, unsigned long KHz, perf_caps = &cpu_data->perf_caps; max_cap = arch_scale_cpu_capacity(cpu_dev->id); - perf_prev = cppc_cpufreq_khz_to_perf(cpu_data, KHz); + perf_prev = cppc_khz_to_perf(perf_caps, KHz); perf_step = CPPC_EM_CAP_STEP * perf_caps->highest_perf / max_cap; step = perf_prev / perf_step; @@ -683,10 +577,6 @@ static struct cppc_cpudata *cppc_cpufreq_get_cpu_data(unsigned int cpu) goto free_mask; } - /* Convert the lowest and nominal freq from MHz to KHz */ - cpu_data->perf_caps.lowest_freq *= 1000; - cpu_data->perf_caps.nominal_freq *= 1000; - return cpu_data; free_mask: @@ -725,18 +615,18 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) * Set min to lowest nonlinear perf to avoid any efficiency penalty (see * Section 8.4.7.1.1.5 of ACPI 6.1 spec) */ - policy->min = cppc_cpufreq_perf_to_khz(cpu_data, - caps->lowest_nonlinear_perf); - policy->max = cppc_cpufreq_perf_to_khz(cpu_data, policy->boost_enabled ? - caps->highest_perf : caps->nominal_perf); + policy->min = cppc_perf_to_khz(caps, + caps->lowest_nonlinear_perf); + policy->max = cppc_perf_to_khz(caps, policy->boost_enabled ? + caps->highest_perf : caps->nominal_perf); /* * Set cpuinfo.min_freq to Lowest to make the full range of performance * available if userspace wants to use any perf between lowest & lowest * nonlinear perf */ - policy->cpuinfo.min_freq = cppc_cpufreq_perf_to_khz(cpu_data, - caps->lowest_perf); + policy->cpuinfo.min_freq = cppc_perf_to_khz(caps, + caps->lowest_perf); policy->cpuinfo.max_freq = policy->max; policy->transition_delay_us = cppc_cpufreq_get_transition_delay_us(cpu); @@ -770,7 +660,7 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) boost_supported = true; /* Set policy->cur to norm now. */ - policy->cur = cppc_cpufreq_perf_to_khz(cpu_data, caps->nominal_perf); + policy->cur = cppc_perf_to_khz(caps, caps->nominal_perf); cpu_data->perf_ctrls.desired_perf = caps->nominal_perf; ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls); @@ -894,7 +784,7 @@ static unsigned int cppc_cpufreq_get_rate(unsigned int cpu) &fb_ctrs.fb_ctrs_t0, &fb_ctrs.fb_ctrs_t1); - return cppc_cpufreq_perf_to_khz(cpu_data, delivered_perf); + return cppc_perf_to_khz(&cpu_data->perf_caps, delivered_perf); } static int cppc_cpufreq_set_boost(struct cpufreq_policy *policy, int state) @@ -909,11 +799,9 @@ static int cppc_cpufreq_set_boost(struct cpufreq_policy *policy, int state) } if (state) - policy->max = cppc_cpufreq_perf_to_khz(cpu_data, - caps->highest_perf); + policy->max = cppc_perf_to_khz(caps, caps->highest_perf); else - policy->max = cppc_cpufreq_perf_to_khz(cpu_data, - caps->nominal_perf); + policy->max = cppc_perf_to_khz(caps, caps->nominal_perf); policy->cpuinfo.max_freq = policy->max; ret = freq_qos_update_request(policy->max_freq_req, policy->max); @@ -1077,7 +965,7 @@ static unsigned int hisi_cppc_cpufreq_get_rate(unsigned int cpu) if (ret < 0) return 0; - return cppc_cpufreq_perf_to_khz(cpu_data, desired_perf); + return cppc_perf_to_khz(&cpu_data->perf_caps, desired_perf); } static void cppc_check_hisi_workaround(void) diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index 5d24633e3177..bb21c00f0d5a 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -144,6 +144,8 @@ extern int cppc_set_epp(int cpu, u64 epp_val); extern int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls); extern int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps); extern bool cppc_perf_ctrs_in_pcc(void); +extern unsigned int cppc_perf_to_khz(struct cppc_perf_caps *caps, unsigned int perf); +extern unsigned int cppc_khz_to_perf(struct cppc_perf_caps *caps, unsigned int freq); extern bool acpi_cpc_valid(void); extern int acpi_get_psd_map(unsigned int cpu, struct cppc_cpudata *cpu_data); extern unsigned int cppc_get_transition_latency(int cpu); -- Gitee From cad879258ca4bcbd65c25b5530dc793e0d760062 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 13 Jun 2025 15:27:47 +0800 Subject: [PATCH 75/78] cpufreq/cppc: Set the frequency used for computing the capacity mainline inclusion from mainline-v6.8-rc6 commit 5477fa249b56c59c3baa1b237bf083cffa64c84a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICEXYT CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5477fa249b56c59c3baa1b237bf083cffa64c84a ---------------------------------------------------------------------- Save the frequency associated to the performance that has been used when initializing the capacity of CPUs. Also, cppc cpufreq driver can register an artificial energy model. In such case, it needs the frequency for this compute capacity. Fixes: 1eb5dde674f5 ("cpufreq: CPPC: Add support for frequency invariance") Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Tested-by: Pierre Gondois Acked-by: Sudeep Holla Acked-by: Viresh Kumar Link: https://lore.kernel.org/r/20231211104855.558096-7-vincent.guittot@linaro.org Signed-off-by: Lifeng Zheng Signed-off-by: Hongye Lin Signed-off-by: huwentao --- drivers/base/arch_topology.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 3bc8b007f1bd..51a341c346f1 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -358,6 +358,7 @@ bool __init topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu) void topology_init_cpu_capacity_cppc(void) { + u64 capacity, capacity_scale = 0; struct cppc_perf_caps perf_caps; int cpu; @@ -374,6 +375,10 @@ void topology_init_cpu_capacity_cppc(void) (perf_caps.highest_perf >= perf_caps.nominal_perf) && (perf_caps.highest_perf >= perf_caps.lowest_perf)) { raw_capacity[cpu] = perf_caps.highest_perf; + capacity_scale = max_t(u64, capacity_scale, raw_capacity[cpu]); + + per_cpu(capacity_freq_ref, cpu) = cppc_perf_to_khz(&perf_caps, raw_capacity[cpu]); + pr_debug("cpu_capacity: CPU%d cpu_capacity=%u (raw).\n", cpu, raw_capacity[cpu]); continue; @@ -384,7 +389,15 @@ void topology_init_cpu_capacity_cppc(void) goto exit; } - topology_normalize_cpu_scale(); + for_each_possible_cpu(cpu) { + capacity = raw_capacity[cpu]; + capacity = div64_u64(capacity << SCHED_CAPACITY_SHIFT, + capacity_scale); + topology_set_cpu_scale(cpu, capacity); + pr_debug("cpu_capacity: CPU%d cpu_capacity=%lu\n", + cpu, topology_get_cpu_scale(cpu)); + } + schedule_work(&update_topology_flags_work); pr_debug("cpu_capacity: cpu_capacity initialization done\n"); -- Gitee From 80f1d32dd4d5868b9332feed11998fa0b0bfc46c Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 13 Jun 2025 15:27:48 +0800 Subject: [PATCH 76/78] arm64/amu: Use capacity_ref_freq() to set AMU ratio mainline inclusion from mainline-v6.8-rc6 commit 1f023007f5e782bda19ad9104830c404fd622c5d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICEXYT CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1f023007f5e782bda19ad9104830c404fd622c5d ---------------------------------------------------------------------- Use the new capacity_ref_freq() method to set the ratio that is used by AMU for computing the arch_scale_freq_capacity(). This helps to keep everything aligned using the same reference for computing CPUs capacity. The default value of the ratio (stored in per_cpu(arch_max_freq_scale)) ensures that arch_scale_freq_capacity() returns max capacity until it is set to its correct value with the cpu capacity and capacity_ref_freq(). Fixes: 1eb5dde674f5 ("cpufreq: CPPC: Add support for frequency invariance") Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Acked-by: Sudeep Holla Acked-by: Will Deacon Link: https://lore.kernel.org/r/20231211104855.558096-8-vincent.guittot@linaro.org Signed-off-by: Lifeng Zheng Signed-off-by: Hongye Lin Signed-off-by: huwentao --- arch/arm64/kernel/topology.c | 26 +++++++++++++------------- drivers/base/arch_topology.c | 12 +++++++++++- include/linux/arch_topology.h | 1 + 3 files changed, 25 insertions(+), 14 deletions(-) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index a320b03fcced..e626a4985c48 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -170,7 +170,12 @@ int __init parse_acpi_topology(void) #undef pr_fmt #define pr_fmt(fmt) "AMU: " fmt -static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, arch_max_freq_scale); +/* + * Ensure that amu_scale_freq_tick() will return SCHED_CAPACITY_SCALE until + * the CPU capacity and its associated frequency have been correctly + * initialized. + */ +static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, arch_max_freq_scale) = 1UL << (2 * SCHED_CAPACITY_SHIFT); static DEFINE_PER_CPU(u64, arch_const_cycles_prev); static DEFINE_PER_CPU(u64, arch_core_cycles_prev); static cpumask_var_t amu_fie_cpus; @@ -200,14 +205,14 @@ static inline bool freq_counters_valid(int cpu) return true; } -static int freq_inv_set_max_ratio(int cpu, u64 max_rate, u64 ref_rate) +void freq_inv_set_max_ratio(int cpu, u64 max_rate) { - u64 ratio; + u64 ratio, ref_rate = arch_timer_get_rate(); if (unlikely(!max_rate || !ref_rate)) { - pr_debug("CPU%d: invalid maximum or reference frequency.\n", + WARN_ONCE(1, "CPU%d: invalid maximum or reference frequency.\n", cpu); - return -EINVAL; + return; } /* @@ -227,12 +232,10 @@ static int freq_inv_set_max_ratio(int cpu, u64 max_rate, u64 ref_rate) ratio = div64_u64(ratio, max_rate); if (!ratio) { WARN_ONCE(1, "Reference frequency too low.\n"); - return -EINVAL; + return; } - per_cpu(arch_max_freq_scale, cpu) = (unsigned long)ratio; - - return 0; + WRITE_ONCE(per_cpu(arch_max_freq_scale, cpu), (unsigned long)ratio); } static void amu_scale_freq_tick(void) @@ -283,10 +286,7 @@ static void amu_fie_setup(const struct cpumask *cpus) return; for_each_cpu(cpu, cpus) { - if (!freq_counters_valid(cpu) || - freq_inv_set_max_ratio(cpu, - cpufreq_get_hw_max_freq(cpu) * 1000ULL, - arch_timer_get_rate())) + if (!freq_counters_valid(cpu)) return; } diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 51a341c346f1..7b28f2416e00 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -353,6 +353,10 @@ bool __init topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu) return !ret; } +void __weak freq_inv_set_max_ratio(int cpu, u64 max_rate) +{ +} + #ifdef CONFIG_ACPI_CPPC_LIB #include @@ -390,6 +394,9 @@ void topology_init_cpu_capacity_cppc(void) } for_each_possible_cpu(cpu) { + freq_inv_set_max_ratio(cpu, + per_cpu(capacity_freq_ref, cpu) * HZ_PER_KHZ); + capacity = raw_capacity[cpu]; capacity = div64_u64(capacity << SCHED_CAPACITY_SHIFT, capacity_scale); @@ -431,8 +438,11 @@ init_cpu_capacity_callback(struct notifier_block *nb, cpumask_andnot(cpus_to_visit, cpus_to_visit, policy->related_cpus); - for_each_cpu(cpu, policy->related_cpus) + for_each_cpu(cpu, policy->related_cpus) { per_cpu(capacity_freq_ref, cpu) = policy->cpuinfo.max_freq; + freq_inv_set_max_ratio(cpu, + per_cpu(capacity_freq_ref, cpu) * HZ_PER_KHZ); + } if (cpumask_empty(cpus_to_visit)) { topology_normalize_cpu_scale(); diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index c02f645fd47d..984a20559a4d 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -103,6 +103,7 @@ void update_siblings_masks(unsigned int cpu); void remove_cpu_topology(unsigned int cpuid); void reset_cpu_topology(void); int parse_acpi_topology(void); +void freq_inv_set_max_ratio(int cpu, u64 max_rate); #ifdef CONFIG_HOTPLUG_SMT bool topology_is_primary_thread(unsigned int cpu); -- Gitee From 00ab162423c2eea882db3df13cc199eeec8e8e74 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 17 Jan 2024 20:05:45 +0100 Subject: [PATCH 77/78] topology: Set capacity_freq_ref in all cases mainline inclusion from mainline-v6.9-rc6 commit 98323e9d70172f1b46d1cadb20d6c54abf62870d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICEXYT CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=98323e9d70172f1b46d1cadb20d6c54abf62870d ---------------------------------------------------------------------- If "capacity-dmips-mhz" is not set, raw_capacity is null and we skip the normalization step which includes setting per_cpu capacity_freq_ref. Always register the notifier but skip the capacity normalization if raw_capacity is null. Fixes: 9942cb22ea45 ("sched/topology: Add a new arch_scale_freq_ref() method") Signed-off-by: Vincent Guittot Acked-by: Sudeep Holla Tested-by: Pierre Gondois Tested-by: Mark Brown Tested-by: Paul Barker Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Link: https://lore.kernel.org/r/20240117190545.596057-1-vincent.guittot@linaro.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Lifeng Zheng Signed-off-by: Hongye Lin Signed-off-by: huwentao --- drivers/base/arch_topology.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 7b28f2416e00..6050bff683fc 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -426,9 +426,6 @@ init_cpu_capacity_callback(struct notifier_block *nb, struct cpufreq_policy *policy = data; int cpu; - if (!raw_capacity) - return 0; - if (val != CPUFREQ_CREATE_POLICY) return 0; @@ -445,9 +442,11 @@ init_cpu_capacity_callback(struct notifier_block *nb, } if (cpumask_empty(cpus_to_visit)) { - topology_normalize_cpu_scale(); - schedule_work(&update_topology_flags_work); - free_raw_capacity(); + if (raw_capacity) { + topology_normalize_cpu_scale(); + schedule_work(&update_topology_flags_work); + free_raw_capacity(); + } pr_debug("cpu_capacity: parsing done\n"); schedule_work(&parsing_done_work); } @@ -467,7 +466,7 @@ static int __init register_cpufreq_notifier(void) * On ACPI-based systems skip registering cpufreq notifier as cpufreq * information is not needed for cpu capacity initialization. */ - if (!acpi_disabled || !raw_capacity) + if (!acpi_disabled) return -EINVAL; if (!alloc_cpumask_var(&cpus_to_visit, GFP_KERNEL)) -- Gitee From 9697a172bebe28d49d3c070fda98ae6bf5af2a27 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Fri, 13 Jun 2025 15:27:51 +0800 Subject: [PATCH 78/78] arm64: topology: Setup amu fie when cpu hotplugging driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICEXYT ---------------------------------------------------------------------- Amu fie was set up by a cpufreq policy notifier after the policy was created. This caused some problems: 1. The cpus related to the same policy would all fail to set up amu fie if one of them couldn't pass the freq_counters_valid() check. 2. The cpus fail to set up amu fie would never have a chance to set up again. When boot with maxcpu=1 restrict, the support amu flags of the offline cpus would never be setup. After that, when cpufreq policy was being created, the online cpu might set up amu fie fail because the other cpus related to the same policy couldn't pass the freq_counters_valid() check. Hotplug the offline cpus, since the policy was already created, amu_fie_setup() would never be called again. All cpus couldn't setup amu fie in this situation. After commit 1f023007f5e7 ("arm64/amu: Use capacity_ref_freq() to set AMU ratio"), the max_freq stores in policy data is never needed when setting up amu fie. This indicates that the setting up of amu fie does not depend on the policy any more. So each cpu can set up amu fie separately during hotplug and the problems above will be solved. Fixes: 1eb5dde674f5 ("cpufreq: CPPC: Add support for frequency invariance") Signed-off-by: Lifeng Zheng Signed-off-by: Hongye Lin Signed-off-by: huwentao --- arch/arm64/kernel/topology.c | 53 +++++++++++------------------------- 1 file changed, 16 insertions(+), 37 deletions(-) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index e626a4985c48..893d31fdd374 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -277,52 +276,28 @@ static struct scale_freq_data amu_sfd = { .set_freq_scale = amu_scale_freq_tick, }; -static void amu_fie_setup(const struct cpumask *cpus) +static void amu_fie_setup(unsigned int cpu) { - int cpu; - - /* We are already set since the last insmod of cpufreq driver */ - if (unlikely(cpumask_subset(cpus, amu_fie_cpus))) + if (cpumask_test_cpu(cpu, amu_fie_cpus)) return; - for_each_cpu(cpu, cpus) { - if (!freq_counters_valid(cpu)) - return; - } + if (!freq_counters_valid(cpu)) + return; - cpumask_or(amu_fie_cpus, amu_fie_cpus, cpus); + cpumask_set_cpu(cpu, amu_fie_cpus); topology_set_scale_freq_source(&amu_sfd, amu_fie_cpus); - pr_debug("CPUs[%*pbl]: counters will be used for FIE.", - cpumask_pr_args(cpus)); + pr_debug("CPUs[%u]: counters will be used for FIE.", cpu); } -static int init_amu_fie_callback(struct notifier_block *nb, unsigned long val, - void *data) +static int cpuhp_topology_online(unsigned int cpu) { - struct cpufreq_policy *policy = data; - - if (val == CPUFREQ_CREATE_POLICY) - amu_fie_setup(policy->related_cpus); - - /* - * We don't need to handle CPUFREQ_REMOVE_POLICY event as the AMU - * counters don't have any dependency on cpufreq driver once we have - * initialized AMU support and enabled invariance. The AMU counters will - * keep on working just fine in the absence of the cpufreq driver, and - * for the CPUs for which there are no counters available, the last set - * value of arch_freq_scale will remain valid as that is the frequency - * those CPUs are running at. - */ + amu_fie_setup(cpu); return 0; } -static struct notifier_block init_amu_fie_notifier = { - .notifier_call = init_amu_fie_callback, -}; - static int __init init_amu_fie(void) { int ret; @@ -330,12 +305,16 @@ static int __init init_amu_fie(void) if (!zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL)) return -ENOMEM; - ret = cpufreq_register_notifier(&init_amu_fie_notifier, - CPUFREQ_POLICY_NOTIFIER); - if (ret) + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "arm64/topology:online", + cpuhp_topology_online, + NULL); + if (ret < 0) { free_cpumask_var(amu_fie_cpus); + return ret; + } - return ret; + return 0; } core_initcall(init_amu_fie); -- Gitee