From df8b37ff5cec844ae9554c8264d56616a61ad916 Mon Sep 17 00:00:00 2001
From: Jie Zhan <zhanjie9@hisilicon.com>
Date: Tue, 22 Aug 2023 20:48:37 +0800
Subject: [PATCH 1/5] cpufreq: Support per-policy performance boost

[Upstream commit 218a06a79d9a98a96ef46bb003d4d8adb0962056]
The boost control currently applies to the whole system.  However, users
may prefer to boost a subset of cores in order to provide prioritized
performance to workloads running on the boosted cores.

Enable per-policy boost by adding a 'boost' sysfs interface under each
policy path.  This can be found at:

	/sys/devices/system/cpu/cpufreq/policy<*>/boost

Same to the global boost switch, writing 1/0 to the per-policy 'boost'
enables/disables boost on a cpufreq policy respectively.

The user view of global and per-policy boost controls should be:

1. Enabling global boost initially enables boost on all policies, and
per-policy boost can then be enabled or disabled individually, given that
the platform does support so.

2. Disabling global boost makes the per-policy boost interface illegal.

Signed-off-by: Jie Zhan <zhanjie9@hisilicon.com>
Reviewed-by: Wei Xu <xuwei5@hisilicon.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 43 +++++++++++++++++++++++++++++++++++++++
 include/linux/cpufreq.h   |  3 +++
 2 files changed, 46 insertions(+)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 04dfa34096cc..7b1e37cc7166 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -77,6 +77,7 @@ static void cpufreq_governor_limits(struct cpufreq_policy *policy);
 static int cpufreq_set_policy(struct cpufreq_policy *policy,
 			      struct cpufreq_governor *new_gov,
 			      unsigned int new_pol);
+static bool cpufreq_boost_supported(void);
 
 /**
  * Two notifier lists: the "policy" list is involved in the
@@ -605,6 +606,40 @@ static ssize_t store_boost(struct kobject *kobj, struct kobj_attribute *attr,
 }
 define_one_global_rw(boost);
 
+static ssize_t show_local_boost(struct cpufreq_policy *policy, char *buf)
+{
+	return sysfs_emit(buf, "%d\n", policy->boost_enabled);
+}
+
+static ssize_t store_local_boost(struct cpufreq_policy *policy,
+				 const char *buf, size_t count)
+{
+	int ret, enable;
+
+	ret = kstrtoint(buf, 10, &enable);
+	if (ret || enable < 0 || enable > 1)
+		return -EINVAL;
+
+	if (!cpufreq_driver->boost_enabled)
+		return -EINVAL;
+
+	if (policy->boost_enabled == enable)
+		return count;
+
+	cpus_read_lock();
+	ret = cpufreq_driver->set_boost(policy, enable);
+	cpus_read_unlock();
+
+	if (ret)
+		return ret;
+
+	policy->boost_enabled = enable;
+
+	return count;
+}
+
+static struct freq_attr local_boost = __ATTR(boost, 0644, show_local_boost, store_local_boost);
+
 static struct cpufreq_governor *find_governor(const char *str_governor)
 {
 	struct cpufreq_governor *t;
@@ -1045,6 +1080,12 @@ static int cpufreq_add_dev_interface(struct cpufreq_policy *policy)
 			return ret;
 	}
 
+	if (cpufreq_boost_supported()) {
+		ret = sysfs_create_file(&policy->kobj, &local_boost.attr);
+		if (ret)
+			return ret;
+	}
+
 	return 0;
 }
 
@@ -2564,6 +2605,8 @@ int cpufreq_boost_trigger_state(int state)
 		ret = cpufreq_driver->set_boost(policy, state);
 		if (ret)
 			goto err_reset_state;
+
+		policy->boost_enabled = state;
 	}
 	put_online_cpus();
 
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 81b9a6be2e02..bf47a225b8f6 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -125,6 +125,9 @@ struct cpufreq_policy {
 	 */
 	bool			dvfs_possible_from_any_cpu;
 
+	/* Per policy boost enabled flag. */
+	bool			boost_enabled;
+
 	 /* Cached frequency lookup from cpufreq_driver_resolve_freq. */
 	unsigned int cached_target_freq;
 	int cached_resolved_idx;
-- 
Gitee


From 8f527ca5f5c975daa5c998c3aa0982f7558ad404 Mon Sep 17 00:00:00 2001
From: Lifeng Zheng <zhenglifeng1@huawei.com>
Date: Tue, 11 Mar 2025 11:24:04 +0800
Subject: [PATCH 2/5] cpufreq: Introduce a more generic way to set default
 per-policy boost flag

[Upstream commit dd016f379ebc2d43a9405742d1a6066577509bd7]
In cpufreq_online() of cpufreq.c, the per-policy boost flag is already
set to mirror the cpufreq_driver boost during init but using freq_table
to judge if the policy has boost frequency. There are two drawbacks to
this approach:

 1. It doesn't work for the cpufreq drivers that do not use a frequency
    table. For now, acpi-cpufreq and amd-pstate have to enable boost in
    policy initialization. And cppc_cpufreq never set policy to boost
    when going online no matter what the cpufreq_driver boost flag is.

 2. If the CPU goes offline when cpufreq_driver boost is enabled and
    then goes online when cpufreq_driver boost is disabled, the
    per-policy boost flag will incorrectly remain true.

Running set_boost at the end of the online process is a more generic way
for all cpufreq drivers.

Signed-off-by: Lifeng Zheng <zhenglifeng1@huawei.com>
Link: https://patch.msgid.link/20250117101457.1530653-3-zhenglifeng1@huawei.com
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
[ rjw: Changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 7b1e37cc7166..b01c6de60f0a 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1542,6 +1542,18 @@ static int cpufreq_online(unsigned int cpu)
 	if (cpufreq_thermal_control_enabled(cpufreq_driver))
 		policy->cdev = of_cpufreq_cooling_register(policy);
 
+	/* Let the per-policy boost flag mirror the cpufreq_driver boost during init */
+	if (policy->boost_enabled != cpufreq_boost_enabled()) {
+		policy->boost_enabled = cpufreq_boost_enabled();
+		ret = cpufreq_driver->set_boost(policy, policy->boost_enabled);
+		if (ret) {
+			/* If the set_boost fails, the online operation is not affected */
+			pr_info("%s: CPU%d: Cannot %s BOOST\n", __func__, policy->cpu,
+				policy->boost_enabled ? "enable" : "disable");
+			policy->boost_enabled = !policy->boost_enabled;
+		}
+	}
+
 	pr_debug("initialization complete\n");
 
 	return 0;
-- 
Gitee


From 65b76102b93d5a6b33e6785a6a467af297cf2e7a Mon Sep 17 00:00:00 2001
From: Lifeng Zheng <zhenglifeng1@huawei.com>
Date: Tue, 11 Mar 2025 11:24:05 +0800
Subject: [PATCH 3/5] cpufreq: CPPC: Fix wrong max_freq in policy
 initialization

[Upstream commit 03d8b4e76266e11662c5e544854b737843173e2d]
In policy initialization, policy->max and policy->cpuinfo.max_freq are
always set to the value calculated from caps->nominal_perf.

This will cause the frequency stay on base frequency even if the policy
is already boosted when a CPU is going online.

Fix this by using policy->boost_enabled to determine which value should
be set.

Signed-off-by: Lifeng Zheng <zhenglifeng1@huawei.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Link: https://patch.msgid.link/20250117101457.1530653-4-zhenglifeng1@huawei.com
[ rjw: Changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cppc_cpufreq.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c
index d07d38e9faed..689ff142ee4b 100644
--- a/drivers/cpufreq/cppc_cpufreq.c
+++ b/drivers/cpufreq/cppc_cpufreq.c
@@ -307,7 +307,8 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	 * Section 8.4.7.1.1.5 of ACPI 6.1 spec)
 	 */
 	policy->min = cppc_cpufreq_perf_to_khz(cpu, cpu->perf_caps.lowest_nonlinear_perf);
-	policy->max = cppc_cpufreq_perf_to_khz(cpu, cpu->perf_caps.nominal_perf);
+	policy->max = cppc_cpufreq_perf_to_khz(cpu, policy->boost_enabled ?
+                                                cpu->perf_caps.highest_perf : cpu->perf_caps.nominal_perf);
 
 	/*
 	 * Set cpuinfo.min_freq to Lowest to make the full range of performance
@@ -315,7 +316,7 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	 * nonlinear perf
 	 */
 	policy->cpuinfo.min_freq = cppc_cpufreq_perf_to_khz(cpu, cpu->perf_caps.lowest_perf);
-	policy->cpuinfo.max_freq = cppc_cpufreq_perf_to_khz(cpu, cpu->perf_caps.nominal_perf);
+	policy->cpuinfo.max_freq = policy->max;
 
 	policy->transition_delay_us = cppc_cpufreq_get_transition_delay_us(cpu_num);
 	policy->shared_type = cpu->shared_type;
-- 
Gitee


From edce2f97f0817c0fa4bd342b00ffd3e3744a3cea Mon Sep 17 00:00:00 2001
From: Aboorva Devarajan <aboorvad@linux.ibm.com>
Date: Tue, 11 Mar 2025 11:24:07 +0800
Subject: [PATCH 4/5] cpufreq: prevent NULL dereference in cpufreq_online()

[Upstream commit 0813fd2e14ca6ecd4e6ba005a9766f08e26020d7]
Ensure cpufreq_driver->set_boost is non-NULL before using it in
cpufreq_online() to prevent a potential NULL pointer dereference.

Reported-by: Gautam Menghani <gautam@linux.ibm.com>
Closes: https://lore.kernel.org/all/c9e56c5f54cc33338762c94e9bed7b5a0d5de812.camel@linux.ibm.com/
Fixes: da59223d340c ("cpufreq: Introduce a more generic way to set default per-policy boost flag")
Suggested-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Aboorva Devarajan <aboorvad@linux.ibm.com>
Link: https://patch.msgid.link/20250205181347.2079272-1-aboorvad@linux.ibm.com
[ rjw: Minor edits in the subject and changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index b01c6de60f0a..57a820ad1206 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1543,7 +1543,8 @@ static int cpufreq_online(unsigned int cpu)
 		policy->cdev = of_cpufreq_cooling_register(policy);
 
 	/* Let the per-policy boost flag mirror the cpufreq_driver boost during init */
-	if (policy->boost_enabled != cpufreq_boost_enabled()) {
+	if (cpufreq_driver->set_boost &&
+	    policy->boost_enabled != cpufreq_boost_enabled()) {
 		policy->boost_enabled = cpufreq_boost_enabled();
 		ret = cpufreq_driver->set_boost(policy, policy->boost_enabled);
 		if (ret) {
-- 
Gitee


From 8e76eb71b4ed47d89caa4ca44bead5e53de2b671 Mon Sep 17 00:00:00 2001
From: Jie Zhan <zhanjie9@hisilicon.com>
Date: Mon, 21 Apr 2025 20:34:35 +0800
Subject: [PATCH 5/5] cpufreq: governor: Fix negative 'idle_time' handling in
 dbs_update()

[Upstream commit 3698dd6b139dc37b35a9ad83d9330c1f99666c02]
We observed an issue that the CPU frequency can't raise up with a 100% CPU
load when NOHZ is off and the 'conservative' governor is selected.

'idle_time' can be negative if it's obtained from get_cpu_idle_time_jiffy()
when NOHZ is off.  This was found and explained in commit 9485e4ca0b48
("cpufreq: governor: Fix handling of special cases in dbs_update()").

However, commit 7592019634f8 ("cpufreq: governors: Fix long idle detection
logic in load calculation") introduced a comparison between 'idle_time' and
'samling_rate' to detect a long idle interval.  While 'idle_time' is
converted to int before comparison, it's actually promoted to unsigned
again when compared with an unsigned 'sampling_rate'.  Hence, this leads to
wrong idle interval detection when it's in fact 100% busy and sets
policy_dbs->idle_periods to a very large value.  'conservative' adjusts the
frequency to minimum because of the large 'idle_periods', such that the
frequency can't raise up.  'Ondemand' doesn't use policy_dbs->idle_periods
so it fortunately avoids the issue.

Correct negative 'idle_time' to 0 before any use of it in dbs_update().

Fixes: 7592019634f8 ("cpufreq: governors: Fix long idle detection logic in load calculation")
Signed-off-by: Jie Zhan <zhanjie9@hisilicon.com>
Reviewed-by: Chen Yu <yu.c.chen@intel.com>
Link: https://patch.msgid.link/20250213035510.2402076-1-zhanjie9@hisilicon.com
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq_governor.c | 45 +++++++++++++++---------------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 4bb054d0cb43..01d19d9ac0fe 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -145,7 +145,23 @@ unsigned int dbs_update(struct cpufreq_policy *policy)
 		time_elapsed = update_time - j_cdbs->prev_update_time;
 		j_cdbs->prev_update_time = update_time;
 
-		idle_time = cur_idle_time - j_cdbs->prev_cpu_idle;
+		/*
+		 * cur_idle_time could be smaller than j_cdbs->prev_cpu_idle if
+		 * it's obtained from get_cpu_idle_time_jiffy() when NOHZ is
+		 * off, where idle_time is calculated by the difference between
+		 * time elapsed in jiffies and "busy time" obtained from CPU
+		 * statistics.  If a CPU is 100% busy, the time elapsed and busy
+		 * time should grow with the same amount in two consecutive
+		 * samples, but in practice there could be a tiny difference,
+		 * making the accumulated idle time decrease sometimes.  Hence,
+		 * in this case, idle_time should be regarded as 0 in order to
+		 * make the further process correct.
+		 */
+		if (cur_idle_time > j_cdbs->prev_cpu_idle)
+			idle_time = cur_idle_time - j_cdbs->prev_cpu_idle;
+		else
+			idle_time = 0;
+
 		j_cdbs->prev_cpu_idle = cur_idle_time;
 
 		if (ignore_nice) {
@@ -162,7 +178,7 @@ unsigned int dbs_update(struct cpufreq_policy *policy)
 			 * calls, so the previous load value can be used then.
 			 */
 			load = j_cdbs->prev_load;
-		} else if (unlikely((int)idle_time > 2 * sampling_rate &&
+		} else if (unlikely(idle_time > 2 * sampling_rate &&
 				    j_cdbs->prev_load)) {
 			/*
 			 * If the CPU had gone completely idle and a task has
@@ -189,30 +205,15 @@ unsigned int dbs_update(struct cpufreq_policy *policy)
 			load = j_cdbs->prev_load;
 			j_cdbs->prev_load = 0;
 		} else {
-			if (time_elapsed >= idle_time) {
+			if (time_elapsed > idle_time)
 				load = 100 * (time_elapsed - idle_time) / time_elapsed;
-			} else {
-				/*
-				 * That can happen if idle_time is returned by
-				 * get_cpu_idle_time_jiffy().  In that case
-				 * idle_time is roughly equal to the difference
-				 * between time_elapsed and "busy time" obtained
-				 * from CPU statistics.  Then, the "busy time"
-				 * can end up being greater than time_elapsed
-				 * (for example, if jiffies_64 and the CPU
-				 * statistics are updated by different CPUs),
-				 * so idle_time may in fact be negative.  That
-				 * means, though, that the CPU was busy all
-				 * the time (on the rough average) during the
-				 * last sampling interval and 100 can be
-				 * returned as the load.
-				 */
-				load = (int)idle_time < 0 ? 100 : 0;
-			}
+			else
+				load = 0;
+
 			j_cdbs->prev_load = load;
 		}
 
-		if (unlikely((int)idle_time > 2 * sampling_rate)) {
+		if (unlikely(idle_time > 2 * sampling_rate)) {
 			unsigned int periods = idle_time / sampling_rate;
 
 			if (periods < idle_periods)
-- 
Gitee