From ed57324d5ee1ac607972b4426558ed9a9424e13d Mon Sep 17 00:00:00 2001 From: LeoLiu-oc Date: Tue, 21 Oct 2025 10:50:01 +0800 Subject: [PATCH] x86/delay: Update support for Zhaoxin PAUSEOPT instruction zhaoxin inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID2S2C CVE: NA ------------------- The original zxpause instruction has been updated with the official name pauseopt. Moreover, the original patches have been optimized and updated. Therefore, the original patches need to be withdrawn so that they can be replaced with the new ones later. PAUSEOPT instructs the processor to enter an implementation-dependent optimized state. The instruction execution wakes up when the time-stamp counter reaches or exceeds the implicit EDX:EAX 64-bit input value. The instruction execution also wakes up due to the expiration of the operating system time-limit or by an external interrupt. PAUSEOPT is available on processors with X86_FEATURE_PAUSEOPT. PAUSEOPT allows the processor to enter a light-weight power/performance optimized state (C0.1 state) for a period specified by the instruction or until the system time limit. MSR_ZX_PAUSE_CONTROL MSR register allows the OS to enable/disable C0.2 on the processor and to set the maximum time the processor can reside in C0.1 or C0.2. By default C0.2 is disabled. A sysfs interface to adjust the time and the C0.2 enablement is provided in a follow up change. Reviewed-by: Felix Zhang Tested-by: Lyle Li Signed-off-by: LeoLiu-oc --- arch/x86/Kconfig.assembler | 5 + arch/x86/include/asm/cpufeatures.h | 4 +- arch/x86/include/asm/delay.h | 2 +- arch/x86/include/asm/msr-index.h | 8 +- arch/x86/include/asm/mwait.h | 17 +- arch/x86/kernel/cpu/Makefile | 2 +- arch/x86/kernel/cpu/{zxpause.c => pauseopt.c} | 156 +++++++----------- arch/x86/kernel/time.c | 4 +- arch/x86/lib/delay.c | 16 +- tools/arch/x86/include/asm/cpufeatures.h | 4 +- tools/arch/x86/include/asm/msr-index.h | 9 +- 11 files changed, 95 insertions(+), 132 deletions(-) rename arch/x86/kernel/cpu/{zxpause.c => pauseopt.c} (40%) diff --git a/arch/x86/Kconfig.assembler b/arch/x86/Kconfig.assembler index 16d0b022d6fff..1195554bb92c6 100644 --- a/arch/x86/Kconfig.assembler +++ b/arch/x86/Kconfig.assembler @@ -29,3 +29,8 @@ config AS_WRUSS def_bool $(as-instr64,wrussq %rax$(comma)(%rbx)) help Supported by binutils >= 2.31 and LLVM integrated assembler + +config AS_PAUSEOPT + def_bool $(as-instr,pauseopt) + help + Supported by binutils >= xxx-TBD and LLVM integrated assembler xxx-TBD diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 1688c2c667fb2..bcb7955e56de4 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -498,8 +498,8 @@ #define X86_FEATURE_HYGON_SM3 (29*32 + 1) /* "sm3" SM3 instructions */ #define X86_FEATURE_HYGON_SM4 (29*32 + 2) /* "sm4" SM4 instructions */ -/* VIA/Cyrix/Centaur/Zhaoxin-defined CPU features, CPUID level 0xC0000006, word 21 */ -#define X86_FEATURE_ZXPAUSE (30*32 + 0) /* Zhaoxin ZXPAUSE */ +/* Zhaoxin/Centaur-defined CPU features, CPUID level 0xC0000006, word 30 */ +#define X86_FEATURE_PAUSEOPT (30*32 + 0) /* Zhaoxin PAUSEOPT */ /* * BUG word(s) diff --git a/arch/x86/include/asm/delay.h b/arch/x86/include/asm/delay.h index 4dbb3fea67fb5..c844077f19b6f 100644 --- a/arch/x86/include/asm/delay.h +++ b/arch/x86/include/asm/delay.h @@ -7,7 +7,7 @@ void __init use_tsc_delay(void); void __init use_tpause_delay(void); -void __init use_zxpause_delay(void); +void __init use_pauseopt_delay(void); void use_mwaitx_delay(void); #endif /* _ASM_X86_DELAY_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 9860a3cb69d09..d5685ac32a4ea 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -75,22 +75,18 @@ #define MSR_IA32_UMWAIT_CONTROL 0xe1 #define MSR_IA32_UMWAIT_CONTROL_C02_DISABLE BIT(0) #define MSR_IA32_UMWAIT_CONTROL_RESERVED BIT(1) - -#define MSR_ZX_PAUSE_CONTROL 0x187f -#define MSR_ZX_PAUSE_CONTROL_C02_DISABLE BIT(0) -#define MSR_ZX_PAUSE_CONTROL_RESERVED BIT(1) - /* * The time field is bit[31:2], but representing a 32bit value with * bit[1:0] zero. */ #define MSR_IA32_UMWAIT_CONTROL_TIME_MASK (~0x03U) +#define MSR_PAUSEOPT_CONTROL 0x187f /* * The time field is bit[31:2], but representing a 32bit value with * bit[1:0] zero. */ -#define MSR_ZX_PAUSE_CONTROL_TIME_MASK (~0x03U) +#define MSR_PAUSEOPT_CONTROL_TIME_MASK (~0x03U) /* Abbreviated from Intel SDM name IA32_CORE_CAPABILITIES */ #define MSR_IA32_CORE_CAPS 0x000000cf diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 6d4e5805dd0a4..7d7afb8676099 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -26,7 +26,7 @@ #define TPAUSE_C01_STATE 1 #define TPAUSE_C02_STATE 0 -#define ZXPAUSE_C01_STATE 1 +#define PAUSEOPT_P01_STATE 1 static __always_inline void __monitor(const void *eax, unsigned long ecx, unsigned long edx) @@ -151,16 +151,14 @@ static inline void __tpause(u32 ecx, u32 edx, u32 eax) } /* - * Caller can specify whether to enter C0.1 (low latency, less - * power saving) or C0.2 state (saves more power, but longer wakeup - * latency). This may be overridden by the ZX_PAUSE_CONTROL MSR - * which can force requests for C0.2 to be downgraded to C0.1. + * Caller can specify to enter P0.1 (low latency, less power saving). */ -static inline void __zxpause(u32 ecx, u32 edx, u32 eax) +static inline void __pauseopt(u32 ecx, u32 edx, u32 eax) { - /* "zxpause %ecx, %edx, %eax;" */ -#ifdef CONFIG_AS_ZXPAUSE - asm volatile("zxpause %%ecx\n" + /* "pauseopt %ecx, %edx, %eax;" */ +#ifdef CONFIG_AS_PAUSEOPT + asm volatile( + "pauseopt\n" : : "c"(ecx), "d"(edx), "a"(eax)); #else @@ -169,4 +167,5 @@ static inline void __zxpause(u32 ecx, u32 edx, u32 eax) : "c"(ecx), "d"(edx), "a"(eax)); #endif } + #endif /* _ASM_X86_MWAIT_H */ diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index b96c8413bbc32..807c523dfdecc 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -25,7 +25,7 @@ obj-y += bugs.o obj-y += aperfmperf.o obj-y += cpuid-deps.o obj-y += umwait.o -obj-y += zxpause.o +obj-y += pauseopt.o obj-$(CONFIG_PROC_FS) += proc.o obj-y += capflags.o powerflags.o diff --git a/arch/x86/kernel/cpu/zxpause.c b/arch/x86/kernel/cpu/pauseopt.c similarity index 40% rename from arch/x86/kernel/cpu/zxpause.c rename to arch/x86/kernel/cpu/pauseopt.c index 7f55f5d9e8c0c..58a490da7b3e5 100644 --- a/arch/x86/kernel/cpu/zxpause.c +++ b/arch/x86/kernel/cpu/pauseopt.c @@ -6,41 +6,37 @@ #include #include -#define ZXPAUSE_C02_ENABLE 0 - -#define ZXPAUSE_CTRL_VAL(max_time, c02_disable) \ - (((max_time) & MSR_ZX_PAUSE_CONTROL_TIME_MASK) | \ - ((c02_disable) & MSR_ZX_PAUSE_CONTROL_C02_DISABLE)) +#define PAUSEOPT_CTRL_VAL(max_time) (((max_time) & MSR_PAUSEOPT_CONTROL_TIME_MASK)) /* - * Cache ZX_PAUSE_CONTROL MSR. This is a systemwide control. By default, - * zxpause max time is 100000 in TSC-quanta and C0.2 is enabled + * Cache PAUSEOPT_CONTROL MSR. This is a systemwide control. By default, + * pauseopt max time is 100000 in TSC-quanta and P0.1 is enabled. */ -static u32 zxpause_control_cached = ZXPAUSE_CTRL_VAL(100000, ZXPAUSE_C02_ENABLE); +static u32 pauseopt_control_cached = PAUSEOPT_CTRL_VAL(100000); /* - * Cache the original ZX_PAUSE_CONTROL MSR value which is configured by + * Cache the original PAUSEOPT_CONTROL MSR value which is configured by * hardware or BIOS before kernel boot. */ -static u32 orig_zxpause_control_cached __ro_after_init; +static u32 orig_pauseopt_control_cached __ro_after_init; /* - * Serialize access to zxpause_control_cached and ZX_PAUSE_CONTROL MSR in + * Serialize access to pauseopt_control_cached and PAUSEOPT_CONTROL MSR in * the sysfs write functions. */ -static DEFINE_MUTEX(zxpause_lock); +static DEFINE_MUTEX(pauseopt_lock); -static void zxpause_update_control_msr(void *unused) +static void pauseopt_update_control_msr(void *unused) { lockdep_assert_irqs_disabled(); - wrmsr(MSR_ZX_PAUSE_CONTROL, READ_ONCE(zxpause_control_cached), 0); + wrmsr(MSR_PAUSEOPT_CONTROL, READ_ONCE(pauseopt_control_cached), 0); } /* * The CPU hotplug callback sets the control MSR to the global control * value. * - * Disable interrupts so the read of zxpause_control_cached and the WRMSR + * Disable interrupts so the read of pauseopt_control_cached and the WRMSR * are protected against a concurrent sysfs write. Otherwise the sysfs * write could update the cached value after it had been read on this CPU * and issue the IPI before the old value had been written. The IPI would @@ -51,10 +47,10 @@ static void zxpause_update_control_msr(void *unused) * value or the IPI is updating this CPU to the new control value after * interrupts have been reenabled. */ -static int zxpause_cpu_online(unsigned int cpu) +static int pauseopt_cpu_online(unsigned int cpu) { local_irq_disable(); - zxpause_update_control_msr(NULL); + pauseopt_update_control_msr(NULL); local_irq_enable(); return 0; } @@ -63,21 +59,21 @@ static int zxpause_cpu_online(unsigned int cpu) * The CPU hotplug callback sets the control MSR to the original control * value. */ -static int zxpause_cpu_offline(unsigned int cpu) +static int pauseopt_cpu_offline(unsigned int cpu) { /* * This code is protected by the CPU hotplug already and - * orig_zxpause_control_cached is never changed after it caches - * the original control MSR value in zxpause_init(). So there + * orig_pauseopt_control_cached is never changed after it caches + * the original control MSR value in pauseopt_init(). So there * is no race condition here. */ - wrmsr(MSR_ZX_PAUSE_CONTROL, orig_zxpause_control_cached, 0); + wrmsr(MSR_PAUSEOPT_CONTROL, orig_pauseopt_control_cached, 0); return 0; } /* - * On resume, restore ZX_PAUSE_CONTROL MSR on the boot processor which + * On resume, restore PAUSEOPT_CONTROL MSR on the boot processor which * is the only active CPU at this time. The MSR is set up on the APs via the * CPU hotplug callback. * @@ -86,81 +82,51 @@ static int zxpause_cpu_offline(unsigned int cpu) * trust the firmware nor does it matter if the same value is written * again. */ -static void zxpause_syscore_resume(void) +static void pauseopt_syscore_resume(void) { - zxpause_update_control_msr(NULL); + pauseopt_update_control_msr(NULL); } -static struct syscore_ops zxpause_syscore_ops = { - .resume = zxpause_syscore_resume, +static struct syscore_ops pauseopt_syscore_ops = { + .resume = pauseopt_syscore_resume, }; /* sysfs interface */ -/* - * When bit 0 in ZX_PAUSE_CONTROL MSR is 1, C0.2 is disabled. - * Otherwise, C0.2 is enabled. - */ -static inline bool zxpause_ctrl_c02_enabled(u32 ctrl) -{ - return !(ctrl & MSR_ZX_PAUSE_CONTROL_C02_DISABLE); -} - -static inline u32 zxpause_ctrl_max_time(u32 ctrl) +static inline u32 pauseopt_ctrl_max_time(u32 ctrl) { - return ctrl & MSR_ZX_PAUSE_CONTROL_TIME_MASK; + return ctrl & MSR_PAUSEOPT_CONTROL_TIME_MASK; } -static inline void zxpause_update_control(u32 maxtime, bool c02_enable) +static inline void pauseopt_update_control(u32 maxtime) { - u32 ctrl = maxtime & MSR_ZX_PAUSE_CONTROL_TIME_MASK; - - if (!c02_enable) - ctrl |= MSR_ZX_PAUSE_CONTROL_C02_DISABLE; + u32 ctrl = maxtime & MSR_PAUSEOPT_CONTROL_TIME_MASK; - WRITE_ONCE(zxpause_control_cached, ctrl); + WRITE_ONCE(pauseopt_control_cached, ctrl); /* Propagate to all CPUs */ - on_each_cpu(zxpause_update_control_msr, NULL, 1); + on_each_cpu(pauseopt_update_control_msr, NULL, 1); } static ssize_t -enable_c02_show(struct device *dev, struct device_attribute *attr, char *buf) +enable_p01_show(struct device *dev, struct device_attribute *attr, char *buf) { - u32 ctrl = READ_ONCE(zxpause_control_cached); + u32 ret; - return sprintf(buf, "%d\n", zxpause_ctrl_c02_enabled(ctrl)); -} + if (boot_cpu_has(X86_FEATURE_PAUSEOPT)) + ret = 1; + else + ret = 0; -static ssize_t enable_c02_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - bool c02_enable; - u32 ctrl; - int ret; - - ret = kstrtobool(buf, &c02_enable); - if (ret) - return ret; - - mutex_lock(&zxpause_lock); - - ctrl = READ_ONCE(zxpause_control_cached); - if (c02_enable != zxpause_ctrl_c02_enabled(ctrl)) - zxpause_update_control(ctrl, c02_enable); - - mutex_unlock(&zxpause_lock); - - return count; + return sprintf(buf, "%d\n", ret); } -static DEVICE_ATTR_RW(enable_c02); +static DEVICE_ATTR_RO(enable_p01); static ssize_t max_time_show(struct device *kobj, struct device_attribute *attr, char *buf) { - u32 ctrl = READ_ONCE(zxpause_control_cached); + u32 ctrl = READ_ONCE(pauseopt_control_cached); - return sprintf(buf, "%u\n", zxpause_ctrl_max_time(ctrl)); + return sprintf(buf, "%u\n", pauseopt_ctrl_max_time(ctrl)); } static ssize_t max_time_store(struct device *kobj, @@ -175,49 +141,49 @@ static ssize_t max_time_store(struct device *kobj, return ret; /* bits[1:0] must be zero */ - if (max_time & ~MSR_ZX_PAUSE_CONTROL_TIME_MASK) + if (max_time & ~MSR_PAUSEOPT_CONTROL_TIME_MASK) return -EINVAL; - mutex_lock(&zxpause_lock); + mutex_lock(&pauseopt_lock); - ctrl = READ_ONCE(zxpause_control_cached); - if (max_time != zxpause_ctrl_max_time(ctrl)) - zxpause_update_control(max_time, zxpause_ctrl_c02_enabled(ctrl)); + ctrl = READ_ONCE(pauseopt_control_cached); + if (max_time != pauseopt_ctrl_max_time(ctrl)) + pauseopt_update_control(max_time); - mutex_unlock(&zxpause_lock); + mutex_unlock(&pauseopt_lock); return count; } static DEVICE_ATTR_RW(max_time); -static struct attribute *zxpause_attrs[] = { - &dev_attr_enable_c02.attr, +static struct attribute *pauseopt_attrs[] = { + &dev_attr_enable_p01.attr, &dev_attr_max_time.attr, NULL }; -static struct attribute_group zxpause_attr_group = { - .attrs = zxpause_attrs, - .name = "zxpause_control", +static struct attribute_group pauseopt_attr_group = { + .attrs = pauseopt_attrs, + .name = "pauseopt_control", }; -static int __init zxpause_init(void) +static int __init pauseopt_init(void) { struct device *dev; int ret; - if (!boot_cpu_has(X86_FEATURE_ZXPAUSE)) + if (!boot_cpu_has(X86_FEATURE_PAUSEOPT)) return -ENODEV; /* * Cache the original control MSR value before the control MSR is - * changed. This is the only place where orig_zxpause_control_cached + * changed. This is the only place where orig_pauseopt_control_cached * is modified. */ - rdmsrl(MSR_ZX_PAUSE_CONTROL, orig_zxpause_control_cached); + rdmsrl(MSR_PAUSEOPT_CONTROL, orig_pauseopt_control_cached); - ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "zxpause:online", - zxpause_cpu_online, zxpause_cpu_offline); + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "pauseopt:online", + pauseopt_cpu_online, pauseopt_cpu_offline); if (ret < 0) { /* * On failure, the control MSR on all CPUs has the @@ -226,13 +192,17 @@ static int __init zxpause_init(void) return ret; } - register_syscore_ops(&zxpause_syscore_ops); + register_syscore_ops(&pauseopt_syscore_ops); /* - * Add zxpause control interface. Ignore failure, so at least the + * Add pauseopt control interface. Ignore failure, so at least the * default values are set up in case the machine manages to boot. */ dev = bus_get_dev_root(&cpu_subsys); - return sysfs_create_group(&dev->kobj, &zxpause_attr_group); + if (dev) { + ret = sysfs_create_group(&dev->kobj, &pauseopt_attr_group); + put_device(dev); + } + return ret; } -device_initcall(zxpause_init); +device_initcall(pauseopt_init); diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 6a6c8bd7843c6..99355d27415ee 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -85,8 +85,8 @@ static __init void x86_late_time_init(void) if (static_cpu_has(X86_FEATURE_WAITPKG)) use_tpause_delay(); - if (static_cpu_has(X86_FEATURE_ZXPAUSE)) - use_zxpause_delay(); + if (static_cpu_has(X86_FEATURE_PAUSEOPT)) + use_pauseopt_delay(); } /* diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 3946badbd78fd..96bf5b3baacd3 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -118,12 +118,12 @@ static void delay_halt_tpause(u64 start, u64 cycles) } /* - * On ZHAOXIN the ZXPAUSE instruction waits until any of: + * On ZHAOXIN the PAUSEOPT instruction waits until any of: * 1) the delta of TSC counter exceeds the value provided in EDX:EAX - * 2) global timeout in ZX_PAUSE_CONTROL is exceeded + * 2) global timeout in PAUSEOPT_CONTROL is exceeded * 3) an external interrupt occurs */ -static void delay_halt_zxpause(u64 unused, u64 cycles) +static void delay_halt_pauseopt(u64 unused, u64 cycles) { u64 until = cycles; u32 eax, edx; @@ -131,11 +131,7 @@ static void delay_halt_zxpause(u64 unused, u64 cycles) eax = lower_32_bits(until); edx = upper_32_bits(until); - /* - * Hard code the deeper (C0.1) sleep state because exit latency is - * small compared to the "microseconds" that usleep() will delay. - */ - __zxpause(ZXPAUSE_C01_STATE, edx, eax); + __pauseopt(PAUSEOPT_P01_STATE, edx, eax); } /* @@ -204,9 +200,9 @@ void __init use_tpause_delay(void) delay_fn = delay_halt; } -void __init use_zxpause_delay(void) +void __init use_pauseopt_delay(void) { - delay_halt_fn = delay_halt_zxpause; + delay_halt_fn = delay_halt_pauseopt; delay_fn = delay_halt; } diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 3c3af4eb58773..73f37fd032769 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -443,8 +443,8 @@ #define X86_FEATURE_AUTOIBRS (20*32+ 8) /* "" Automatic IBRS */ #define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* "" SMM_CTL MSR is not present */ -/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000006, word 20 */ -#define X86_FEATURE_ZXPAUSE (30*32 + 0) /* ZHAOXIN ZXPAUSE */ +/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000006, word 30 */ +#define X86_FEATURE_PAUSEOPT (30*32 + 0) /* ZHAOXIN PAUSEOPT */ /* * BUG word(s) diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 474fd815148e8..29f30c8beed33 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -72,22 +72,19 @@ #define MSR_IA32_UMWAIT_CONTROL 0xe1 #define MSR_IA32_UMWAIT_CONTROL_C02_DISABLE BIT(0) #define MSR_IA32_UMWAIT_CONTROL_RESERVED BIT(1) - -#define MSR_ZX_PAUSE_CONTROL 0x187f -#define MSR_ZX_PAUSE_CONTROL_C02_DISABLE BIT(0) -#define MSR_ZX_PAUSE_CONTROL_RESERVED BIT(1) - /* * The time field is bit[31:2], but representing a 32bit value with * bit[1:0] zero. */ #define MSR_IA32_UMWAIT_CONTROL_TIME_MASK (~0x03U) +#define MSR_PAUSEOPT_CONTROL 0x187f + /* * The time field is bit[31:2], but representing a 32bit value with * bit[1:0] zero. */ -#define MSR_ZX_PAUSE_CONTROL_TIME_MASK (~0x03U) +#define MSR_PAUSEOPT_CONTROL_TIME_MASK (~0x03U) /* Abbreviated from Intel SDM name IA32_CORE_CAPABILITIES */ #define MSR_IA32_CORE_CAPS 0x000000cf -- Gitee