From 581829c24ea5e021ace4593bbd5c48a7034e6ffc Mon Sep 17 00:00:00 2001 From: Robert Hoo Date: Tue, 19 Apr 2022 23:32:40 +0800 Subject: [PATCH 01/10] x86/cpu: Add new VMX feature, Tertiary VM-Execution control mainline inclusion from mainline-v6.0-rc1 commit 465932db25f3664893b66152c7b190afd28c32db category: feature feature: IPI Virtualization bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5ODSC CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=465932db25f3664893b66152c7b190afd28c32db Intel-SIG: commit 465932db25f3 ("x86/cpu: Add new VMX feature, Tertiary VM-Execution control") ------------------------------------- x86/cpu: Add new VMX feature, Tertiary VM-Execution control A new 64-bit control field "tertiary processor-based VM-execution controls", is defined [1]. It's controlled by bit 17 of the primary processor-based VM-execution controls. Different from its brother VM-execution fields, this tertiary VM- execution controls field is 64 bit. So it occupies 2 vmx_feature_leafs, TERTIARY_CTLS_LOW and TERTIARY_CTLS_HIGH. Its companion VMX capability reporting MSR,MSR_IA32_VMX_PROCBASED_CTLS3 (0x492), is also semantically different from its brothers, whose 64 bits consist of all allow-1, rather than 32-bit allow-0 and 32-bit allow-1 [1][2]. Therefore, its init_vmx_capabilities() is a little different from others. [1] ISE 6.2 "VMCS Changes" https://www.intel.com/content/www/us/en/develop/download/intel-architecture-instruction-set-extensions-programming-reference.html [2] SDM Vol3. Appendix A.3 Reviewed-by: Sean Christopherson Reviewed-by: Maxim Levitsky Signed-off-by: Robert Hoo Signed-off-by: Zeng Guang Message-Id: <20220419153240.11549-1-guang.zeng@intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- arch/x86/include/asm/msr-index.h | 1 + arch/x86/include/asm/vmxfeatures.h | 3 ++- arch/x86/kernel/cpu/feat_ctl.c | 9 ++++++++- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index c1be8eaf4304..0d5a8dd4f5e6 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -941,6 +941,7 @@ #define MSR_IA32_VMX_TRUE_EXIT_CTLS 0x0000048f #define MSR_IA32_VMX_TRUE_ENTRY_CTLS 0x00000490 #define MSR_IA32_VMX_VMFUNC 0x00000491 +#define MSR_IA32_VMX_PROCBASED_CTLS3 0x00000492 /* VMX_BASIC bits and bitmasks */ #define VMX_BASIC_VMCS_SIZE_SHIFT 32 diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h index 9915990fd8cf..0d43a41310af 100644 --- a/arch/x86/include/asm/vmxfeatures.h +++ b/arch/x86/include/asm/vmxfeatures.h @@ -5,7 +5,7 @@ /* * Defines VMX CPU feature bits */ -#define NVMXINTS 3 /* N 32-bit words worth of info */ +#define NVMXINTS 5 /* N 32-bit words worth of info */ /* * Note: If the comment begins with a quoted string, that string is used @@ -43,6 +43,7 @@ #define VMX_FEATURE_RDTSC_EXITING ( 1*32+ 12) /* "" VM-Exit on RDTSC */ #define VMX_FEATURE_CR3_LOAD_EXITING ( 1*32+ 15) /* "" VM-Exit on writes to CR3 */ #define VMX_FEATURE_CR3_STORE_EXITING ( 1*32+ 16) /* "" VM-Exit on reads from CR3 */ +#define VMX_FEATURE_TERTIARY_CONTROLS ( 1*32+ 17) /* "" Enable Tertiary VM-Execution Controls */ #define VMX_FEATURE_CR8_LOAD_EXITING ( 1*32+ 19) /* "" VM-Exit on writes to CR8 */ #define VMX_FEATURE_CR8_STORE_EXITING ( 1*32+ 20) /* "" VM-Exit on reads from CR8 */ #define VMX_FEATURE_VIRTUAL_TPR ( 1*32+ 21) /* "vtpr" TPR virtualization, a.k.a. TPR shadow */ diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index da696eb4821a..993697e71854 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -15,6 +15,8 @@ enum vmx_feature_leafs { MISC_FEATURES = 0, PRIMARY_CTLS, SECONDARY_CTLS, + TERTIARY_CTLS_LOW, + TERTIARY_CTLS_HIGH, NR_VMX_FEATURE_WORDS, }; @@ -22,7 +24,7 @@ enum vmx_feature_leafs { static void init_vmx_capabilities(struct cpuinfo_x86 *c) { - u32 supported, funcs, ept, vpid, ign; + u32 supported, funcs, ept, vpid, ign, low, high; BUILD_BUG_ON(NVMXINTS != NR_VMX_FEATURE_WORDS); @@ -42,6 +44,11 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c) rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS2, &ign, &supported); c->vmx_capability[SECONDARY_CTLS] = supported; + /* All 64 bits of tertiary controls MSR are allowed-1 settings. */ + rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS3, &low, &high); + c->vmx_capability[TERTIARY_CTLS_LOW] = low; + c->vmx_capability[TERTIARY_CTLS_HIGH] = high; + rdmsr(MSR_IA32_VMX_PINBASED_CTLS, ign, supported); rdmsr_safe(MSR_IA32_VMX_VMFUNC, &ign, &funcs); -- Gitee From fcfbb76a81cc28f2efb04c914d717d1ef53f9d2f Mon Sep 17 00:00:00 2001 From: Robert Hoo Date: Tue, 19 Apr 2022 23:33:18 +0800 Subject: [PATCH 02/10] KVM: VMX: Extend BUILD_CONTROLS_SHADOW macro to support 64-bit variation mainline inclusion from mainline-v6.0-rc1 commit ed3905ba60384ab8c73b421c3618375e58080a9a category: feature feature: IPI Virtualization bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5ODSC CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ed3905ba60384ab8c73b421c3618375e58080a9a Intel-SIG: commit ed3905ba6038 ("KVM: VMX: Extend BUILD_CONTROLS_SHADOW macro to support 64-bit variation") ------------------------------------- KVM: VMX: Extend BUILD_CONTROLS_SHADOW macro to support 64-bit variation The Tertiary VM-Exec Control, different from previous control fields, is 64 bit. So extend BUILD_CONTROLS_SHADOW() by adding a 'bit' parameter, to support both 32 bit and 64 bit fields' auxiliary functions building. Suggested-by: Sean Christopherson Reviewed-by: Maxim Levitsky Reviewed-by: Sean Christopherson Signed-off-by: Robert Hoo Signed-off-by: Zeng Guang Message-Id: <20220419153318.11595-1-guang.zeng@intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- arch/x86/kvm/vmx/vmx.h | 52 +++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 1848fef1f96e..2937296a945e 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -404,31 +404,35 @@ static inline u8 vmx_get_rvi(void) return vmcs_read16(GUEST_INTR_STATUS) & 0xff; } -#define BUILD_CONTROLS_SHADOW(lname, uname) \ -static inline void lname##_controls_set(struct vcpu_vmx *vmx, u32 val) \ -{ \ - if (vmx->loaded_vmcs->controls_shadow.lname != val) { \ - vmcs_write32(uname, val); \ - vmx->loaded_vmcs->controls_shadow.lname = val; \ - } \ -} \ -static inline u32 lname##_controls_get(struct vcpu_vmx *vmx) \ -{ \ - return vmx->loaded_vmcs->controls_shadow.lname; \ -} \ -static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u32 val) \ -{ \ - lname##_controls_set(vmx, lname##_controls_get(vmx) | val); \ -} \ -static inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u32 val) \ -{ \ - lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val); \ +#define BUILD_CONTROLS_SHADOW(lname, uname, bits) \ +static inline void lname##_controls_set(struct vcpu_vmx *vmx, u##bits val) \ +{ \ + if (vmx->loaded_vmcs->controls_shadow.lname != val) { \ + vmcs_write##bits(uname, val); \ + vmx->loaded_vmcs->controls_shadow.lname = val; \ + } \ +} \ +static inline u##bits __##lname##_controls_get(struct loaded_vmcs *vmcs) \ +{ \ + return vmcs->controls_shadow.lname; \ +} \ +static inline u##bits lname##_controls_get(struct vcpu_vmx *vmx) \ +{ \ + return __##lname##_controls_get(vmx->loaded_vmcs); \ +} \ +static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u##bits val) \ +{ \ + lname##_controls_set(vmx, lname##_controls_get(vmx) | val); \ +} \ +static inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u##bits val) \ +{ \ + lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val); \ } -BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS) -BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS) -BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL) -BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL) -BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL) +BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS, 32) +BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS, 32) +BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL, 32) +BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL, 32) +BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL, 32) static inline void vmx_register_cache_reset(struct kvm_vcpu *vcpu) { -- Gitee From df66c9d17b69a0e4487fb46a2e0c15ed4fd7d6c2 Mon Sep 17 00:00:00 2001 From: Robert Hoo Date: Tue, 19 Apr 2022 23:34:00 +0800 Subject: [PATCH 03/10] KVM: VMX: Detect Tertiary VM-Execution control when setup VMCS config mainline inclusion from mainline-v6.0-rc1 commit 1ad4e5438c67a01620ed67cea959de89f4430515 category: feature feature: IPI Virtualization bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5ODSC CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1ad4e5438c67a01620ed67cea959de89f4430515 Intel-SIG: commit 1ad4e5438c67 ("KVM: VMX: Detect Tertiary VM-Execution control when setup VMCS config") ------------------------------------- KVM: VMX: Detect Tertiary VM-Execution control when setup VMCS config Check VMX features on tertiary execution control in VMCS config setup. Sub-features in tertiary execution control to be enabled are adjusted according to hardware capabilities although no sub-feature is enabled in this patch. EVMCSv1 doesn't support tertiary VM-execution control, so disable it when EVMCSv1 is in use. And define the auxiliary functions for Tertiary control field here, using the new BUILD_CONTROLS_SHADOW(). Reviewed-by: Maxim Levitsky Signed-off-by: Robert Hoo Signed-off-by: Zeng Guang Message-Id: <20220419153400.11642-1-guang.zeng@intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- arch/x86/include/asm/vmx.h | 3 +++ arch/x86/kvm/vmx/capabilities.h | 7 +++++++ arch/x86/kvm/vmx/evmcs.c | 2 ++ arch/x86/kvm/vmx/evmcs.h | 1 + arch/x86/kvm/vmx/vmcs.h | 1 + arch/x86/kvm/vmx/vmx.c | 29 ++++++++++++++++++++++++++++- arch/x86/kvm/vmx/vmx.h | 1 + 7 files changed, 43 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index c6f028bac3ff..4c9c4df1c74d 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -31,6 +31,7 @@ #define CPU_BASED_RDTSC_EXITING VMCS_CONTROL_BIT(RDTSC_EXITING) #define CPU_BASED_CR3_LOAD_EXITING VMCS_CONTROL_BIT(CR3_LOAD_EXITING) #define CPU_BASED_CR3_STORE_EXITING VMCS_CONTROL_BIT(CR3_STORE_EXITING) +#define CPU_BASED_ACTIVATE_TERTIARY_CONTROLS VMCS_CONTROL_BIT(TERTIARY_CONTROLS) #define CPU_BASED_CR8_LOAD_EXITING VMCS_CONTROL_BIT(CR8_LOAD_EXITING) #define CPU_BASED_CR8_STORE_EXITING VMCS_CONTROL_BIT(CR8_STORE_EXITING) #define CPU_BASED_TPR_SHADOW VMCS_CONTROL_BIT(VIRTUAL_TPR) @@ -219,6 +220,8 @@ enum vmcs_field { ENCLS_EXITING_BITMAP_HIGH = 0x0000202F, TSC_MULTIPLIER = 0x00002032, TSC_MULTIPLIER_HIGH = 0x00002033, + TERTIARY_VM_EXEC_CONTROL = 0x00002034, + TERTIARY_VM_EXEC_CONTROL_HIGH = 0x00002035, GUEST_PHYSICAL_ADDRESS = 0x00002400, GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, VMCS_LINK_POINTER = 0x00002800, diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 7c47c7f7911b..e52e355b99ab 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -60,6 +60,7 @@ struct vmcs_config { u32 pin_based_exec_ctrl; u32 cpu_based_exec_ctrl; u32 cpu_based_2nd_exec_ctrl; + u64 cpu_based_3rd_exec_ctrl; u32 vmexit_ctrl; u32 vmentry_ctrl; struct nested_vmx_msrs nested; @@ -133,6 +134,12 @@ static inline bool cpu_has_secondary_exec_ctrls(void) CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; } +static inline bool cpu_has_tertiary_exec_ctrls(void) +{ + return vmcs_config.cpu_based_exec_ctrl & + CPU_BASED_ACTIVATE_TERTIARY_CONTROLS; +} + static inline bool cpu_has_vmx_virtualize_apic_accesses(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index 5b68034ec5f9..a406a7a59ffd 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -299,8 +299,10 @@ const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1); __init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) { + vmcs_conf->cpu_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_EXEC_CTRL; vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL; vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC; + vmcs_conf->cpu_based_3rd_exec_ctrl = 0; vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index 011929a63823..fecaa026b78d 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -50,6 +50,7 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs); */ #define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \ PIN_BASED_VMX_PREEMPTION_TIMER) +#define EVMCS1_UNSUPPORTED_EXEC_CTRL (CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) #define EVMCS1_UNSUPPORTED_2NDEXEC \ (SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \ diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 571d9ad80a59..48a71ddec218 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -48,6 +48,7 @@ struct vmcs_controls_shadow { u32 pin; u32 exec; u32 secondary_exec; + u64 tertiary_exec; }; /* diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 6bd6cd341dc3..e1298128d6c3 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2550,6 +2550,15 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, return 0; } +static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) +{ + u64 allowed; + + rdmsrl(msr, allowed); + + return ctl_opt & allowed; +} + static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, struct vmx_capability *vmx_cap) { @@ -2558,6 +2567,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, u32 _pin_based_exec_control = 0; u32 _cpu_based_exec_control = 0; u32 _cpu_based_2nd_exec_control = 0; + u64 _cpu_based_3rd_exec_control = 0; u32 _vmexit_control = 0; u32 _vmentry_control = 0; @@ -2579,7 +2589,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, opt = CPU_BASED_TPR_SHADOW | CPU_BASED_USE_MSR_BITMAPS | - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS | + CPU_BASED_ACTIVATE_TERTIARY_CONTROLS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, &_cpu_based_exec_control) < 0) return -EIO; @@ -2652,6 +2663,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, "1-setting enable VPID VM-execution control\n"); } + if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) { + u64 opt3 = 0; + + _cpu_based_3rd_exec_control = adjust_vmx_controls64(opt3, + MSR_IA32_VMX_PROCBASED_CTLS3); + } + min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT; #ifdef CONFIG_X86_64 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; @@ -2739,6 +2757,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; + vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control; vmcs_conf->vmexit_ctrl = _vmexit_control; vmcs_conf->vmentry_ctrl = _vmentry_control; @@ -4330,6 +4349,11 @@ u32 vmx_exec_control(struct vcpu_vmx *vmx) return exec_control; } +static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx) +{ + return vmcs_config.cpu_based_3rd_exec_ctrl; +} + /* * Adjust a single secondary execution control bit to intercept/allow an * instruction in the guest. This is usually done based on whether or not a @@ -4496,6 +4520,9 @@ static void init_vmcs(struct vcpu_vmx *vmx) secondary_exec_controls_set(vmx, vmx->secondary_exec_control); } + if (cpu_has_tertiary_exec_ctrls()) + tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx)); + if (kvm_vcpu_apicv_active(&vmx->vcpu)) { vmcs_write64(EOI_EXIT_BITMAP0, 0); vmcs_write64(EOI_EXIT_BITMAP1, 0); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 2937296a945e..f3ac7929c99a 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -433,6 +433,7 @@ BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS, 32) BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL, 32) BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL, 32) BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL, 32) +BUILD_CONTROLS_SHADOW(tertiary_exec, TERTIARY_VM_EXEC_CONTROL, 64) static inline void vmx_register_cache_reset(struct kvm_vcpu *vcpu) { -- Gitee From a0e47d7af172ee694014a5121f3bdd6da40dabf3 Mon Sep 17 00:00:00 2001 From: Robert Hoo Date: Tue, 19 Apr 2022 23:34:41 +0800 Subject: [PATCH 04/10] KVM: VMX: Report tertiary_exec_control field in dump_vmcs() mainline inclusion from mainline-v6.0-rc1 commit 0b85baa5f46de1c6ad6e4b987905df041f2f80f0 category: feature feature: IPI Virtualization bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5ODSC CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0b85baa5f46de1c6ad6e4b987905df041f2f80f0 Intel-SIG: commit 0b85baa5f46d ("KVM: VMX: Report tertiary_exec_control field in dump_vmcs()") ------------------------------------- KVM: VMX: Report tertiary_exec_control field in dump_vmcs() Add tertiary_exec_control field report in dump_vmcs(). Meanwhile, reorganize the dump output of VMCS category as follows. Before change: *** Control State *** PinBased=0x000000ff CPUBased=0xb5a26dfa SecondaryExec=0x061037eb EntryControls=0000d1ff ExitControls=002befff After change: *** Control State *** CPUBased=0xb5a26dfa SecondaryExec=0x061037eb TertiaryExec=0x0000000000000010 PinBased=0x000000ff EntryControls=0000d1ff ExitControls=002befff Reviewed-by: Maxim Levitsky Signed-off-by: Robert Hoo Signed-off-by: Zeng Guang Message-Id: <20220419153441.11687-1-guang.zeng@intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- arch/x86/kvm/vmx/vmx.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e1298128d6c3..63a04de140d6 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6028,6 +6028,7 @@ void dump_vmcs(void) { u32 vmentry_ctl, vmexit_ctl; u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control; + u64 tertiary_exec_control; unsigned long cr4; if (!dump_invalid_vmcs) { @@ -6040,9 +6041,16 @@ void dump_vmcs(void) cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); cr4 = vmcs_readl(GUEST_CR4); - secondary_exec_control = 0; + if (cpu_has_secondary_exec_ctrls()) secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + else + secondary_exec_control = 0; + + if (cpu_has_tertiary_exec_ctrls()) + tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL); + else + tertiary_exec_control = 0; pr_err("*** Guest State ***\n"); pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", @@ -6125,9 +6133,10 @@ void dump_vmcs(void) vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); pr_err("*** Control State ***\n"); - pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n", - pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control); - pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl); + pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n", + cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control); + pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n", + pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl); pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", vmcs_read32(EXCEPTION_BITMAP), vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), -- Gitee From 9d63e2af3b168eb4f68407c90bbc40e98776f59c Mon Sep 17 00:00:00 2001 From: Zeng Guang Date: Tue, 19 Apr 2022 23:35:16 +0800 Subject: [PATCH 05/10] KVM: x86: Add support for vICR APIC-write VM-Exits in x2APIC mode mainline inclusion from mainline-v6.0-rc1 commit 5413bcba7ed57206178d60ee03dd5bb3a460e645 category: feature feature: IPI Virtualization bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5ODSC CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5413bcba7ed57206178d60ee03dd5bb3a460e645 Intel-SIG: commit 5413bcba7ed5 ("KVM: x86: Add support for vICR APIC-write VM-Exits in x2APIC mode") ------------------------------------- KVM: x86: Add support for vICR APIC-write VM-Exits in x2APIC mode Upcoming Intel CPUs will support virtual x2APIC MSR writes to the vICR, i.e. will trap and generate an APIC-write VM-Exit instead of intercepting the WRMSR. Add support for handling "nodecode" x2APIC writes, which were previously impossible. Note, x2APIC MSR writes are 64 bits wide. Signed-off-by: Zeng Guang Message-Id: <20220419153516.11739-1-guang.zeng@intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- arch/x86/kvm/lapic.c | 13 ++++++++++--- arch/x86/kvm/lapic.h | 5 +++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4ac75a0ec62b..ef0ccca76324 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -67,6 +67,7 @@ static bool lapic_timer_advance_dynamic __read_mostly; #define LAPIC_TIMER_ADVANCE_NS_MAX 5000 /* step-by-step approximation to mitigate fluctuation */ #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 +static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data); static inline int apic_test_vector(int vec, void *bitmap) { @@ -2170,15 +2171,21 @@ EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); /* emulate APIC access in a trap manner */ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) { - u32 val = 0; + struct kvm_lapic *apic = vcpu->arch.apic; + u64 val = 0; /* hw has done the conditional check and inst decode */ offset &= 0xff0; - kvm_lapic_reg_read(vcpu->arch.apic, offset, 4, &val); + /* exception dealing with 64bit data on vICR in x2apic mode */ + if ((offset == APIC_ICR) && apic_x2apic_mode(apic)) { + val = kvm_lapic_get_reg64(apic, offset); + kvm_lapic_reg_write(apic, APIC_ICR2, (u32)(val>>32)); + } else + kvm_lapic_reg_read(apic, offset, 4, &val); /* TODO: optimize to just emulate side effect w/o one more write */ - kvm_lapic_reg_write(vcpu->arch.apic, offset, val); + kvm_lapic_reg_write(apic, offset, (u32)val); } EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 4fb86e3a9dd3..49e85dea5e3f 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -162,6 +162,11 @@ static inline u32 kvm_lapic_get_reg(struct kvm_lapic *apic, int reg_off) return *((u32 *) (apic->regs + reg_off)); } +static inline u64 kvm_lapic_get_reg64(struct kvm_lapic *apic, int reg_off) +{ + return *((u64 *) (apic->regs + reg_off)); +} + static inline void __kvm_lapic_set_reg(char *regs, int reg_off, u32 val) { *((u32 *) (regs + reg_off)) = val; -- Gitee From 428afadf2b5627fb2dbc4e1ec60628156b663f83 Mon Sep 17 00:00:00 2001 From: Zeng Guang Date: Tue, 19 Apr 2022 23:36:04 +0800 Subject: [PATCH 06/10] KVM: VMX: Clean up vmx_refresh_apicv_exec_ctrl() mainline inclusion from mainline-v6.0-rc1 commit f08a06c9a35706349f74b7a18deefe3f89f73e8e category: feature feature: IPI Virtualization bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5ODSC CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f08a06c9a35706349f74b7a18deefe3f89f73e8e Intel-SIG: commit f08a06c9a357 ("KVM: VMX: Clean up vmx_refresh_apicv_exec_ctrl()") ------------------------------------- KVM: VMX: Clean up vmx_refresh_apicv_exec_ctrl() Remove the condition check cpu_has_secondary_exec_ctrls(). Calling vmx_refresh_apicv_exec_ctrl() premises secondary controls activated and VMCS fields related to APICv valid as well. If it's invoked in wrong circumstance at the worst case, VMX operation will report VMfailValid error without further harmful impact and just functions as if all the secondary controls were 0. Suggested-by: Sean Christopherson Signed-off-by: Zeng Guang Message-Id: <20220419153604.11786-1-guang.zeng@intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- arch/x86/kvm/vmx/vmx.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 63a04de140d6..605267375786 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4308,16 +4308,15 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); - if (cpu_has_secondary_exec_ctrls()) { - if (kvm_vcpu_apicv_active(vcpu)) - secondary_exec_controls_setbit(vmx, - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); - else - secondary_exec_controls_clearbit(vmx, - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); - } + + if (kvm_vcpu_apicv_active(vcpu)) + secondary_exec_controls_setbit(vmx, + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + else + secondary_exec_controls_clearbit(vmx, + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); if (cpu_has_vmx_msr_bitmap()) vmx_update_msr_bitmap(vcpu); -- Gitee From 36c9eea0fc0c2f028b924564f9675a424fab83e6 Mon Sep 17 00:00:00 2001 From: Zeng Guang Date: Tue, 19 Apr 2022 23:44:09 +0800 Subject: [PATCH 07/10] KVM: Move kvm_arch_vcpu_precreate() under kvm->lock mainline inclusion from mainline-v6.0-rc1 commit 1d5e740d518e02cea46325b3d37135bf9c08982a category: feature feature: IPI Virtualization bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5ODSC CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1d5e740d518e02cea46325b3d37135bf9c08982a Intel-SIG: commit 1d5e740d518e ("KVM: Move kvm_arch_vcpu_precreate() under kvm->lock") ------------------------------------- KVM: Move kvm_arch_vcpu_precreate() under kvm->lock kvm_arch_vcpu_precreate() targets to handle arch specific VM resource to be prepared prior to the actual creation of vCPU. For example, x86 platform may need do per-VM allocation based on max_vcpu_ids at the first vCPU creation. It probably leads to concurrency control on this allocation as multiple vCPU creation could happen simultaneously. From the architectual point of view, it's necessary to execute kvm_arch_vcpu_precreate() under protect of kvm->lock. Currently only arm64, x86 and s390 have non-nop implementations at the stage of vCPU pre-creation. Remove the lock acquiring in s390's design and make sure all architecture can run kvm_arch_vcpu_precreate() safely under kvm->lock without recrusive lock issue. Suggested-by: Sean Christopherson Signed-off-by: Zeng Guang Message-Id: <20220419154409.11842-1-guang.zeng@intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- arch/s390/kvm/kvm-s390.c | 2 -- arch/x86/kvm/x86.c | 2 +- virt/kvm/kvm_main.c | 10 ++++++---- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index d8e9239c24ff..8b35774d3319 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2944,9 +2944,7 @@ static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id) if (!sclp.has_esca || !sclp.has_64bscao) return false; - mutex_lock(&kvm->lock); rc = kvm->arch.use_esca ? 0 : sca_switch_to_extended(kvm); - mutex_unlock(&kvm->lock); return rc == 0 && id < KVM_S390_ESCA_CPU_SLOTS; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 033fcb0e11ec..c0d397fcf049 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9925,7 +9925,7 @@ static void fx_init(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) { - if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) + if (kvm_check_tsc_unstable() && kvm->created_vcpus) pr_warn_once("kvm: SMP vm created on host with unstable TSC; " "guest TSC will not be reliable\n"); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a5c107bbf022..93cd86267eba 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3180,13 +3180,15 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) return -EINVAL; } + r = kvm_arch_vcpu_precreate(kvm, id); + if (r) { + mutex_unlock(&kvm->lock); + return r; + } + kvm->created_vcpus++; mutex_unlock(&kvm->lock); - r = kvm_arch_vcpu_precreate(kvm, id); - if (r) - goto vcpu_decrement; - vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); if (!vcpu) { r = -ENOMEM; -- Gitee From c731a5704a1f6ad2a82e3e9b692492d30fc2007e Mon Sep 17 00:00:00 2001 From: Zeng Guang Date: Tue, 19 Apr 2022 23:44:44 +0800 Subject: [PATCH 08/10] KVM: x86: Allow userspace to set maximum VCPU id for VM mainline inclusion from mainline-v6.0-rc1 commit 35875316384b71d23dc2a45a969732fc8cab16af category: feature feature: IPI Virtualization bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5ODSC CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=35875316384b71d23dc2a45a969732fc8cab16af Intel-SIG: commit 35875316384b ("KVM: x86: Allow userspace to set maximum VCPU id for VM") ------------------------------------- KVM: x86: Allow userspace to set maximum VCPU id for VM Introduce new max_vcpu_ids in KVM for x86 architecture. Userspace can assign maximum possible vcpu id for current VM session using KVM_CAP_MAX_VCPU_ID of KVM_ENABLE_CAP ioctl(). This is done for x86 only because the sole use case is to guide memory allocation for PID-pointer table, a structure needed to enable VMX IPI. By default, max_vcpu_ids set as KVM_MAX_VCPU_ID. Suggested-by: Sean Christopherson Reviewed-by: Maxim Levitsky Signed-off-by: Zeng Guang Message-Id: <20220419154444.11888-1-guang.zeng@intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng Signed-off-by: Jason Zeng --- Documentation/virt/kvm/api.rst | 21 +++++++++++++++++++++ arch/x86/include/asm/kvm_host.h | 6 ++++++ arch/x86/kvm/x86.c | 20 ++++++++++++++++++++ 3 files changed, 47 insertions(+) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index d542bd99a9a4..19438a5c942b 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6220,6 +6220,27 @@ default. See Documentation/x86/sgx.rst for more details. +7.23 KVM_CAP_MAX_VCPU_ID +------------------------ + +:Architectures: x86 +:Target: VM +:Parameters: args[0] - maximum APIC ID value set for current VM +:Returns: 0 on success, -EINVAL if args[0] is beyond KVM_MAX_VCPU_ID + supported in KVM or if it has been set. + +This capability allows userspace to specify maximum possible APIC ID +assigned for current VM session prior to the creation of vCPUs, saving +memory for data structures indexed by the APIC ID. Userspace is able +to calculate the limit to APIC ID values from designated +CPU topology. + +The value can be changed only until KVM_ENABLE_CAP is set to a nonzero +value or until a vCPU is created. Upon creation of the first vCPU, +if the value was set to zero or KVM_ENABLE_CAP was not invoked, KVM +uses the return value of KVM_CHECK_EXTENSION(KVM_CAP_MAX_VCPU_ID) as +the maximum APIC ID. + 8. Other capabilities. ====================== diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6ecc24e335ff..1b08ba66c088 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1017,6 +1017,12 @@ struct kvm_arch { struct list_head tdp_mmu_roots; /* List of struct tdp_mmu_pages not being used as roots */ struct list_head tdp_mmu_pages; + /* + * VM-scope maximum vCPU ID. Used to determine the size of structures + * that increase along with the maximum vCPU ID, in which case, using + * the global KVM_MAX_VCPU_ID may lead to significant memory waste. + */ + u32 max_vcpu_ids; }; struct kvm_vm_stat { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c0d397fcf049..bee4c895276e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5304,6 +5304,20 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, break; } #endif + case KVM_CAP_MAX_VCPU_ID: + r = -EINVAL; + if (cap->args[0] > KVM_MAX_VCPU_ID) + break; + + mutex_lock(&kvm->lock); + if (kvm->arch.max_vcpu_ids == cap->args[0]) { + r = 0; + } else if (!kvm->arch.max_vcpu_ids) { + kvm->arch.max_vcpu_ids = cap->args[0]; + r = 0; + } + mutex_unlock(&kvm->lock); + break; default: r = -EINVAL; break; @@ -9929,6 +9943,12 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) pr_warn_once("kvm: SMP vm created on host with unstable TSC; " "guest TSC will not be reliable\n"); + if (!kvm->arch.max_vcpu_ids) + kvm->arch.max_vcpu_ids = KVM_MAX_VCPU_ID; + + if (id >= kvm->arch.max_vcpu_ids) + return -EINVAL; + return 0; } -- Gitee From 30535bca7ccc517258a5d5042a3d1e5f633e4459 Mon Sep 17 00:00:00 2001 From: Zeng Guang Date: Fri, 22 Apr 2022 21:44:56 +0800 Subject: [PATCH 09/10] kvm: selftests: Add KVM_CAP_MAX_VCPU_ID cap test mainline inclusion from mainline-v6.0-rc1 commit 753dcf7a8686a750fa6aa4b4ca42c6945fc75ac1 category: feature feature: IPI Virtualization bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5ODSC CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=753dcf7a8686a750fa6aa4b4ca42c6945fc75ac1 Intel-SIG: commit 753dcf7a8686 ("kvm: selftests: Add KVM_CAP_MAX_VCPU_ID cap test") ------------------------------------- kvm: selftests: Add KVM_CAP_MAX_VCPU_ID cap test Basic test coverage of KVM_CAP_MAX_VCPU_ID cap. This capability can be enabled before vCPU creation and only allowed to set once. if assigned vcpu id is beyond KVM_CAP_MAX_VCPU_ID capability, vCPU creation will fail. Signed-off-by: Zeng Guang Message-Id: <20220422134456.26655-1-guang.zeng@intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 1 + .../testing/selftests/kvm/include/kvm_util.h | 1 + tools/testing/selftests/kvm/lib/kvm_util.c | 27 ++++++++++ .../kvm/x86_64/max_vcpuid_cap_test.c | 54 +++++++++++++++++++ 5 files changed, 84 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 7a2c242b7152..d4768897eda6 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -9,6 +9,7 @@ /x86_64/evmcs_test /x86_64/kvm_pv_test /x86_64/hyperv_cpuid +/x86_64/max_vcpuid_cap_test /x86_64/mmio_warning_test /x86_64/platform_info_test /x86_64/set_sregs_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 3d14ef77755e..25618728dc4a 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -59,6 +59,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test TEST_GEN_PROGS_x86_64 += x86_64/debug_regs TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test TEST_GEN_PROGS_x86_64 += x86_64/user_msr_test +TEST_GEN_PROGS_x86_64 += x86_64/max_vcpuid_cap_test TEST_GEN_PROGS_x86_64 += demand_paging_test TEST_GEN_PROGS_x86_64 += dirty_log_test TEST_GEN_PROGS_x86_64 += dirty_log_perf_test diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 7d29aa786959..85b621383fa1 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -62,6 +62,7 @@ enum vm_mem_backing_src_type { }; int kvm_check_cap(long cap); +int vm_check_cap(struct kvm_vm *vm, long cap); int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap); int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id, struct kvm_enable_cap *cap); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 49805fd16fdf..d9de68b44783 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -63,6 +63,33 @@ int kvm_check_cap(long cap) return ret; } +/* VM Check Capability + * + * Input Args: + * vm - Virtual Machine + * cap - Capability + * + * Output Args: None + * + * Return: + * On success, the Value corresponding to the capability (KVM_CAP_*) + * specified by the value of cap. On failure a TEST_ASSERT failure + * is produced. + * + * Looks up and returns the value corresponding to the capability + * (KVM_CAP_*) given by cap. + */ +int vm_check_cap(struct kvm_vm *vm, long cap) +{ + int ret; + + ret = ioctl(vm->fd, KVM_CHECK_EXTENSION, cap); + TEST_ASSERT(ret >= 0, "KVM_CHECK_EXTENSION VM IOCTL failed,\n" + " rc: %i errno: %i", ret, errno); + + return ret; +} + /* VM Enable Capability * * Input Args: diff --git a/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c b/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c new file mode 100644 index 000000000000..3f6c1ad86cc6 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * maximum APIC ID capability tests + * + * Copyright (C) 2022, Intel, Inc. + * + * Tests for getting/setting maximum APIC ID capability + */ + +#include "kvm_util.h" +#include "../lib/kvm_util_internal.h" + +#define MAX_VCPU_ID 2 + +int main(int argc, char *argv[]) +{ + struct kvm_vm *vm; + struct kvm_enable_cap cap = { 0 }; + int ret; + + vm = vm_create(VM_MODE_DEFAULT, 0, O_RDWR); + + /* Get KVM_CAP_MAX_VCPU_ID cap supported in KVM */ + ret = vm_check_cap(vm, KVM_CAP_MAX_VCPU_ID); + + /* Try to set KVM_CAP_MAX_VCPU_ID beyond KVM cap */ + cap.cap = KVM_CAP_MAX_VCPU_ID; + cap.args[0] = ret + 1; + ret = ioctl(vm->fd, KVM_ENABLE_CAP, &cap); + TEST_ASSERT(ret < 0, + "Unexpected success to enable KVM_CAP_MAX_VCPU_ID" + "beyond KVM cap!\n"); + + /* Set KVM_CAP_MAX_VCPU_ID */ + cap.cap = KVM_CAP_MAX_VCPU_ID; + cap.args[0] = MAX_VCPU_ID; + ret = ioctl(vm->fd, KVM_ENABLE_CAP, &cap); + TEST_ASSERT(ret == 0, + "Unexpected failure to enable KVM_CAP_MAX_VCPU_ID!\n"); + + /* Try to set KVM_CAP_MAX_VCPU_ID again */ + cap.args[0] = MAX_VCPU_ID + 1; + ret = ioctl(vm->fd, KVM_ENABLE_CAP, &cap); + TEST_ASSERT(ret < 0, + "Unexpected success to enable KVM_CAP_MAX_VCPU_ID again\n"); + + /* Create vCPU with id beyond KVM_CAP_MAX_VCPU_ID cap*/ + ret = ioctl(vm->fd, KVM_CREATE_VCPU, MAX_VCPU_ID); + TEST_ASSERT(ret < 0, + "Unexpected success in creating a vCPU with VCPU ID out of range\n"); + + kvm_vm_free(vm); + return 0; +} -- Gitee From ba83444cc8aa7750028339e662e77564dee29499 Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Tue, 19 Apr 2022 23:45:10 +0800 Subject: [PATCH 10/10] KVM: VMX: enable IPI virtualization mainline inclusion from mainline-v6.0-rc1 commit d588bb9be1da6aa750aa64875fe57369db983d8b category: feature feature: IPI Virtualization bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5ODSC CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d588bb9be1da6aa750aa64875fe57369db983d8b Intel-SIG: commit d588bb9be1da ("KVM: VMX: enable IPI virtualization") ------------------------------------- KVM: VMX: enable IPI virtualization With IPI virtualization enabled, the processor emulates writes to APIC registers that would send IPIs. The processor sets the bit corresponding to the vector in target vCPU's PIR and may send a notification (IPI) specified by NDST and NV fields in target vCPU's Posted-Interrupt Descriptor (PID). It is similar to what IOMMU engine does when dealing with posted interrupt from devices. A PID-pointer table is used by the processor to locate the PID of a vCPU with the vCPU's APIC ID. The table size depends on maximum APIC ID assigned for current VM session from userspace. Allocating memory for PID-pointer table is deferred to vCPU creation, because irqchip mode and VM-scope maximum APIC ID is settled at that point. KVM can skip PID-pointer table allocation if !irqchip_in_kernel(). Like VT-d PI, if a vCPU goes to blocked state, VMM needs to switch its notification vector to wakeup vector. This can ensure that when an IPI for blocked vCPUs arrives, VMM can get control and wake up blocked vCPUs. And if a VCPU is preempted, its posted interrupt notification is suppressed. Note that IPI virtualization can only virualize physical-addressing, flat mode, unicast IPIs. Sending other IPIs would still cause a trap-like APIC-write VM-exit and need to be handled by VMM. Signed-off-by: Chao Gao Signed-off-by: Zeng Guang Message-Id: <20220419154510.11938-1-guang.zeng@intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/include/asm/vmx.h | 8 +++ arch/x86/include/asm/vmxfeatures.h | 2 + arch/x86/kvm/vmx/capabilities.h | 6 +++ arch/x86/kvm/vmx/posted_intr.c | 4 +- arch/x86/kvm/vmx/posted_intr.h | 2 + arch/x86/kvm/vmx/vmx.c | 82 ++++++++++++++++++++++++++++-- arch/x86/kvm/vmx/vmx.h | 7 +++ arch/x86/kvm/x86.c | 2 +- 9 files changed, 106 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1b08ba66c088..b8ce51554cc2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1122,6 +1122,7 @@ struct kvm_x86_ops { void (*vm_destroy)(struct kvm *kvm); /* Create, but do not attach this VCPU */ + int (*vcpu_precreate)(struct kvm *kvm); int (*vcpu_create)(struct kvm_vcpu *vcpu); void (*vcpu_free)(struct kvm_vcpu *vcpu); void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event); diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 4c9c4df1c74d..aaf3812063bb 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -75,6 +75,11 @@ #define SECONDARY_EXEC_TSC_SCALING VMCS_CONTROL_BIT(TSC_SCALING) #define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE VMCS_CONTROL_BIT(USR_WAIT_PAUSE) +/* + * Definitions of Tertiary Processor-Based VM-Execution Controls. + */ +#define TERTIARY_EXEC_IPI_VIRT VMCS_CONTROL_BIT(IPI_VIRT) + #define PIN_BASED_EXT_INTR_MASK VMCS_CONTROL_BIT(INTR_EXITING) #define PIN_BASED_NMI_EXITING VMCS_CONTROL_BIT(NMI_EXITING) #define PIN_BASED_VIRTUAL_NMIS VMCS_CONTROL_BIT(VIRTUAL_NMIS) @@ -157,6 +162,7 @@ static inline int vmx_misc_mseg_revid(u64 vmx_misc) enum vmcs_field { VIRTUAL_PROCESSOR_ID = 0x00000000, POSTED_INTR_NV = 0x00000002, + LAST_PID_POINTER_INDEX = 0x00000008, GUEST_ES_SELECTOR = 0x00000800, GUEST_CS_SELECTOR = 0x00000802, GUEST_SS_SELECTOR = 0x00000804, @@ -222,6 +228,8 @@ enum vmcs_field { TSC_MULTIPLIER_HIGH = 0x00002033, TERTIARY_VM_EXEC_CONTROL = 0x00002034, TERTIARY_VM_EXEC_CONTROL_HIGH = 0x00002035, + PID_POINTER_TABLE = 0x00002042, + PID_POINTER_TABLE_HIGH = 0x00002043, GUEST_PHYSICAL_ADDRESS = 0x00002400, GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, VMCS_LINK_POINTER = 0x00002800, diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h index 0d43a41310af..af446ba041db 100644 --- a/arch/x86/include/asm/vmxfeatures.h +++ b/arch/x86/include/asm/vmxfeatures.h @@ -85,4 +85,6 @@ #define VMX_FEATURE_USR_WAIT_PAUSE ( 2*32+ 26) /* Enable TPAUSE, UMONITOR, UMWAIT in guest */ #define VMX_FEATURE_ENCLV_EXITING ( 2*32+ 28) /* "" VM-Exit on ENCLV (leaf dependent) */ +/* Tertiary Processor-Based VM-Execution Controls, word 3 */ +#define VMX_FEATURE_IPI_VIRT ( 3*32+ 4) /* Enable IPI virtualization */ #endif /* _ASM_X86_VMXFEATURES_H */ diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index e52e355b99ab..1fc2d1415889 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -13,6 +13,7 @@ extern bool __read_mostly enable_unrestricted_guest; extern bool __read_mostly enable_ept_ad_bits; extern bool __read_mostly enable_pml; extern bool __read_mostly enable_apicv; +extern bool __read_mostly enable_ipiv; extern int __read_mostly pt_mode; #define PT_MODE_SYSTEM 0 @@ -279,6 +280,11 @@ static inline bool cpu_has_vmx_apicv(void) cpu_has_vmx_posted_intr(); } +static inline bool cpu_has_vmx_ipiv(void) +{ + return vmcs_config.cpu_based_3rd_exec_ctrl & TERTIARY_EXEC_IPI_VIRT; +} + static inline bool cpu_has_vmx_flexpriority(void) { return cpu_has_vmx_tpr_shadow() && diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 5f8acd2faa7c..9689541fa027 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -89,7 +89,7 @@ void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - if (!vmx_can_use_vtd_pi(vcpu->kvm)) + if (!(vmx_can_use_ipiv(vcpu) || vmx_can_use_vtd_pi(vcpu->kvm))) return; /* Set SN when the vCPU is preempted */ @@ -147,7 +147,7 @@ int pi_pre_block(struct kvm_vcpu *vcpu) struct pi_desc old, new; struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - if (!vmx_can_use_vtd_pi(vcpu->kvm)) + if (!(vmx_can_use_ipiv(vcpu) || vmx_can_use_vtd_pi(vcpu->kvm))) return 0; WARN_ON(irqs_disabled()); diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index 0bdc41391c5b..39d92d52caa9 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -5,6 +5,8 @@ #define POSTED_INTR_ON 0 #define POSTED_INTR_SN 1 +#define PID_TABLE_ENTRY_VALID 1 + /* Posted-Interrupt Descriptor */ struct pi_desc { u32 pir[8]; /* Posted interrupt requested */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 605267375786..1cce20417d96 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -104,6 +104,9 @@ module_param(fasteoi, bool, S_IRUGO); bool __read_mostly enable_apicv = 1; module_param(enable_apicv, bool, S_IRUGO); +bool __read_mostly enable_ipiv = true; +module_param(enable_ipiv, bool, 0444); + /* * If nested=1, nested virtualization is supported, i.e., guests may use * VMX and be a hypervisor for its own guests. If nested=0, guests may not @@ -2664,7 +2667,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, } if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) { - u64 opt3 = 0; + u64 opt3 = TERTIARY_EXEC_IPI_VIRT; _cpu_based_3rd_exec_control = adjust_vmx_controls64(opt3, MSR_IA32_VMX_PROCBASED_CTLS3); @@ -4042,6 +4045,8 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode) vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW); vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); + if (enable_ipiv) + vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW); } } @@ -4309,14 +4314,19 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); - if (kvm_vcpu_apicv_active(vcpu)) + if (kvm_vcpu_apicv_active(vcpu)) { secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); - else + if (enable_ipiv) + tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT); + } else { secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + if (enable_ipiv) + tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT); + } if (cpu_has_vmx_msr_bitmap()) vmx_update_msr_bitmap(vcpu); @@ -4350,7 +4360,16 @@ u32 vmx_exec_control(struct vcpu_vmx *vmx) static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx) { - return vmcs_config.cpu_based_3rd_exec_ctrl; + u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl; + + /* + * IPI virtualization relies on APICv. Disable IPI virtualization if + * APICv is inhibited. + */ + if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu)) + exec_control &= ~TERTIARY_EXEC_IPI_VIRT; + + return exec_control; } /* @@ -4493,6 +4512,35 @@ static void ept_set_mmio_spte_mask(void) kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, 0); } +static inline int vmx_get_pid_table_order(struct kvm *kvm) +{ + return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table)); +} + +static int vmx_alloc_ipiv_pid_table(struct kvm *kvm) +{ + struct page *pages; + struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); + + if (!irqchip_in_kernel(kvm) || !enable_ipiv) + return 0; + + if (kvm_vmx->pid_table) + return 0; + + pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, vmx_get_pid_table_order(kvm)); + if (!pages) + return -ENOMEM; + + kvm_vmx->pid_table = (void *)page_address(pages); + return 0; +} + +static int vmx_vcpu_precreate(struct kvm *kvm) +{ + return vmx_alloc_ipiv_pid_table(kvm); +} + #define VMX_XSS_EXIT_BITMAP 0 /* @@ -4501,6 +4549,9 @@ static void ept_set_mmio_spte_mask(void) */ static void init_vmcs(struct vcpu_vmx *vmx) { + struct kvm *kvm = vmx->vcpu.kvm; + struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); + if (nested) nested_vmx_set_vmcs_shadowing_bitmap(); @@ -4534,7 +4585,12 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); } - if (!kvm_pause_in_guest(vmx->vcpu.kvm)) { + if (vmx_can_use_ipiv(&vmx->vcpu)) { + vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table)); + vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1); + } + + if (!kvm_pause_in_guest(kvm)) { vmcs_write32(PLE_GAP, ple_gap); vmx->ple_window = ple_window; vmx->ple_window_dirty = true; @@ -7226,6 +7282,10 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) vmx->ept_pointer = INVALID_PAGE; + if (vmx_can_use_ipiv(vcpu)) + WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id], + __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID); + return 0; free_vmcs: @@ -7889,6 +7949,13 @@ static bool vmx_check_apicv_inhibit_reasons(ulong bit) return supported & BIT(bit); } +static void vmx_vm_destroy(struct kvm *kvm) +{ + struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); + + free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm)); +} + static struct kvm_x86_ops vmx_x86_ops __initdata = { .hardware_unsetup = hardware_unsetup, @@ -7899,7 +7966,9 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .vm_size = sizeof(struct kvm_vmx), .vm_init = vmx_vm_init, + .vm_destroy = vmx_vm_destroy, + .vcpu_precreate = vmx_vcpu_precreate, .vcpu_create = vmx_create_vcpu, .vcpu_free = vmx_free_vcpu, .vcpu_reset = vmx_vcpu_reset, @@ -8101,6 +8170,9 @@ static __init int hardware_setup(void) vmx_x86_ops.sync_pir_to_irr = NULL; } + if (!enable_apicv || !cpu_has_vmx_ipiv()) + enable_ipiv = false; + if (cpu_has_vmx_tsc_scaling()) { kvm_has_tsc_control = true; kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index f3ac7929c99a..31c7b8f489f6 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -357,6 +357,8 @@ struct kvm_vmx { enum ept_pointers_status ept_pointers_match; spinlock_t ept_pointer_lock; + /* Posted Interrupt Descriptor (PID) table for IPI virtualization */ + u64 *pid_table; }; bool nested_vmx_allowed(struct kvm_vcpu *vcpu); @@ -553,4 +555,9 @@ static inline bool vmx_guest_state_valid(struct kvm_vcpu *vcpu) void dump_vmcs(void); +static inline bool vmx_can_use_ipiv(struct kvm_vcpu *vcpu) +{ + return lapic_in_kernel(vcpu) && enable_ipiv; +} + #endif /* __KVM_X86_VMX_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bee4c895276e..ccc9d019637a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9949,7 +9949,7 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) if (id >= kvm->arch.max_vcpu_ids) return -EINVAL; - return 0; + return kvm_x86_ops.vcpu_precreate ? kvm_x86_ops.vcpu_precreate(kvm) : 0; } int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) -- Gitee