From 0e48bc71ce1325b9b379a151775d5c7355e84066 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 24 Jun 2025 15:45:37 +0800 Subject: [PATCH 01/84] arm64: mm: Add top-level dispatcher for internal mem_encrypt API mainline inclusion from mainline-v6.12-rc1 commit e7bafbf7177750e6643941473b343ed72fc5a100 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBWQ24 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e7bafbf7177750e6643941473b343ed72fc5a100 -------------------------------- Implementing the internal mem_encrypt API for arm64 depends entirely on the Confidential Computing environment in which the kernel is running. Introduce a simple dispatcher so that backend hooks can be registered depending upon the environment in which the kernel finds itself. Reviewed-by: Catalin Marinas Reviewed-by: Steven Price Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20240830130150.8568-4-will@kernel.org Signed-off-by: Will Deacon Conflicts: arch/arm64/Kconfig arch/arm64/include/asm/set_memory.h [Only context conflicts] Signed-off-by: Cai Xinchen Signed-off-by: Xu Raoqing --- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/mem_encrypt.h | 15 +++++++++ arch/arm64/include/asm/set_memory.h | 1 + arch/arm64/mm/Makefile | 2 +- arch/arm64/mm/mem_encrypt.c | 50 ++++++++++++++++++++++++++++ 5 files changed, 68 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/include/asm/mem_encrypt.h create mode 100644 arch/arm64/mm/mem_encrypt.c diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index aff99b41981d..089fc18ea851 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -32,6 +32,7 @@ config ARM64 select ARCH_HAS_KCOV select ARCH_HAS_KEEPINITRD select ARCH_HAS_MEMBARRIER_SYNC_CORE + select ARCH_HAS_MEM_ENCRYPT select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PTE_DEVMAP diff --git a/arch/arm64/include/asm/mem_encrypt.h b/arch/arm64/include/asm/mem_encrypt.h new file mode 100644 index 000000000000..b0c9a86b13a4 --- /dev/null +++ b/arch/arm64/include/asm/mem_encrypt.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __ASM_MEM_ENCRYPT_H +#define __ASM_MEM_ENCRYPT_H + +struct arm64_mem_crypt_ops { + int (*encrypt)(unsigned long addr, int numpages); + int (*decrypt)(unsigned long addr, int numpages); +}; + +int arm64_mem_crypt_ops_register(const struct arm64_mem_crypt_ops *ops); + +int set_memory_encrypted(unsigned long addr, int numpages); +int set_memory_decrypted(unsigned long addr, int numpages); + +#endif /* __ASM_MEM_ENCRYPT_H */ diff --git a/arch/arm64/include/asm/set_memory.h b/arch/arm64/include/asm/set_memory.h index 0f740b781187..917761feeffd 100644 --- a/arch/arm64/include/asm/set_memory.h +++ b/arch/arm64/include/asm/set_memory.h @@ -3,6 +3,7 @@ #ifndef _ASM_ARM64_SET_MEMORY_H #define _ASM_ARM64_SET_MEMORY_H +#include #include bool can_set_direct_map(void); diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile index dbd1bc95967d..b35415be1382 100644 --- a/arch/arm64/mm/Makefile +++ b/arch/arm64/mm/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-y := dma-mapping.o extable.o fault.o init.o \ cache.o copypage.o flush.o \ - ioremap.o mmap.o pgd.o mmu.o \ + ioremap.o mmap.o pgd.o mem_encrypt.o mmu.o \ context.o proc.o pageattr.o fixmap.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_PTDUMP_CORE) += ptdump.o diff --git a/arch/arm64/mm/mem_encrypt.c b/arch/arm64/mm/mem_encrypt.c new file mode 100644 index 000000000000..ee3c0ab04384 --- /dev/null +++ b/arch/arm64/mm/mem_encrypt.c @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Implementation of the memory encryption/decryption API. + * + * Since the low-level details of the operation depend on the + * Confidential Computing environment (e.g. pKVM, CCA, ...), this just + * acts as a top-level dispatcher to whatever hooks may have been + * registered. + * + * Author: Will Deacon + * Copyright (C) 2024 Google LLC + * + * "Hello, boils and ghouls!" + */ + +#include +#include +#include +#include + +#include + +static const struct arm64_mem_crypt_ops *crypt_ops; + +int arm64_mem_crypt_ops_register(const struct arm64_mem_crypt_ops *ops) +{ + if (WARN_ON(crypt_ops)) + return -EBUSY; + + crypt_ops = ops; + return 0; +} + +int set_memory_encrypted(unsigned long addr, int numpages) +{ + if (likely(!crypt_ops) || WARN_ON(!PAGE_ALIGNED(addr))) + return 0; + + return crypt_ops->encrypt(addr, numpages); +} +EXPORT_SYMBOL_GPL(set_memory_encrypted); + +int set_memory_decrypted(unsigned long addr, int numpages) +{ + if (likely(!crypt_ops) || WARN_ON(!PAGE_ALIGNED(addr))) + return 0; + + return crypt_ops->decrypt(addr, numpages); +} +EXPORT_SYMBOL_GPL(set_memory_decrypted); -- Gitee From f5ca7195fe4445205dfe3f9b8eed12209f877b26 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 24 Jun 2025 15:45:38 +0800 Subject: [PATCH 02/84] arm64: mm: Add confidential computing hook to ioremap_prot() mainline inclusion from mainline-v6.12-rc1 commit c86fa3470c1026e9f63a93e8885ea51ef99fae35 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBWQ24 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c86fa3470c1026e9f63a93e8885ea51ef99fae35 -------------------------------- Confidential Computing environments such as pKVM and Arm's CCA distinguish between shared (i.e. emulated) and private (i.e. assigned) MMIO regions. Introduce a hook into our implementation of ioremap_prot() so that MMIO regions can be shared if necessary. Reviewed-by: Catalin Marinas Reviewed-by: Steven Price Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20240830130150.8568-6-will@kernel.org Signed-off-by: Will Deacon Signed-off-by: Cai Xinchen Signed-off-by: Xu Raoqing --- arch/arm64/include/asm/io.h | 4 ++++ arch/arm64/mm/ioremap.c | 23 ++++++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 3b694511b98f..9f92f92ad62d 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -139,6 +139,10 @@ extern void __memset_io(volatile void __iomem *, int, size_t); * I/O memory mapping functions. */ +typedef int (*ioremap_prot_hook_t)(phys_addr_t phys_addr, size_t size, + pgprot_t *prot); +int arm64_ioremap_prot_hook_register(const ioremap_prot_hook_t hook); + #define ioremap_prot ioremap_prot #define _PAGE_IOREMAP PROT_DEVICE_nGnRE diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c index 3f34dc886f81..47859e84755b 100644 --- a/arch/arm64/mm/ioremap.c +++ b/arch/arm64/mm/ioremap.c @@ -4,10 +4,22 @@ #include #include +static ioremap_prot_hook_t ioremap_prot_hook; + +int arm64_ioremap_prot_hook_register(ioremap_prot_hook_t hook) +{ + if (WARN_ON(ioremap_prot_hook)) + return -EBUSY; + + ioremap_prot_hook = hook; + return 0; +} + void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, unsigned long prot) { unsigned long last_addr = phys_addr + size - 1; + pgprot_t pgprot = __pgprot(prot); /* Don't allow outside PHYS_MASK */ if (last_addr & ~PHYS_MASK) @@ -17,7 +29,16 @@ void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, if (WARN_ON(pfn_is_map_memory(__phys_to_pfn(phys_addr)))) return NULL; - return generic_ioremap_prot(phys_addr, size, __pgprot(prot)); + /* + * If a hook is registered (e.g. for confidential computing + * purposes), call that now and barf if it fails. + */ + if (unlikely(ioremap_prot_hook) && + WARN_ON(ioremap_prot_hook(phys_addr, size, &pgprot))) { + return NULL; + } + + return generic_ioremap_prot(phys_addr, size, pgprot); } EXPORT_SYMBOL(ioremap_prot); -- Gitee From fb5f86dfa91bdbdc17027f660494fc82829ef47a Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 24 Jun 2025 15:45:39 +0800 Subject: [PATCH 03/84] arm64: rsi: Add RSI definitions mainline inclusion from mainline-v6.13-rc1 commit b880a80011f56880f32bde47fc6af313359f926b category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBWQ24 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b880a80011f56880f32bde47fc6af313359f926b -------------------------------- The RMM (Realm Management Monitor) provides functionality that can be accessed by a realm guest through SMC (Realm Services Interface) calls. The SMC definitions are based on DEN0137[1] version 1.0-rel0. [1] https://developer.arm.com/documentation/den0137/1-0rel0/ Acked-by: Catalin Marinas Reviewed-by: Gavin Shan Signed-off-by: Suzuki K Poulose Signed-off-by: Steven Price Link: https://lore.kernel.org/r/20241017131434.40935-2-steven.price@arm.com Signed-off-by: Catalin Marinas Signed-off-by: Cai Xinchen --- arch/arm64/include/asm/rsi_cmds.h | 139 +++++++++++++++++++++ arch/arm64/include/asm/rsi_smc.h | 193 ++++++++++++++++++++++++++++++ 2 files changed, 332 insertions(+) create mode 100644 arch/arm64/include/asm/rsi_cmds.h create mode 100644 arch/arm64/include/asm/rsi_smc.h diff --git a/arch/arm64/include/asm/rsi_cmds.h b/arch/arm64/include/asm/rsi_cmds.h new file mode 100644 index 000000000000..2fcf351b5634 --- /dev/null +++ b/arch/arm64/include/asm/rsi_cmds.h @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2023 ARM Ltd. + */ + +#ifndef __ASM_RSI_CMDS_H +#define __ASM_RSI_CMDS_H + +#include + +#include + +#define RSI_GRANULE_SHIFT 12 +#define RSI_GRANULE_SIZE (_AC(1, UL) << RSI_GRANULE_SHIFT) + +enum ripas { + RSI_RIPAS_EMPTY = 0, + RSI_RIPAS_RAM = 1, + RSI_RIPAS_DESTROYED = 2, + RSI_RIPAS_DEV = 3, +}; + +static inline unsigned long rsi_request_version(unsigned long req, + unsigned long *out_lower, + unsigned long *out_higher) +{ + struct arm_smccc_res res; + + arm_smccc_smc(SMC_RSI_ABI_VERSION, req, 0, 0, 0, 0, 0, 0, &res); + + if (out_lower) + *out_lower = res.a1; + if (out_higher) + *out_higher = res.a2; + + return res.a0; +} + +static inline unsigned long rsi_get_realm_config(struct realm_config *cfg) +{ + struct arm_smccc_res res; + + arm_smccc_smc(SMC_RSI_REALM_CONFIG, virt_to_phys(cfg), + 0, 0, 0, 0, 0, 0, &res); + return res.a0; +} + +static inline long rsi_set_addr_range_state(phys_addr_t start, + phys_addr_t end, + enum ripas state, + unsigned long flags, + phys_addr_t *top) +{ + struct arm_smccc_res res; + + arm_smccc_smc(SMC_RSI_IPA_STATE_SET, start, end, state, + flags, 0, 0, 0, &res); + + if (top) + *top = res.a1; + + if (res.a2 != RSI_ACCEPT) + return -EPERM; + + return res.a0; +} + +/** + * rsi_attestation_token_init - Initialise the operation to retrieve an + * attestation token. + * + * @challenge: The challenge data to be used in the attestation token + * generation. + * @size: Size of the challenge data in bytes. + * + * Initialises the attestation token generation and returns an upper bound + * on the attestation token size that can be used to allocate an adequate + * buffer. The caller is expected to subsequently call + * rsi_attestation_token_continue() to retrieve the attestation token data on + * the same CPU. + * + * Returns: + * On success, returns the upper limit of the attestation report size. + * Otherwise, -EINVAL + */ +static inline long +rsi_attestation_token_init(const u8 *challenge, unsigned long size) +{ + struct arm_smccc_1_2_regs regs = { 0 }; + + /* The challenge must be at least 32bytes and at most 64bytes */ + if (!challenge || size < 32 || size > 64) + return -EINVAL; + + regs.a0 = SMC_RSI_ATTESTATION_TOKEN_INIT; + memcpy(®s.a1, challenge, size); + arm_smccc_1_2_smc(®s, ®s); + + if (regs.a0 == RSI_SUCCESS) + return regs.a1; + + return -EINVAL; +} + +/** + * rsi_attestation_token_continue - Continue the operation to retrieve an + * attestation token. + * + * @granule: {I}PA of the Granule to which the token will be written. + * @offset: Offset within Granule to start of buffer in bytes. + * @size: The size of the buffer. + * @len: The number of bytes written to the buffer. + * + * Retrieves up to a RSI_GRANULE_SIZE worth of token data per call. The caller + * is expected to call rsi_attestation_token_init() before calling this + * function to retrieve the attestation token. + * + * Return: + * * %RSI_SUCCESS - Attestation token retrieved successfully. + * * %RSI_INCOMPLETE - Token generation is not complete. + * * %RSI_ERROR_INPUT - A parameter was not valid. + * * %RSI_ERROR_STATE - Attestation not in progress. + */ +static inline unsigned long rsi_attestation_token_continue(phys_addr_t granule, + unsigned long offset, + unsigned long size, + unsigned long *len) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RSI_ATTESTATION_TOKEN_CONTINUE, + granule, offset, size, 0, &res); + + if (len) + *len = res.a1; + return res.a0; +} + +#endif /* __ASM_RSI_CMDS_H */ diff --git a/arch/arm64/include/asm/rsi_smc.h b/arch/arm64/include/asm/rsi_smc.h new file mode 100644 index 000000000000..6cb070eca9e9 --- /dev/null +++ b/arch/arm64/include/asm/rsi_smc.h @@ -0,0 +1,193 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2023 ARM Ltd. + */ + +#ifndef __ASM_RSI_SMC_H_ +#define __ASM_RSI_SMC_H_ + +#include + +/* + * This file describes the Realm Services Interface (RSI) Application Binary + * Interface (ABI) for SMC calls made from within the Realm to the RMM and + * serviced by the RMM. + */ + +/* + * The major version number of the RSI implementation. This is increased when + * the binary format or semantics of the SMC calls change. + */ +#define RSI_ABI_VERSION_MAJOR UL(1) + +/* + * The minor version number of the RSI implementation. This is increased when + * a bug is fixed, or a feature is added without breaking binary compatibility. + */ +#define RSI_ABI_VERSION_MINOR UL(0) + +#define RSI_ABI_VERSION ((RSI_ABI_VERSION_MAJOR << 16) | \ + RSI_ABI_VERSION_MINOR) + +#define RSI_ABI_VERSION_GET_MAJOR(_version) ((_version) >> 16) +#define RSI_ABI_VERSION_GET_MINOR(_version) ((_version) & 0xFFFF) + +#define RSI_SUCCESS UL(0) +#define RSI_ERROR_INPUT UL(1) +#define RSI_ERROR_STATE UL(2) +#define RSI_INCOMPLETE UL(3) +#define RSI_ERROR_UNKNOWN UL(4) + +#define SMC_RSI_FID(n) ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_STANDARD, \ + n) + +/* + * Returns RSI version. + * + * arg1 == Requested interface revision + * ret0 == Status / error + * ret1 == Lower implemented interface revision + * ret2 == Higher implemented interface revision + */ +#define SMC_RSI_ABI_VERSION SMC_RSI_FID(0x190) + +/* + * Read feature register. + * + * arg1 == Feature register index + * ret0 == Status / error + * ret1 == Feature register value + */ +#define SMC_RSI_FEATURES SMC_RSI_FID(0x191) + +/* + * Read measurement for the current Realm. + * + * arg1 == Index, which measurements slot to read + * ret0 == Status / error + * ret1 == Measurement value, bytes: 0 - 7 + * ret2 == Measurement value, bytes: 8 - 15 + * ret3 == Measurement value, bytes: 16 - 23 + * ret4 == Measurement value, bytes: 24 - 31 + * ret5 == Measurement value, bytes: 32 - 39 + * ret6 == Measurement value, bytes: 40 - 47 + * ret7 == Measurement value, bytes: 48 - 55 + * ret8 == Measurement value, bytes: 56 - 63 + */ +#define SMC_RSI_MEASUREMENT_READ SMC_RSI_FID(0x192) + +/* + * Extend Realm Extensible Measurement (REM) value. + * + * arg1 == Index, which measurements slot to extend + * arg2 == Size of realm measurement in bytes, max 64 bytes + * arg3 == Measurement value, bytes: 0 - 7 + * arg4 == Measurement value, bytes: 8 - 15 + * arg5 == Measurement value, bytes: 16 - 23 + * arg6 == Measurement value, bytes: 24 - 31 + * arg7 == Measurement value, bytes: 32 - 39 + * arg8 == Measurement value, bytes: 40 - 47 + * arg9 == Measurement value, bytes: 48 - 55 + * arg10 == Measurement value, bytes: 56 - 63 + * ret0 == Status / error + */ +#define SMC_RSI_MEASUREMENT_EXTEND SMC_RSI_FID(0x193) + +/* + * Initialize the operation to retrieve an attestation token. + * + * arg1 == Challenge value, bytes: 0 - 7 + * arg2 == Challenge value, bytes: 8 - 15 + * arg3 == Challenge value, bytes: 16 - 23 + * arg4 == Challenge value, bytes: 24 - 31 + * arg5 == Challenge value, bytes: 32 - 39 + * arg6 == Challenge value, bytes: 40 - 47 + * arg7 == Challenge value, bytes: 48 - 55 + * arg8 == Challenge value, bytes: 56 - 63 + * ret0 == Status / error + * ret1 == Upper bound of token size in bytes + */ +#define SMC_RSI_ATTESTATION_TOKEN_INIT SMC_RSI_FID(0x194) + +/* + * Continue the operation to retrieve an attestation token. + * + * arg1 == The IPA of token buffer + * arg2 == Offset within the granule of the token buffer + * arg3 == Size of the granule buffer + * ret0 == Status / error + * ret1 == Length of token bytes copied to the granule buffer + */ +#define SMC_RSI_ATTESTATION_TOKEN_CONTINUE SMC_RSI_FID(0x195) + +#ifndef __ASSEMBLY__ + +struct realm_config { + union { + struct { + unsigned long ipa_bits; /* Width of IPA in bits */ + unsigned long hash_algo; /* Hash algorithm */ + }; + u8 pad[0x200]; + }; + union { + u8 rpv[64]; /* Realm Personalization Value */ + u8 pad2[0xe00]; + }; + /* + * The RMM requires the configuration structure to be aligned to a 4k + * boundary, ensure this happens by aligning this structure. + */ +} __aligned(0x1000); + +#endif /* __ASSEMBLY__ */ + +/* + * Read configuration for the current Realm. + * + * arg1 == struct realm_config addr + * ret0 == Status / error + */ +#define SMC_RSI_REALM_CONFIG SMC_RSI_FID(0x196) + +/* + * Request RIPAS of a target IPA range to be changed to a specified value. + * + * arg1 == Base IPA address of target region + * arg2 == Top of the region + * arg3 == RIPAS value + * arg4 == flags + * ret0 == Status / error + * ret1 == Top of modified IPA range + * ret2 == Whether the Host accepted or rejected the request + */ +#define SMC_RSI_IPA_STATE_SET SMC_RSI_FID(0x197) + +#define RSI_NO_CHANGE_DESTROYED UL(0) +#define RSI_CHANGE_DESTROYED UL(1) + +#define RSI_ACCEPT UL(0) +#define RSI_REJECT UL(1) + +/* + * Get RIPAS of a target IPA range. + * + * arg1 == Base IPA of target region + * arg2 == End of target IPA region + * ret0 == Status / error + * ret1 == Top of IPA region which has the reported RIPAS value + * ret2 == RIPAS value + */ +#define SMC_RSI_IPA_STATE_GET SMC_RSI_FID(0x198) + +/* + * Make a Host call. + * + * arg1 == IPA of host call structure + * ret0 == Status / error + */ +#define SMC_RSI_HOST_CALL SMC_RSI_FID(0x199) + +#endif /* __ASM_RSI_SMC_H_ */ -- Gitee From 04abb97fea751e059ed1d602a5e094c9fbb95e12 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 24 Jun 2025 15:45:40 +0800 Subject: [PATCH 04/84] arm64: Detect if in a realm and set RIPAS RAM mainline inclusion from mainline-v6.13-rc1 commit c077711f718be7cebcc8b987eac2ebfd17447e9f category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBWQ24 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c077711f718be7cebcc8b987eac2ebfd17447e9f -------------------------------- Detect that the VM is a realm guest by the presence of the RSI interface. This is done after PSCI has been initialised so that we can check the SMCCC conduit before making any RSI calls. If in a realm then iterate over all memory ensuring that it is marked as RIPAS RAM. The loader is required to do this for us, however if some memory is missed this will cause the guest to receive a hard to debug external abort at some random point in the future. So for a belt-and-braces approach set all memory to RIPAS RAM. Any failure here implies that the RAM regions passed to Linux are incorrect so panic() promptly to make the situation clear. Reviewed-by: Gavin Shan Reviewed-by: Catalin Marinas Signed-off-by: Suzuki K Poulose Co-developed-by: Steven Price Signed-off-by: Steven Price Link: https://lore.kernel.org/r/20241017131434.40935-3-steven.price@arm.com Signed-off-by: Catalin Marinas Conflicts: arch/arm64/kernel/Makefile arch/arm64/kernel/setup.c [Only context conflicts] Signed-off-by: Cai Xinchen Signed-off-by: Xu Raoqing --- arch/arm64/include/asm/rsi.h | 66 +++++++++++++++++++++++++++++++ arch/arm64/kernel/Makefile | 2 +- arch/arm64/kernel/rsi.c | 76 ++++++++++++++++++++++++++++++++++++ arch/arm64/kernel/setup.c | 3 ++ 4 files changed, 146 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/include/asm/rsi.h create mode 100644 arch/arm64/kernel/rsi.c diff --git a/arch/arm64/include/asm/rsi.h b/arch/arm64/include/asm/rsi.h new file mode 100644 index 000000000000..acba065eb00e --- /dev/null +++ b/arch/arm64/include/asm/rsi.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2024 ARM Ltd. + */ + +#ifndef __ASM_RSI_H_ +#define __ASM_RSI_H_ + +#include +#include +#include + +DECLARE_STATIC_KEY_FALSE(rsi_present); + +void __init arm64_rsi_init(void); + +static inline bool is_realm_world(void) +{ + return static_branch_unlikely(&rsi_present); +} + +static inline int rsi_set_memory_range(phys_addr_t start, phys_addr_t end, + enum ripas state, unsigned long flags) +{ + unsigned long ret; + phys_addr_t top; + + while (start != end) { + ret = rsi_set_addr_range_state(start, end, state, flags, &top); + if (ret || top < start || top > end) + return -EINVAL; + start = top; + } + + return 0; +} + +/* + * Convert the specified range to RAM. Do not use this if you rely on the + * contents of a page that may already be in RAM state. + */ +static inline int rsi_set_memory_range_protected(phys_addr_t start, + phys_addr_t end) +{ + return rsi_set_memory_range(start, end, RSI_RIPAS_RAM, + RSI_CHANGE_DESTROYED); +} + +/* + * Convert the specified range to RAM. Do not convert any pages that may have + * been DESTROYED, without our permission. + */ +static inline int rsi_set_memory_range_protected_safe(phys_addr_t start, + phys_addr_t end) +{ + return rsi_set_memory_range(start, end, RSI_RIPAS_RAM, + RSI_NO_CHANGE_DESTROYED); +} + +static inline int rsi_set_memory_range_shared(phys_addr_t start, + phys_addr_t end) +{ + return rsi_set_memory_range(start, end, RSI_RIPAS_EMPTY, + RSI_CHANGE_DESTROYED); +} +#endif /* __ASM_RSI_H_ */ diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index d95b3d6b471a..c6a5be77d77d 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -34,7 +34,7 @@ obj-y := debug-monitors.o entry.o irq.o fpsimd.o \ cpufeature.o alternative.o cacheinfo.o \ smp.o smp_spin_table.o topology.o smccc-call.o \ syscall.o proton-pack.o idreg-override.o idle.o \ - patching.o + patching.o rsi.o obj-$(CONFIG_COMPAT) += sys32.o signal32.o \ sys_compat.o diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c new file mode 100644 index 000000000000..c5758317dfed --- /dev/null +++ b/arch/arm64/kernel/rsi.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 ARM Ltd. + */ + +#include +#include +#include +#include + +DEFINE_STATIC_KEY_FALSE_RO(rsi_present); +EXPORT_SYMBOL(rsi_present); + +static bool rsi_version_matches(void) +{ + unsigned long ver_lower, ver_higher; + unsigned long ret = rsi_request_version(RSI_ABI_VERSION, + &ver_lower, + &ver_higher); + + if (ret == SMCCC_RET_NOT_SUPPORTED) + return false; + + if (ret != RSI_SUCCESS) { + pr_err("RME: RMM doesn't support RSI version %lu.%lu. Supported range: %lu.%lu-%lu.%lu\n", + RSI_ABI_VERSION_MAJOR, RSI_ABI_VERSION_MINOR, + RSI_ABI_VERSION_GET_MAJOR(ver_lower), + RSI_ABI_VERSION_GET_MINOR(ver_lower), + RSI_ABI_VERSION_GET_MAJOR(ver_higher), + RSI_ABI_VERSION_GET_MINOR(ver_higher)); + return false; + } + + pr_info("RME: Using RSI version %lu.%lu\n", + RSI_ABI_VERSION_GET_MAJOR(ver_lower), + RSI_ABI_VERSION_GET_MINOR(ver_lower)); + + return true; +} + +static void __init arm64_rsi_setup_memory(void) +{ + u64 i; + phys_addr_t start, end; + + /* + * Iterate over the available memory ranges and convert the state to + * protected memory. We should take extra care to ensure that we DO NOT + * permit any "DESTROYED" pages to be converted to "RAM". + * + * panic() is used because if the attempt to switch the memory to + * protected has failed here, then future accesses to the memory are + * simply going to be reflected as a SEA (Synchronous External Abort) + * which we can't handle. Bailing out early prevents the guest limping + * on and dying later. + */ + for_each_mem_range(i, &start, &end) { + if (rsi_set_memory_range_protected_safe(start, end)) { + panic("Failed to set memory range to protected: %pa-%pa", + &start, &end); + } + } +} + +void __init arm64_rsi_init(void) +{ + if (arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_SMC) + return; + if (!rsi_version_matches()) + return; + + arm64_rsi_setup_memory(); + + static_branch_enable(&rsi_present); +} + diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 040b0175334c..a5711b5bd77f 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -371,6 +372,8 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p) else psci_acpi_init(); + arm64_rsi_init(); + init_bootcpu_ops(); smp_init_cpus(); smp_build_mpidr_hash(); -- Gitee From 0ea3129ed87ec36615bcb4cc3478b90e7e02e895 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Tue, 24 Jun 2025 15:45:41 +0800 Subject: [PATCH 05/84] arm64: realm: Query IPA size from the RMM mainline inclusion from mainline-v6.13-rc1 commit 399306954996be58ac20b4b29f6334e3d55a2ce7 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBWQ24 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=399306954996be58ac20b4b29f6334e3d55a2ce7 -------------------------------- The top bit of the configured IPA size is used as an attribute to control whether the address is protected or shared. Query the configuration from the RMM to assertain which bit this is. Reviewed-by: Catalin Marinas Reviewed-by: Gavin Shan Co-developed-by: Suzuki K Poulose Signed-off-by: Suzuki K Poulose Signed-off-by: Steven Price Link: https://lore.kernel.org/r/20241017131434.40935-4-steven.price@arm.com Signed-off-by: Catalin Marinas Signed-off-by: Cai Xinchen --- arch/arm64/include/asm/pgtable-prot.h | 4 ++++ arch/arm64/kernel/rsi.c | 8 ++++++++ 2 files changed, 12 insertions(+) diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index eed814b00a38..b8646e82db87 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -65,8 +65,12 @@ #include #include +#include extern bool arm64_use_ng_mappings; +extern unsigned long prot_ns_shared; + +#define PROT_NS_SHARED (is_realm_world() ? prot_ns_shared : 0) #define PTE_MAYBE_NG (arm64_use_ng_mappings ? PTE_NG : 0) #define PMD_MAYBE_NG (arm64_use_ng_mappings ? PMD_SECT_NG : 0) diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c index c5758317dfed..cea8f0d39591 100644 --- a/arch/arm64/kernel/rsi.c +++ b/arch/arm64/kernel/rsi.c @@ -8,6 +8,11 @@ #include #include +static struct realm_config config; + +unsigned long prot_ns_shared; +EXPORT_SYMBOL(prot_ns_shared); + DEFINE_STATIC_KEY_FALSE_RO(rsi_present); EXPORT_SYMBOL(rsi_present); @@ -68,6 +73,9 @@ void __init arm64_rsi_init(void) return; if (!rsi_version_matches()) return; + if (WARN_ON(rsi_get_realm_config(&config))) + return; + prot_ns_shared = BIT(config.ipa_bits - 1); arm64_rsi_setup_memory(); -- Gitee From e1a3a14f94453f63427419a1578cc30b6b629cc6 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 24 Jun 2025 15:45:42 +0800 Subject: [PATCH 06/84] arm64: rsi: Add support for checking whether an MMIO is protected mainline inclusion from mainline-v6.13-rc1 commit 371589437616fbb03590d8ff505f8a4c95c8a031 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBWQ24 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=371589437616fbb03590d8ff505f8a4c95c8a031 -------------------------------- On Arm CCA, with RMM-v1.0, all MMIO regions are shared. However, in the future, an Arm CCA-v1.0 compliant guest may be run in a lesser privileged partition in the Realm World (with Arm CCA-v1.1 Planes feature). In this case, some of the MMIO regions may be emulated by a higher privileged component in the Realm world, i.e, protected. Thus the guest must decide today, whether a given MMIO region is shared vs Protected and create the stage1 mapping accordingly. On Arm CCA, this detection is based on the "IPA State" (RIPAS == RIPAS_IO). Provide a helper to run this check on a given range of MMIO. Also, provide a arm64 helper which may be hooked in by other solutions. Reviewed-by: Catalin Marinas Reviewed-by: Gavin Shan Signed-off-by: Suzuki K Poulose Signed-off-by: Steven Price Link: https://lore.kernel.org/r/20241017131434.40935-5-steven.price@arm.com Signed-off-by: Catalin Marinas Signed-off-by: Cai Xinchen --- arch/arm64/include/asm/io.h | 8 ++++++++ arch/arm64/include/asm/rsi.h | 2 ++ arch/arm64/include/asm/rsi_cmds.h | 21 +++++++++++++++++++++ arch/arm64/kernel/rsi.c | 26 ++++++++++++++++++++++++++ 4 files changed, 57 insertions(+) diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 9f92f92ad62d..8f3bbf168ec7 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -17,6 +17,7 @@ #include #include #include +#include /* * Generic IO read/write. These perform native-endian accesses. @@ -186,4 +187,11 @@ extern bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size, unsigned long flags); #define arch_memremap_can_ram_remap arch_memremap_can_ram_remap +static inline bool arm64_is_protected_mmio(phys_addr_t phys_addr, size_t size) +{ + if (unlikely(is_realm_world())) + return __arm64_is_protected_mmio(phys_addr, size); + return false; +} + #endif /* __ASM_IO_H */ diff --git a/arch/arm64/include/asm/rsi.h b/arch/arm64/include/asm/rsi.h index acba065eb00e..188cbb9b23f5 100644 --- a/arch/arm64/include/asm/rsi.h +++ b/arch/arm64/include/asm/rsi.h @@ -14,6 +14,8 @@ DECLARE_STATIC_KEY_FALSE(rsi_present); void __init arm64_rsi_init(void); +bool __arm64_is_protected_mmio(phys_addr_t base, size_t size); + static inline bool is_realm_world(void) { return static_branch_unlikely(&rsi_present); diff --git a/arch/arm64/include/asm/rsi_cmds.h b/arch/arm64/include/asm/rsi_cmds.h index 2fcf351b5634..e6a211001bd3 100644 --- a/arch/arm64/include/asm/rsi_cmds.h +++ b/arch/arm64/include/asm/rsi_cmds.h @@ -45,6 +45,27 @@ static inline unsigned long rsi_get_realm_config(struct realm_config *cfg) return res.a0; } +static inline unsigned long rsi_ipa_state_get(phys_addr_t start, + phys_addr_t end, + enum ripas *state, + phys_addr_t *top) +{ + struct arm_smccc_res res; + + arm_smccc_smc(SMC_RSI_IPA_STATE_GET, + start, end, 0, 0, 0, 0, 0, + &res); + + if (res.a0 == RSI_SUCCESS) { + if (top) + *top = res.a1; + if (state) + *state = res.a2; + } + + return res.a0; +} + static inline long rsi_set_addr_range_state(phys_addr_t start, phys_addr_t end, enum ripas state, diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c index cea8f0d39591..7e7934c4fca0 100644 --- a/arch/arm64/kernel/rsi.c +++ b/arch/arm64/kernel/rsi.c @@ -67,6 +67,32 @@ static void __init arm64_rsi_setup_memory(void) } } +bool __arm64_is_protected_mmio(phys_addr_t base, size_t size) +{ + enum ripas ripas; + phys_addr_t end, top; + + /* Overflow ? */ + if (WARN_ON(base + size <= base)) + return false; + + end = ALIGN(base + size, RSI_GRANULE_SIZE); + base = ALIGN_DOWN(base, RSI_GRANULE_SIZE); + + while (base < end) { + if (WARN_ON(rsi_ipa_state_get(base, end, &ripas, &top))) + break; + if (WARN_ON(top <= base)) + break; + if (ripas != RSI_RIPAS_DEV) + break; + base = top; + } + + return base >= end; +} +EXPORT_SYMBOL(__arm64_is_protected_mmio); + void __init arm64_rsi_init(void) { if (arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_SMC) -- Gitee From dcf82b88698f6e7ec1c560811c2cbcf4be36c1e7 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 24 Jun 2025 15:45:43 +0800 Subject: [PATCH 07/84] arm64: rsi: Map unprotected MMIO as decrypted mainline inclusion from mainline-v6.13-rc1 commit 3c6c706139564f74ec48229378873c1d930a8bc8 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBWQ24 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3c6c706139564f74ec48229378873c1d930a8bc8 -------------------------------- Instead of marking every MMIO as shared, check if the given region is "Protected" and apply the permissions accordingly. Reviewed-by: Gavin Shan Reviewed-by: Catalin Marinas Signed-off-by: Suzuki K Poulose Signed-off-by: Steven Price Link: https://lore.kernel.org/r/20241017131434.40935-6-steven.price@arm.com Signed-off-by: Catalin Marinas Signed-off-by: Cai Xinchen --- arch/arm64/kernel/rsi.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c index 7e7934c4fca0..3e0c83e2296f 100644 --- a/arch/arm64/kernel/rsi.c +++ b/arch/arm64/kernel/rsi.c @@ -6,6 +6,8 @@ #include #include #include + +#include #include static struct realm_config config; @@ -93,6 +95,16 @@ bool __arm64_is_protected_mmio(phys_addr_t base, size_t size) } EXPORT_SYMBOL(__arm64_is_protected_mmio); +static int realm_ioremap_hook(phys_addr_t phys, size_t size, pgprot_t *prot) +{ + if (__arm64_is_protected_mmio(phys, size)) + *prot = pgprot_encrypted(*prot); + else + *prot = pgprot_decrypted(*prot); + + return 0; +} + void __init arm64_rsi_init(void) { if (arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_SMC) @@ -103,6 +115,9 @@ void __init arm64_rsi_init(void) return; prot_ns_shared = BIT(config.ipa_bits - 1); + if (arm64_ioremap_prot_hook_register(realm_ioremap_hook)) + return; + arm64_rsi_setup_memory(); static_branch_enable(&rsi_present); -- Gitee From b52627cd41109d95ce5fbacf867401fddad5d77a Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 24 Jun 2025 15:45:44 +0800 Subject: [PATCH 08/84] efi: arm64: Map Device with Prot Shared mainline inclusion from mainline-v6.13-rc1 commit 491db21d8256992ab9fe11c42744eb3044315d14 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBWQ24 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=491db21d8256992ab9fe11c42744eb3044315d14 -------------------------------- Device mappings need to be emulated by the VMM so must be mapped shared with the host. Reviewed-by: Gavin Shan Reviewed-by: Catalin Marinas Signed-off-by: Suzuki K Poulose Signed-off-by: Steven Price Link: https://lore.kernel.org/r/20241017131434.40935-7-steven.price@arm.com Signed-off-by: Catalin Marinas Signed-off-by: Cai Xinchen --- arch/arm64/kernel/efi.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 2b478ca356b0..76cf3e63422a 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -32,8 +32,16 @@ static __init pteval_t create_mapping_protection(efi_memory_desc_t *md) u64 attr = md->attribute; u32 type = md->type; - if (type == EFI_MEMORY_MAPPED_IO) - return PROT_DEVICE_nGnRE; + if (type == EFI_MEMORY_MAPPED_IO) { + pgprot_t prot = __pgprot(PROT_DEVICE_nGnRE); + + if (arm64_is_protected_mmio(md->phys_addr, + md->num_pages << EFI_PAGE_SHIFT)) + prot = pgprot_encrypted(prot); + else + prot = pgprot_decrypted(prot); + return pgprot_val(prot); + } if (region_is_misaligned(md)) { static bool __initdata code_is_misaligned; -- Gitee From fda8d44fa8ddf9f299f616e744e26f717e7c8e8e Mon Sep 17 00:00:00 2001 From: Steven Price Date: Tue, 24 Jun 2025 15:45:45 +0800 Subject: [PATCH 09/84] arm64: Enforce bounce buffers for realm DMA mainline inclusion from mainline-v6.13-rc1 commit fbf979a01375704fa87c559763209c658593b6f8 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBWQ24 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fbf979a01375704fa87c559763209c658593b6f8 -------------------------------- Within a realm guest it's not possible for a device emulated by the VMM to access arbitrary guest memory. So force the use of bounce buffers to ensure that the memory the emulated devices are accessing is in memory which is explicitly shared with the host. This adds a call to swiotlb_update_mem_attributes() which calls set_memory_decrypted() to ensure the bounce buffer memory is shared with the host. For non-realm guests or hosts this is a no-op. Reviewed-by: Catalin Marinas Reviewed-by: Gavin Shan Co-developed-by: Suzuki K Poulose Signed-off-by: Suzuki K Poulose Signed-off-by: Steven Price Link: https://lore.kernel.org/r/20241017131434.40935-8-steven.price@arm.com Signed-off-by: Catalin Marinas Conflicts: arch/arm64/mm/init.c [The commit 65033574ade9 ("arm64: swiotlb: Reduce the default size if no ZONE_DMA bouncing needed") is not merged resulting in context conflicts. Virtcca commit caaefd56addf ("mm: enable swiotlb alloc for cvm share mem") call swiotlb_cvm_update_mem_attributes after swiotlb_init resulting in context conflicts.] Signed-off-by: Cai Xinchen Signed-off-by: Xu Raoqing --- arch/arm64/kernel/rsi.c | 1 + arch/arm64/mm/init.c | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c index 3e0c83e2296f..a23c0a7154d2 100644 --- a/arch/arm64/kernel/rsi.c +++ b/arch/arm64/kernel/rsi.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 1463dc657a98..f840dd539caf 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -495,12 +496,19 @@ void __init bootmem_init(void) */ void __init mem_init(void) { + unsigned int flags = SWIOTLB_VERBOSE; bool swiotlb = max_pfn > PFN_DOWN(arm64_dma_phys_limit); + if (is_realm_world()) { + swiotlb = true; + flags |= SWIOTLB_FORCE; + } + if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC)) swiotlb = true; - swiotlb_init(swiotlb, SWIOTLB_VERBOSE); + swiotlb_init(swiotlb, flags); + swiotlb_update_mem_attributes(); #ifdef CONFIG_PSWIOTLB /* enable pswiotlb default */ -- Gitee From 601d993db5995b78c534d1ab7746aadc27f3d7fd Mon Sep 17 00:00:00 2001 From: Steven Price Date: Tue, 24 Jun 2025 15:45:46 +0800 Subject: [PATCH 10/84] arm64: mm: Avoid TLBI when marking pages as valid mainline inclusion from mainline-v6.13-rc1 commit 0e9cb5995b2539a332fe65ada6a28a6be55f6e40 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBWQ24 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0e9cb5995b2539a332fe65ada6a28a6be55f6e40 -------------------------------- When __change_memory_common() is purely setting the valid bit on a PTE (e.g. via the set_memory_valid() call) there is no need for a TLBI as either the entry isn't changing (the valid bit was already set) or the entry was invalid and so should not have been cached in the TLB. Reviewed-by: Catalin Marinas Reviewed-by: Gavin Shan Reviewed-by: Suzuki K Poulose Signed-off-by: Steven Price Link: https://lore.kernel.org/r/20241017131434.40935-9-steven.price@arm.com Signed-off-by: Catalin Marinas Signed-off-by: Cai Xinchen --- arch/arm64/mm/pageattr.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 0a62f458c5cb..ac46516e929c 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -60,7 +60,13 @@ static int __change_memory_common(unsigned long start, unsigned long size, ret = apply_to_page_range(&init_mm, start, size, change_page_range, &data); - flush_tlb_kernel_range(start, start + size); + /* + * If the memory is being made valid without changing any other bits + * then a TLBI isn't required as a non-valid entry cannot be cached in + * the TLB. + */ + if (pgprot_val(set_mask) != PTE_VALID || pgprot_val(clear_mask)) + flush_tlb_kernel_range(start, start + size); return ret; } -- Gitee From 2dec29b57f1a7202e694277996e2e5790db43738 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 24 Jun 2025 15:45:47 +0800 Subject: [PATCH 11/84] arm64: Enable memory encrypt for Realms mainline inclusion from mainline-v6.13-rc1 commit 42be24a4178fe51e6f47d91d8621b2f53820f88b category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBWQ24 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=42be24a4178fe51e6f47d91d8621b2f53820f88b -------------------------------- Use the memory encryption APIs to trigger a RSI call to request a transition between protected memory and shared memory (or vice versa) and updating the kernel's linear map of modified pages to flip the top bit of the IPA. This requires that block mappings are not used in the direct map for realm guests. Reviewed-by: Catalin Marinas Reviewed-by: Gavin Shan Signed-off-by: Suzuki K Poulose Co-developed-by: Steven Price Signed-off-by: Steven Price Link: https://lore.kernel.org/r/20241017131434.40935-10-steven.price@arm.com Signed-off-by: Catalin Marinas Conflicts: arch/arm64/Kconfig [Only context conflicts] Signed-off-by: Cai Xinchen Signed-off-by: Xu Raoqing --- arch/arm64/Kconfig | 3 + arch/arm64/include/asm/mem_encrypt.h | 9 +++ arch/arm64/include/asm/pgtable.h | 5 ++ arch/arm64/include/asm/set_memory.h | 3 + arch/arm64/kernel/rsi.c | 16 +++++ arch/arm64/mm/pageattr.c | 90 +++++++++++++++++++++++++++- 6 files changed, 123 insertions(+), 3 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 089fc18ea851..7452254e7a5c 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -20,6 +20,7 @@ config ARM64 select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2 select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE select ARCH_HAS_CACHE_LINE_SIZE + select ARCH_HAS_CC_PLATFORM select ARCH_HAS_CURRENT_STACK_POINTER select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEBUG_VM_PGTABLE @@ -40,6 +41,8 @@ config ARM64 select ARCH_HAS_SETUP_DMA_OPS select ARCH_HAS_SET_DIRECT_MAP select ARCH_HAS_SET_MEMORY + select ARCH_HAS_MEM_ENCRYPT + select ARCH_HAS_FORCE_DMA_UNENCRYPTED select ARCH_STACKWALK select ARCH_HAS_STRICT_KERNEL_RWX select ARCH_HAS_STRICT_MODULE_RWX diff --git a/arch/arm64/include/asm/mem_encrypt.h b/arch/arm64/include/asm/mem_encrypt.h index b0c9a86b13a4..f8f78f622dd2 100644 --- a/arch/arm64/include/asm/mem_encrypt.h +++ b/arch/arm64/include/asm/mem_encrypt.h @@ -2,6 +2,8 @@ #ifndef __ASM_MEM_ENCRYPT_H #define __ASM_MEM_ENCRYPT_H +#include + struct arm64_mem_crypt_ops { int (*encrypt)(unsigned long addr, int numpages); int (*decrypt)(unsigned long addr, int numpages); @@ -12,4 +14,11 @@ int arm64_mem_crypt_ops_register(const struct arm64_mem_crypt_ops *ops); int set_memory_encrypted(unsigned long addr, int numpages); int set_memory_decrypted(unsigned long addr, int numpages); +int realm_register_memory_enc_ops(void); + +static inline bool force_dma_unencrypted(struct device *dev) +{ + return is_realm_world(); +} + #endif /* __ASM_MEM_ENCRYPT_H */ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 34cf8dcfa0b0..f6575f6f8c2d 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -552,6 +552,11 @@ static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, #define pgprot_nx(prot) \ __pgprot_modify(prot, PTE_MAYBE_GP, PTE_PXN) +#define pgprot_decrypted(prot) \ + __pgprot_modify(prot, PROT_NS_SHARED, PROT_NS_SHARED) +#define pgprot_encrypted(prot) \ + __pgprot_modify(prot, PROT_NS_SHARED, 0) + /* * Mark the prot value as uncacheable and unbufferable. */ diff --git a/arch/arm64/include/asm/set_memory.h b/arch/arm64/include/asm/set_memory.h index 917761feeffd..37774c793006 100644 --- a/arch/arm64/include/asm/set_memory.h +++ b/arch/arm64/include/asm/set_memory.h @@ -15,4 +15,7 @@ int set_direct_map_invalid_noflush(struct page *page); int set_direct_map_default_noflush(struct page *page); bool kernel_page_present(struct page *page); +int set_memory_encrypted(unsigned long addr, int numpages); +int set_memory_decrypted(unsigned long addr, int numpages); + #endif /* _ASM_ARM64_SET_MEMORY_H */ diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c index a23c0a7154d2..3031f25c32ef 100644 --- a/arch/arm64/kernel/rsi.c +++ b/arch/arm64/kernel/rsi.c @@ -7,8 +7,10 @@ #include #include #include +#include #include +#include #include static struct realm_config config; @@ -19,6 +21,17 @@ EXPORT_SYMBOL(prot_ns_shared); DEFINE_STATIC_KEY_FALSE_RO(rsi_present); EXPORT_SYMBOL(rsi_present); +bool cc_platform_has(enum cc_attr attr) +{ + switch (attr) { + case CC_ATTR_MEM_ENCRYPT: + return is_realm_world(); + default: + return false; + } +} +EXPORT_SYMBOL_GPL(cc_platform_has); + static bool rsi_version_matches(void) { unsigned long ver_lower, ver_higher; @@ -119,6 +132,9 @@ void __init arm64_rsi_init(void) if (arm64_ioremap_prot_hook_register(realm_ioremap_hook)) return; + if (realm_register_memory_enc_ops()) + return; + arm64_rsi_setup_memory(); static_branch_enable(&rsi_present); diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index ac46516e929c..8a81c59c9eff 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -5,10 +5,12 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -23,14 +25,16 @@ bool rodata_full __ro_after_init = IS_ENABLED(CONFIG_RODATA_FULL_DEFAULT_ENABLED bool can_set_direct_map(void) { /* - * rodata_full and DEBUG_PAGEALLOC require linear map to be - * mapped at page granularity, so that it is possible to + * rodata_full, DEBUG_PAGEALLOC and a Realm guest all require linear + * map to be mapped at page granularity, so that it is possible to * protect/unprotect single pages. * * KFENCE pool requires page-granular mapping if initialized late. + * + * Realms need to make pages shared/protected at page granularity. */ return rodata_full || debug_pagealloc_enabled() || - arm64_kfence_can_set_direct_map(); + arm64_kfence_can_set_direct_map() || is_realm_world(); } static int change_page_range(pte_t *ptep, unsigned long addr, void *data) @@ -198,6 +202,86 @@ int set_direct_map_default_noflush(struct page *page) PAGE_SIZE, change_page_range, &data); } +static int __set_memory_enc_dec(unsigned long addr, + int numpages, + bool encrypt) +{ + unsigned long set_prot = 0, clear_prot = 0; + phys_addr_t start, end; + int ret; + + if (!is_realm_world()) + return 0; + + if (!__is_lm_address(addr)) + return -EINVAL; + + start = __virt_to_phys(addr); + end = start + numpages * PAGE_SIZE; + + if (encrypt) + clear_prot = PROT_NS_SHARED; + else + set_prot = PROT_NS_SHARED; + + /* + * Break the mapping before we make any changes to avoid stale TLB + * entries or Synchronous External Aborts caused by RIPAS_EMPTY + */ + ret = __change_memory_common(addr, PAGE_SIZE * numpages, + __pgprot(set_prot), + __pgprot(clear_prot | PTE_VALID)); + + if (ret) + return ret; + + if (encrypt) + ret = rsi_set_memory_range_protected(start, end); + else + ret = rsi_set_memory_range_shared(start, end); + + if (ret) + return ret; + + return __change_memory_common(addr, PAGE_SIZE * numpages, + __pgprot(PTE_VALID), + __pgprot(0)); +} + +static int realm_set_memory_encrypted(unsigned long addr, int numpages) +{ + int ret = __set_memory_enc_dec(addr, numpages, true); + + /* + * If the request to change state fails, then the only sensible cause + * of action for the caller is to leak the memory + */ + WARN(ret, "Failed to encrypt memory, %d pages will be leaked", + numpages); + + return ret; +} + +static int realm_set_memory_decrypted(unsigned long addr, int numpages) +{ + int ret = __set_memory_enc_dec(addr, numpages, false); + + WARN(ret, "Failed to decrypt memory, %d pages will be leaked", + numpages); + + return ret; +} + +static const struct arm64_mem_crypt_ops realm_crypt_ops = { + .encrypt = realm_set_memory_encrypted, + .decrypt = realm_set_memory_decrypted, +}; + +int realm_register_memory_enc_ops(void) +{ + return arm64_mem_crypt_ops_register(&realm_crypt_ops); +} + #ifdef CONFIG_DEBUG_PAGEALLOC void __kernel_map_pages(struct page *page, int numpages, int enable) { -- Gitee From d64ef1c1976dd195ad58dd2c177e4863ecf83323 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Tue, 24 Jun 2025 15:45:48 +0800 Subject: [PATCH 12/84] irqchip/gic-v3-its: Share ITS tables with a non-trusted hypervisor mainline inclusion from mainline-v6.13-rc1 commit b08e2f42e86b5848add254da45b56fc672e2bced category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBWQ24 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b08e2f42e86b5848add254da45b56fc672e2bced -------------------------------- Within a realm guest the ITS is emulated by the host. This means the allocations must have been made available to the host by a call to set_memory_decrypted(). Introduce an allocation function which performs this extra call. For the ITT use a custom genpool-based allocator that calls set_memory_decrypted() for each page allocated, but then suballocates the size needed for each ITT. Note that there is no mechanism implemented to return pages from the genpool, but it is unlikely that the peak number of devices will be much larger than the normal level - so this isn't expected to be an issue. Co-developed-by: Suzuki K Poulose Signed-off-by: Suzuki K Poulose Signed-off-by: Steven Price Signed-off-by: Thomas Gleixner Tested-by: Will Deacon Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/all/20241002141630.433502-2-steven.price@arm.com Conflicts: drivers/irqchip/irq-gic-v3-its.c [Because the hisi inclusion commit 8207b85ea7207 ("kvm: hisi: print error for IPIV") and the commit 7b39fd06a3912 ("irqchip/gic-v3-its: Alloc/Free device id from pools for virtual devices"), The context conflicts.] Signed-off-by: Cai Xinchen Signed-off-by: Xu Raoqing --- drivers/irqchip/irq-gic-v3-its.c | 138 +++++++++++++++++++++++++------ 1 file changed, 115 insertions(+), 23 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 13cdb85b5d9b..37a9f7692bc2 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -12,12 +12,14 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include #include #include @@ -27,6 +29,7 @@ #include #include #include +#include #include #include @@ -167,6 +170,7 @@ struct its_device { struct its_node *its; struct event_lpi_map event_map; void *itt; + u32 itt_sz; u32 nr_ites; u32 device_id; bool shared; @@ -202,6 +206,87 @@ static DEFINE_IDA(its_vpeid_ida); #define gic_data_rdist_rd_base() (gic_data_rdist()->rd_base) #define gic_data_rdist_vlpi_base() (gic_data_rdist_rd_base() + SZ_128K) +static struct page *its_alloc_pages_node(int node, gfp_t gfp, + unsigned int order) +{ + struct page *page; + int ret = 0; + + page = alloc_pages_node(node, gfp, order); + + if (!page) + return NULL; + + ret = set_memory_decrypted((unsigned long)page_address(page), + 1 << order); + /* + * If set_memory_decrypted() fails then we don't know what state the + * page is in, so we can't free it. Instead we leak it. + * set_memory_decrypted() will already have WARNed. + */ + if (ret) + return NULL; + + return page; +} + +static struct page *its_alloc_pages(gfp_t gfp, unsigned int order) +{ + return its_alloc_pages_node(NUMA_NO_NODE, gfp, order); +} + +static void its_free_pages(void *addr, unsigned int order) +{ + /* + * If the memory cannot be encrypted again then we must leak the pages. + * set_memory_encrypted() will already have WARNed. + */ + if (set_memory_encrypted((unsigned long)addr, 1 << order)) + return; + free_pages((unsigned long)addr, order); +} + +static struct gen_pool *itt_pool; + +static void *itt_alloc_pool(int node, int size) +{ + unsigned long addr; + struct page *page; + + if (size >= PAGE_SIZE) { + page = its_alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, get_order(size)); + + return page ? page_address(page) : NULL; + } + + do { + addr = gen_pool_alloc(itt_pool, size); + if (addr) + break; + + page = its_alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 1); + if (!page) + break; + + gen_pool_add(itt_pool, (unsigned long)page_address(page), PAGE_SIZE, node); + } while (!addr); + + return (void *)addr; +} + +static void itt_free_pool(void *addr, int size) +{ + if (!addr) + return; + + if (size >= PAGE_SIZE) { + its_free_pages(addr, get_order(size)); + return; + } + + gen_pool_free(itt_pool, (unsigned long)addr, size); +} + /* * Skip ITSs that have no vLPIs mapped, unless we're on GICv4.1, as we * always have vSGIs mapped. @@ -2201,7 +2286,8 @@ static struct page *its_allocate_prop_table(gfp_t gfp_flags) { struct page *prop_page; - prop_page = alloc_pages(gfp_flags, get_order(LPI_PROPBASE_SZ)); + prop_page = its_alloc_pages(gfp_flags, + get_order(LPI_PROPBASE_SZ)); if (!prop_page) return NULL; @@ -2212,8 +2298,7 @@ static struct page *its_allocate_prop_table(gfp_t gfp_flags) static void its_free_prop_table(struct page *prop_page) { - free_pages((unsigned long)page_address(prop_page), - get_order(LPI_PROPBASE_SZ)); + its_free_pages(page_address(prop_page), get_order(LPI_PROPBASE_SZ)); } static bool gic_check_reserved_range(phys_addr_t addr, unsigned long size) @@ -2335,7 +2420,7 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, order = get_order(GITS_BASER_PAGES_MAX * psz); } - page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order); + page = its_alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order); if (!page) return -ENOMEM; @@ -2348,7 +2433,7 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, /* 52bit PA is supported only when PageSize=64K */ if (psz != SZ_64K) { pr_err("ITS: no 52bit PA support when psz=%d\n", psz); - free_pages((unsigned long)base, order); + its_free_pages(base, order); return -ENXIO; } @@ -2404,7 +2489,7 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, pr_err("ITS@%pa: %s doesn't stick: %llx %llx\n", &its->phys_base, its_base_type_string[type], val, tmp); - free_pages((unsigned long)base, order); + its_free_pages(base, order); return -ENXIO; } @@ -2543,8 +2628,7 @@ static void its_free_tables(struct its_node *its) for (i = 0; i < GITS_BASER_NR_REGS; i++) { if (its->tables[i].base) { - free_pages((unsigned long)its->tables[i].base, - its->tables[i].order); + its_free_pages(its->tables[i].base, its->tables[i].order); its->tables[i].base = NULL; } } @@ -2814,7 +2898,7 @@ static bool allocate_vpe_l2_table(int cpu, u32 id) /* Allocate memory for 2nd level table */ if (!table[idx]) { - page = alloc_pages(GFP_KERNEL | __GFP_ZERO, get_order(psz)); + page = its_alloc_pages(GFP_KERNEL | __GFP_ZERO, get_order(psz)); if (!page) return false; @@ -2933,7 +3017,7 @@ static int allocate_vpe_l1_table(void) pr_debug("np = %d, npg = %lld, psz = %d, epp = %d, esz = %d\n", np, npg, psz, epp, esz); - page = alloc_pages(GFP_ATOMIC | __GFP_ZERO, get_order(np * PAGE_SIZE)); + page = its_alloc_pages(GFP_ATOMIC | __GFP_ZERO, get_order(np * PAGE_SIZE)); if (!page) return -ENOMEM; @@ -2979,8 +3063,7 @@ static struct page *its_allocate_pending_table(gfp_t gfp_flags) { struct page *pend_page; - pend_page = alloc_pages(gfp_flags | __GFP_ZERO, - get_order(LPI_PENDBASE_SZ)); + pend_page = its_alloc_pages(gfp_flags | __GFP_ZERO, get_order(LPI_PENDBASE_SZ)); if (!pend_page) return NULL; @@ -2992,7 +3075,7 @@ static struct page *its_allocate_pending_table(gfp_t gfp_flags) static void its_free_pending_table(struct page *pt) { - free_pages((unsigned long)page_address(pt), get_order(LPI_PENDBASE_SZ)); + its_free_pages(page_address(pt), get_order(LPI_PENDBASE_SZ)); } /* @@ -3327,8 +3410,8 @@ static bool its_alloc_table_entry(struct its_node *its, /* Allocate memory for 2nd level table */ if (!table[idx]) { - page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, - get_order(baser->psz)); + page = its_alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, + get_order(baser->psz)); if (!page) return false; @@ -3423,7 +3506,6 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, if (WARN_ON(!is_power_of_2(nvecs))) nvecs = roundup_pow_of_two(nvecs); - dev = kzalloc(sizeof(*dev), GFP_KERNEL); /* * Even if the device wants a single LPI, the ITT must be * sized as a power of two (and you need at least one bit...). @@ -3431,7 +3513,11 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, nr_ites = max(2, nvecs); sz = nr_ites * (FIELD_GET(GITS_TYPER_ITT_ENTRY_SIZE, its->typer) + 1); sz = max(sz, ITS_ITT_ALIGN) + ITS_ITT_ALIGN - 1; - itt = kzalloc_node(sz, GFP_KERNEL, its->numa_node); + + itt = itt_alloc_pool(its->numa_node, sz); + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (alloc_lpis) { lpi_map = its_lpi_alloc(nvecs, &lpi_base, &nr_lpis); if (lpi_map) @@ -3443,9 +3529,9 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, lpi_base = 0; } - if (!dev || !itt || !col_map || (!lpi_map && alloc_lpis)) { + if (!dev || !itt || !col_map || (!lpi_map && alloc_lpis)) { kfree(dev); - kfree(itt); + itt_free_pool(itt, sz); bitmap_free(lpi_map); kfree(col_map); return NULL; @@ -3455,6 +3541,7 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, dev->its = its; dev->itt = itt; + dev->itt_sz = sz; dev->nr_ites = nr_ites; dev->event_map.lpi_map = lpi_map; dev->event_map.col_map = col_map; @@ -3482,7 +3569,7 @@ static void its_free_device(struct its_device *its_dev) list_del(&its_dev->entry); raw_spin_unlock_irqrestore(&its_dev->its->lock, flags); kfree(its_dev->event_map.col_map); - kfree(its_dev->itt); + itt_free_pool(its_dev->itt, its_dev->itt_sz); kfree(its_dev); } @@ -5125,8 +5212,9 @@ static int __init its_probe_one(struct its_node *its) } } - page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, - get_order(ITS_CMD_QUEUE_SZ)); + page = its_alloc_pages_node(its->numa_node, + GFP_KERNEL | __GFP_ZERO, + get_order(ITS_CMD_QUEUE_SZ)); if (!page) { err = -ENOMEM; goto out_unmap_sgir; @@ -5190,7 +5278,7 @@ static int __init its_probe_one(struct its_node *its) out_free_tables: its_free_tables(its); out_free_cmd: - free_pages((unsigned long)its->cmd_base, get_order(ITS_CMD_QUEUE_SZ)); + its_free_pages(its->cmd_base, get_order(ITS_CMD_QUEUE_SZ)); out_unmap_sgir: if (its->sgir_base) iounmap(its->sgir_base); @@ -5672,6 +5760,10 @@ int __init its_init(struct fwnode_handle *handle, struct rdists *rdists, bool has_v4_1 = false; int err; + itt_pool = gen_pool_create(get_order(ITS_ITT_ALIGN), -1); + if (!itt_pool) + return -ENOMEM; + gic_rdists = rdists; its_parent = parent_domain; -- Gitee From 0adc63232c16a8d75f06ceda65316b34eae93131 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Tue, 24 Jun 2025 15:45:49 +0800 Subject: [PATCH 13/84] irqchip/gic-v3-its: Fix over allocation in itt_alloc_pool() mainline inclusion from mainline-v6.13-rc1 commit bc88d44bd7e45b992cf8c2c2ffbc7bb3e24db4a7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IBWQ24 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=bc88d44bd7e45b992cf8c2c2ffbc7bb3e24db4a7 -------------------------------- itt_alloc_pool() calls its_alloc_pages_node() to allocate an individual page to add to the pool (for allocations Signed-off-by: Steven Price Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/1f6e19c4-1fb9-43ab-a8a2-a465c9cff84b@arm.com Closes: https://lore.kernel.org/r/ed65312a-245c-4fa5-91ad-5d620cab7c6b%40nvidia.com Signed-off-by: Cai Xinchen --- drivers/irqchip/irq-gic-v3-its.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 37a9f7692bc2..2e924c93a403 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -264,7 +264,7 @@ static void *itt_alloc_pool(int node, int size) if (addr) break; - page = its_alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 1); + page = its_alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); if (!page) break; -- Gitee From 58938a5097768502b5987d877735ed91803f8359 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Tue, 24 Jun 2025 15:45:50 +0800 Subject: [PATCH 14/84] irqchip/gic-v3-its: Rely on genpool alignment mainline inclusion from mainline-v6.13-rc1 commit e36d4165f0796536b338521ef714551be0feb706 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IBWQ24 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e36d4165f0796536b338521ef714551be0feb706 -------------------------------- its_create_device() over-allocated by ITS_ITT_ALIGN - 1 bytes to ensure that an aligned area was available within the allocation. The new genpool allocator has its min_alloc_order set to get_order(ITS_ITT_ALIGN) so all allocations from it should be appropriately aligned. Remove the over-allocation from its_create_device() and alignment from its_build_mapd_cmd(). Signed-off-by: Steven Price Signed-off-by: Thomas Gleixner Tested-by: Will Deacon Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/all/20241002141630.433502-3-steven.price@arm.com Signed-off-by: Cai Xinchen --- drivers/irqchip/irq-gic-v3-its.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 2e924c93a403..080d9b8ffbd6 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -709,7 +709,6 @@ static struct its_collection *its_build_mapd_cmd(struct its_node *its, u8 size = ilog2(desc->its_mapd_cmd.dev->nr_ites); itt_addr = virt_to_phys(desc->its_mapd_cmd.dev->itt); - itt_addr = ALIGN(itt_addr, ITS_ITT_ALIGN); its_encode_cmd(cmd, GITS_CMD_MAPD); its_encode_devid(cmd, desc->its_mapd_cmd.dev->device_id); @@ -3512,7 +3511,7 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, */ nr_ites = max(2, nvecs); sz = nr_ites * (FIELD_GET(GITS_TYPER_ITT_ENTRY_SIZE, its->typer) + 1); - sz = max(sz, ITS_ITT_ALIGN) + ITS_ITT_ALIGN - 1; + sz = max(sz, ITS_ITT_ALIGN); itt = itt_alloc_pool(its->numa_node, sz); -- Gitee From 17b49635c9643c0bd4f81e97e706b2ea16ab647f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 Jun 2025 15:45:51 +0800 Subject: [PATCH 15/84] jump_label,module: Don't alloc static_key_mod for __ro_after_init keys mainline inclusion from mainline-v6.10-rc1 commit 91a1d97ef482c1e4c9d4c1c656a53b0f6b16d0ed category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBWQ24 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=91a1d97ef482c1e4c9d4c1c656a53b0f6b16d0ed -------------------------------- When a static_key is marked ro_after_init, its state will never change (after init), therefore jump_label_update() will never need to iterate the entries, and thus module load won't actually need to track this -- avoiding the static_key::next write. Therefore, mark these keys such that jump_label_add_module() might recognise them and avoid the modification. Use the special state: 'static_key_linked(key) && !static_key_mod(key)' to denote such keys. jump_label_add_module() does not exist under CONFIG_JUMP_LABEL=n, so the newly-introduced jump_label_init_ro() can be defined as a nop for that configuration. [ mingo: Renamed jump_label_ro() to jump_label_init_ro() ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20240313180106.2917308-2-vschneid@redhat.com Conflicts: init/main.c [Only context conflicts] Signed-off-by: Cai Xinchen --- include/asm-generic/sections.h | 5 ++++ include/linux/jump_label.h | 3 ++ init/main.c | 1 + kernel/jump_label.c | 53 ++++++++++++++++++++++++++++++++++ 4 files changed, 62 insertions(+) diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index db13bb620f52..c768de6f19a9 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -180,6 +180,11 @@ static inline bool is_kernel_rodata(unsigned long addr) addr < (unsigned long)__end_rodata; } +static inline bool is_kernel_ro_after_init(unsigned long addr) +{ + return addr >= (unsigned long)__start_ro_after_init && + addr < (unsigned long)__end_ro_after_init; +} /** * is_kernel_inittext - checks if the pointer address is located in the * .init.text section diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index f0a949b7c973..f5a2727ca4a9 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -216,6 +216,7 @@ extern struct jump_entry __start___jump_table[]; extern struct jump_entry __stop___jump_table[]; extern void jump_label_init(void); +extern void jump_label_init_ro(void); extern void jump_label_lock(void); extern void jump_label_unlock(void); extern void arch_jump_label_transform(struct jump_entry *entry, @@ -265,6 +266,8 @@ static __always_inline void jump_label_init(void) static_key_initialized = true; } +static __always_inline void jump_label_init_ro(void) { } + static __always_inline bool static_key_false(struct static_key *key) { if (unlikely_notrace(static_key_count(key) > 0)) diff --git a/init/main.c b/init/main.c index 725423122567..a6b581b54893 100644 --- a/init/main.c +++ b/init/main.c @@ -1416,6 +1416,7 @@ static void mark_readonly(void) * insecure pages which are W+X. */ flush_module_init_free_work(); + jump_label_init_ro(); mark_rodata_ro(); rodata_test(); } else diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 554e04b25b13..101572d6a908 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -565,6 +565,45 @@ void __init jump_label_init(void) cpus_read_unlock(); } +static inline bool static_key_sealed(struct static_key *key) +{ + return (key->type & JUMP_TYPE_LINKED) && !(key->type & ~JUMP_TYPE_MASK); +} + +static inline void static_key_seal(struct static_key *key) +{ + unsigned long type = key->type & JUMP_TYPE_TRUE; + key->type = JUMP_TYPE_LINKED | type; +} + +void jump_label_init_ro(void) +{ + struct jump_entry *iter_start = __start___jump_table; + struct jump_entry *iter_stop = __stop___jump_table; + struct jump_entry *iter; + + if (WARN_ON_ONCE(!static_key_initialized)) + return; + + cpus_read_lock(); + jump_label_lock(); + + for (iter = iter_start; iter < iter_stop; iter++) { + struct static_key *iterk = jump_entry_key(iter); + + if (!is_kernel_ro_after_init((unsigned long)iterk)) + continue; + + if (static_key_sealed(iterk)) + continue; + + static_key_seal(iterk); + } + + jump_label_unlock(); + cpus_read_unlock(); +} + #ifdef CONFIG_MODULES enum jump_label_type jump_label_init_type(struct jump_entry *entry) @@ -685,6 +724,15 @@ static int jump_label_add_module(struct module *mod) static_key_set_entries(key, iter); continue; } + + /* + * If the key was sealed at init, then there's no need to keep a + * reference to its module entries - just patch them now and be + * done with it. + */ + if (static_key_sealed(key)) + goto do_poke; + jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL); if (!jlm) return -ENOMEM; @@ -710,6 +758,7 @@ static int jump_label_add_module(struct module *mod) static_key_set_linked(key); /* Only update if we've changed from our initial state */ +do_poke: if (jump_label_type(iter) != jump_label_init_type(iter)) __jump_label_update(key, iter, iter_stop, true); } @@ -734,6 +783,10 @@ static void jump_label_del_module(struct module *mod) if (within_module((unsigned long)key, mod)) continue; + /* No @jlm allocated because key was sealed at init. */ + if (static_key_sealed(key)) + continue; + /* No memory during module load */ if (WARN_ON(!static_key_linked(key))) continue; -- Gitee From d7d4ab0bb3c3078bd4c5ba661de825ca1130c9e5 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Tue, 24 Jun 2025 15:45:52 +0800 Subject: [PATCH 16/84] parisc: Delay write-protection until mark_rodata_ro() call mainline inclusion from mainline-v6.11-rc7 commit 213aa670153ed675a007c1f35c5db544b0fefc94 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IBWQ24 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=213aa670153ed675a007c1f35c5db544b0fefc94 -------------------------------- Do not write-protect the kernel read-only and __ro_after_init sections earlier than before mark_rodata_ro() is called. This fixes a boot issue on parisc which is triggered by commit 91a1d97ef482 ("jump_label,module: Don't alloc static_key_mod for __ro_after_init keys"). That commit may modify static key contents in the __ro_after_init section at bootup, so this section needs to be writable at least until mark_rodata_ro() is called. Signed-off-by: Helge Deller Reported-by: matoro Reported-by: Christoph Biedl Tested-by: Christoph Biedl Link: https://lore.kernel.org/linux-parisc/096cad5aada514255cd7b0b9dbafc768@matoro.tk/#r Fixes: 91a1d97ef482 ("jump_label,module: Don't alloc static_key_mod for __ro_after_init keys") Cc: stable@vger.kernel.org # v6.10+ Signed-off-by: Cai Xinchen --- arch/parisc/mm/init.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index a2a3e89f2d9a..136a7ec1b50a 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -457,7 +457,6 @@ void free_initmem(void) unsigned long kernel_end = (unsigned long)&_end; /* Remap kernel text and data, but do not touch init section yet. */ - kernel_set_to_readonly = true; map_pages(init_end, __pa(init_end), kernel_end - init_end, PAGE_KERNEL, 0); @@ -491,11 +490,18 @@ void free_initmem(void) #ifdef CONFIG_STRICT_KERNEL_RWX void mark_rodata_ro(void) { - /* rodata memory was already mapped with KERNEL_RO access rights by - pagetable_init() and map_pages(). No need to do additional stuff here */ - unsigned long roai_size = __end_ro_after_init - __start_ro_after_init; + unsigned long start = (unsigned long) &__start_rodata; + unsigned long end = (unsigned long) &__end_rodata; + + pr_info("Write protecting the kernel read-only data: %luk\n", + (end - start) >> 10); + + kernel_set_to_readonly = true; + map_pages(start, __pa(start), end - start, PAGE_KERNEL, 0); - pr_info("Write protected read-only-after-init data: %luk\n", roai_size >> 10); + /* force the kernel to see the new page table entries */ + flush_cache_all(); + flush_tlb_all(); } #endif -- Gitee From 367914c59dfabf89c8981e291ac7e98532c07d65 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 24 Jun 2025 15:45:53 +0800 Subject: [PATCH 17/84] arm64: realm: ioremap: Allow mapping memory as encrypted maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBWQ24 CVE: NA Reference: https://lore.kernel.org/all/20250613111153.1548928-2-suzuki.poulose@arm.com/ ---------------------------------------------------------------------- For ioremap(), so far we only checked if it was a device (RIPAS_DEV) to choose an encrypted vs decrypted mapping. However, we may have firmware reserved memory regions exposed to the OS (e.g., EFI Coco Secret Securityfs, ACPI CCEL). We need to make sure that anything that is RIPAS_RAM (i.e., Guest protected memory with RMM guarantees) are also mapped as encrypted. Rephrasing the above, anything that is not RIPAS_EMPTY is guaranteed to be protected by the RMM. Thus we choose encrypted mapping for anything that is not RIPAS_EMPTY. While at it, rename the helper function __arm64_is_protected_mmio => arm64_rsi_is_protected to clearly indicate that this not an arm64 generic helper, but something to do with Realms. Cc: Sami Mujawar Cc: Will Deacon Cc: Catalin Marinas Cc: "Aneesh Kumar K.V" Cc: Steven Price Signed-off-by: Suzuki K Poulose Signed-off-by: Cai Xinchen --- arch/arm64/include/asm/io.h | 2 +- arch/arm64/include/asm/rsi.h | 2 +- arch/arm64/kernel/rsi.c | 26 ++++++++++++++++++++++---- 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 8f3bbf168ec7..e9d6bcfddf35 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -190,7 +190,7 @@ extern bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size, static inline bool arm64_is_protected_mmio(phys_addr_t phys_addr, size_t size) { if (unlikely(is_realm_world())) - return __arm64_is_protected_mmio(phys_addr, size); + return arm64_rsi_is_protected(phys_addr, size); return false; } diff --git a/arch/arm64/include/asm/rsi.h b/arch/arm64/include/asm/rsi.h index 188cbb9b23f5..5f9c8623183d 100644 --- a/arch/arm64/include/asm/rsi.h +++ b/arch/arm64/include/asm/rsi.h @@ -14,7 +14,7 @@ DECLARE_STATIC_KEY_FALSE(rsi_present); void __init arm64_rsi_init(void); -bool __arm64_is_protected_mmio(phys_addr_t base, size_t size); +bool arm64_rsi_is_protected(phys_addr_t base, size_t size); static inline bool is_realm_world(void) { diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c index 3031f25c32ef..58860483eadd 100644 --- a/arch/arm64/kernel/rsi.c +++ b/arch/arm64/kernel/rsi.c @@ -83,7 +83,25 @@ static void __init arm64_rsi_setup_memory(void) } } -bool __arm64_is_protected_mmio(phys_addr_t base, size_t size) +/* + * Check if a given PA range is Trusted (e.g., Protected memory, a Trusted Device + * mapping, or an MMIO emulated in the Realm world). + * + * We can rely on the RIPAS value of the region to detect if a given region is + * protected. + * + * RIPAS_DEV - A trusted device memory or a trusted emulated MMIO (in the Realm + * world + * RIPAS_RAM - Memory (RAM), protected by the RMM guarantees. (e.g., Firmware + * reserved regions for data sharing). + * + * RIPAS_DESTROYED is a special case of one of the above, where the host did + * something without our permission and as such we can't do anything about it. + * + * The only case where something is emulated by the untrusted hypervisor or is + * backed by shared memory is indicated by RSI_RIPAS_EMPTY. + */ +bool arm64_rsi_is_protected(phys_addr_t base, size_t size) { enum ripas ripas; phys_addr_t end, top; @@ -100,18 +118,18 @@ bool __arm64_is_protected_mmio(phys_addr_t base, size_t size) break; if (WARN_ON(top <= base)) break; - if (ripas != RSI_RIPAS_DEV) + if (ripas == RSI_RIPAS_EMPTY) break; base = top; } return base >= end; } -EXPORT_SYMBOL(__arm64_is_protected_mmio); +EXPORT_SYMBOL(arm64_rsi_is_protected); static int realm_ioremap_hook(phys_addr_t phys, size_t size, pgprot_t *prot) { - if (__arm64_is_protected_mmio(phys, size)) + if (arm64_rsi_is_protected(phys, size)) *prot = pgprot_encrypted(*prot); else *prot = pgprot_decrypted(*prot); -- Gitee From 60d5157db5b82abd0d35b7ebfdd0bfc937a911cd Mon Sep 17 00:00:00 2001 From: Yiwei Zhuang Date: Tue, 24 Jun 2025 15:45:54 +0800 Subject: [PATCH 18/84] rme: make sure realm guest map memory in page granularity cca inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBWQ24 -------------------------------- maillist: Aneesh pointed out that this call to is_realm_world() is now too early since the decision to delay the RSI detection. The upshot is that a realm guest which doesn't have page granularity forced for other reasons will fail to share pages with the host. At the moment I can think of a couple of options: (1) Make rodata_full a requirement for realm guests. CONFIG_RODATA_FULL_DEFAULT_ENABLED is already "default y" so this isn't a big ask. (2) Revisit the idea of detecting when running as a realm guest early. This has the advantage of also "fixing" earlycon (no need to manually specify the shared-alias of an unprotected UART). I'm currently leaning towards (1) because it's the default anyway. But if we're going to need to fix earlycon (or indeed find other similar issues) then (2) would obviously make sense. CCA context is follow: CCA guest must run at page granularity. Thus, we need to make sure NO_BLOCK_MAPPINGS and NO_CONT_MAPPINGS flags are set during paging_init. However, at that time, is_realm_world has not been initialized yet. Therefore, rodata=full is forced currently while booting a realm guest[1]. Link: https://patchwork.kernel.org/project/kvm/patch/20241017131434.40935-10-steven.price@arm.com/ Signed-off-by: Yiwei Zhuang Signed-off-by: Cai Xinchen --- arch/arm64/kernel/rsi.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c index 58860483eadd..be868cb5a4b4 100644 --- a/arch/arm64/kernel/rsi.c +++ b/arch/arm64/kernel/rsi.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -143,6 +144,10 @@ void __init arm64_rsi_init(void) return; if (!rsi_version_matches()) return; + if (!can_set_direct_map()) { + pr_err("RME: Cannot set the kernel direct map, consider CONFIG_RODATA_FULL_DEFAULT_ENABLED=y or rodata=full\n"); + return; + } if (WARN_ON(rsi_get_realm_config(&config))) return; prot_ns_shared = BIT(config.ipa_bits - 1); -- Gitee From 1ef6c0310e0ebb94ff50bbbc2bd475bb0780dd2b Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 24 Jun 2025 15:45:55 +0800 Subject: [PATCH 19/84] dma: Fix encryption bit clearing for dma_to_phys mainline inclusion from mainline-v6.15-rc1 commit c380931712d16e23f6aa90703f438330139e9731 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBWQ24 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c380931712d16e23f6aa90703f438330139e9731 -------------------------------- phys_to_dma() sets the encryption bit on the translated DMA address. But dma_to_phys() clears the encryption bit after it has been translated back to the physical address, which could fail if the device uses DMA ranges. AMD SME doesn't use the DMA ranges and thus this is harmless. But as we are about to add support for other architectures, let us fix this. Reported-by: Aneesh Kumar K.V Link: https://lkml.kernel.org/r/yq5amsen9stc.fsf@kernel.org Cc: Will Deacon Cc: Jean-Philippe Brucker Cc: Robin Murphy Cc: Steven Price Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Tom Lendacky Reviewed-by: Robin Murphy Acked-by: Tom Lendacky Signed-off-by: Suzuki K Poulose Reviewed-by: Gavin Shan Acked-by: Marek Szyprowski Fixes: 42be24a4178f ("arm64: Enable memory encrypt for Realms") Acked-by: Will Deacon Link: https://lore.kernel.org/r/20250227144150.1667735-2-suzuki.poulose@arm.com Signed-off-by: Catalin Marinas Signed-off-by: Cai Xinchen --- include/linux/dma-direct.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index 18aade195884..0c3ff9702244 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -78,12 +78,13 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dma_addr) { phys_addr_t paddr; + dma_addr = __sme_clr(dma_addr); if (dev->dma_range_map) paddr = translate_dma_to_phys(dev, dma_addr); else paddr = dma_addr; - return __sme_clr(paddr); + return paddr; } #endif /* !CONFIG_ARCH_HAS_PHYS_TO_DMA */ -- Gitee From aa24034b609ead58e5bfc15d6f25fa8be5b05b79 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 24 Jun 2025 15:45:56 +0800 Subject: [PATCH 20/84] dma: Introduce generic dma_addr_*crypted helpers mainline inclusion from mainline-v6.15-rc1 commit b66e2ee7b6c8d45bbe4b6f6885ee27511506812c category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBWQ24 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b66e2ee7b6c8d45bbe4b6f6885ee27511506812c -------------------------------- AMD SME added __sme_set/__sme_clr primitives to modify the DMA address for encrypted/decrypted traffic. However this doesn't fit in with other models, e.g., Arm CCA where the meanings are the opposite. i.e., "decrypted" traffic has a bit set and "encrypted" traffic has the top bit cleared. In preparation for adding the support for Arm CCA DMA conversions, convert the existing primitives to more generic ones that can be provided by the backends. i.e., add helpers to 1. dma_addr_encrypted - Convert a DMA address to "encrypted" [ == __sme_set() ] 2. dma_addr_unencrypted - Convert a DMA address to "decrypted" [ None exists today ] 3. dma_addr_canonical - Clear any "encryption"/"decryption" bits from DMA address [ SME uses __sme_clr() ] and convert to a canonical DMA address. Since the original __sme_xxx helpers come from linux/mem_encrypt.h, use that as the home for the new definitions and provide dummy ones when none is provided by the architectures. With the above, phys_to_dma_unencrypted() uses the newly added dma_addr_unencrypted() helper and to make it a bit more easier to read and avoid double conversion, provide __phys_to_dma(). Suggested-by: Robin Murphy Cc: Will Deacon Cc: Jean-Philippe Brucker Cc: Robin Murphy Cc: Steven Price Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Tom Lendacky Cc: Aneesh Kumar K.V Signed-off-by: Suzuki K Poulose Reviewed-by: Robin Murphy Reviewed-by: Gavin Shan Acked-by: Marek Szyprowski Fixes: 42be24a4178f ("arm64: Enable memory encrypt for Realms") Acked-by: Will Deacon Link: https://lore.kernel.org/r/20250227144150.1667735-3-suzuki.poulose@arm.com Signed-off-by: Catalin Marinas Signed-off-by: Cai Xinchen --- include/linux/dma-direct.h | 12 ++++++++---- include/linux/mem_encrypt.h | 23 +++++++++++++++++++++++ 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index 0c3ff9702244..8589b0dafdb0 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -55,14 +55,18 @@ static inline phys_addr_t translate_dma_to_phys(struct device *dev, #define phys_to_dma_unencrypted phys_to_dma #endif #else -static inline dma_addr_t phys_to_dma_unencrypted(struct device *dev, - phys_addr_t paddr) +static inline dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr) { if (dev->dma_range_map) return translate_phys_to_dma(dev, paddr); return paddr; } +static inline dma_addr_t phys_to_dma_unencrypted(struct device *dev, + phys_addr_t paddr) +{ + return dma_addr_unencrypted(__phys_to_dma(dev, paddr)); +} /* * If memory encryption is supported, phys_to_dma will set the memory encryption * bit in the DMA address, and dma_to_phys will clear it. @@ -71,14 +75,14 @@ static inline dma_addr_t phys_to_dma_unencrypted(struct device *dev, */ static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) { - return __sme_set(phys_to_dma_unencrypted(dev, paddr)); + return dma_addr_encrypted(__phys_to_dma(dev, paddr)); } static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dma_addr) { phys_addr_t paddr; - dma_addr = __sme_clr(dma_addr); + dma_addr = dma_addr_canonical(dma_addr); if (dev->dma_range_map) paddr = translate_dma_to_phys(dev, dma_addr); else diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h index ae4526389261..07584c5e36fb 100644 --- a/include/linux/mem_encrypt.h +++ b/include/linux/mem_encrypt.h @@ -26,11 +26,34 @@ */ #define __sme_set(x) ((x) | sme_me_mask) #define __sme_clr(x) ((x) & ~sme_me_mask) + +#define dma_addr_encrypted(x) __sme_set(x) +#define dma_addr_canonical(x) __sme_clr(x) + #else #define __sme_set(x) (x) #define __sme_clr(x) (x) #endif +/* + * dma_addr_encrypted() and dma_addr_unencrypted() are for converting a given DMA + * address to the respective type of addressing. + * + * dma_addr_canonical() is used to reverse any conversions for encrypted/decrypted + * back to the canonical address. + */ +#ifndef dma_addr_encrypted +#define dma_addr_encrypted(x) (x) +#endif + +#ifndef dma_addr_unencrypted +#define dma_addr_unencrypted(x) (x) +#endif + +#ifndef dma_addr_canonical +#define dma_addr_canonical(x) (x) +#endif + #endif /* __ASSEMBLY__ */ #endif /* __MEM_ENCRYPT_H__ */ -- Gitee From 825a3888f8e6fcdf738f87740f58967c88da7f52 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 24 Jun 2025 15:45:57 +0800 Subject: [PATCH 21/84] arm64: realm: Use aliased addresses for device DMA to shared buffers mainline inclusion from mainline-v6.15-rc1 commit 7d953a06241624ee2efb172d037a4168978f4147 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBWQ24 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=7d953a06241624ee2efb172d037a4168978f4147 -------------------------------- When a device performs DMA to a shared buffer using physical addresses, (without Stage1 translation), the device must use the "{I}PA address" with the top bit set in Realm. This is to make sure that a trusted device will be able to write to shared buffers as well as the protected buffers. Thus, a Realm must always program the full address including the "protection" bit, like AMD SME encryption bits. Enable this by providing arm64 specific dma_addr_{encrypted, canonical} helpers for Realms. Please note that the VMM needs to similarly make sure that the SMMU Stage2 in the Non-secure world is setup accordingly to map IPA at the unprotected alias. Cc: Will Deacon Cc: Jean-Philippe Brucker Cc: Robin Murphy Cc: Steven Price Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Tom Lendacky Cc: Aneesh Kumar K.V Signed-off-by: Suzuki K Poulose Reviewed-by: Gavin Shan Acked-by: Catalin Marinas Reviewed-by: Robin Murphy Acked-by: Marek Szyprowski Fixes: 42be24a4178f ("arm64: Enable memory encrypt for Realms") Acked-by: Will Deacon Link: https://lore.kernel.org/r/20250227144150.1667735-4-suzuki.poulose@arm.com Signed-off-by: Catalin Marinas Signed-off-by: Cai Xinchen --- arch/arm64/include/asm/mem_encrypt.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/arm64/include/asm/mem_encrypt.h b/arch/arm64/include/asm/mem_encrypt.h index f8f78f622dd2..a2a1eeb36d4b 100644 --- a/arch/arm64/include/asm/mem_encrypt.h +++ b/arch/arm64/include/asm/mem_encrypt.h @@ -21,4 +21,15 @@ static inline bool force_dma_unencrypted(struct device *dev) return is_realm_world(); } +/* + * For Arm CCA guests, canonical addresses are "encrypted", so no changes + * required for dma_addr_encrypted(). + * The unencrypted DMA buffers must be accessed via the unprotected IPA, + * "top IPA bit" set. + */ +#define dma_addr_unencrypted(x) ((x) | PROT_NS_SHARED) + +/* Clear the "top" IPA bit while converting back */ +#define dma_addr_canonical(x) ((x) & ~PROT_NS_SHARED) + #endif /* __ASM_MEM_ENCRYPT_H */ -- Gitee From 2a20baaa51883c1080eb86c96962415779b1fd3d Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 24 Jun 2025 15:45:58 +0800 Subject: [PATCH 22/84] virt: coco: Add a coco/Makefile and coco/Kconfig mainline inclusion from mainline-v6.7-rc1 commit ec51ffcf263016111f090b9440a3c5a8338648e8 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBWQ24 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ec51ffcf263016111f090b9440a3c5a8338648e8 -------------------------------- In preparation for adding another coco build target, relieve drivers/virt/Makefile of the responsibility to track new compilation unit additions to drivers/virt/coco/, and do the same for drivers/virt/Kconfig. Reviewed-by: Kuppuswamy Sathyanarayanan Tested-by: Kuppuswamy Sathyanarayanan Reviewed-by: Tom Lendacky Signed-off-by: Dan Williams Conflicts: drivers/virt/Kconfig drivers/virt/Makefile drivers/virt/coco/Kconfig drivers/virt/coco/Makefile [The commit be5ee944496f8 ("driver/virt/coco: Add HYGON CSV Guest dirver.") hygon inclusion csv-guest files, adapte these patches by moving these csv-guest to coco Kconfig/Makefile] Signed-off-by: Cai Xinchen --- drivers/virt/Kconfig | 8 +------- drivers/virt/Makefile | 5 +---- drivers/virt/coco/Kconfig | 11 +++++++++++ drivers/virt/coco/Makefile | 8 ++++++++ 4 files changed, 21 insertions(+), 11 deletions(-) create mode 100644 drivers/virt/coco/Kconfig create mode 100644 drivers/virt/coco/Makefile diff --git a/drivers/virt/Kconfig b/drivers/virt/Kconfig index b1c4efa00182..40129b6f0eca 100644 --- a/drivers/virt/Kconfig +++ b/drivers/virt/Kconfig @@ -48,12 +48,6 @@ source "drivers/virt/nitro_enclaves/Kconfig" source "drivers/virt/acrn/Kconfig" -source "drivers/virt/coco/efi_secret/Kconfig" - -source "drivers/virt/coco/sev-guest/Kconfig" - -source "drivers/virt/coco/tdx-guest/Kconfig" - -source "drivers/virt/coco/csv-guest/Kconfig" +source "drivers/virt/coco/Kconfig" endif diff --git a/drivers/virt/Makefile b/drivers/virt/Makefile index 62681a260758..f29901bd7820 100644 --- a/drivers/virt/Makefile +++ b/drivers/virt/Makefile @@ -9,7 +9,4 @@ obj-y += vboxguest/ obj-$(CONFIG_NITRO_ENCLAVES) += nitro_enclaves/ obj-$(CONFIG_ACRN_HSM) += acrn/ -obj-$(CONFIG_EFI_SECRET) += coco/efi_secret/ -obj-$(CONFIG_SEV_GUEST) += coco/sev-guest/ -obj-$(CONFIG_INTEL_TDX_GUEST) += coco/tdx-guest/ -obj-$(CONFIG_CSV_GUEST) += coco/csv-guest/ +obj-y += coco/ diff --git a/drivers/virt/coco/Kconfig b/drivers/virt/coco/Kconfig new file mode 100644 index 000000000000..d12219bac4e2 --- /dev/null +++ b/drivers/virt/coco/Kconfig @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Confidential computing related collateral +# +source "drivers/virt/coco/efi_secret/Kconfig" + +source "drivers/virt/coco/sev-guest/Kconfig" + +source "drivers/virt/coco/tdx-guest/Kconfig" + +source "drivers/virt/coco/csv-guest/Kconfig" diff --git a/drivers/virt/coco/Makefile b/drivers/virt/coco/Makefile new file mode 100644 index 000000000000..e3d8acf80128 --- /dev/null +++ b/drivers/virt/coco/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Confidential computing related collateral +# +obj-$(CONFIG_EFI_SECRET) += efi_secret/ +obj-$(CONFIG_SEV_GUEST) += sev-guest/ +obj-$(CONFIG_INTEL_TDX_GUEST) += tdx-guest/ +obj-$(CONFIG_CSV_GUEST) += csv-guest/ -- Gitee From b157ba05da41913198236e8f61502b1b4b315693 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 24 Jun 2025 15:45:59 +0800 Subject: [PATCH 23/84] configfs-tsm: Introduce a shared ABI for attestation reports mainline inclusion from mainline-v6.7-rc1 commit 70e6f7e2b98575621019aa40ac616be58ff984e0 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBWQ24 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=70e6f7e2b98575621019aa40ac616be58ff984e0 -------------------------------- One of the common operations of a TSM (Trusted Security Module) is to provide a way for a TVM (confidential computing guest execution environment) to take a measurement of its launch state, sign it and submit it to a verifying party. Upon successful attestation that verifies the integrity of the TVM additional secrets may be deployed. The concept is common across TSMs, but the implementations are unfortunately vendor specific. While the industry grapples with a common definition of this attestation format [1], Linux need not make this problem worse by defining a new ABI per TSM that wants to perform a similar operation. The current momentum has been to invent new ioctl-ABI per TSM per function which at best is an abdication of the kernel's responsibility to make common infrastructure concepts share common ABI. The proposal, targeted to conceptually work with TDX, SEV-SNP, COVE if not more, is to define a configfs interface to retrieve the TSM-specific blob. report=/sys/kernel/config/tsm/report/report0 mkdir $report dd if=binary_userdata_plus_nonce > $report/inblob hexdump $report/outblob This approach later allows for the standardization of the attestation blob format without needing to invent a new ABI. Once standardization happens the standard format can be emitted by $report/outblob and indicated by $report/provider, or a new attribute like "$report/tcg_coco_report" can emit the standard format alongside the vendor format. Review of previous iterations of this interface identified that there is a need to scale report generation for multiple container environments [2]. Configfs enables a model where each container can bind mount one or more report generation item instances. Still, within a container only a single thread can be manipulating a given configuration instance at a time. A 'generation' count is provided to detect conflicts between multiple threads racing to configure a report instance. The SEV-SNP concepts of "extended reports" and "privilege levels" are optionally enabled by selecting 'tsm_report_ext_type' at register_tsm() time. The expectation is that those concepts are generic enough that they may be adopted by other TSM implementations. In other words, configfs-tsm aims to address a superset of TSM specific functionality with a common ABI where attributes may appear, or not appear, based on the set of concepts the implementation supports. Link: http://lore.kernel.org/r/64961c3baf8ce_142af829436@dwillia2-xfh.jf.intel.com.notmuch [1] Link: http://lore.kernel.org/r/57f3a05e-8fcd-4656-beea-56bb8365ae64@linux.microsoft.com [2] Cc: Kuppuswamy Sathyanarayanan Cc: Dionna Amalie Glaze Cc: James Bottomley Cc: Peter Gonda Cc: Greg Kroah-Hartman Cc: Samuel Ortiz Acked-by: Greg Kroah-Hartman Acked-by: Thomas Gleixner Reviewed-by: Kuppuswamy Sathyanarayanan Tested-by: Kuppuswamy Sathyanarayanan Reviewed-by: Tom Lendacky Signed-off-by: Dan Williams Signed-off-by: Cai Xinchen --- Documentation/ABI/testing/configfs-tsm | 82 +++++ MAINTAINERS | 8 + drivers/virt/coco/Kconfig | 5 + drivers/virt/coco/Makefile | 1 + drivers/virt/coco/tsm.c | 425 +++++++++++++++++++++++++ include/linux/tsm.h | 69 ++++ 6 files changed, 590 insertions(+) create mode 100644 Documentation/ABI/testing/configfs-tsm create mode 100644 drivers/virt/coco/tsm.c create mode 100644 include/linux/tsm.h diff --git a/Documentation/ABI/testing/configfs-tsm b/Documentation/ABI/testing/configfs-tsm new file mode 100644 index 000000000000..dd24202b5ba5 --- /dev/null +++ b/Documentation/ABI/testing/configfs-tsm @@ -0,0 +1,82 @@ +What: /sys/kernel/config/tsm/report/$name/inblob +Date: September, 2023 +KernelVersion: v6.7 +Contact: linux-coco@lists.linux.dev +Description: + (WO) Up to 64 bytes of user specified binary data. For replay + protection this should include a nonce, but the kernel does not + place any restrictions on the content. + +What: /sys/kernel/config/tsm/report/$name/outblob +Date: September, 2023 +KernelVersion: v6.7 +Contact: linux-coco@lists.linux.dev +Description: + (RO) Binary attestation report generated from @inblob and other + options The format of the report is implementation specific + where the implementation is conveyed via the @provider + attribute. + +What: /sys/kernel/config/tsm/report/$name/auxblob +Date: October, 2023 +KernelVersion: v6.7 +Contact: linux-coco@lists.linux.dev +Description: + (RO) Optional supplemental data that a TSM may emit, visibility + of this attribute depends on TSM, and may be empty if no + auxiliary data is available. + + When @provider is "sev_guest" this file contains the + "cert_table" from SEV-ES Guest-Hypervisor Communication Block + Standardization v2.03 Section 4.1.8.1 MSG_REPORT_REQ. + https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/specifications/56421.pdf + +What: /sys/kernel/config/tsm/report/$name/provider +Date: September, 2023 +KernelVersion: v6.7 +Contact: linux-coco@lists.linux.dev +Description: + (RO) A name for the format-specification of @outblob like + "sev_guest" [1] or "tdx_guest" [2] in the near term, or a + common standard format in the future. + + [1]: SEV Secure Nested Paging Firmware ABI Specification + Revision 1.55 Table 22 + https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/specifications/56860.pdf + + [2]: Intel® Trust Domain Extensions Data Center Attestation + Primitives : Quote Generation Library and Quote Verification + Library Revision 0.8 Appendix 4,5 + https://download.01.org/intel-sgx/latest/dcap-latest/linux/docs/Intel_TDX_DCAP_Quoting_Library_API.pdf + +What: /sys/kernel/config/tsm/report/$name/generation +Date: September, 2023 +KernelVersion: v6.7 +Contact: linux-coco@lists.linux.dev +Description: + (RO) The value in this attribute increments each time @inblob or + any option is written. Userspace can detect conflicts by + checking generation before writing to any attribute and making + sure the number of writes matches expectations after reading + @outblob, or it can prevent conflicts by creating a report + instance per requesting context. + +What: /sys/kernel/config/tsm/report/$name/privlevel +Date: September, 2023 +KernelVersion: v6.7 +Contact: linux-coco@lists.linux.dev +Description: + (WO) Attribute is visible if a TSM implementation provider + supports the concept of attestation reports for TVMs running at + different privilege levels, like SEV-SNP "VMPL", specify the + privilege level via this attribute. The minimum acceptable + value is conveyed via @privlevel_floor and the maximum + acceptable value is TSM_PRIVLEVEL_MAX (3). + +What: /sys/kernel/config/tsm/report/$name/privlevel_floor +Date: September, 2023 +KernelVersion: v6.7 +Contact: linux-coco@lists.linux.dev +Description: + (RO) Indicates the minimum permissible value that can be written + to @privlevel. diff --git a/MAINTAINERS b/MAINTAINERS index 082aac073070..15b9057e9216 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21970,6 +21970,14 @@ W: https://github.com/srcres258/linux-doc T: git git://github.com/srcres258/linux-doc.git doc-zh-tw F: Documentation/translations/zh_TW/ +TRUSTED SECURITY MODULE (TSM) ATTESTATION REPORTS +M: Dan Williams +L: linux-coco@lists.linux.dev +S: Maintained +F: Documentation/ABI/testing/configfs-tsm +F: drivers/virt/coco/tsm.c +F: include/linux/tsm.h + TTY LAYER AND SERIAL DRIVERS M: Greg Kroah-Hartman M: Jiri Slaby diff --git a/drivers/virt/coco/Kconfig b/drivers/virt/coco/Kconfig index d12219bac4e2..2b2e2dd06c7f 100644 --- a/drivers/virt/coco/Kconfig +++ b/drivers/virt/coco/Kconfig @@ -2,6 +2,11 @@ # # Confidential computing related collateral # + +config TSM_REPORTS + select CONFIGFS_FS + tristate + source "drivers/virt/coco/efi_secret/Kconfig" source "drivers/virt/coco/sev-guest/Kconfig" diff --git a/drivers/virt/coco/Makefile b/drivers/virt/coco/Makefile index e3d8acf80128..8b289aaf73b0 100644 --- a/drivers/virt/coco/Makefile +++ b/drivers/virt/coco/Makefile @@ -2,6 +2,7 @@ # # Confidential computing related collateral # +obj-$(CONFIG_TSM_REPORTS) += tsm.o obj-$(CONFIG_EFI_SECRET) += efi_secret/ obj-$(CONFIG_SEV_GUEST) += sev-guest/ obj-$(CONFIG_INTEL_TDX_GUEST) += tdx-guest/ diff --git a/drivers/virt/coco/tsm.c b/drivers/virt/coco/tsm.c new file mode 100644 index 000000000000..d1c2db83a8ca --- /dev/null +++ b/drivers/virt/coco/tsm.c @@ -0,0 +1,425 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2023 Intel Corporation. All rights reserved. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +static struct tsm_provider { + const struct tsm_ops *ops; + const struct config_item_type *type; + void *data; +} provider; +static DECLARE_RWSEM(tsm_rwsem); + +/** + * DOC: Trusted Security Module (TSM) Attestation Report Interface + * + * The TSM report interface is a common provider of blobs that facilitate + * attestation of a TVM (confidential computing guest) by an attestation + * service. A TSM report combines a user-defined blob (likely a public-key with + * a nonce for a key-exchange protocol) with a signed attestation report. That + * combined blob is then used to obtain secrets provided by an agent that can + * validate the attestation report. The expectation is that this interface is + * invoked infrequently, however configfs allows for multiple agents to + * own their own report generation instances to generate reports as + * often as needed. + * + * The attestation report format is TSM provider specific, when / if a standard + * materializes that can be published instead of the vendor layout. Until then + * the 'provider' attribute indicates the format of 'outblob', and optionally + * 'auxblob'. + */ + +struct tsm_report_state { + struct tsm_report report; + unsigned long write_generation; + unsigned long read_generation; + struct config_item cfg; +}; + +enum tsm_data_select { + TSM_REPORT, + TSM_CERTS, +}; + +static struct tsm_report *to_tsm_report(struct config_item *cfg) +{ + struct tsm_report_state *state = + container_of(cfg, struct tsm_report_state, cfg); + + return &state->report; +} + +static struct tsm_report_state *to_state(struct tsm_report *report) +{ + return container_of(report, struct tsm_report_state, report); +} + +static int try_advance_write_generation(struct tsm_report *report) +{ + struct tsm_report_state *state = to_state(report); + + lockdep_assert_held_write(&tsm_rwsem); + + /* + * Malicious or broken userspace has written enough times for + * read_generation == write_generation by modular arithmetic without an + * interim read. Stop accepting updates until the current report + * configuration is read. + */ + if (state->write_generation == state->read_generation - 1) + return -EBUSY; + state->write_generation++; + return 0; +} + +static ssize_t tsm_report_privlevel_store(struct config_item *cfg, + const char *buf, size_t len) +{ + struct tsm_report *report = to_tsm_report(cfg); + unsigned int val; + int rc; + + rc = kstrtouint(buf, 0, &val); + if (rc) + return rc; + + /* + * The valid privilege levels that a TSM might accept, if it accepts a + * privilege level setting at all, are a max of TSM_PRIVLEVEL_MAX (see + * SEV-SNP GHCB) and a minimum of a TSM selected floor value no less + * than 0. + */ + if (provider.ops->privlevel_floor > val || val > TSM_PRIVLEVEL_MAX) + return -EINVAL; + + guard(rwsem_write)(&tsm_rwsem); + rc = try_advance_write_generation(report); + if (rc) + return rc; + report->desc.privlevel = val; + + return len; +} +CONFIGFS_ATTR_WO(tsm_report_, privlevel); + +static ssize_t tsm_report_privlevel_floor_show(struct config_item *cfg, + char *buf) +{ + guard(rwsem_read)(&tsm_rwsem); + return sysfs_emit(buf, "%u\n", provider.ops->privlevel_floor); +} +CONFIGFS_ATTR_RO(tsm_report_, privlevel_floor); + +static ssize_t tsm_report_inblob_write(struct config_item *cfg, + const void *buf, size_t count) +{ + struct tsm_report *report = to_tsm_report(cfg); + int rc; + + guard(rwsem_write)(&tsm_rwsem); + rc = try_advance_write_generation(report); + if (rc) + return rc; + + report->desc.inblob_len = count; + memcpy(report->desc.inblob, buf, count); + return count; +} +CONFIGFS_BIN_ATTR_WO(tsm_report_, inblob, NULL, TSM_INBLOB_MAX); + +static ssize_t tsm_report_generation_show(struct config_item *cfg, char *buf) +{ + struct tsm_report *report = to_tsm_report(cfg); + struct tsm_report_state *state = to_state(report); + + guard(rwsem_read)(&tsm_rwsem); + return sysfs_emit(buf, "%lu\n", state->write_generation); +} +CONFIGFS_ATTR_RO(tsm_report_, generation); + +static ssize_t tsm_report_provider_show(struct config_item *cfg, char *buf) +{ + guard(rwsem_read)(&tsm_rwsem); + return sysfs_emit(buf, "%s\n", provider.ops->name); +} +CONFIGFS_ATTR_RO(tsm_report_, provider); + +static ssize_t __read_report(struct tsm_report *report, void *buf, size_t count, + enum tsm_data_select select) +{ + loff_t offset = 0; + ssize_t len; + u8 *out; + + if (select == TSM_REPORT) { + out = report->outblob; + len = report->outblob_len; + } else { + out = report->auxblob; + len = report->auxblob_len; + } + + /* + * Recall that a NULL @buf is configfs requesting the size of + * the buffer. + */ + if (!buf) + return len; + return memory_read_from_buffer(buf, count, &offset, out, len); +} + +static ssize_t read_cached_report(struct tsm_report *report, void *buf, + size_t count, enum tsm_data_select select) +{ + struct tsm_report_state *state = to_state(report); + + guard(rwsem_read)(&tsm_rwsem); + if (!report->desc.inblob_len) + return -EINVAL; + + /* + * A given TSM backend always fills in ->outblob regardless of + * whether the report includes an auxblob or not. + */ + if (!report->outblob || + state->read_generation != state->write_generation) + return -EWOULDBLOCK; + + return __read_report(report, buf, count, select); +} + +static ssize_t tsm_report_read(struct tsm_report *report, void *buf, + size_t count, enum tsm_data_select select) +{ + struct tsm_report_state *state = to_state(report); + const struct tsm_ops *ops; + ssize_t rc; + + /* try to read from the existing report if present and valid... */ + rc = read_cached_report(report, buf, count, select); + if (rc >= 0 || rc != -EWOULDBLOCK) + return rc; + + /* slow path, report may need to be regenerated... */ + guard(rwsem_write)(&tsm_rwsem); + ops = provider.ops; + if (!ops) + return -ENOTTY; + if (!report->desc.inblob_len) + return -EINVAL; + + /* did another thread already generate this report? */ + if (report->outblob && + state->read_generation == state->write_generation) + goto out; + + kvfree(report->outblob); + kvfree(report->auxblob); + report->outblob = NULL; + report->auxblob = NULL; + rc = ops->report_new(report, provider.data); + if (rc < 0) + return rc; + state->read_generation = state->write_generation; +out: + return __read_report(report, buf, count, select); +} + +static ssize_t tsm_report_outblob_read(struct config_item *cfg, void *buf, + size_t count) +{ + struct tsm_report *report = to_tsm_report(cfg); + + return tsm_report_read(report, buf, count, TSM_REPORT); +} +CONFIGFS_BIN_ATTR_RO(tsm_report_, outblob, NULL, TSM_OUTBLOB_MAX); + +static ssize_t tsm_report_auxblob_read(struct config_item *cfg, void *buf, + size_t count) +{ + struct tsm_report *report = to_tsm_report(cfg); + + return tsm_report_read(report, buf, count, TSM_CERTS); +} +CONFIGFS_BIN_ATTR_RO(tsm_report_, auxblob, NULL, TSM_OUTBLOB_MAX); + +#define TSM_DEFAULT_ATTRS() \ + &tsm_report_attr_generation, \ + &tsm_report_attr_provider + +static struct configfs_attribute *tsm_report_attrs[] = { + TSM_DEFAULT_ATTRS(), + NULL, +}; + +static struct configfs_attribute *tsm_report_extra_attrs[] = { + TSM_DEFAULT_ATTRS(), + &tsm_report_attr_privlevel, + &tsm_report_attr_privlevel_floor, + NULL, +}; + +#define TSM_DEFAULT_BIN_ATTRS() \ + &tsm_report_attr_inblob, \ + &tsm_report_attr_outblob + +static struct configfs_bin_attribute *tsm_report_bin_attrs[] = { + TSM_DEFAULT_BIN_ATTRS(), + NULL, +}; + +static struct configfs_bin_attribute *tsm_report_bin_extra_attrs[] = { + TSM_DEFAULT_BIN_ATTRS(), + &tsm_report_attr_auxblob, + NULL, +}; + +static void tsm_report_item_release(struct config_item *cfg) +{ + struct tsm_report *report = to_tsm_report(cfg); + struct tsm_report_state *state = to_state(report); + + kvfree(report->auxblob); + kvfree(report->outblob); + kfree(state); +} + +static struct configfs_item_operations tsm_report_item_ops = { + .release = tsm_report_item_release, +}; + +const struct config_item_type tsm_report_default_type = { + .ct_owner = THIS_MODULE, + .ct_bin_attrs = tsm_report_bin_attrs, + .ct_attrs = tsm_report_attrs, + .ct_item_ops = &tsm_report_item_ops, +}; +EXPORT_SYMBOL_GPL(tsm_report_default_type); + +const struct config_item_type tsm_report_extra_type = { + .ct_owner = THIS_MODULE, + .ct_bin_attrs = tsm_report_bin_extra_attrs, + .ct_attrs = tsm_report_extra_attrs, + .ct_item_ops = &tsm_report_item_ops, +}; +EXPORT_SYMBOL_GPL(tsm_report_extra_type); + +static struct config_item *tsm_report_make_item(struct config_group *group, + const char *name) +{ + struct tsm_report_state *state; + + guard(rwsem_read)(&tsm_rwsem); + if (!provider.ops) + return ERR_PTR(-ENXIO); + + state = kzalloc(sizeof(*state), GFP_KERNEL); + if (!state) + return ERR_PTR(-ENOMEM); + + config_item_init_type_name(&state->cfg, name, provider.type); + return &state->cfg; +} + +static struct configfs_group_operations tsm_report_group_ops = { + .make_item = tsm_report_make_item, +}; + +static const struct config_item_type tsm_reports_type = { + .ct_owner = THIS_MODULE, + .ct_group_ops = &tsm_report_group_ops, +}; + +static const struct config_item_type tsm_root_group_type = { + .ct_owner = THIS_MODULE, +}; + +static struct configfs_subsystem tsm_configfs = { + .su_group = { + .cg_item = { + .ci_namebuf = "tsm", + .ci_type = &tsm_root_group_type, + }, + }, + .su_mutex = __MUTEX_INITIALIZER(tsm_configfs.su_mutex), +}; + +int tsm_register(const struct tsm_ops *ops, void *priv, + const struct config_item_type *type) +{ + const struct tsm_ops *conflict; + + if (!type) + type = &tsm_report_default_type; + if (!(type == &tsm_report_default_type || type == &tsm_report_extra_type)) + return -EINVAL; + + guard(rwsem_write)(&tsm_rwsem); + conflict = provider.ops; + if (conflict) { + pr_err("\"%s\" ops already registered\n", conflict->name); + return -EBUSY; + } + + provider.ops = ops; + provider.data = priv; + provider.type = type; + return 0; +} +EXPORT_SYMBOL_GPL(tsm_register); + +int tsm_unregister(const struct tsm_ops *ops) +{ + guard(rwsem_write)(&tsm_rwsem); + if (ops != provider.ops) + return -EBUSY; + provider.ops = NULL; + provider.data = NULL; + provider.type = NULL; + return 0; +} +EXPORT_SYMBOL_GPL(tsm_unregister); + +static struct config_group *tsm_report_group; + +static int __init tsm_init(void) +{ + struct config_group *root = &tsm_configfs.su_group; + struct config_group *tsm; + int rc; + + config_group_init(root); + rc = configfs_register_subsystem(&tsm_configfs); + if (rc) + return rc; + + tsm = configfs_register_default_group(root, "report", + &tsm_reports_type); + if (IS_ERR(tsm)) { + configfs_unregister_subsystem(&tsm_configfs); + return PTR_ERR(tsm); + } + tsm_report_group = tsm; + + return 0; +} +module_init(tsm_init); + +static void __exit tsm_exit(void) +{ + configfs_unregister_default_group(tsm_report_group); + configfs_unregister_subsystem(&tsm_configfs); +} +module_exit(tsm_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Provide Trusted Security Module attestation reports via configfs"); diff --git a/include/linux/tsm.h b/include/linux/tsm.h new file mode 100644 index 000000000000..de8324a2223c --- /dev/null +++ b/include/linux/tsm.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __TSM_H +#define __TSM_H + +#include +#include + +#define TSM_INBLOB_MAX 64 +#define TSM_OUTBLOB_MAX SZ_32K + +/* + * Privilege level is a nested permission concept to allow confidential + * guests to partition address space, 4-levels are supported. + */ +#define TSM_PRIVLEVEL_MAX 3 + +/** + * struct tsm_desc - option descriptor for generating tsm report blobs + * @privlevel: optional privilege level to associate with @outblob + * @inblob_len: sizeof @inblob + * @inblob: arbitrary input data + */ +struct tsm_desc { + unsigned int privlevel; + size_t inblob_len; + u8 inblob[TSM_INBLOB_MAX]; +}; + +/** + * struct tsm_report - track state of report generation relative to options + * @desc: input parameters to @report_new() + * @outblob_len: sizeof(@outblob) + * @outblob: generated evidence to provider to the attestation agent + * @auxblob_len: sizeof(@auxblob) + * @auxblob: (optional) auxiliary data to the report (e.g. certificate data) + */ +struct tsm_report { + struct tsm_desc desc; + size_t outblob_len; + u8 *outblob; + size_t auxblob_len; + u8 *auxblob; +}; + +/** + * struct tsm_ops - attributes and operations for tsm instances + * @name: tsm id reflected in /sys/kernel/config/tsm/report/$report/provider + * @privlevel_floor: convey base privlevel for nested scenarios + * @report_new: Populate @report with the report blob and auxblob + * (optional), return 0 on successful population, or -errno otherwise + * + * Implementation specific ops, only one is expected to be registered at + * a time i.e. only one of "sev-guest", "tdx-guest", etc. + */ +struct tsm_ops { + const char *name; + const unsigned int privlevel_floor; + int (*report_new)(struct tsm_report *report, void *data); +}; + +extern const struct config_item_type tsm_report_default_type; + +/* publish @privlevel, @privlevel_floor, and @auxblob attributes */ +extern const struct config_item_type tsm_report_extra_type; + +int tsm_register(const struct tsm_ops *ops, void *priv, + const struct config_item_type *type); +int tsm_unregister(const struct tsm_ops *ops); +#endif /* __TSM_H */ -- Gitee From 04be186dee8681e14e1c9bfd34bb4a2310c08868 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 24 Jun 2025 15:46:00 +0800 Subject: [PATCH 24/84] mm/slab: Add __free() support for kvfree mainline inclusion from mainline-v6.7-rc1 commit a67d74a4b163878a3c0537033ed1b20db92ebfc5 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBWQ24 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a67d74a4b163878a3c0537033ed1b20db92ebfc5 -------------------------------- Allow for the declaration of variables that trigger kvfree() when they go out of scope. The check for NULL and call to kvfree() can be elided by the compiler in most cases, otherwise without the NULL check an unnecessary call to kvfree() may be emitted. Peter proposed a comment for this detail [1]. Link: http://lore.kernel.org/r/20230816103102.GF980931@hirez.programming.kicks-ass.net [1] Cc: Andrew Morton Cc: Peter Zijlstra Cc: Greg Kroah-Hartman Acked-by: Pankaj Gupta Acked-by: Greg Kroah-Hartman Reviewed-by: Kuppuswamy Sathyanarayanan Tested-by: Kuppuswamy Sathyanarayanan Reviewed-by: Tom Lendacky Signed-off-by: Dan Williams Signed-off-by: Cai Xinchen --- include/linux/slab.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/slab.h b/include/linux/slab.h index a761cc3559f2..f30f3ade52cf 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -764,6 +764,8 @@ static inline __alloc_size(1, 2) void *kvcalloc(size_t n, size_t size, gfp_t fla extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags) __realloc_size(3); extern void kvfree(const void *addr); +DEFINE_FREE(kvfree, void *, if (_T) kvfree(_T)) + extern void kvfree_sensitive(const void *addr, size_t len); unsigned int kmem_cache_size(struct kmem_cache *s); -- Gitee From 3c757c5525294271d9adcd2aebebf0e42f4fdadc Mon Sep 17 00:00:00 2001 From: Sami Mujawar Date: Tue, 24 Jun 2025 15:46:01 +0800 Subject: [PATCH 25/84] virt: arm-cca-guest: TSM_REPORT support for realms mainline inclusion from mainline-v6.13-rc1 commit 7999edc484ca376f803562edb2d43ec921642c2a category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBWQ24 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=7999edc484ca376f803562edb2d43ec921642c2a -------------------------------- Introduce an arm-cca-guest driver that registers with the configfs-tsm module to provide user interfaces for retrieving an attestation token. When a new report is requested the arm-cca-guest driver invokes the appropriate RSI interfaces to query an attestation token. The steps to retrieve an attestation token are as follows: 1. Mount the configfs filesystem if not already mounted mount -t configfs none /sys/kernel/config 2. Generate an attestation token report=/sys/kernel/config/tsm/report/report0 mkdir $report dd if=/dev/urandom bs=64 count=1 > $report/inblob hexdump -C $report/outblob rmdir $report Signed-off-by: Sami Mujawar Signed-off-by: Suzuki K Poulose Signed-off-by: Steven Price Reviewed-by: Gavin Shan Link: https://lore.kernel.org/r/20241017131434.40935-11-steven.price@arm.com Signed-off-by: Catalin Marinas Conflicts: drivers/virt/coco/Kconfig drivers/virt/coco/Makefile [The commit be5ee944496f8 ("driver/virt/coco: Add HYGON CSV Guest dirver.") adds csv-guest files resulting in context conflicts] Signed-off-by: Cai Xinchen --- drivers/virt/coco/Kconfig | 2 + drivers/virt/coco/Makefile | 1 + drivers/virt/coco/arm-cca-guest/Kconfig | 11 + drivers/virt/coco/arm-cca-guest/Makefile | 2 + .../virt/coco/arm-cca-guest/arm-cca-guest.c | 224 ++++++++++++++++++ 5 files changed, 240 insertions(+) create mode 100644 drivers/virt/coco/arm-cca-guest/Kconfig create mode 100644 drivers/virt/coco/arm-cca-guest/Makefile create mode 100644 drivers/virt/coco/arm-cca-guest/arm-cca-guest.c diff --git a/drivers/virt/coco/Kconfig b/drivers/virt/coco/Kconfig index 2b2e2dd06c7f..dd1011c2e59e 100644 --- a/drivers/virt/coco/Kconfig +++ b/drivers/virt/coco/Kconfig @@ -14,3 +14,5 @@ source "drivers/virt/coco/sev-guest/Kconfig" source "drivers/virt/coco/tdx-guest/Kconfig" source "drivers/virt/coco/csv-guest/Kconfig" + +source "drivers/virt/coco/arm-cca-guest/Kconfig" diff --git a/drivers/virt/coco/Makefile b/drivers/virt/coco/Makefile index 8b289aaf73b0..a0c5840a3a33 100644 --- a/drivers/virt/coco/Makefile +++ b/drivers/virt/coco/Makefile @@ -7,3 +7,4 @@ obj-$(CONFIG_EFI_SECRET) += efi_secret/ obj-$(CONFIG_SEV_GUEST) += sev-guest/ obj-$(CONFIG_INTEL_TDX_GUEST) += tdx-guest/ obj-$(CONFIG_CSV_GUEST) += csv-guest/ +obj-$(CONFIG_ARM_CCA_GUEST) += arm-cca-guest/ diff --git a/drivers/virt/coco/arm-cca-guest/Kconfig b/drivers/virt/coco/arm-cca-guest/Kconfig new file mode 100644 index 000000000000..9dd27c3ee215 --- /dev/null +++ b/drivers/virt/coco/arm-cca-guest/Kconfig @@ -0,0 +1,11 @@ +config ARM_CCA_GUEST + tristate "Arm CCA Guest driver" + depends on ARM64 + default m + select TSM_REPORTS + help + The driver provides userspace interface to request and + attestation report from the Realm Management Monitor(RMM). + + If you choose 'M' here, this module will be called + arm-cca-guest. diff --git a/drivers/virt/coco/arm-cca-guest/Makefile b/drivers/virt/coco/arm-cca-guest/Makefile new file mode 100644 index 000000000000..69eeba08e98a --- /dev/null +++ b/drivers/virt/coco/arm-cca-guest/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_ARM_CCA_GUEST) += arm-cca-guest.o diff --git a/drivers/virt/coco/arm-cca-guest/arm-cca-guest.c b/drivers/virt/coco/arm-cca-guest/arm-cca-guest.c new file mode 100644 index 000000000000..f937f6e1009e --- /dev/null +++ b/drivers/virt/coco/arm-cca-guest/arm-cca-guest.c @@ -0,0 +1,224 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 ARM Ltd. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +/** + * struct arm_cca_token_info - a descriptor for the token buffer. + * @challenge: Pointer to the challenge data + * @challenge_size: Size of the challenge data + * @granule: PA of the granule to which the token will be written + * @offset: Offset within granule to start of buffer in bytes + * @result: result of rsi_attestation_token_continue operation + */ +struct arm_cca_token_info { + void *challenge; + unsigned long challenge_size; + phys_addr_t granule; + unsigned long offset; + unsigned long result; +}; + +static void arm_cca_attestation_init(void *param) +{ + struct arm_cca_token_info *info; + + info = (struct arm_cca_token_info *)param; + + info->result = rsi_attestation_token_init(info->challenge, + info->challenge_size); +} + +/** + * arm_cca_attestation_continue - Retrieve the attestation token data. + * + * @param: pointer to the arm_cca_token_info + * + * Attestation token generation is a long running operation and therefore + * the token data may not be retrieved in a single call. Moreover, the + * token retrieval operation must be requested on the same CPU on which the + * attestation token generation was initialised. + * This helper function is therefore scheduled on the same CPU multiple + * times until the entire token data is retrieved. + */ +static void arm_cca_attestation_continue(void *param) +{ + unsigned long len; + unsigned long size; + struct arm_cca_token_info *info; + + info = (struct arm_cca_token_info *)param; + + size = RSI_GRANULE_SIZE - info->offset; + info->result = rsi_attestation_token_continue(info->granule, + info->offset, size, &len); + info->offset += len; +} + +/** + * arm_cca_report_new - Generate a new attestation token. + * + * @report: pointer to the TSM report context information. + * @data: pointer to the context specific data for this module. + * + * Initialise the attestation token generation using the challenge data + * passed in the TSM descriptor. Allocate memory for the attestation token + * and schedule calls to retrieve the attestation token on the same CPU + * on which the attestation token generation was initialised. + * + * The challenge data must be at least 32 bytes and no more than 64 bytes. If + * less than 64 bytes are provided it will be zero padded to 64 bytes. + * + * Return: + * * %0 - Attestation token generated successfully. + * * %-EINVAL - A parameter was not valid. + * * %-ENOMEM - Out of memory. + * * %-EFAULT - Failed to get IPA for memory page(s). + * * A negative status code as returned by smp_call_function_single(). + */ +static int arm_cca_report_new(struct tsm_report *report, void *data) +{ + int ret; + int cpu; + long max_size; + unsigned long token_size = 0; + struct arm_cca_token_info info; + void *buf; + u8 *token __free(kvfree) = NULL; + struct tsm_desc *desc = &report->desc; + + if (desc->inblob_len < 32 || desc->inblob_len > 64) + return -EINVAL; + + /* + * The attestation token 'init' and 'continue' calls must be + * performed on the same CPU. smp_call_function_single() is used + * instead of simply calling get_cpu() because of the need to + * allocate outblob based on the returned value from the 'init' + * call and that cannot be done in an atomic context. + */ + cpu = smp_processor_id(); + + info.challenge = desc->inblob; + info.challenge_size = desc->inblob_len; + + ret = smp_call_function_single(cpu, arm_cca_attestation_init, + &info, true); + if (ret) + return ret; + max_size = info.result; + + if (max_size <= 0) + return -EINVAL; + + /* Allocate outblob */ + token = kvzalloc(max_size, GFP_KERNEL); + if (!token) + return -ENOMEM; + + /* + * Since the outblob may not be physically contiguous, use a page + * to bounce the buffer from RMM. + */ + buf = alloc_pages_exact(RSI_GRANULE_SIZE, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + /* Get the PA of the memory page(s) that were allocated */ + info.granule = (unsigned long)virt_to_phys(buf); + + /* Loop until the token is ready or there is an error */ + do { + /* Retrieve one RSI_GRANULE_SIZE data per loop iteration */ + info.offset = 0; + do { + /* + * Schedule a call to retrieve a sub-granule chunk + * of data per loop iteration. + */ + ret = smp_call_function_single(cpu, + arm_cca_attestation_continue, + (void *)&info, true); + if (ret != 0) { + token_size = 0; + goto exit_free_granule_page; + } + } while (info.result == RSI_INCOMPLETE && + info.offset < RSI_GRANULE_SIZE); + + if (info.result != RSI_SUCCESS) { + ret = -ENXIO; + token_size = 0; + goto exit_free_granule_page; + } + + /* + * Copy the retrieved token data from the granule + * to the token buffer, ensuring that the RMM doesn't + * overflow the buffer. + */ + if (WARN_ON(token_size + info.offset > max_size)) + break; + memcpy(&token[token_size], buf, info.offset); + token_size += info.offset; + } while (info.result == RSI_INCOMPLETE); + + report->outblob = no_free_ptr(token); +exit_free_granule_page: + report->outblob_len = token_size; + free_pages_exact(buf, RSI_GRANULE_SIZE); + return ret; +} + +static const struct tsm_ops arm_cca_tsm_ops = { + .name = KBUILD_MODNAME, + .report_new = arm_cca_report_new, +}; + +/** + * arm_cca_guest_init - Register with the Trusted Security Module (TSM) + * interface. + * + * Return: + * * %0 - Registered successfully with the TSM interface. + * * %-ENODEV - The execution context is not an Arm Realm. + * * %-EBUSY - Already registered. + */ +static int __init arm_cca_guest_init(void) +{ + int ret; + + if (!is_realm_world()) + return -ENODEV; + + ret = tsm_register(&arm_cca_tsm_ops, NULL, NULL); + if (ret < 0) + pr_err("Error %d registering with TSM\n", ret); + + return ret; +} +module_init(arm_cca_guest_init); + +/** + * arm_cca_guest_exit - unregister with the Trusted Security Module (TSM) + * interface. + */ +static void __exit arm_cca_guest_exit(void) +{ + tsm_unregister(&arm_cca_tsm_ops); +} +module_exit(arm_cca_guest_exit); + +MODULE_AUTHOR("Sami Mujawar "); +MODULE_DESCRIPTION("Arm CCA Guest TSM Driver"); +MODULE_LICENSE("GPL"); -- Gitee From 30b120f112505569f6a4167affbe5b4edaee56f9 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Tue, 24 Jun 2025 15:46:02 +0800 Subject: [PATCH 26/84] arm64: Document Arm Confidential Compute mainline inclusion from mainline-v6.13-rc1 commit 972d755f01954bd0e36d8696f0d7dc6466072c21 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBWQ24 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=972d755f01954bd0e36d8696f0d7dc6466072c21 -------------------------------- Add some documentation on Arm CCA and the requirements for running Linux as a Realm guest. Also update booting.rst to describe the requirement for RIPAS RAM. Reviewed-by: Gavin Shan Reviewed-by: Suzuki K Poulose Signed-off-by: Steven Price Link: https://lore.kernel.org/r/20241017131434.40935-12-steven.price@arm.com Signed-off-by: Catalin Marinas Signed-off-by: Cai Xinchen --- Documentation/arch/arm64/arm-cca.rst | 69 ++++++++++++++++++++++++++++ Documentation/arch/arm64/booting.rst | 3 ++ Documentation/arch/arm64/index.rst | 1 + 3 files changed, 73 insertions(+) create mode 100644 Documentation/arch/arm64/arm-cca.rst diff --git a/Documentation/arch/arm64/arm-cca.rst b/Documentation/arch/arm64/arm-cca.rst new file mode 100644 index 000000000000..c48b7d4ab6bd --- /dev/null +++ b/Documentation/arch/arm64/arm-cca.rst @@ -0,0 +1,69 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================================== +Arm Confidential Compute Architecture +===================================== + +Arm systems that support the Realm Management Extension (RME) contain +hardware to allow a VM guest to be run in a way which protects the code +and data of the guest from the hypervisor. It extends the older "two +world" model (Normal and Secure World) into four worlds: Normal, Secure, +Root and Realm. Linux can then also be run as a guest to a monitor +running in the Realm world. + +The monitor running in the Realm world is known as the Realm Management +Monitor (RMM) and implements the Realm Management Monitor +specification[1]. The monitor acts a bit like a hypervisor (e.g. it runs +in EL2 and manages the stage 2 page tables etc of the guests running in +Realm world), however much of the control is handled by a hypervisor +running in the Normal World. The Normal World hypervisor uses the Realm +Management Interface (RMI) defined by the RMM specification to request +the RMM to perform operations (e.g. mapping memory or executing a vCPU). + +The RMM defines an environment for guests where the address space (IPA) +is split into two. The lower half is protected - any memory that is +mapped in this half cannot be seen by the Normal World and the RMM +restricts what operations the Normal World can perform on this memory +(e.g. the Normal World cannot replace pages in this region without the +guest's cooperation). The upper half is shared, the Normal World is free +to make changes to the pages in this region, and is able to emulate MMIO +devices in this region too. + +A guest running in a Realm may also communicate with the RMM using the +Realm Services Interface (RSI) to request changes in its environment or +to perform attestation about its environment. In particular it may +request that areas of the protected address space are transitioned +between 'RAM' and 'EMPTY' (in either direction). This allows a Realm +guest to give up memory to be returned to the Normal World, or to +request new memory from the Normal World. Without an explicit request +from the Realm guest the RMM will otherwise prevent the Normal World +from making these changes. + +Linux as a Realm Guest +---------------------- + +To run Linux as a guest within a Realm, the following must be provided +either by the VMM or by a `boot loader` run in the Realm before Linux: + + * All protected RAM described to Linux (by DT or ACPI) must be marked + RIPAS RAM before handing control over to Linux. + + * MMIO devices must be either unprotected (e.g. emulated by the Normal + World) or marked RIPAS DEV. + + * MMIO devices emulated by the Normal World and used very early in boot + (specifically earlycon) must be specified in the upper half of IPA. + For earlycon this can be done by specifying the address on the + command line, e.g. with an IPA size of 33 bits and the base address + of the emulated UART at 0x1000000: ``earlycon=uart,mmio,0x101000000`` + + * Linux will use bounce buffers for communicating with unprotected + devices. It will transition some protected memory to RIPAS EMPTY and + expect to be able to access unprotected pages at the same IPA address + but with the highest valid IPA bit set. The expectation is that the + VMM will remove the physical pages from the protected mapping and + provide those pages as unprotected pages. + +References +---------- +[1] https://developer.arm.com/documentation/den0137/ diff --git a/Documentation/arch/arm64/booting.rst b/Documentation/arch/arm64/booting.rst index b57776a68f15..30164fb24a24 100644 --- a/Documentation/arch/arm64/booting.rst +++ b/Documentation/arch/arm64/booting.rst @@ -41,6 +41,9 @@ to automatically locate and size all RAM, or it may use knowledge of the RAM in the machine, or any other method the boot loader designer sees fit.) +For Arm Confidential Compute Realms this includes ensuring that all +protected RAM has a Realm IPA state (RIPAS) of "RAM". + 2. Setup the device tree ------------------------- diff --git a/Documentation/arch/arm64/index.rst b/Documentation/arch/arm64/index.rst index d08e924204bf..e352f4e6eb96 100644 --- a/Documentation/arch/arm64/index.rst +++ b/Documentation/arch/arm64/index.rst @@ -10,6 +10,7 @@ ARM64 Architecture acpi_object_usage amu arm-acpi + arm-cca asymmetric-32bit booting cpu-feature-registers -- Gitee From 5dcdc1156cd500fc2d0641b65065bcbf517a4b0e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 24 Jun 2025 15:46:03 +0800 Subject: [PATCH 27/84] configfs-tsm-report: Fix NULL dereference of tsm_ops mainline inclusion from mainline-v6.16-rc1 commit fba4ceaa242d2bdf4c04b77bda41d32d02d3925d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IBWQ24 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fba4ceaa242d2bdf4c04b77bda41d32d02d3925d -------------------------------- Unlike sysfs, the lifetime of configfs objects is controlled by userspace. There is no mechanism for the kernel to find and delete all created config-items. Instead, the configfs-tsm-report mechanism has an expectation that tsm_unregister() can happen at any time and cause established config-item access to start failing. That expectation is not fully satisfied. While tsm_report_read(), tsm_report_{is,is_bin}_visible(), and tsm_report_make_item() safely fail if tsm_ops have been unregistered, tsm_report_privlevel_store() tsm_report_provider_show() fail to check for ops registration. Add the missing checks for tsm_ops having been removed. Now, in supporting the ability for tsm_unregister() to always succeed, it leaves the problem of what to do with lingering config-items. The expectation is that the admin that arranges for the ->remove() (unbind) of the ${tsm_arch}-guest driver is also responsible for deletion of all open config-items. Until that deletion happens, ->probe() (reload / bind) of the ${tsm_arch}-guest driver fails. This allows for emergency shutdown / revocation of attestation interfaces, and requires coordinated restart. Fixes: 70e6f7e2b985 ("configfs-tsm: Introduce a shared ABI for attestation reports") Cc: stable@vger.kernel.org Cc: Suzuki K Poulose Cc: Steven Price Cc: Sami Mujawar Cc: Borislav Petkov (AMD) Cc: Tom Lendacky Reviewed-by: Kuppuswamy Sathyanarayanan Reported-by: Cedric Xing Reviewed-by: Kai Huang Link: https://patch.msgid.link/20250430203331.1177062-1-dan.j.williams@intel.com Signed-off-by: Dan Williams Conflicts: drivers/virt/coco/tsm.c [Because the commit 20dfee959364 ("x86/sev: Take advantage of configfs visibility support in TSM") is not merged, result in context conflicts.] Signed-off-by: Cai Xinchen --- drivers/virt/coco/tsm.c | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/drivers/virt/coco/tsm.c b/drivers/virt/coco/tsm.c index d1c2db83a8ca..9c80966dd7b6 100644 --- a/drivers/virt/coco/tsm.c +++ b/drivers/virt/coco/tsm.c @@ -16,6 +16,7 @@ static struct tsm_provider { const struct tsm_ops *ops; const struct config_item_type *type; void *data; + atomic_t count; } provider; static DECLARE_RWSEM(tsm_rwsem); @@ -92,6 +93,10 @@ static ssize_t tsm_report_privlevel_store(struct config_item *cfg, if (rc) return rc; + guard(rwsem_write)(&tsm_rwsem); + if (!provider.ops) + return -ENXIO; + /* * The valid privilege levels that a TSM might accept, if it accepts a * privilege level setting at all, are a max of TSM_PRIVLEVEL_MAX (see @@ -101,7 +106,6 @@ static ssize_t tsm_report_privlevel_store(struct config_item *cfg, if (provider.ops->privlevel_floor > val || val > TSM_PRIVLEVEL_MAX) return -EINVAL; - guard(rwsem_write)(&tsm_rwsem); rc = try_advance_write_generation(report); if (rc) return rc; @@ -115,6 +119,10 @@ static ssize_t tsm_report_privlevel_floor_show(struct config_item *cfg, char *buf) { guard(rwsem_read)(&tsm_rwsem); + + if (!provider.ops) + return -ENXIO; + return sysfs_emit(buf, "%u\n", provider.ops->privlevel_floor); } CONFIGFS_ATTR_RO(tsm_report_, privlevel_floor); @@ -149,6 +157,9 @@ CONFIGFS_ATTR_RO(tsm_report_, generation); static ssize_t tsm_report_provider_show(struct config_item *cfg, char *buf) { guard(rwsem_read)(&tsm_rwsem); + if (!provider.ops) + return -ENXIO; + return sysfs_emit(buf, "%s\n", provider.ops->name); } CONFIGFS_ATTR_RO(tsm_report_, provider); @@ -213,7 +224,7 @@ static ssize_t tsm_report_read(struct tsm_report *report, void *buf, guard(rwsem_write)(&tsm_rwsem); ops = provider.ops; if (!ops) - return -ENOTTY; + return -ENXIO; if (!report->desc.inblob_len) return -EINVAL; @@ -326,12 +337,20 @@ static struct config_item *tsm_report_make_item(struct config_group *group, if (!state) return ERR_PTR(-ENOMEM); + atomic_inc(&provider.count); config_item_init_type_name(&state->cfg, name, provider.type); return &state->cfg; } +static void tsm_report_drop_item(struct config_group *group, struct config_item *item) +{ + config_item_put(item); + atomic_dec(&provider.count); +} + static struct configfs_group_operations tsm_report_group_ops = { .make_item = tsm_report_make_item, + .drop_item = tsm_report_drop_item, }; static const struct config_item_type tsm_reports_type = { @@ -370,6 +389,11 @@ int tsm_register(const struct tsm_ops *ops, void *priv, return -EBUSY; } + if (atomic_read(&provider.count)) { + pr_err("configfs/tsm/report not empty\n"); + return -EBUSY; + } + provider.ops = ops; provider.data = priv; provider.type = type; @@ -382,6 +406,9 @@ int tsm_unregister(const struct tsm_ops *ops) guard(rwsem_write)(&tsm_rwsem); if (ops != provider.ops) return -EBUSY; + if (atomic_read(&provider.count)) + pr_warn("\"%s\" unregistered with items present in configfs/tsm/report\n", + provider.ops->name); provider.ops = NULL; provider.data = NULL; provider.type = NULL; -- Gitee From c9f2c7c31ddbdbbd343da2555ecb407221b16218 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 24 Jun 2025 15:46:04 +0800 Subject: [PATCH 28/84] MAINTAINERS: Add CCA and pKVM CoCO guest support to the ARM64 entry mainline inclusion from mainline-v6.13-rc2 commit 92230596252ab6155f2d7f7ff9fa61425800a13f category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBWQ24 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=92230596252ab6155f2d7f7ff9fa61425800a13f -------------------------------- Commits 7999edc484ca ("virt: arm-cca-guest: TSM_REPORT support for realm") and a06c3fad49a5 ("drivers/virt: pkvm: Add initial support for running as a protected guest") added arm64 guest-side support for running in CCA and pKVM confidential computing environments respectively. Unfortunately, these changes were not accompanied by a MAINTAINERS entry and so aren't automatically picked up by the get_maintainer.pl script. Since the initial support was merged via the arm64 tree, extend the ARM64 entry to cover the two new directories. Cc: Marc Zyngier Cc: Oliver Upton Cc: Suzuki K Poulose Signed-off-by: Will Deacon Acked-by: Suzuki K Poulose Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20241202145731.6422-3-will@kernel.org Signed-off-by: Catalin Marinas Signed-off-by: Cai Xinchen --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 15b9057e9216..1af3bc0d6371 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3015,6 +3015,8 @@ S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git F: Documentation/arch/arm64/ F: arch/arm64/ +F: drivers/virt/coco/arm-cca-guest/ +F: drivers/virt/coco/pkvm-guest/ F: tools/testing/selftests/arm64/ X: arch/arm64/boot/dts/ -- Gitee From 7157a06ad98fdc8cc8e639cdf29f8137e655b923 Mon Sep 17 00:00:00 2001 From: Cai Xinchen Date: Fri, 11 Jul 2025 17:59:29 +0800 Subject: [PATCH 29/84] config: Set TSM_REPORTS=m and ARM_CCA_GUEST=m hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IBWQ24 -------------------------------- Fix openeuler CI error "CONFIG_ARM_CCA_GUEST=m The configs listed above are introduced but the openeuler_defconfig for aarch64 is not updated" Fixes: 5ada556da5f1 ("config: Enable CONFIG_VIRT_DRIVERS") Signed-off-by: Cai Xinchen [update tencent.config instead] Signed-off-by: Xu Raoqing --- arch/arm64/configs/tencent.config | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/configs/tencent.config b/arch/arm64/configs/tencent.config index fb4542b428f6..d6dceaabbf74 100644 --- a/arch/arm64/configs/tencent.config +++ b/arch/arm64/configs/tencent.config @@ -1417,6 +1417,8 @@ CONFIG_MLX5_VFIO_PCI=m CONFIG_VFIO_PLATFORM=m CONFIG_VFIO_FSL_MC=m CONFIG_VIRT_DRIVERS=y +CONFIG_TSM_REPORTS=m +CONFIG_ARM_CCA_GUEST=m CONFIG_VIRTIO_PCI=m CONFIG_VIRTIO_VDPA=m CONFIG_VIRTIO_BALLOON=m -- Gitee From 676b1c89c7ecce28a2a8d8ae34b3b591bea4f406 Mon Sep 17 00:00:00 2001 From: Reiji Watanabe Date: Thu, 18 Sep 2025 20:39:11 +0800 Subject: [PATCH 30/84] KVM: arm64: PMU: Add a helper to read a vCPU's PMCR_EL0 mainline inclusion from mainline-v6.7-rc1 commit 57fc267f1b5caa56dfb46f60e20e673cbc4cc4a8 category: cleanup bugzilla: https://gitee.com/openeuler/kernel/issues/I9Q7QP CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=57fc267f1b5caa56dfb46f60e20e673cbc4cc4a8 -------------------------------- Add a helper to read a vCPU's PMCR_EL0, and use it whenever KVM reads a vCPU's PMCR_EL0. Currently, the PMCR_EL0 value is tracked per vCPU. The following patches will make (only) PMCR_EL0.N track per guest. Having the new helper will be useful to combine the PMCR_EL0.N field (tracked per guest) and the other fields (tracked per vCPU) to provide the value of PMCR_EL0. No functional change intended. Reviewed-by: Sebastian Ott Signed-off-by: Reiji Watanabe Signed-off-by: Raghavendra Rao Ananta Reviewed-by: Eric Auger Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20231020214053.2144305-4-rananta@google.com Signed-off-by: Oliver Upton Signed-off-by: Xu Raoqing --- arch/arm64/kvm/arm.c | 3 +-- arch/arm64/kvm/pmu-emul.c | 21 +++++++++++++++------ arch/arm64/kvm/sys_regs.c | 6 +++--- include/kvm/arm_pmu.h | 6 ++++++ 4 files changed, 25 insertions(+), 11 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index fe4314af8eec..ac51acec3c56 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -796,8 +796,7 @@ static int check_vcpu_requests(struct kvm_vcpu *vcpu) } if (kvm_check_request(KVM_REQ_RELOAD_PMU, vcpu)) - kvm_pmu_handle_pmcr(vcpu, - __vcpu_sys_reg(vcpu, PMCR_EL0)); + kvm_pmu_handle_pmcr(vcpu, kvm_vcpu_read_pmcr(vcpu)); if (kvm_check_request(KVM_REQ_RESYNC_PMU_EL0, vcpu)) kvm_vcpu_pmu_restore_guest(vcpu); diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 3867d6d1f5d1..f535b18d51c2 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -72,7 +72,7 @@ static bool kvm_pmc_is_64bit(struct kvm_pmc *pmc) static bool kvm_pmc_has_64bit_overflow(struct kvm_pmc *pmc) { - u64 val = __vcpu_sys_reg(kvm_pmc_to_vcpu(pmc), PMCR_EL0); + u64 val = kvm_vcpu_read_pmcr(kvm_pmc_to_vcpu(pmc)); return (pmc->idx < ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LP)) || (pmc->idx == ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LC)); @@ -250,7 +250,7 @@ void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu) { - u64 val = __vcpu_sys_reg(vcpu, PMCR_EL0) >> ARMV8_PMU_PMCR_N_SHIFT; + u64 val = kvm_vcpu_read_pmcr(vcpu) >> ARMV8_PMU_PMCR_N_SHIFT; val &= ARMV8_PMU_PMCR_N_MASK; if (val == 0) @@ -272,7 +272,7 @@ void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val) if (!kvm_vcpu_has_pmu(vcpu)) return; - if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) || !val) + if (!(kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E) || !val) return; for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) { @@ -324,7 +324,7 @@ static u64 kvm_pmu_overflow_status(struct kvm_vcpu *vcpu) { u64 reg = 0; - if ((__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E)) { + if ((kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E)) { reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0); reg &= __vcpu_sys_reg(vcpu, PMINTENSET_EL1); } @@ -425,7 +425,7 @@ static void kvm_pmu_counter_increment(struct kvm_vcpu *vcpu, { int i; - if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E)) + if (!(kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E)) return; /* Weed out disabled counters */ @@ -568,7 +568,7 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) static bool kvm_pmu_counter_is_enabled(struct kvm_pmc *pmc) { struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); - return (__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) && + return (kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E) && (__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(pmc->idx)); } @@ -1071,3 +1071,12 @@ u8 kvm_arm_pmu_get_pmuver_limit(void) ID_AA64DFR0_EL1_PMUVer_V3P5); return FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), tmp); } + +/** + * kvm_vcpu_read_pmcr - Read PMCR_EL0 register for the vCPU + * @vcpu: The vcpu pointer + */ +u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu) +{ + return __vcpu_sys_reg(vcpu, PMCR_EL0); +} diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index bfcf36c758c2..0a53eb79627f 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -828,7 +828,7 @@ static bool access_pmcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, * Only update writeable bits of PMCR (continuing into * kvm_pmu_handle_pmcr() as well) */ - val = __vcpu_sys_reg(vcpu, PMCR_EL0); + val = kvm_vcpu_read_pmcr(vcpu); val &= ~ARMV8_PMU_PMCR_MASK; val |= p->regval & ARMV8_PMU_PMCR_MASK; if (!kvm_supports_32bit_el0()) @@ -836,7 +836,7 @@ static bool access_pmcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, kvm_pmu_handle_pmcr(vcpu, val); } else { /* PMCR.P & PMCR.C are RAZ */ - val = __vcpu_sys_reg(vcpu, PMCR_EL0) + val = kvm_vcpu_read_pmcr(vcpu) & ~(ARMV8_PMU_PMCR_P | ARMV8_PMU_PMCR_C); p->regval = val; } @@ -885,7 +885,7 @@ static bool pmu_counter_idx_valid(struct kvm_vcpu *vcpu, u64 idx) { u64 pmcr, val; - pmcr = __vcpu_sys_reg(vcpu, PMCR_EL0); + pmcr = kvm_vcpu_read_pmcr(vcpu); val = (pmcr >> ARMV8_PMU_PMCR_N_SHIFT) & ARMV8_PMU_PMCR_N_MASK; if (idx >= val && idx != ARMV8_PMU_CYCLE_IDX) { kvm_inject_undefined(vcpu); diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index c4aabbf002f7..934ed4bc96d8 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -102,6 +102,7 @@ void kvm_vcpu_pmu_resync_el0(void); u8 kvm_arm_pmu_get_pmuver_limit(void); +u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu); #else struct kvm_pmu { }; @@ -174,6 +175,11 @@ static inline u8 kvm_arm_pmu_get_pmuver_limit(void) } static inline void kvm_vcpu_pmu_resync_el0(void) {} +static inline u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu) +{ + return 0; +} + #endif #endif -- Gitee From 50aa1a19192262d5998445184de4a6d3e372dc4f Mon Sep 17 00:00:00 2001 From: James Clark Date: Thu, 19 Sep 2024 14:25:13 +0800 Subject: [PATCH 31/84] arm: perf/kvm: Use GENMASK for ARMV8_PMU_PMCR_N mainline inclusion from mainline-v6.8-rc1 commit 62e1f212e5fe7624249212813ee96202e0c31430 category: cleanup bugzilla: https://gitee.com/openeuler/kernel/issues/I9Q7QP CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=62e1f212e5fe7624249212813ee96202e0c31430 -------------------------------- This is so that FIELD_GET and FIELD_PREP can be used and that the fields are in a consistent format to arm64/tools/sysreg Signed-off-by: James Clark Link: https://lore.kernel.org/r/20231211161331.1277825-3-james.clark@arm.com Signed-off-by: Will Deacon Signed-off-by: Junhao He Signed-off-by: Xu Raoqing --- arch/arm64/kvm/pmu-emul.c | 3 +-- arch/arm64/kvm/sys_regs.c | 7 +++---- drivers/perf/arm_pmuv3.c | 4 ++-- include/linux/perf/arm_pmuv3.h | 3 +-- 4 files changed, 7 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index f535b18d51c2..d02944e9b4b7 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -250,9 +250,8 @@ void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu) { - u64 val = kvm_vcpu_read_pmcr(vcpu) >> ARMV8_PMU_PMCR_N_SHIFT; + u64 val = FIELD_GET(ARMV8_PMU_PMCR_N, kvm_vcpu_read_pmcr(vcpu)); - val &= ARMV8_PMU_PMCR_N_MASK; if (val == 0) return BIT(ARMV8_PMU_CYCLE_IDX); else diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 0a53eb79627f..1ce2e1975d57 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -731,8 +731,7 @@ static u64 reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) if (!kvm_arm_support_pmu_v3()) return 0; - n = read_sysreg(pmcr_el0) >> ARMV8_PMU_PMCR_N_SHIFT; - n &= ARMV8_PMU_PMCR_N_MASK; + n = FIELD_GET(ARMV8_PMU_PMCR_N, read_sysreg(pmcr_el0)); if (n) mask |= GENMASK(n - 1, 0); @@ -775,7 +774,7 @@ static u64 reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) return 0; /* Only preserve PMCR_EL0.N, and reset the rest to 0 */ - pmcr = read_sysreg(pmcr_el0) & (ARMV8_PMU_PMCR_N_MASK << ARMV8_PMU_PMCR_N_SHIFT); + pmcr = read_sysreg(pmcr_el0) & ARMV8_PMU_PMCR_N; if (!kvm_supports_32bit_el0()) pmcr |= ARMV8_PMU_PMCR_LC; @@ -886,7 +885,7 @@ static bool pmu_counter_idx_valid(struct kvm_vcpu *vcpu, u64 idx) u64 pmcr, val; pmcr = kvm_vcpu_read_pmcr(vcpu); - val = (pmcr >> ARMV8_PMU_PMCR_N_SHIFT) & ARMV8_PMU_PMCR_N_MASK; + val = FIELD_GET(ARMV8_PMU_PMCR_N, pmcr); if (idx >= val && idx != ARMV8_PMU_CYCLE_IDX) { kvm_inject_undefined(vcpu); return false; diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index 0858e6096453..6c0efa5f2eca 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -15,6 +15,7 @@ #include #include +#include #include #include #include @@ -1111,8 +1112,7 @@ static void __armv8pmu_probe_pmu(void *info) probe->present = true; /* Read the nb of CNTx counters supported from PMNC */ - cpu_pmu->num_events = (armv8pmu_pmcr_read() >> ARMV8_PMU_PMCR_N_SHIFT) - & ARMV8_PMU_PMCR_N_MASK; + cpu_pmu->num_events = FIELD_GET(ARMV8_PMU_PMCR_N, armv8pmu_pmcr_read()); /* Add the CPU cycles counter */ cpu_pmu->num_events += 1; diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h index e3899bd77f5c..28a21d13908a 100644 --- a/include/linux/perf/arm_pmuv3.h +++ b/include/linux/perf/arm_pmuv3.h @@ -215,8 +215,7 @@ #define ARMV8_PMU_PMCR_DP (1 << 5) /* Disable CCNT if non-invasive debug*/ #define ARMV8_PMU_PMCR_LC (1 << 6) /* Overflow on 64 bit cycle counter */ #define ARMV8_PMU_PMCR_LP (1 << 7) /* Long event counter enable */ -#define ARMV8_PMU_PMCR_N_SHIFT 11 /* Number of counters supported */ -#define ARMV8_PMU_PMCR_N_MASK 0x1f +#define ARMV8_PMU_PMCR_N GENMASK(15, 11) /* Number of counters supported */ #define ARMV8_PMU_PMCR_MASK 0xff /* Mask for writable bits */ /* -- Gitee From 574a1751c213d4961fb09bec1108f91e314828e9 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Sat, 14 Jun 2025 15:14:49 +0800 Subject: [PATCH 32/84] kvm: arm64: Include kvm_emulate.h in kvm/arm_psci.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- Fix a potential build error (like below, when asm/kvm_emulate.h gets included after the kvm/arm_psci.h) by including the missing header file in kvm/arm_psci.h: ./include/kvm/arm_psci.h: In function ‘kvm_psci_version’: ./include/kvm/arm_psci.h:29:13: error: implicit declaration of function ‘vcpu_has_feature’; did you mean ‘cpu_have_feature’? [-Werror=implicit-function-declaration] 29 | if (vcpu_has_feature(vcpu, KVM_ARM_VCPU_PSCI_0_2)) { | ^~~~~~~~~~~~~~~~ | cpu_have_feature Signed-off-by: Suzuki K Poulose Signed-off-by: Steven Price Signed-off-by: Yiwei Zhuang Signed-off-by: Xu Raoqing --- include/kvm/arm_psci.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/kvm/arm_psci.h b/include/kvm/arm_psci.h index 6e55b9283789..bbeb68f031be 100644 --- a/include/kvm/arm_psci.h +++ b/include/kvm/arm_psci.h @@ -10,6 +10,8 @@ #include #include +#include + #define KVM_ARM_PSCI_0_1 PSCI_VERSION(0, 1) #define KVM_ARM_PSCI_0_2 PSCI_VERSION(0, 2) #define KVM_ARM_PSCI_1_0 PSCI_VERSION(1, 0) -- Gitee From 0c87ec0567ae6c4d4949f88385c6b1558e39f732 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Sat, 14 Jun 2025 15:16:18 +0800 Subject: [PATCH 33/84] arm64: RME: Handle Granule Protection Faults (GPFs) community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- If the host attempts to access granules that have been delegated for use in a realm these accesses will be caught and will trigger a Granule Protection Fault (GPF). A fault during a page walk signals a bug in the kernel and is handled by oopsing the kernel. A non-page walk fault could be caused by user space having access to a page which has been delegated to the kernel and will trigger a SIGBUS to allow debugging why user space is trying to access a delegated page. Reviewed-by: Suzuki K Poulose Reviewed-by: Gavin Shan Signed-off-by: Steven Price Signed-off-by: Xu Raoqing --- arch/arm64/mm/fault.c | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index fb997df204e9..b6550cedd4b7 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -1728,6 +1728,25 @@ static int do_tag_check_fault(unsigned long far, unsigned long esr, return 0; } +static int do_gpf_ptw(unsigned long far, unsigned long esr, struct pt_regs *regs) +{ + const struct fault_info *inf = esr_to_fault_info(esr); + + die_kernel_fault(inf->name, far, esr, regs); + return 0; +} + +static int do_gpf(unsigned long far, unsigned long esr, struct pt_regs *regs) +{ + const struct fault_info *inf = esr_to_fault_info(esr); + + if (!is_el1_instruction_abort(esr) && fixup_exception(regs)) + return 0; + + arm64_notify_die(inf->name, regs, inf->sig, inf->code, far, esr); + return 0; +} + static const struct fault_info fault_info[] = { { do_bad, SIGKILL, SI_KERNEL, "ttbr address size fault" }, { do_bad, SIGKILL, SI_KERNEL, "level 1 address size fault" }, @@ -1764,12 +1783,12 @@ static const struct fault_info fault_info[] = { { do_bad, SIGKILL, SI_KERNEL, "unknown 32" }, { do_alignment_fault, SIGBUS, BUS_ADRALN, "alignment fault" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 34" }, - { do_bad, SIGKILL, SI_KERNEL, "unknown 35" }, - { do_bad, SIGKILL, SI_KERNEL, "unknown 36" }, - { do_bad, SIGKILL, SI_KERNEL, "unknown 37" }, - { do_bad, SIGKILL, SI_KERNEL, "unknown 38" }, - { do_bad, SIGKILL, SI_KERNEL, "unknown 39" }, - { do_bad, SIGKILL, SI_KERNEL, "unknown 40" }, + { do_gpf_ptw, SIGKILL, SI_KERNEL, "Granule Protection Fault at level -1" }, + { do_gpf_ptw, SIGKILL, SI_KERNEL, "Granule Protection Fault at level 0" }, + { do_gpf_ptw, SIGKILL, SI_KERNEL, "Granule Protection Fault at level 1" }, + { do_gpf_ptw, SIGKILL, SI_KERNEL, "Granule Protection Fault at level 2" }, + { do_gpf_ptw, SIGKILL, SI_KERNEL, "Granule Protection Fault at level 3" }, + { do_gpf, SIGBUS, SI_KERNEL, "Granule Protection Fault not on table walk" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 41" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 42" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 43" }, -- Gitee From 0c2d49d71c47b4e5829bf6583deb611c73a18bd2 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Sat, 14 Jun 2025 15:17:34 +0800 Subject: [PATCH 34/84] arm64: RME: Add SMC definitions for calling the RMM community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- The RMM (Realm Management Monitor) provides functionality that can be accessed by SMC calls from the host. The SMC definitions are based on DEN0137[1] version 1.0-rel0 [1] https://developer.arm.com/documentation/den0137/1-0rel0/ Reviewed-by: Gavin Shan Signed-off-by: Steven Price Signed-off-by: Xu Raoqing --- arch/arm64/include/asm/rmi_smc.h | 259 +++++++++++++++++++++++++++++++ 1 file changed, 259 insertions(+) create mode 100644 arch/arm64/include/asm/rmi_smc.h diff --git a/arch/arm64/include/asm/rmi_smc.h b/arch/arm64/include/asm/rmi_smc.h new file mode 100644 index 000000000000..7a93a3e0ac6e --- /dev/null +++ b/arch/arm64/include/asm/rmi_smc.h @@ -0,0 +1,259 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023-2024 ARM Ltd. + * + * The values and structures in this file are from the Realm Management Monitor + * specification (DEN0137) version 1.0-rel0: + * https://developer.arm.com/documentation/den0137/1-0rel0/ + */ + +#ifndef __ASM_RMI_SMC_H +#define __ASM_RMI_SMC_H + +#include + +#define SMC_RMI_CALL(func) \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_STANDARD, \ + (func)) + +#define SMC_RMI_VERSION SMC_RMI_CALL(0x0150) +#define SMC_RMI_GRANULE_DELEGATE SMC_RMI_CALL(0x0151) +#define SMC_RMI_GRANULE_UNDELEGATE SMC_RMI_CALL(0x0152) +#define SMC_RMI_DATA_CREATE SMC_RMI_CALL(0x0153) +#define SMC_RMI_DATA_CREATE_UNKNOWN SMC_RMI_CALL(0x0154) +#define SMC_RMI_DATA_DESTROY SMC_RMI_CALL(0x0155) + +#define SMC_RMI_REALM_ACTIVATE SMC_RMI_CALL(0x0157) +#define SMC_RMI_REALM_CREATE SMC_RMI_CALL(0x0158) +#define SMC_RMI_REALM_DESTROY SMC_RMI_CALL(0x0159) +#define SMC_RMI_REC_CREATE SMC_RMI_CALL(0x015a) +#define SMC_RMI_REC_DESTROY SMC_RMI_CALL(0x015b) +#define SMC_RMI_REC_ENTER SMC_RMI_CALL(0x015c) +#define SMC_RMI_RTT_CREATE SMC_RMI_CALL(0x015d) +#define SMC_RMI_RTT_DESTROY SMC_RMI_CALL(0x015e) +#define SMC_RMI_RTT_MAP_UNPROTECTED SMC_RMI_CALL(0x015f) + +#define SMC_RMI_RTT_READ_ENTRY SMC_RMI_CALL(0x0161) +#define SMC_RMI_RTT_UNMAP_UNPROTECTED SMC_RMI_CALL(0x0162) + +#define SMC_RMI_PSCI_COMPLETE SMC_RMI_CALL(0x0164) +#define SMC_RMI_FEATURES SMC_RMI_CALL(0x0165) +#define SMC_RMI_RTT_FOLD SMC_RMI_CALL(0x0166) +#define SMC_RMI_REC_AUX_COUNT SMC_RMI_CALL(0x0167) +#define SMC_RMI_RTT_INIT_RIPAS SMC_RMI_CALL(0x0168) +#define SMC_RMI_RTT_SET_RIPAS SMC_RMI_CALL(0x0169) + +#define RMI_ABI_MAJOR_VERSION 1 +#define RMI_ABI_MINOR_VERSION 0 + +#define RMI_ABI_VERSION_GET_MAJOR(version) ((version) >> 16) +#define RMI_ABI_VERSION_GET_MINOR(version) ((version) & 0xFFFF) +#define RMI_ABI_VERSION(major, minor) (((major) << 16) | (minor)) + +#define RMI_UNASSIGNED 0 +#define RMI_ASSIGNED 1 +#define RMI_TABLE 2 + +#define RMI_RETURN_STATUS(ret) ((ret) & 0xFF) +#define RMI_RETURN_INDEX(ret) (((ret) >> 8) & 0xFF) + +#define RMI_SUCCESS 0 +#define RMI_ERROR_INPUT 1 +#define RMI_ERROR_REALM 2 +#define RMI_ERROR_REC 3 +#define RMI_ERROR_RTT 4 + +enum rmi_ripas { + RMI_EMPTY = 0, + RMI_RAM = 1, + RMI_DESTROYED = 2, +}; + +#define RMI_NO_MEASURE_CONTENT 0 +#define RMI_MEASURE_CONTENT 1 + +#define RMI_FEATURE_REGISTER_0_S2SZ GENMASK(7, 0) +#define RMI_FEATURE_REGISTER_0_LPA2 BIT(8) +#define RMI_FEATURE_REGISTER_0_SVE_EN BIT(9) +#define RMI_FEATURE_REGISTER_0_SVE_VL GENMASK(13, 10) +#define RMI_FEATURE_REGISTER_0_NUM_BPS GENMASK(19, 14) +#define RMI_FEATURE_REGISTER_0_NUM_WPS GENMASK(25, 20) +#define RMI_FEATURE_REGISTER_0_PMU_EN BIT(26) +#define RMI_FEATURE_REGISTER_0_PMU_NUM_CTRS GENMASK(31, 27) +#define RMI_FEATURE_REGISTER_0_HASH_SHA_256 BIT(32) +#define RMI_FEATURE_REGISTER_0_HASH_SHA_512 BIT(33) +#define RMI_FEATURE_REGISTER_0_GICV3_NUM_LRS GENMASK(37, 34) +#define RMI_FEATURE_REGISTER_0_MAX_RECS_ORDER GENMASK(41, 38) +#define RMI_FEATURE_REGISTER_0_Reserved GENMASK(63, 42) + +#define RMI_REALM_PARAM_FLAG_LPA2 BIT(0) +#define RMI_REALM_PARAM_FLAG_SVE BIT(1) +#define RMI_REALM_PARAM_FLAG_PMU BIT(2) + +/* + * Note many of these fields are smaller than u64 but all fields have u64 + * alignment, so use u64 to ensure correct alignment. + */ +struct realm_params { + union { /* 0x0 */ + struct { + u64 flags; + u64 s2sz; + u64 sve_vl; + u64 num_bps; + u64 num_wps; + u64 pmu_num_ctrs; + u64 hash_algo; + }; + u8 padding0[0x400]; + }; + union { /* 0x400 */ + u8 rpv[64]; + u8 padding1[0x400]; + }; + union { /* 0x800 */ + struct { + u64 vmid; + u64 rtt_base; + s64 rtt_level_start; + u64 rtt_num_start; + }; + u8 padding2[0x800]; + }; +}; + +/* + * The number of GPRs (starting from X0) that are + * configured by the host when a REC is created. + */ +#define REC_CREATE_NR_GPRS 8 + +#define REC_PARAMS_FLAG_RUNNABLE BIT_ULL(0) + +#define REC_PARAMS_AUX_GRANULES 16 + +struct rec_params { + union { /* 0x0 */ + u64 flags; + u8 padding0[0x100]; + }; + union { /* 0x100 */ + u64 mpidr; + u8 padding1[0x100]; + }; + union { /* 0x200 */ + u64 pc; + u8 padding2[0x100]; + }; + union { /* 0x300 */ + u64 gprs[REC_CREATE_NR_GPRS]; + u8 padding3[0x500]; + }; + union { /* 0x800 */ + struct { + u64 num_rec_aux; + u64 aux[REC_PARAMS_AUX_GRANULES]; + }; + u8 padding4[0x800]; + }; +}; + +#define REC_ENTER_FLAG_EMULATED_MMIO BIT(0) +#define REC_ENTER_FLAG_INJECT_SEA BIT(1) +#define REC_ENTER_FLAG_TRAP_WFI BIT(2) +#define REC_ENTER_FLAG_TRAP_WFE BIT(3) +#define REC_ENTER_FLAG_RIPAS_RESPONSE BIT(4) + +#define REC_RUN_GPRS 31 +#define REC_MAX_GIC_NUM_LRS 16 + +struct rec_enter { + union { /* 0x000 */ + u64 flags; + u8 padding0[0x200]; + }; + union { /* 0x200 */ + u64 gprs[REC_RUN_GPRS]; + u8 padding1[0x100]; + }; + union { /* 0x300 */ + struct { + u64 gicv3_hcr; + u64 gicv3_lrs[REC_MAX_GIC_NUM_LRS]; + }; + u8 padding2[0x100]; + }; + u8 padding3[0x400]; +}; + +#define RMI_EXIT_SYNC 0x00 +#define RMI_EXIT_IRQ 0x01 +#define RMI_EXIT_FIQ 0x02 +#define RMI_EXIT_PSCI 0x03 +#define RMI_EXIT_RIPAS_CHANGE 0x04 +#define RMI_EXIT_HOST_CALL 0x05 +#define RMI_EXIT_SERROR 0x06 + +struct rec_exit { + union { /* 0x000 */ + u8 exit_reason; + u8 padding0[0x100]; + }; + union { /* 0x100 */ + struct { + u64 esr; + u64 far; + u64 hpfar; + }; + u8 padding1[0x100]; + }; + union { /* 0x200 */ + u64 gprs[REC_RUN_GPRS]; + u8 padding2[0x100]; + }; + union { /* 0x300 */ + struct { + u64 gicv3_hcr; + u64 gicv3_lrs[REC_MAX_GIC_NUM_LRS]; + u64 gicv3_misr; + u64 gicv3_vmcr; + }; + u8 padding3[0x100]; + }; + union { /* 0x400 */ + struct { + u64 cntp_ctl; + u64 cntp_cval; + u64 cntv_ctl; + u64 cntv_cval; + }; + u8 padding4[0x100]; + }; + union { /* 0x500 */ + struct { + u64 ripas_base; + u64 ripas_top; + u64 ripas_value; + }; + u8 padding5[0x100]; + }; + union { /* 0x600 */ + u16 imm; + u8 padding6[0x100]; + }; + union { /* 0x700 */ + struct { + u8 pmu_ovf_status; + }; + u8 padding7[0x100]; + }; +}; + +struct rec_run { + struct rec_enter enter; + struct rec_exit exit; +}; + +#endif /* __ASM_RMI_SMC_H */ -- Gitee From 138037601855d67e630583afec173d1894e64f27 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Sat, 14 Jun 2025 15:20:06 +0800 Subject: [PATCH 35/84] arm64: RME: Add wrappers for RMI calls community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- The wrappers make the call sites easier to read and deal with the boiler plate of handling the error codes from the RMM. Reviewed-by: Gavin Shan Signed-off-by: Steven Price Reviewed-by: Suzuki K Poulose Signed-off-by: Xu Raoqing --- arch/arm64/include/asm/rmi_cmds.h | 508 ++++++++++++++++++++++++++++++ 1 file changed, 508 insertions(+) create mode 100644 arch/arm64/include/asm/rmi_cmds.h diff --git a/arch/arm64/include/asm/rmi_cmds.h b/arch/arm64/include/asm/rmi_cmds.h new file mode 100644 index 000000000000..27cd2751f3bf --- /dev/null +++ b/arch/arm64/include/asm/rmi_cmds.h @@ -0,0 +1,508 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 ARM Ltd. + */ + +#ifndef __ASM_RMI_CMDS_H +#define __ASM_RMI_CMDS_H + +#include + +#include + +struct rtt_entry { + unsigned long walk_level; + unsigned long desc; + int state; + int ripas; +}; + +/** + * rmi_data_create() - Create a data granule + * @rd: PA of the RD + * @data: PA of the target granule + * @ipa: IPA at which the granule will be mapped in the guest + * @src: PA of the source granule + * @flags: RMI_MEASURE_CONTENT if the contents should be measured + * + * Create a new data granule, copying contents from a non-secure granule. + * + * Return: RMI return code + */ +static inline int rmi_data_create(unsigned long rd, unsigned long data, + unsigned long ipa, unsigned long src, + unsigned long flags) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_DATA_CREATE, rd, data, ipa, src, + flags, &res); + + return res.a0; +} + +/** + * rmi_data_create_unknown() - Create a data granule with unknown contents + * @rd: PA of the RD + * @data: PA of the target granule + * @ipa: IPA at which the granule will be mapped in the guest + * + * Return: RMI return code + */ +static inline int rmi_data_create_unknown(unsigned long rd, + unsigned long data, + unsigned long ipa) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_DATA_CREATE_UNKNOWN, rd, data, ipa, &res); + + return res.a0; +} + +/** + * rmi_data_destroy() - Destroy a data granule + * @rd: PA of the RD + * @ipa: IPA at which the granule is mapped in the guest + * @data_out: PA of the granule which was destroyed + * @top_out: Top IPA of non-live RTT entries + * + * Unmap a protected IPA from stage 2, transitioning it to DESTROYED. + * The IPA cannot be used by the guest unless it is transitioned to RAM again + * by the realm guest. + * + * Return: RMI return code + */ +static inline int rmi_data_destroy(unsigned long rd, unsigned long ipa, + unsigned long *data_out, + unsigned long *top_out) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_DATA_DESTROY, rd, ipa, &res); + + if (data_out) + *data_out = res.a1; + if (top_out) + *top_out = res.a2; + + return res.a0; +} + +/** + * rmi_features() - Read feature register + * @index: Feature register index + * @out: Feature register value is written to this pointer + * + * Return: RMI return code + */ +static inline int rmi_features(unsigned long index, unsigned long *out) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_FEATURES, index, &res); + + if (out) + *out = res.a1; + return res.a0; +} + +/** + * rmi_granule_delegate() - Delegate a granule + * @phys: PA of the granule + * + * Delegate a granule for use by the realm world. + * + * Return: RMI return code + */ +static inline int rmi_granule_delegate(unsigned long phys) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_GRANULE_DELEGATE, phys, &res); + + return res.a0; +} + +/** + * rmi_granule_undelegate() - Undelegate a granule + * @phys: PA of the granule + * + * Undelegate a granule to allow use by the normal world. Will fail if the + * granule is in use. + * + * Return: RMI return code + */ +static inline int rmi_granule_undelegate(unsigned long phys) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_GRANULE_UNDELEGATE, phys, &res); + + return res.a0; +} + +/** + * rmi_psci_complete() - Complete pending PSCI command + * @calling_rec: PA of the calling REC + * @target_rec: PA of the target REC + * @status: Status of the PSCI request + * + * Completes a pending PSCI command which was called with an MPIDR argument, by + * providing the corresponding REC. + * + * Return: RMI return code + */ +static inline int rmi_psci_complete(unsigned long calling_rec, + unsigned long target_rec, + unsigned long status) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_PSCI_COMPLETE, calling_rec, target_rec, + status, &res); + + return res.a0; +} + +/** + * rmi_realm_activate() - Active a realm + * @rd: PA of the RD + * + * Mark a realm as Active signalling that creation is complete and allowing + * execution of the realm. + * + * Return: RMI return code + */ +static inline int rmi_realm_activate(unsigned long rd) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_REALM_ACTIVATE, rd, &res); + + return res.a0; +} + +/** + * rmi_realm_create() - Create a realm + * @rd: PA of the RD + * @params: PA of realm parameters + * + * Create a new realm using the given parameters. + * + * Return: RMI return code + */ +static inline int rmi_realm_create(unsigned long rd, unsigned long params) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_REALM_CREATE, rd, params, &res); + + return res.a0; +} + +/** + * rmi_realm_destroy() - Destroy a realm + * @rd: PA of the RD + * + * Destroys a realm, all objects belonging to the realm must be destroyed first. + * + * Return: RMI return code + */ +static inline int rmi_realm_destroy(unsigned long rd) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_REALM_DESTROY, rd, &res); + + return res.a0; +} + +/** + * rmi_rec_aux_count() - Get number of auxiliary granules required + * @rd: PA of the RD + * @aux_count: Number of granules written to this pointer + * + * A REC may require extra auxiliary granules to be delegated for the RMM to + * store metadata (not visible to the normal world) in. This function provides + * the number of granules that are required. + * + * Return: RMI return code + */ +static inline int rmi_rec_aux_count(unsigned long rd, unsigned long *aux_count) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_REC_AUX_COUNT, rd, &res); + + if (aux_count) + *aux_count = res.a1; + return res.a0; +} + +/** + * rmi_rec_create() - Create a REC + * @rd: PA of the RD + * @rec: PA of the target REC + * @params: PA of REC parameters + * + * Create a REC using the parameters specified in the struct rec_params pointed + * to by @params. + * + * Return: RMI return code + */ +static inline int rmi_rec_create(unsigned long rd, unsigned long rec, + unsigned long params) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_REC_CREATE, rd, rec, params, &res); + + return res.a0; +} + +/** + * rmi_rec_destroy() - Destroy a REC + * @rec: PA of the target REC + * + * Destroys a REC. The REC must not be running. + * + * Return: RMI return code + */ +static inline int rmi_rec_destroy(unsigned long rec) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_REC_DESTROY, rec, &res); + + return res.a0; +} + +/** + * rmi_rec_enter() - Enter a REC + * @rec: PA of the target REC + * @run_ptr: PA of RecRun structure + * + * Starts (or continues) execution within a REC. + * + * Return: RMI return code + */ +static inline int rmi_rec_enter(unsigned long rec, unsigned long run_ptr) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_REC_ENTER, rec, run_ptr, &res); + + return res.a0; +} + +/** + * rmi_rtt_create() - Creates an RTT + * @rd: PA of the RD + * @rtt: PA of the target RTT + * @ipa: Base of the IPA range described by the RTT + * @level: Depth of the RTT within the tree + * + * Creates an RTT (Realm Translation Table) at the specified level for the + * translation of the specified address within the realm. + * + * Return: RMI return code + */ +static inline int rmi_rtt_create(unsigned long rd, unsigned long rtt, + unsigned long ipa, long level) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_RTT_CREATE, rd, rtt, ipa, level, &res); + + return res.a0; +} + +/** + * rmi_rtt_destroy() - Destroy an RTT + * @rd: PA of the RD + * @ipa: Base of the IPA range described by the RTT + * @level: Depth of the RTT within the tree + * @out_rtt: Pointer to write the PA of the RTT which was destroyed + * @out_top: Pointer to write the top IPA of non-live RTT entries + * + * Destroys an RTT. The RTT must be non-live, i.e. none of the entries in the + * table are in ASSIGNED or TABLE state. + * + * Return: RMI return code. + */ +static inline int rmi_rtt_destroy(unsigned long rd, + unsigned long ipa, + long level, + unsigned long *out_rtt, + unsigned long *out_top) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_RTT_DESTROY, rd, ipa, level, &res); + + if (out_rtt) + *out_rtt = res.a1; + if (out_top) + *out_top = res.a2; + + return res.a0; +} + +/** + * rmi_rtt_fold() - Fold an RTT + * @rd: PA of the RD + * @ipa: Base of the IPA range described by the RTT + * @level: Depth of the RTT within the tree + * @out_rtt: Pointer to write the PA of the RTT which was destroyed + * + * Folds an RTT. If all entries with the RTT are 'homogeneous' the RTT can be + * folded into the parent and the RTT destroyed. + * + * Return: RMI return code + */ +static inline int rmi_rtt_fold(unsigned long rd, unsigned long ipa, + long level, unsigned long *out_rtt) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_RTT_FOLD, rd, ipa, level, &res); + + if (out_rtt) + *out_rtt = res.a1; + + return res.a0; +} + +/** + * rmi_rtt_init_ripas() - Set RIPAS for new realm + * @rd: PA of the RD + * @base: Base of target IPA region + * @top: Top of target IPA region + * @out_top: Top IPA of range whose RIPAS was modified + * + * Sets the RIPAS of a target IPA range to RAM, for a realm in the NEW state. + * + * Return: RMI return code + */ +static inline int rmi_rtt_init_ripas(unsigned long rd, unsigned long base, + unsigned long top, unsigned long *out_top) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_RTT_INIT_RIPAS, rd, base, top, &res); + + if (out_top) + *out_top = res.a1; + + return res.a0; +} + +/** + * rmi_rtt_map_unprotected() - Map NS granules into a realm + * @rd: PA of the RD + * @ipa: Base IPA of the mapping + * @level: Depth within the RTT tree + * @desc: RTTE descriptor + * + * Create a mapping from an Unprotected IPA to a Non-secure PA. + * + * Return: RMI return code + */ +static inline int rmi_rtt_map_unprotected(unsigned long rd, + unsigned long ipa, + long level, + unsigned long desc) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_RTT_MAP_UNPROTECTED, rd, ipa, level, + desc, &res); + + return res.a0; +} + +/** + * rmi_rtt_read_entry() - Read an RTTE + * @rd: PA of the RD + * @ipa: IPA for which to read the RTTE + * @level: RTT level at which to read the RTTE + * @rtt: Output structure describing the RTTE + * + * Reads a RTTE (Realm Translation Table Entry). + * + * Return: RMI return code + */ +static inline int rmi_rtt_read_entry(unsigned long rd, unsigned long ipa, + long level, struct rtt_entry *rtt) +{ + struct arm_smccc_1_2_regs regs = { + SMC_RMI_RTT_READ_ENTRY, + rd, ipa, level + }; + + arm_smccc_1_2_smc(®s, ®s); + + rtt->walk_level = regs.a1; + rtt->state = regs.a2 & 0xFF; + rtt->desc = regs.a3; + rtt->ripas = regs.a4 & 0xFF; + + return regs.a0; +} + +/** + * rmi_rtt_set_ripas() - Set RIPAS for an running realm + * @rd: PA of the RD + * @rec: PA of the REC making the request + * @base: Base of target IPA region + * @top: Top of target IPA region + * @out_top: Pointer to write top IPA of range whose RIPAS was modified + * + * Completes a request made by the realm to change the RIPAS of a target IPA + * range. + * + * Return: RMI return code + */ +static inline int rmi_rtt_set_ripas(unsigned long rd, unsigned long rec, + unsigned long base, unsigned long top, + unsigned long *out_top) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_RTT_SET_RIPAS, rd, rec, base, top, &res); + + if (out_top) + *out_top = res.a1; + + return res.a0; +} + +/** + * rmi_rtt_unmap_unprotected() - Remove a NS mapping + * @rd: PA of the RD + * @ipa: Base IPA of the mapping + * @level: Depth within the RTT tree + * @out_top: Pointer to write top IPA of non-live RTT entries + * + * Removes a mapping at an Unprotected IPA. + * + * Return: RMI return code + */ +static inline int rmi_rtt_unmap_unprotected(unsigned long rd, + unsigned long ipa, + long level, + unsigned long *out_top) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_RTT_UNMAP_UNPROTECTED, rd, ipa, + level, &res); + + if (out_top) + *out_top = res.a1; + + return res.a0; +} + +#endif /* __ASM_RMI_CMDS_H */ -- Gitee From 83ea8b561dbaadc4a30fed345d054ae23b7c1edf Mon Sep 17 00:00:00 2001 From: Steven Price Date: Sat, 14 Jun 2025 15:22:11 +0800 Subject: [PATCH 36/84] arm64: RME: Check for RME support at KVM init community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- Query the RMI version number and check if it is a compatible version. A static key is also provided to signal that a supported RMM is available. Functions are provided to query if a VM or VCPU is a realm (or rec) which currently will always return false. Later patches make use of struct realm and the states as the ioctls interfaces are added to support realm and REC creation and destruction. Signed-off-by: Steven Price Reviewed-by: Gavin Shan Reviewed-by: Suzuki K Poulose Signed-off-by: Xu Raoqing --- arch/arm64/include/asm/kvm_emulate.h | 18 +++++++++ arch/arm64/include/asm/kvm_host.h | 4 ++ arch/arm64/include/asm/kvm_rme.h | 56 ++++++++++++++++++++++++++++ arch/arm64/include/asm/virt.h | 1 + arch/arm64/kvm/Makefile | 3 +- arch/arm64/kvm/arm.c | 6 +++ arch/arm64/kvm/rme.c | 56 ++++++++++++++++++++++++++++ 7 files changed, 143 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/include/asm/kvm_rme.h create mode 100644 arch/arm64/kvm/rme.c diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 3d6725ff0bf6..ae4f2f94a26a 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -615,4 +615,22 @@ static __always_inline void kvm_reset_cptr_el2(struct kvm_vcpu *vcpu) kvm_write_cptr_el2(val); } + +static inline bool kvm_is_realm(struct kvm *kvm) +{ + if (static_branch_unlikely(&kvm_rme_is_available) && kvm) + return kvm->arch.is_realm; + return false; +} + +static inline enum realm_state kvm_realm_state(struct kvm *kvm) +{ + return READ_ONCE(kvm->arch.realm.state); +} + +static inline bool vcpu_is_rec(struct kvm_vcpu *vcpu) +{ + return false; +} + #endif /* __ARM64_KVM_EMULATE_H__ */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index df73e452d2cc..339fc315789f 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -27,6 +27,7 @@ #include #include #include +#include #define __KVM_HAVE_ARCH_INTC_INITIALIZED @@ -278,6 +279,9 @@ struct kvm_arch { * the associated pKVM instance in the hypervisor. */ struct kvm_protected_vm pkvm; + + bool is_realm; + struct realm realm; }; struct kvm_vcpu_fault_info { diff --git a/arch/arm64/include/asm/kvm_rme.h b/arch/arm64/include/asm/kvm_rme.h new file mode 100644 index 000000000000..9c8a0b23e0e4 --- /dev/null +++ b/arch/arm64/include/asm/kvm_rme.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 ARM Ltd. + */ + +#ifndef __ASM_KVM_RME_H +#define __ASM_KVM_RME_H + +/** + * enum realm_state - State of a Realm + */ +enum realm_state { + /** + * @REALM_STATE_NONE: + * Realm has not yet been created. rmi_realm_create() may be + * called to create the realm. + */ + REALM_STATE_NONE, + /** + * @REALM_STATE_NEW: + * Realm is under construction, not eligible for execution. Pages + * may be populated with rmi_data_create(). + */ + REALM_STATE_NEW, + /** + * @REALM_STATE_ACTIVE: + * Realm has been created and is eligible for execution with + * rmi_rec_enter(). Pages may no longer be populated with + * rmi_data_create(). + */ + REALM_STATE_ACTIVE, + /** + * @REALM_STATE_DYING: + * Realm is in the process of being destroyed or has already been + * destroyed. + */ + REALM_STATE_DYING, + /** + * @REALM_STATE_DEAD: + * Realm has been destroyed. + */ + REALM_STATE_DEAD +}; + +/** + * struct realm - Additional per VM data for a Realm + * + * @state: The lifetime state machine for the realm + */ +struct realm { + enum realm_state state; +}; + +void kvm_init_rme(void); + +#endif /* __ASM_KVM_RME_H */ diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h index 261d6e9df2e1..12cf36c38189 100644 --- a/arch/arm64/include/asm/virt.h +++ b/arch/arm64/include/asm/virt.h @@ -81,6 +81,7 @@ void __hyp_reset_vectors(void); bool is_kvm_arm_initialised(void); DECLARE_STATIC_KEY_FALSE(kvm_protected_mode_initialized); +DECLARE_STATIC_KEY_FALSE(kvm_rme_is_available); /* Reports the availability of HYP mode */ static inline bool is_hyp_mode_available(void) diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index c0c050e53157..1c1d8cdf381f 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -20,7 +20,8 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \ vgic/vgic-v3.o vgic/vgic-v4.o \ vgic/vgic-mmio.o vgic/vgic-mmio-v2.o \ vgic/vgic-mmio-v3.o vgic/vgic-kvm-device.o \ - vgic/vgic-its.o vgic/vgic-debug.o + vgic/vgic-its.o vgic/vgic-debug.o \ + rme.o kvm-$(CONFIG_HW_PERF_EVENTS) += pmu-emul.o pmu.o diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index ac51acec3c56..4f4dea5fe810 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -47,6 +48,8 @@ static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT; +DEFINE_STATIC_KEY_FALSE(kvm_rme_is_available); + DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector); DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); @@ -2410,6 +2413,9 @@ static __init int kvm_arm_init(void) in_hyp_mode = is_kernel_in_hyp_mode(); + if (in_hyp_mode) + kvm_init_rme(); + if (cpus_have_final_cap(ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) || cpus_have_final_cap(ARM64_WORKAROUND_1508412)) kvm_info("Guests without required CPU erratum workarounds can deadlock system!\n" \ diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c new file mode 100644 index 000000000000..67cf2d94cb2d --- /dev/null +++ b/arch/arm64/kvm/rme.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2023 ARM Ltd. + */ + +#include + +#include +#include + +static int rmi_check_version(void) +{ + struct arm_smccc_res res; + unsigned short version_major, version_minor; + unsigned long host_version = RMI_ABI_VERSION(RMI_ABI_MAJOR_VERSION, + RMI_ABI_MINOR_VERSION); + + arm_smccc_1_1_invoke(SMC_RMI_VERSION, host_version, &res); + + if (res.a0 == SMCCC_RET_NOT_SUPPORTED) + return -ENXIO; + + version_major = RMI_ABI_VERSION_GET_MAJOR(res.a1); + version_minor = RMI_ABI_VERSION_GET_MINOR(res.a1); + + if (res.a0 != RMI_SUCCESS) { + unsigned short high_version_major, high_version_minor; + + high_version_major = RMI_ABI_VERSION_GET_MAJOR(res.a2); + high_version_minor = RMI_ABI_VERSION_GET_MINOR(res.a2); + + kvm_err("Unsupported RMI ABI (v%d.%d - v%d.%d) we want v%d.%d\n", + version_major, version_minor, + high_version_major, high_version_minor, + RMI_ABI_MAJOR_VERSION, + RMI_ABI_MINOR_VERSION); + return -ENXIO; + } + + kvm_info("RMI ABI version %d.%d\n", version_major, version_minor); + + return 0; +} + +void kvm_init_rme(void) +{ + if (PAGE_SIZE != SZ_4K) + /* Only 4k page size on the host is supported */ + return; + + if (rmi_check_version()) + /* Continue without realm support */ + return; + + /* Future patch will enable static branch kvm_rme_is_available */ +} -- Gitee From 46c99fc5b2645a109f1cbe78f6dc454fdb706883 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Sat, 14 Jun 2025 15:24:20 +0800 Subject: [PATCH 37/84] arm64: RME: Define the user ABI community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- There is one (multiplexed) CAP which can be used to create, populate and then activate the realm. Co-developed-by: Suzuki K Poulose Signed-off-by: Suzuki K Poulose Signed-off-by: Steven Price Reviewed-by: Gavin Shan Signed-off-by: Xu Raoqing --- Documentation/virt/kvm/api.rst | 70 +++++++++++++++++++++++++++++++ arch/arm64/include/uapi/asm/kvm.h | 49 ++++++++++++++++++++++ include/uapi/linux/kvm.h | 10 +++++ 3 files changed, 129 insertions(+) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index d258ed3baed8..f6bed0666c91 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -3520,6 +3520,11 @@ Possible features: - the KVM_REG_ARM64_SVE_VLS pseudo-register is immutable, and can no longer be written using KVM_SET_ONE_REG. + - KVM_ARM_VCPU_REC: Allocate a REC (Realm Execution Context) for this + VCPU. This must be specified on all VCPUs created in a Realm VM. + Depends on KVM_CAP_ARM_RME. + Requires KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_REC). + 4.83 KVM_ARM_PREFERRED_TARGET ----------------------------- @@ -5055,6 +5060,7 @@ Recognised values for feature: ===== =========================================== arm64 KVM_ARM_VCPU_SVE (requires KVM_CAP_ARM_SVE) + arm64 KVM_ARM_VCPU_REC (requires KVM_CAP_ARM_RME) ===== =========================================== Finalizes the configuration of the specified vcpu feature. @@ -6336,6 +6342,30 @@ to the byte array. __u64 flags; } hypercall; +4.144 KVM_ARM_VCPU_RMM_PSCI_COMPLETE +------------------------------------ + +:Capability: KVM_CAP_ARM_RME +:Architectures: arm64 +:Type: vcpu ioctl +:Parameters: struct kvm_arm_rmm_psci_complete (in) +:Returns: 0 if successful, < 0 on error + +:: + + struct kvm_arm_rmm_psci_complete { + __u64 target_mpidr; + __u32 psci_status; + __u32 padding[3]; + }; + +Where PSCI functions are handled by user space, the RMM needs to be informed of +the target of the operation using `target_mpidr`, along with the status +(`psci_status`). The RMM v1.0 specification defines two functions that require +this call: PSCI_CPU_ON and PSCI_AFFINITY_INFO. + +If the kernel is handling PSCI then this is done automatically and the VMM +doesn't need to call this ioctl. It is strongly recommended that userspace use ``KVM_EXIT_IO`` (x86) or ``KVM_EXIT_MMIO`` (all except s390) to implement functionality that @@ -7833,6 +7863,46 @@ This capability is aimed to mitigate the threat that malicious VMs can cause CPU stuck (due to event windows don't open up) and make the CPU unavailable to host or other VMs. +7.38 KVM_CAP_ARM_RME +-------------------- + +:Architectures: arm64 +:Target: VM +:Parameters: args[0] provides an action, args[1] points to a structure in + memory for some actions. +:Returns: 0 on success, negative value on error + +Used to configure and set up the memory for a Realm. The available actions are: + +================================= ============================================= + KVM_CAP_ARM_RME_CONFIG_REALM Takes struct arm_rme_config as args[1] and + configures realm parameters prior to it being + created. + + Options are ARM_RME_CONFIG_RPV to set the + "Realm Personalization Value" and + ARM_RME_CONFIG_HASH_ALGO to set the hash + algorithm. + + KVM_CAP_ARM_RME_CREATE_REALM Request the RMM create the realm. The realm's + configuration parameters must be set first. + + KVM_CAP_ARM_RME_INIT_RIPAS_REALM Takes struct arm_rme_init_ripas as args[1] + and sets the RIPAS (Realm IPA State) to + RIPAS_RAM of a specified area of the realm's + IPA. + + KVM_CAP_ARM_RME_POPULATE_REALM Takes struct arm_rme_init_ripas as args[1] + and populates a region of protected address + space by copying the data from the shared + alias. + + KVM_CAP_ARM_RME_ACTIVATE_REALM Request the RMM activate the realm. No + further changes can be made to the realm's + configuration, and VCPUs are not permitted to + enter the realm until it has been activated. +================================= ============================================= + 8. Other capabilities. ====================== diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index f7ddd73a8c0f..1d73809c3851 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -110,6 +110,7 @@ struct kvm_regs { #define KVM_ARM_VCPU_PTRAUTH_ADDRESS 5 /* VCPU uses address authentication */ #define KVM_ARM_VCPU_PTRAUTH_GENERIC 6 /* VCPU uses generic authentication */ #define KVM_ARM_VCPU_HAS_EL2 7 /* Support nested virtualization */ +#define KVM_ARM_VCPU_REC 9 /* VCPU REC state as part of Realm */ struct kvm_vcpu_init { __u32 target; @@ -415,6 +416,54 @@ enum { #define KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES 3 #define KVM_DEV_ARM_ITS_CTRL_RESET 4 +/* KVM_CAP_ARM_RME on VM fd */ +#define KVM_CAP_ARM_RME_CONFIG_REALM 0 +#define KVM_CAP_ARM_RME_CREATE_REALM 1 +#define KVM_CAP_ARM_RME_INIT_RIPAS_REALM 2 +#define KVM_CAP_ARM_RME_POPULATE_REALM 3 +#define KVM_CAP_ARM_RME_ACTIVATE_REALM 4 + +/* List of configuration items accepted for KVM_CAP_ARM_RME_CONFIG_REALM */ +#define ARM_RME_CONFIG_RPV 0 +#define ARM_RME_CONFIG_HASH_ALGO 1 + +#define ARM_RME_CONFIG_MEASUREMENT_ALGO_SHA256 0 +#define ARM_RME_CONFIG_MEASUREMENT_ALGO_SHA512 1 + +#define ARM_RME_CONFIG_RPV_SIZE 64 + +struct arm_rme_config { + __u32 cfg; + union { + /* cfg == ARM_RME_CONFIG_RPV */ + struct { + __u8 rpv[ARM_RME_CONFIG_RPV_SIZE]; + }; + + /* cfg == ARM_RME_CONFIG_HASH_ALGO */ + struct { + __u32 hash_algo; + }; + + /* Fix the size of the union */ + __u8 reserved[256]; + }; +}; + +#define KVM_ARM_RME_POPULATE_FLAGS_MEASURE (1 << 0) +struct arm_rme_populate_realm { + __u64 base; + __u64 size; + __u32 flags; + __u32 reserved[3]; +}; + +struct arm_rme_init_ripas { + __u64 base; + __u64 size; + __u64 reserved[2]; +}; + /* Device Control API on vcpu fd */ #define KVM_ARM_VCPU_PMU_V3_CTRL 0 #define KVM_ARM_VCPU_PMU_V3_IRQ 0 diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 23fae6de3caa..fc5a128c7cd2 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1207,6 +1207,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE 228 #define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229 #define KVM_CAP_USER_MEMORY2 231 +#define KVM_CAP_ARM_RME 240 #define KVM_CAP_SEV_ES_GHCB 500 #define KVM_CAP_HYGON_COCO_EXT 501 @@ -2407,4 +2408,13 @@ struct kvm_csv3_handle_memory { __u32 opcode; }; +/* Available with KVM_CAP_ARM_RME, only for VMs with KVM_VM_TYPE_ARM_REALM */ +struct kvm_arm_rmm_psci_complete { + __u64 target_mpidr; + __u32 psci_status; + __u32 padding[3]; +}; + +#define KVM_ARM_VCPU_RMM_PSCI_COMPLETE _IOW(KVMIO, 0xd6, struct kvm_arm_rmm_psci_complete) + #endif /* __LINUX_KVM_H */ -- Gitee From 3278e50c0251377fe090691e7e191b999217c347 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Tue, 17 Jun 2025 16:46:13 +0800 Subject: [PATCH 38/84] arm64: RME: ioctls to create and configure realms community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- Add the KVM_CAP_ARM_RME_CREATE_RD ioctl to create a realm. This involves delegating pages to the RMM to hold the Realm Descriptor (RD) and for the base level of the Realm Translation Tables (RTT). A VMID also need to be picked, since the RMM has a separate VMID address space a dedicated allocator is added for this purpose. KVM_CAP_ARM_RME_CONFIG_REALM is provided to allow configuring the realm before it is created. Configuration options can be classified as: 1. Parameters specific to the Realm stage2 (e.g. IPA Size, vmid, stage2 entry level, entry level RTTs, number of RTTs in start level, LPA2) Most of these are not measured by RMM and comes from KVM book keeping. 2. Parameters controlling "Arm Architecture features for the VM". (e.g. SVE VL, PMU counters, number of HW BRPs/WPs), configured by the VMM using the "user ID register write" mechanism. These will be supported in the later patches. 3. Parameters are not part of the core Arm architecture but defined by the RMM spec (e.g. Hash algorithm for measurement, Personalisation value). These are programmed via KVM_CAP_ARM_RME_CONFIG_REALM. For the IPA size there is the possibility that the RMM supports a different size to the IPA size supported by KVM for normal guests. At the moment the 'normal limit' is exposed by KVM_CAP_ARM_VM_IPA_SIZE and the IPA size is configured by the bottom bits of vm_type in KVM_CREATE_VM. This means that it isn't easy for the VMM to discover what IPA sizes are supported for Realm guests. Since the IPA is part of the measurement of the realm guest the current expectation is that the VMM will be required to pick the IPA size demanded by attestation and therefore simply failing if this isn't available is fine. An option would be to expose a new capability ioctl to obtain the RMM's maximum IPA size if this is needed in the future. Co-developed-by: Suzuki K Poulose Signed-off-by: Suzuki K Poulose Signed-off-by: Steven Price Reviewed-by: Gavin Shan Signed-off-by: Xu Raoqing --- arch/arm64/include/asm/kvm_emulate.h | 5 + arch/arm64/include/asm/kvm_rme.h | 19 ++ arch/arm64/kvm/arm.c | 16 ++ arch/arm64/kvm/mmu.c | 54 +++-- arch/arm64/kvm/rme.c | 319 +++++++++++++++++++++++++++ 5 files changed, 400 insertions(+), 13 deletions(-) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index ae4f2f94a26a..7a818f980975 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -628,6 +628,11 @@ static inline enum realm_state kvm_realm_state(struct kvm *kvm) return READ_ONCE(kvm->arch.realm.state); } +static inline bool kvm_realm_is_created(struct kvm *kvm) +{ + return kvm_is_realm(kvm) && kvm_realm_state(kvm) != REALM_STATE_NONE; +} + static inline bool vcpu_is_rec(struct kvm_vcpu *vcpu) { return false; diff --git a/arch/arm64/include/asm/kvm_rme.h b/arch/arm64/include/asm/kvm_rme.h index 9c8a0b23e0e4..5dc1915de891 100644 --- a/arch/arm64/include/asm/kvm_rme.h +++ b/arch/arm64/include/asm/kvm_rme.h @@ -6,6 +6,8 @@ #ifndef __ASM_KVM_RME_H #define __ASM_KVM_RME_H +#include + /** * enum realm_state - State of a Realm */ @@ -46,11 +48,28 @@ enum realm_state { * struct realm - Additional per VM data for a Realm * * @state: The lifetime state machine for the realm + * @rd: Kernel mapping of the Realm Descriptor (RD) + * @params: Parameters for the RMI_REALM_CREATE command + * @num_aux: The number of auxiliary pages required by the RMM + * @vmid: VMID to be used by the RMM for the realm + * @ia_bits: Number of valid Input Address bits in the IPA */ struct realm { enum realm_state state; + + void *rd; + struct realm_params *params; + + unsigned long num_aux; + unsigned int vmid; + unsigned int ia_bits; }; void kvm_init_rme(void); +u32 kvm_realm_ipa_limit(void); + +int kvm_realm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap); +int kvm_init_realm_vm(struct kvm *kvm); +void kvm_destroy_realm(struct kvm *kvm); #endif /* __ASM_KVM_RME_H */ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 4f4dea5fe810..ffb4528b8b2d 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -118,6 +118,11 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, } mutex_unlock(&kvm->slots_lock); break; + case KVM_CAP_ARM_RME: + mutex_lock(&kvm->lock); + r = kvm_realm_enable_cap(kvm, cap); + mutex_unlock(&kvm->lock); + break; default: r = -EINVAL; break; @@ -178,6 +183,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) bitmap_zero(kvm->arch.vcpu_features, KVM_VCPU_MAX_FEATURES); + /* Initialise the realm bits after the generic bits are enabled */ + if (kvm_is_realm(kvm)) { + ret = kvm_init_realm_vm(kvm); + if (ret) + goto err_free_cpumask; + } + return 0; err_free_cpumask: @@ -212,6 +224,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_unshare_hyp(kvm, kvm + 1); kvm_arm_teardown_hypercalls(kvm); + kvm_destroy_realm(kvm); } int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) @@ -319,6 +332,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES: r = kvm_supported_block_sizes(); break; + case KVM_CAP_ARM_RME: + r = static_key_enabled(&kvm_rme_is_available); + break; default: r = 0; } diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 482280fe22d7..d820083fc4c2 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -853,24 +853,16 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { .icache_inval_pou = invalidate_icache_guest_page, }; -/** - * kvm_init_stage2_mmu - Initialise a S2 MMU structure - * @kvm: The pointer to the KVM structure - * @mmu: The pointer to the s2 MMU structure - * @type: The machine type of the virtual machine - * - * Allocates only the stage-2 HW PGD level table(s). - * Note we don't need locking here as this is only called when the VM is - * created, which can only be done once. - */ -int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type) +static int kvm_init_ipa_range(struct kvm *kvm, + struct kvm_s2_mmu *mmu, unsigned long type) { u32 kvm_ipa_limit = get_kvm_ipa_limit(); - int cpu, err; - struct kvm_pgtable *pgt; u64 mmfr0, mmfr1; u32 phys_shift; + if (kvm_is_realm(kvm)) + kvm_ipa_limit = kvm_realm_ipa_limit(); + if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK) return -EINVAL; @@ -894,11 +886,33 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); + return 0; +} + +/** + * kvm_init_stage2_mmu - Initialise a S2 MMU structure + * @kvm: The pointer to the KVM structure + * @mmu: The pointer to the s2 MMU structure + * @type: The machine type of the virtual machine + * + * Allocates only the stage-2 HW PGD level table(s). + * Note we don't need locking here as this is only called when the VM is + * created, which can only be done once. + */ +int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type) +{ + int cpu, err; + struct kvm_pgtable *pgt; + if (mmu->pgt != NULL) { kvm_err("kvm_arch already initialized?\n"); return -EINVAL; } + err = kvm_init_ipa_range(kvm, mmu, type); + if (err) + return err; + pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT); if (!pgt) return -ENOMEM; @@ -1012,6 +1026,20 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) struct kvm_pgtable *pgt = NULL; write_lock(&kvm->mmu_lock); + if (kvm_is_realm(kvm) && + (kvm_realm_state(kvm) != REALM_STATE_DEAD && + kvm_realm_state(kvm) != REALM_STATE_NONE)) { + /* Tearing down RTTs will be added in a later patch */ + write_unlock(&kvm->mmu_lock); + + /* + * The physical PGD pages are delegated to the RMM, so cannot + * be freed at this point. This function will be called again + * from kvm_destroy_realm() after the physical pages have been + * returned at which point the memory can be freed. + */ + return; + } pgt = mmu->pgt; if (pgt) { mmu->pgd_phys = 0; diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c index 67cf2d94cb2d..b79f9c1e81c6 100644 --- a/arch/arm64/kvm/rme.c +++ b/arch/arm64/kvm/rme.c @@ -5,9 +5,23 @@ #include +#include +#include #include #include +#include + +static unsigned long rmm_feat_reg0; + +#define RMM_PAGE_SHIFT 12 +#define RMM_PAGE_SIZE BIT(RMM_PAGE_SHIFT) + +static bool rme_has_feature(unsigned long feature) +{ + return !!u64_get_bits(rmm_feat_reg0, feature); +} + static int rmi_check_version(void) { struct arm_smccc_res res; @@ -42,6 +56,305 @@ static int rmi_check_version(void) return 0; } +u32 kvm_realm_ipa_limit(void) +{ + return u64_get_bits(rmm_feat_reg0, RMI_FEATURE_REGISTER_0_S2SZ); +} + +static int get_start_level(struct realm *realm) +{ + return 4 - ((realm->ia_bits - 8) / (RMM_PAGE_SHIFT - 3)); +} + +static void free_delegated_granule(phys_addr_t phys) +{ + if (WARN_ON(rmi_granule_undelegate(phys))) { + /* Undelegate failed: leak the page */ + return; + } + + kvm_account_pgtable_pages(phys_to_virt(phys), -1); + + free_page((unsigned long)phys_to_virt(phys)); +} + +/* Calculate the number of s2 root rtts needed */ +static int realm_num_root_rtts(struct realm *realm) +{ + unsigned int ipa_bits = realm->ia_bits; + unsigned int levels = 3 - get_start_level(realm); + unsigned int sl_ipa_bits = (levels + 1) * (RMM_PAGE_SHIFT - 3) + + RMM_PAGE_SHIFT; + + if (sl_ipa_bits >= ipa_bits) + return 1; + + return 1 << (ipa_bits - sl_ipa_bits); +} + +static int realm_create_rd(struct kvm *kvm) +{ + struct realm *realm = &kvm->arch.realm; + struct realm_params *params = realm->params; + void *rd = NULL; + phys_addr_t rd_phys, params_phys; + size_t pgd_size = kvm_pgtable_stage2_pgd_size(kvm->arch.vtcr); + int i, r; + int rtt_num_start; + + realm->ia_bits = VTCR_EL2_IPA(kvm->arch.vtcr); + rtt_num_start = realm_num_root_rtts(realm); + + if (WARN_ON(realm->rd || !realm->params)) + return -EEXIST; + + if (pgd_size / RMM_PAGE_SIZE < rtt_num_start) + return -EINVAL; + + rd = (void *)__get_free_page(GFP_KERNEL); + if (!rd) + return -ENOMEM; + + rd_phys = virt_to_phys(rd); + if (rmi_granule_delegate(rd_phys)) { + r = -ENXIO; + goto free_rd; + } + + for (i = 0; i < pgd_size; i += RMM_PAGE_SIZE) { + phys_addr_t pgd_phys = kvm->arch.mmu.pgd_phys + i; + + if (rmi_granule_delegate(pgd_phys)) { + r = -ENXIO; + goto out_undelegate_tables; + } + } + + params->s2sz = VTCR_EL2_IPA(kvm->arch.vtcr); + params->rtt_level_start = get_start_level(realm); + params->rtt_num_start = rtt_num_start; + params->rtt_base = kvm->arch.mmu.pgd_phys; + params->vmid = realm->vmid; + + params_phys = virt_to_phys(params); + + if (rmi_realm_create(rd_phys, params_phys)) { + r = -ENXIO; + goto out_undelegate_tables; + } + + if (WARN_ON(rmi_rec_aux_count(rd_phys, &realm->num_aux))) { + WARN_ON(rmi_realm_destroy(rd_phys)); + goto out_undelegate_tables; + } + + realm->rd = rd; + + return 0; + +out_undelegate_tables: + while (i > 0) { + i -= RMM_PAGE_SIZE; + + phys_addr_t pgd_phys = kvm->arch.mmu.pgd_phys + i; + + if (WARN_ON(rmi_granule_undelegate(pgd_phys))) { + /* Leak the pages if they cannot be returned */ + kvm->arch.mmu.pgt = NULL; + break; + } + } + if (WARN_ON(rmi_granule_undelegate(rd_phys))) { + /* Leak the page if it isn't returned */ + return r; + } +free_rd: + free_page((unsigned long)rd); + return r; +} + +/* Protects access to rme_vmid_bitmap */ +static DEFINE_SPINLOCK(rme_vmid_lock); +static unsigned long *rme_vmid_bitmap; + +static int rme_vmid_init(void) +{ + unsigned int vmid_count = 1 << kvm_get_vmid_bits(); + + rme_vmid_bitmap = bitmap_zalloc(vmid_count, GFP_KERNEL); + if (!rme_vmid_bitmap) { + kvm_err("%s: Couldn't allocate rme vmid bitmap\n", __func__); + return -ENOMEM; + } + + return 0; +} + +static int rme_vmid_reserve(void) +{ + int ret; + unsigned int vmid_count = 1 << kvm_get_vmid_bits(); + + spin_lock(&rme_vmid_lock); + ret = bitmap_find_free_region(rme_vmid_bitmap, vmid_count, 0); + spin_unlock(&rme_vmid_lock); + + return ret; +} + +static void rme_vmid_release(unsigned int vmid) +{ + spin_lock(&rme_vmid_lock); + bitmap_release_region(rme_vmid_bitmap, vmid, 0); + spin_unlock(&rme_vmid_lock); +} + +static int kvm_create_realm(struct kvm *kvm) +{ + struct realm *realm = &kvm->arch.realm; + int ret; + + if (!kvm_is_realm(kvm)) + return -EINVAL; + if (kvm_realm_is_created(kvm)) + return -EEXIST; + + ret = rme_vmid_reserve(); + if (ret < 0) + return ret; + realm->vmid = ret; + + ret = realm_create_rd(kvm); + if (ret) { + rme_vmid_release(realm->vmid); + return ret; + } + + WRITE_ONCE(realm->state, REALM_STATE_NEW); + + /* The realm is up, free the parameters. */ + free_page((unsigned long)realm->params); + realm->params = NULL; + + return 0; +} + +static int config_realm_hash_algo(struct realm *realm, + struct arm_rme_config *cfg) +{ + switch (cfg->hash_algo) { + case ARM_RME_CONFIG_MEASUREMENT_ALGO_SHA256: + if (!rme_has_feature(RMI_FEATURE_REGISTER_0_HASH_SHA_256)) + return -EINVAL; + break; + case ARM_RME_CONFIG_MEASUREMENT_ALGO_SHA512: + if (!rme_has_feature(RMI_FEATURE_REGISTER_0_HASH_SHA_512)) + return -EINVAL; + break; + default: + return -EINVAL; + } + realm->params->hash_algo = cfg->hash_algo; + return 0; +} + +static int kvm_rme_config_realm(struct kvm *kvm, struct kvm_enable_cap *cap) +{ + struct arm_rme_config cfg; + struct realm *realm = &kvm->arch.realm; + int r = 0; + + if (kvm_realm_is_created(kvm)) + return -EBUSY; + + if (copy_from_user(&cfg, (void __user *)cap->args[1], sizeof(cfg))) + return -EFAULT; + + switch (cfg.cfg) { + case ARM_RME_CONFIG_RPV: + memcpy(&realm->params->rpv, &cfg.rpv, sizeof(cfg.rpv)); + break; + case ARM_RME_CONFIG_HASH_ALGO: + r = config_realm_hash_algo(realm, &cfg); + break; + default: + r = -EINVAL; + } + + return r; +} + +int kvm_realm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) +{ + int r = 0; + + if (!kvm_is_realm(kvm)) + return -EINVAL; + + switch (cap->args[0]) { + case KVM_CAP_ARM_RME_CONFIG_REALM: + r = kvm_rme_config_realm(kvm, cap); + break; + case KVM_CAP_ARM_RME_CREATE_REALM: + r = kvm_create_realm(kvm); + break; + default: + r = -EINVAL; + break; + } + + return r; +} + +void kvm_destroy_realm(struct kvm *kvm) +{ + struct realm *realm = &kvm->arch.realm; + size_t pgd_size = kvm_pgtable_stage2_pgd_size(kvm->arch.vtcr); + int i; + + if (realm->params) { + free_page((unsigned long)realm->params); + realm->params = NULL; + } + + if (!kvm_realm_is_created(kvm)) + return; + + WRITE_ONCE(realm->state, REALM_STATE_DYING); + + if (realm->rd) { + phys_addr_t rd_phys = virt_to_phys(realm->rd); + + if (WARN_ON(rmi_realm_destroy(rd_phys))) + return; + free_delegated_granule(rd_phys); + realm->rd = NULL; + } + + rme_vmid_release(realm->vmid); + + for (i = 0; i < pgd_size; i += RMM_PAGE_SIZE) { + phys_addr_t pgd_phys = kvm->arch.mmu.pgd_phys + i; + + if (WARN_ON(rmi_granule_undelegate(pgd_phys))) + return; + } + + WRITE_ONCE(realm->state, REALM_STATE_DEAD); + + /* Now that the Realm is destroyed, free the entry level RTTs */ + kvm_free_stage2_pgd(&kvm->arch.mmu); +} + +int kvm_init_realm_vm(struct kvm *kvm) +{ + kvm->arch.realm.params = (void *)get_zeroed_page(GFP_KERNEL); + + if (!kvm->arch.realm.params) + return -ENOMEM; + return 0; +} + void kvm_init_rme(void) { if (PAGE_SIZE != SZ_4K) @@ -52,5 +365,11 @@ void kvm_init_rme(void) /* Continue without realm support */ return; + if (WARN_ON(rmi_features(0, &rmm_feat_reg0))) + return; + + if (rme_vmid_init()) + return; + /* Future patch will enable static branch kvm_rme_is_available */ } -- Gitee From e34b525ca99bbf87a96643618985929e3ad76f0b Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Sat, 14 Jun 2025 15:26:38 +0800 Subject: [PATCH 39/84] kvm: arm64: Don't expose debug capabilities for realm guests community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- RMM v1.0 provides no mechanism for the host to perform debug operations on the guest. So don't expose KVM_CAP_SET_GUEST_DEBUG and report 0 breakpoints and 0 watch points. Signed-off-by: Suzuki K Poulose Signed-off-by: Steven Price Reviewed-by: Gavin Shan Signed-off-by: Xu Raoqing --- arch/arm64/kvm/arm.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index ffb4528b8b2d..23ac7fefe2ce 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -249,7 +249,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ARM_IRQ_LINE_LAYOUT_2: case KVM_CAP_ARM_NISV_TO_USER: case KVM_CAP_ARM_INJECT_EXT_DABT: - case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_VCPU_ATTRIBUTES: case KVM_CAP_PTP_KVM: case KVM_CAP_ARM_SYSTEM_SUSPEND: @@ -257,6 +256,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_COUNTER_OFFSET: r = 1; break; + case KVM_CAP_SET_GUEST_DEBUG: + r = !kvm_is_realm(kvm); + break; case KVM_CAP_SET_GUEST_DEBUG2: return KVM_GUESTDBG_VALID_MASK; case KVM_CAP_ARM_SET_DEVICE_ADDR: @@ -302,10 +304,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = cpus_have_const_cap(ARM64_HAS_32BIT_EL1); break; case KVM_CAP_GUEST_DEBUG_HW_BPS: - r = get_num_brps(); + r = kvm_is_realm(kvm) ? 0 : get_num_brps(); break; case KVM_CAP_GUEST_DEBUG_HW_WPS: - r = get_num_wrps(); + r = kvm_is_realm(kvm) ? 0 : get_num_wrps(); break; case KVM_CAP_ARM_PMU_V3: r = kvm_arm_support_pmu_v3(); -- Gitee From d222863c91013ac16d6995520b547e527eb5e759 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Sat, 14 Jun 2025 15:27:38 +0800 Subject: [PATCH 40/84] KVM: arm64: Allow passing machine type in KVM creation community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- Previously machine type was used purely for specifying the physical address size of the guest. Reserve the higher bits to specify an ARM specific machine type and declare a new type 'KVM_VM_TYPE_ARM_REALM' used to create a realm guest. Reviewed-by: Suzuki K Poulose Signed-off-by: Steven Price Reviewed-by: Gavin Shan Signed-off-by: Xu Raoqing --- Documentation/virt/kvm/api.rst | 16 ++++++++++++++-- arch/arm64/kvm/arm.c | 15 +++++++++++++++ arch/arm64/kvm/mmu.c | 3 --- include/uapi/linux/kvm.h | 19 +++++++++++++++---- 4 files changed, 44 insertions(+), 9 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index f6bed0666c91..49f510bd3a55 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -151,8 +151,20 @@ In order to create user controlled virtual machines on S390, check KVM_CAP_S390_UCONTROL and use the flag KVM_VM_S390_UCONTROL as privileged user (CAP_SYS_ADMIN). -On arm64, the physical address size for a VM (IPA Size limit) is limited -to 40bits by default. The limit can be configured if the host supports the +On arm64, the machine type identifier is used to encode a type and the +physical address size for the VM. The lower byte (bits[7-0]) encode the +address size and the upper bits[11-8] encode a machine type. The machine +types that might be available are: + + ====================== ============================================ + KVM_VM_TYPE_ARM_NORMAL A standard VM + KVM_VM_TYPE_ARM_REALM A "Realm" VM using the Arm Confidential + Compute extensions, the VM's memory is + protected from the host. + ====================== ============================================ + +The physical address size for a VM (IPA Size limit) is limited to 40bits +by default. The limit can be configured if the host supports the extension KVM_CAP_ARM_VM_IPA_SIZE. When supported, use KVM_VM_TYPE_ARM_IPA_SIZE(IPA_Bits) to set the size in the machine type identifier, where IPA_Bits is the maximum width of any physical diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 23ac7fefe2ce..79a2be4704e5 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -154,6 +154,21 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) mutex_unlock(&kvm->lock); #endif + if (type & ~(KVM_VM_TYPE_ARM_MASK | KVM_VM_TYPE_ARM_IPA_SIZE_MASK)) + return -EINVAL; + + switch (type & KVM_VM_TYPE_ARM_MASK) { + case KVM_VM_TYPE_ARM_NORMAL: + break; + case KVM_VM_TYPE_ARM_REALM: + if (!static_branch_unlikely(&kvm_rme_is_available)) + return -EPERM; + kvm->arch.is_realm = true; + break; + default: + return -EINVAL; + } + ret = kvm_share_hyp(kvm, kvm + 1); if (ret) return ret; diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index d820083fc4c2..62540f671493 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -863,9 +863,6 @@ static int kvm_init_ipa_range(struct kvm *kvm, if (kvm_is_realm(kvm)) kvm_ipa_limit = kvm_realm_ipa_limit(); - if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK) - return -EINVAL; - phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); if (is_protected_kvm_enabled()) { phys_shift = kvm_ipa_limit; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index fc5a128c7cd2..e0f07ddd42d1 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -925,14 +925,25 @@ struct kvm_ppc_resize_hpt { #define KVM_S390_SIE_PAGE_OFFSET 1 /* - * On arm64, machine type can be used to request the physical - * address size for the VM. Bits[7-0] are reserved for the guest - * PA size shift (i.e, log2(PA_Size)). For backward compatibility, - * value 0 implies the default IPA size, 40bits. + * On arm64, machine type can be used to request both the machine type and + * the physical address size for the VM. + * + * Bits[11-8] are reserved for the ARM specific machine type. + * + * Bits[7-0] are reserved for the guest PA size shift (i.e, log2(PA_Size)). + * For backward compatibility, value 0 implies the default IPA size, 40bits. */ +#define KVM_VM_TYPE_ARM_SHIFT 8 +#define KVM_VM_TYPE_ARM_MASK (0xfULL << KVM_VM_TYPE_ARM_SHIFT) +#define KVM_VM_TYPE_ARM(_type) \ + (((_type) << KVM_VM_TYPE_ARM_SHIFT) & KVM_VM_TYPE_ARM_MASK) +#define KVM_VM_TYPE_ARM_NORMAL KVM_VM_TYPE_ARM(0) +#define KVM_VM_TYPE_ARM_REALM KVM_VM_TYPE_ARM(1) + #define KVM_VM_TYPE_ARM_IPA_SIZE_MASK 0xffULL #define KVM_VM_TYPE_ARM_IPA_SIZE(x) \ ((x) & KVM_VM_TYPE_ARM_IPA_SIZE_MASK) + /* * ioctls for /dev/kvm fds: */ -- Gitee From 37d0d036aec54e164a19e44a5949c1f18a0e23bc Mon Sep 17 00:00:00 2001 From: Steven Price Date: Sat, 14 Jun 2025 15:28:58 +0800 Subject: [PATCH 41/84] arm64: RME: RTT tear down community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- The RMM owns the stage 2 page tables for a realm, and KVM must request that the RMM creates/destroys entries as necessary. The physical pages to store the page tables are delegated to the realm as required, and can be undelegated when no longer used. Creating new RTTs is the easy part, tearing down is a little more tricky. The result of realm_rtt_destroy() can be used to effectively walk the tree and destroy the entries (undelegating pages that were given to the realm). Signed-off-by: Steven Price Reviewed-by: Suzuki K Poulose Reviewed-by: Gavin Shan Signed-off-by: Xu Raoqing --- arch/arm64/include/asm/kvm_rme.h | 7 ++ arch/arm64/kvm/mmu.c | 6 +- arch/arm64/kvm/rme.c | 128 +++++++++++++++++++++++++++++++ 3 files changed, 138 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/kvm_rme.h b/arch/arm64/include/asm/kvm_rme.h index 5dc1915de891..5f0de9a6d339 100644 --- a/arch/arm64/include/asm/kvm_rme.h +++ b/arch/arm64/include/asm/kvm_rme.h @@ -71,5 +71,12 @@ u32 kvm_realm_ipa_limit(void); int kvm_realm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap); int kvm_init_realm_vm(struct kvm *kvm); void kvm_destroy_realm(struct kvm *kvm); +void kvm_realm_destroy_rtts(struct kvm *kvm, u32 ia_bits); + +static inline bool kvm_realm_is_private_address(struct realm *realm, + unsigned long addr) +{ + return !(addr & BIT(realm->ia_bits - 1)); +} #endif /* __ASM_KVM_RME_H */ diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 62540f671493..f21c6729c5b6 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1020,14 +1020,15 @@ void stage2_unmap_vm(struct kvm *kvm) void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) { struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); - struct kvm_pgtable *pgt = NULL; + struct kvm_pgtable *pgt; write_lock(&kvm->mmu_lock); + pgt = mmu->pgt; if (kvm_is_realm(kvm) && (kvm_realm_state(kvm) != REALM_STATE_DEAD && kvm_realm_state(kvm) != REALM_STATE_NONE)) { - /* Tearing down RTTs will be added in a later patch */ write_unlock(&kvm->mmu_lock); + kvm_realm_destroy_rtts(kvm, pgt->ia_bits); /* * The physical PGD pages are delegated to the RMM, so cannot @@ -1037,7 +1038,6 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) */ return; } - pgt = mmu->pgt; if (pgt) { mmu->pgd_phys = 0; mmu->pgt = NULL; diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c index b79f9c1e81c6..96f47a0faab5 100644 --- a/arch/arm64/kvm/rme.c +++ b/arch/arm64/kvm/rme.c @@ -17,6 +17,22 @@ static unsigned long rmm_feat_reg0; #define RMM_PAGE_SHIFT 12 #define RMM_PAGE_SIZE BIT(RMM_PAGE_SHIFT) +#define RMM_RTT_BLOCK_LEVEL 2 +#define RMM_RTT_MAX_LEVEL 3 + +/* See ARM64_HW_PGTABLE_LEVEL_SHIFT() */ +#define RMM_RTT_LEVEL_SHIFT(l) \ + ((RMM_PAGE_SHIFT - 3) * (4 - (l)) + 3) +#define RMM_L2_BLOCK_SIZE BIT(RMM_RTT_LEVEL_SHIFT(2)) + +static inline unsigned long rme_rtt_level_mapsize(int level) +{ + if (WARN_ON(level > RMM_RTT_MAX_LEVEL)) + return RMM_PAGE_SIZE; + + return (1UL << RMM_RTT_LEVEL_SHIFT(level)); +} + static bool rme_has_feature(unsigned long feature) { return !!u64_get_bits(rmm_feat_reg0, feature); @@ -173,6 +189,118 @@ static int realm_create_rd(struct kvm *kvm) return r; } +static int realm_rtt_destroy(struct realm *realm, unsigned long addr, + int level, phys_addr_t *rtt_granule, + unsigned long *next_addr) +{ + unsigned long out_rtt; + int ret; + + ret = rmi_rtt_destroy(virt_to_phys(realm->rd), addr, level, + &out_rtt, next_addr); + + *rtt_granule = out_rtt; + + return ret; +} + +static int realm_tear_down_rtt_level(struct realm *realm, int level, + unsigned long start, unsigned long end) +{ + ssize_t map_size; + unsigned long addr, next_addr; + + if (WARN_ON(level > RMM_RTT_MAX_LEVEL)) + return -EINVAL; + + map_size = rme_rtt_level_mapsize(level - 1); + + for (addr = start; addr < end; addr = next_addr) { + phys_addr_t rtt_granule; + int ret; + unsigned long align_addr = ALIGN(addr, map_size); + + next_addr = ALIGN(addr + 1, map_size); + + if (next_addr > end || align_addr != addr) { + /* + * The target range is smaller than what this level + * covers, recurse deeper. + */ + ret = realm_tear_down_rtt_level(realm, + level + 1, + addr, + min(next_addr, end)); + if (ret) + return ret; + continue; + } + + ret = realm_rtt_destroy(realm, addr, level, + &rtt_granule, &next_addr); + + switch (RMI_RETURN_STATUS(ret)) { + case RMI_SUCCESS: + free_delegated_granule(rtt_granule); + break; + case RMI_ERROR_RTT: + if (next_addr > addr) { + /* Missing RTT, skip */ + break; + } + /* + * We tear down the RTT range for the full IPA + * space, after everything is unmapped. Also we + * descend down only if we cannot tear down a + * top level RTT. Thus RMM must be able to walk + * to the requested level. e.g., a block mapping + * exists at L1 or L2. + */ + if (WARN_ON(RMI_RETURN_INDEX(ret) != level)) + return -EBUSY; + if (WARN_ON(level == RMM_RTT_MAX_LEVEL)) + return -EBUSY; + + /* + * The table has active entries in it, recurse deeper + * and tear down the RTTs. + */ + next_addr = ALIGN(addr + 1, map_size); + ret = realm_tear_down_rtt_level(realm, + level + 1, + addr, + next_addr); + if (ret) + return ret; + /* + * Now that the child RTTs are destroyed, + * retry at this level. + */ + next_addr = addr; + break; + default: + WARN_ON(1); + return -ENXIO; + } + } + + return 0; +} + +static int realm_tear_down_rtt_range(struct realm *realm, + unsigned long start, unsigned long end) +{ + return realm_tear_down_rtt_level(realm, get_start_level(realm) + 1, + start, end); +} + +void kvm_realm_destroy_rtts(struct kvm *kvm, u32 ia_bits) +{ + struct realm *realm = &kvm->arch.realm; + + WARN_ON(realm_tear_down_rtt_range(realm, 0, (1UL << ia_bits))); +} + /* Protects access to rme_vmid_bitmap */ static DEFINE_SPINLOCK(rme_vmid_lock); static unsigned long *rme_vmid_bitmap; -- Gitee From e54df73f9bb24c177cef9f5501a9a07eaf2cc5d6 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Sat, 14 Jun 2025 15:29:47 +0800 Subject: [PATCH 42/84] KVM: arm64: Add generic check for system-supported vCPU features MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v6.7-rc1 commit ef150908b6bd80a54126dbec324bd63a24a5628a category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ef150908b6bd80a54126dbec324bd63a24a5628a -------------------------------- [ Upstream commit ef150908b6bd80a54126dbec324bd63a24a5628a ] To date KVM has relied on kvm_reset_vcpu() failing when the vCPU feature flags are unsupported by the system. This is a bit messy since kvm_reset_vcpu() is called at runtime outside of the KVM_ARM_VCPU_INIT ioctl when it is expected to succeed. Further complicating the matter is that kvm_reset_vcpu() must tolerate be idemptotent to the config_lock, as it isn't consistently called with the lock held. Prepare to move feature compatibility checks out of kvm_reset_vcpu() with a 'generic' check that compares the user-provided flags with a computed maximum feature set for the system. Reviewed-by: Philippe Mathieu-Daudé Link: https://lore.kernel.org/r/20230920195036.1169791-2-oliver.upton@linux.dev Signed-off-by: Oliver Upton Signed-off-by: Yiwei Zhuang [xuraoqing:backport patch to check system vcpu support] Signed-off-by: Xu Raoqing --- arch/arm64/kvm/arm.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 79a2be4704e5..17ea494f2ad7 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1220,6 +1220,16 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, return -EINVAL; } +static unsigned long system_supported_vcpu_features(void) +{ + unsigned long features = KVM_VCPU_VALID_FEATURES; + + if (!cpus_have_final_cap(ARM64_HAS_32BIT_EL1)) + clear_bit(KVM_ARM_VCPU_EL1_32BIT, &features); + + return features; +} + static int kvm_vcpu_init_check_features(struct kvm_vcpu *vcpu, const struct kvm_vcpu_init *init) { @@ -1234,12 +1244,12 @@ static int kvm_vcpu_init_check_features(struct kvm_vcpu *vcpu, return -ENOENT; } + if (features & ~system_supported_vcpu_features()) + return -EINVAL; + if (!test_bit(KVM_ARM_VCPU_EL1_32BIT, &features)) return 0; - if (!cpus_have_const_cap(ARM64_HAS_32BIT_EL1)) - return -EINVAL; - /* MTE is incompatible with AArch32 */ if (kvm_has_mte(vcpu->kvm)) return -EINVAL; -- Gitee From 8e143d17f16eb097a17ca9d5db0f8d3b9fc5632d Mon Sep 17 00:00:00 2001 From: Steven Price Date: Sat, 14 Jun 2025 15:32:04 +0800 Subject: [PATCH 43/84] arm64: RME: Allocate/free RECs to match vCPUs community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- The RMM maintains a data structure known as the Realm Execution Context (or REC). It is similar to struct kvm_vcpu and tracks the state of the virtual CPUs. KVM must delegate memory and request the structures are created when vCPUs are created, and suitably tear down on destruction. RECs must also be supplied with addition pages - auxiliary (or AUX) granules - for storing the larger registers state (e.g. for SVE). The number of AUX granules for a REC depends on the parameters with which the Realm was created - the RMM makes this information available via the RMI_REC_AUX_COUNT call performed after creating the Realm Descriptor (RD). Note that only some of register state for the REC can be set by KVM, the rest is defined by the RMM (zeroed). The register state then cannot be changed by KVM after the REC is created (except when the guest explicitly requests this e.g. by performing a PSCI call). The RMM also requires that the VMM creates RECs in ascending order of the MPIDR. See Realm Management Monitor specification (DEN0137) for more information: https://developer.arm.com/documentation/den0137/ Signed-off-by: Steven Price Reviewed-by: Gavin Shan Reviewed-by: Suzuki K Poulose Signed-off-by: Xu Raoqing --- arch/arm64/include/asm/kvm_emulate.h | 7 ++ arch/arm64/include/asm/kvm_host.h | 3 + arch/arm64/include/asm/kvm_rme.h | 27 ++++ arch/arm64/kvm/arm.c | 13 +- arch/arm64/kvm/reset.c | 11 ++ arch/arm64/kvm/rme.c | 180 +++++++++++++++++++++++++++ 6 files changed, 239 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 7a818f980975..426e8e6bbf09 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -635,7 +635,14 @@ static inline bool kvm_realm_is_created(struct kvm *kvm) static inline bool vcpu_is_rec(struct kvm_vcpu *vcpu) { + if (static_branch_unlikely(&kvm_rme_is_available)) + return vcpu_has_feature(vcpu, KVM_ARM_VCPU_REC); return false; } +static inline bool kvm_arm_rec_finalized(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.rec.mpidr != INVALID_HWID; +} + #endif /* __ARM64_KVM_EMULATE_H__ */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 339fc315789f..db84d0622021 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -592,6 +592,9 @@ struct kvm_vcpu_arch { /* Per-vcpu CCSIDR override or NULL */ u32 *ccsidr; + + /* Realm meta data */ + struct realm_rec rec; }; /* diff --git a/arch/arm64/include/asm/kvm_rme.h b/arch/arm64/include/asm/kvm_rme.h index 5f0de9a6d339..f716b890e484 100644 --- a/arch/arm64/include/asm/kvm_rme.h +++ b/arch/arm64/include/asm/kvm_rme.h @@ -6,6 +6,7 @@ #ifndef __ASM_KVM_RME_H #define __ASM_KVM_RME_H +#include #include /** @@ -65,6 +66,30 @@ struct realm { unsigned int ia_bits; }; +/** + * struct realm_rec - Additional per VCPU data for a Realm + * + * @mpidr: MPIDR (Multiprocessor Affinity Register) value to identify this VCPU + * @rec_page: Kernel VA of the RMM's private page for this REC + * @aux_pages: Additional pages private to the RMM for this REC + * @run: Kernel VA of the RmiRecRun structure shared with the RMM + */ +struct realm_rec { + unsigned long mpidr; + void *rec_page; + /* + * REC_PARAMS_AUX_GRANULES is the maximum number of granules that the + * RMM can require. By using that to size the array we know that it + * will be big enough as the page size is always at least as large as + * the granule size. In the case of a larger page size than 4k (or an + * RMM which requires fewer auxiliary granules), the array will be + * bigger than needed however the extra memory required is small and + * this keeps the code cleaner. + */ + struct page *aux_pages[REC_PARAMS_AUX_GRANULES]; + struct rec_run *run; +}; + void kvm_init_rme(void); u32 kvm_realm_ipa_limit(void); @@ -72,6 +97,8 @@ int kvm_realm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap); int kvm_init_realm_vm(struct kvm *kvm); void kvm_destroy_realm(struct kvm *kvm); void kvm_realm_destroy_rtts(struct kvm *kvm, u32 ia_bits); +int kvm_create_rec(struct kvm_vcpu *vcpu); +void kvm_destroy_rec(struct kvm_vcpu *vcpu); static inline bool kvm_realm_is_private_address(struct realm *realm, unsigned long addr) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 17ea494f2ad7..3434bf1223b1 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -404,6 +404,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu_clear_flag(vcpu, VCPU_INITIALIZED); bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES); + vcpu->arch.rec.mpidr = INVALID_HWID; + vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO; /* @@ -1220,13 +1222,16 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, return -EINVAL; } -static unsigned long system_supported_vcpu_features(void) +static unsigned long system_supported_vcpu_features(struct kvm *kvm) { unsigned long features = KVM_VCPU_VALID_FEATURES; if (!cpus_have_final_cap(ARM64_HAS_32BIT_EL1)) clear_bit(KVM_ARM_VCPU_EL1_32BIT, &features); + if (!kvm_is_realm(kvm)) + clear_bit(KVM_ARM_VCPU_REC, &features); + return features; } @@ -1244,7 +1249,7 @@ static int kvm_vcpu_init_check_features(struct kvm_vcpu *vcpu, return -ENOENT; } - if (features & ~system_supported_vcpu_features()) + if (features & ~system_supported_vcpu_features(vcpu->kvm)) return -EINVAL; if (!test_bit(KVM_ARM_VCPU_EL1_32BIT, &features)) @@ -1258,6 +1263,10 @@ static int kvm_vcpu_init_check_features(struct kvm_vcpu *vcpu, if (test_bit(KVM_ARM_VCPU_HAS_EL2, &features)) return -EINVAL; + /* RME is incompatible with AArch32 */ + if (test_bit(KVM_ARM_VCPU_REC, &features)) + return -EINVAL; + return 0; } diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 43a53a403f51..c73735ad8fae 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -142,6 +142,11 @@ int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu, int feature) return -EPERM; return kvm_vcpu_finalize_sve(vcpu); + case KVM_ARM_VCPU_REC: + if (!kvm_is_realm(vcpu->kvm) || !vcpu_is_rec(vcpu)) + return -EINVAL; + + return kvm_create_rec(vcpu); } return -EINVAL; @@ -152,6 +157,11 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu) if (vcpu_has_sve(vcpu) && !kvm_arm_vcpu_sve_finalized(vcpu)) return false; + if (kvm_is_realm(vcpu->kvm) && + !(vcpu_is_rec(vcpu) && kvm_arm_rec_finalized(vcpu) && + READ_ONCE(vcpu->kvm->arch.realm.state) == REALM_STATE_ACTIVE)) + return false; + return true; } @@ -165,6 +175,7 @@ void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_unshare_hyp(sve_state, sve_state + vcpu_sve_state_size(vcpu)); kfree(sve_state); kfree(vcpu->arch.ccsidr); + kvm_destroy_rec(vcpu); } static void kvm_vcpu_reset_sve(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c index 96f47a0faab5..9cd99317003a 100644 --- a/arch/arm64/kvm/rme.c +++ b/arch/arm64/kvm/rme.c @@ -474,6 +474,186 @@ void kvm_destroy_realm(struct kvm *kvm) kvm_free_stage2_pgd(&kvm->arch.mmu); } +static void free_rec_aux(struct page **aux_pages, + unsigned int num_aux) +{ + unsigned int i, j; + unsigned int page_count = 0; + + for (i = 0; i < num_aux;) { + struct page *aux_page = aux_pages[page_count++]; + phys_addr_t aux_page_phys = page_to_phys(aux_page); + bool should_free = true; + + for (j = 0; j < PAGE_SIZE && i < num_aux; j += RMM_PAGE_SIZE) { + if (WARN_ON(rmi_granule_undelegate(aux_page_phys))) + should_free = false; + aux_page_phys += RMM_PAGE_SIZE; + i++; + } + /* Only free if all the undelegate calls were successful */ + if (should_free) + __free_page(aux_page); + } +} + +static int alloc_rec_aux(struct page **aux_pages, + u64 *aux_phys_pages, + unsigned int num_aux) +{ + struct page *aux_page; + int page_count = 0; + unsigned int i, j; + int ret; + + for (i = 0; i < num_aux;) { + phys_addr_t aux_page_phys; + + aux_page = alloc_page(GFP_KERNEL); + if (!aux_page) { + ret = -ENOMEM; + goto out_err; + } + + aux_page_phys = page_to_phys(aux_page); + for (j = 0; j < PAGE_SIZE && i < num_aux; j += RMM_PAGE_SIZE) { + if (rmi_granule_delegate(aux_page_phys)) { + ret = -ENXIO; + goto err_undelegate; + } + aux_phys_pages[i++] = aux_page_phys; + aux_page_phys += RMM_PAGE_SIZE; + } + aux_pages[page_count++] = aux_page; + } + + return 0; +err_undelegate: + while (j > 0) { + j -= RMM_PAGE_SIZE; + i--; + if (WARN_ON(rmi_granule_undelegate(aux_phys_pages[i]))) { + /* Leak the page if the undelegate fails */ + goto out_err; + } + } + __free_page(aux_page); +out_err: + free_rec_aux(aux_pages, i); + return ret; +} + +int kvm_create_rec(struct kvm_vcpu *vcpu) +{ + struct user_pt_regs *vcpu_regs = vcpu_gp_regs(vcpu); + unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu); + struct realm *realm = &vcpu->kvm->arch.realm; + struct realm_rec *rec = &vcpu->arch.rec; + unsigned long rec_page_phys; + struct rec_params *params; + int r, i; + + if (kvm_realm_state(vcpu->kvm) != REALM_STATE_NEW) + return -ENOENT; + + if (rec->run) + return -EBUSY; + + /* + * The RMM will report PSCI v1.0 to Realms and the KVM_ARM_VCPU_PSCI_0_2 + * flag covers v0.2 and onwards. + */ + if (!vcpu_has_feature(vcpu, KVM_ARM_VCPU_PSCI_0_2)) + return -EINVAL; + + BUILD_BUG_ON(sizeof(*params) > PAGE_SIZE); + BUILD_BUG_ON(sizeof(*rec->run) > PAGE_SIZE); + + params = (struct rec_params *)get_zeroed_page(GFP_KERNEL); + rec->rec_page = (void *)__get_free_page(GFP_KERNEL); + rec->run = (void *)get_zeroed_page(GFP_KERNEL); + if (!params || !rec->rec_page || !rec->run) { + r = -ENOMEM; + goto out_free_pages; + } + + for (i = 0; i < ARRAY_SIZE(params->gprs); i++) + params->gprs[i] = vcpu_regs->regs[i]; + + params->pc = vcpu_regs->pc; + + if (vcpu->vcpu_id == 0) + params->flags |= REC_PARAMS_FLAG_RUNNABLE; + + rec_page_phys = virt_to_phys(rec->rec_page); + + if (rmi_granule_delegate(rec_page_phys)) { + r = -ENXIO; + goto out_free_pages; + } + + r = alloc_rec_aux(rec->aux_pages, params->aux, realm->num_aux); + if (r) + goto out_undelegate_rmm_rec; + + params->num_rec_aux = realm->num_aux; + params->mpidr = mpidr; + + if (rmi_rec_create(virt_to_phys(realm->rd), + rec_page_phys, + virt_to_phys(params))) { + r = -ENXIO; + goto out_free_rec_aux; + } + + rec->mpidr = mpidr; + + free_page((unsigned long)params); + return 0; + +out_free_rec_aux: + free_rec_aux(rec->aux_pages, realm->num_aux); +out_undelegate_rmm_rec: + if (WARN_ON(rmi_granule_undelegate(rec_page_phys))) + rec->rec_page = NULL; +out_free_pages: + free_page((unsigned long)rec->run); + free_page((unsigned long)rec->rec_page); + free_page((unsigned long)params); + return r; +} + +void kvm_destroy_rec(struct kvm_vcpu *vcpu) +{ + struct realm *realm = &vcpu->kvm->arch.realm; + struct realm_rec *rec = &vcpu->arch.rec; + unsigned long rec_page_phys; + + if (!vcpu_is_rec(vcpu)) + return; + + if (!rec->run) { + /* Nothing to do if the VCPU hasn't been finalized */ + return; + } + + free_page((unsigned long)rec->run); + + rec_page_phys = virt_to_phys(rec->rec_page); + + /* + * The REC and any AUX pages cannot be reclaimed until the REC is + * destroyed. So if the REC destroy fails then the REC page and any AUX + * pages will be leaked. + */ + if (WARN_ON(rmi_rec_destroy(rec_page_phys))) + return; + + free_rec_aux(rec->aux_pages, realm->num_aux); + + free_delegated_granule(rec_page_phys); +} + int kvm_init_realm_vm(struct kvm *kvm) { kvm->arch.realm.params = (void *)get_zeroed_page(GFP_KERNEL); -- Gitee From b53dc907f3caaab81c816185f50cccb7ca2a7123 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Sat, 14 Jun 2025 15:32:57 +0800 Subject: [PATCH 44/84] KVM: arm64: vgic: Provide helper for number of list registers community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- Currently the number of list registers available is stored in a global (kvm_vgic_global_state.nr_lr). With Arm CCA the RMM is permitted to reserve list registers for its own use and so the number of available list registers can be fewer for a realm VM. Provide a wrapper function to fetch the global in preparation for restricting nr_lr when dealing with a realm VM. Signed-off-by: Steven Price Reviewed-by: Gavin Shan Reviewed-by: Suzuki K Poulose Signed-off-by: Xu Raoqing --- arch/arm64/kvm/vgic/vgic.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 8be4c1ebdec2..381623a5715d 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -21,6 +21,11 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = { .gicv3_cpuif = STATIC_KEY_FALSE_INIT, }; +static inline int kvm_vcpu_vgic_nr_lr(struct kvm_vcpu *vcpu) +{ + return kvm_vgic_global_state.nr_lr; +} + /* * Locking order is always: * kvm->lock (mutex) @@ -808,7 +813,7 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) lockdep_assert_held(&vgic_cpu->ap_list_lock); count = compute_ap_list_depth(vcpu, &multi_sgi); - if (count > kvm_vgic_global_state.nr_lr || multi_sgi) + if (count > kvm_vcpu_vgic_nr_lr(vcpu) || multi_sgi) vgic_sort_ap_list(vcpu); count = 0; @@ -837,7 +842,7 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) raw_spin_unlock(&irq->irq_lock); - if (count == kvm_vgic_global_state.nr_lr) { + if (count == kvm_vcpu_vgic_nr_lr(vcpu)) { if (!list_is_last(&irq->ap_list, &vgic_cpu->ap_list_head)) vgic_set_underflow(vcpu); @@ -846,7 +851,7 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) } /* Nuke remaining LRs */ - for (i = count ; i < kvm_vgic_global_state.nr_lr; i++) + for (i = count ; i < kvm_vcpu_vgic_nr_lr(vcpu); i++) vgic_clear_lr(vcpu, i); if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) -- Gitee From da314cd8db057f18a68fd64b794cfdb3660f99fe Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 14 Jun 2025 15:34:25 +0800 Subject: [PATCH 45/84] KVM: arm64: Force GICv3 trap activation when no irqchip is configured on VHE mainline inclusion from mainline-v6.12-rc1 commit 8d917e0a8651377321c06513f42e2ab9a86161f4 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8d917e0a8651377321c06513f42e2ab9a86161f4 -------------------------------- [ Upstream commit 8d917e0a8651377321c06513f42e2ab9a86161f4 ] On a VHE system, no GICv3 traps get configured when no irqchip is present. This is not quite matching the "no GICv3" semantics that we want to present. Force such traps to be configured in this case. Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20240827152517.3909653-4-maz@kernel.org Signed-off-by: Marc Zyngier Signed-off-by: Yiwei Zhuang Signed-off-by: Xu Raoqing --- arch/arm64/kvm/vgic/vgic.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 381623a5715d..6430de54bb2b 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -944,10 +944,13 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) void kvm_vgic_load(struct kvm_vcpu *vcpu) { - if (unlikely(!vgic_initialized(vcpu->kvm))) + if (unlikely(!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm))) { + if (has_vhe() && static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + __vgic_v3_activate_traps(&vcpu->arch.vgic_cpu.vgic_v3); return; + } - if (kvm_vgic_global_state.type == VGIC_V2) + if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) vgic_v2_load(vcpu); else vgic_v3_load(vcpu); @@ -955,10 +958,13 @@ void kvm_vgic_load(struct kvm_vcpu *vcpu) void kvm_vgic_put(struct kvm_vcpu *vcpu) { - if (unlikely(!vgic_initialized(vcpu->kvm))) + if (unlikely(!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm))) { + if (has_vhe() && static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + __vgic_v3_deactivate_traps(&vcpu->arch.vgic_cpu.vgic_v3); return; + } - if (kvm_vgic_global_state.type == VGIC_V2) + if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) vgic_v2_put(vcpu); else vgic_v3_put(vcpu); -- Gitee From 81f99252b87135aa6859108b171d4815d4b06cd1 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Sat, 14 Jun 2025 15:35:11 +0800 Subject: [PATCH 46/84] arm64: RME: Support for the VGIC in realms community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- The RMM provides emulation of a VGIC to the realm guest but delegates much of the handling to the host. Implement support in KVM for saving/restoring state to/from the REC structure. Signed-off-by: Steven Price Signed-off-by: Xu Raoqing --- arch/arm64/include/asm/kvm_rme.h | 1 + arch/arm64/kvm/arm.c | 17 ++++++++++--- arch/arm64/kvm/rme.c | 5 ++++ arch/arm64/kvm/vgic/vgic-init.c | 2 +- arch/arm64/kvm/vgic/vgic-v3.c | 5 ++++ arch/arm64/kvm/vgic/vgic.c | 43 ++++++++++++++++++++++++++++++-- 6 files changed, 67 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/kvm_rme.h b/arch/arm64/include/asm/kvm_rme.h index f716b890e484..9bcad6ec5dbb 100644 --- a/arch/arm64/include/asm/kvm_rme.h +++ b/arch/arm64/include/asm/kvm_rme.h @@ -92,6 +92,7 @@ struct realm_rec { void kvm_init_rme(void); u32 kvm_realm_ipa_limit(void); +u32 kvm_realm_vgic_nr_lr(void); int kvm_realm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap); int kvm_init_realm_vm(struct kvm *kvm); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 3434bf1223b1..9824589e7338 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -507,17 +507,23 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { + kvm_timer_vcpu_put(vcpu); + kvm_vgic_put(vcpu); + + vcpu->cpu = -1; + + if (vcpu_is_rec(vcpu)) + return; + kvm_arch_vcpu_put_debug_state_flags(vcpu); kvm_arch_vcpu_put_fp(vcpu); if (has_vhe()) kvm_vcpu_put_sysregs_vhe(vcpu); - kvm_timer_vcpu_put(vcpu); - kvm_vgic_put(vcpu); + kvm_vcpu_pmu_restore_host(vcpu); kvm_arm_vmid_clear_active(); vcpu_clear_on_unsupported_cpu(vcpu); - vcpu->cpu = -1; } static void __kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu) @@ -673,6 +679,11 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) if (kvm_vm_is_protected(kvm)) kvm_call_hyp_nvhe(__pkvm_vcpu_init_traps, vcpu); + if (!irqchip_in_kernel(kvm) && kvm_is_realm(vcpu->kvm)) { + /* Userspace irqchip not yet supported with Realms */ + return -EOPNOTSUPP; + } + mutex_lock(&kvm->arch.config_lock); set_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags); mutex_unlock(&kvm->arch.config_lock); diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c index 9cd99317003a..2b87cc251a10 100644 --- a/arch/arm64/kvm/rme.c +++ b/arch/arm64/kvm/rme.c @@ -77,6 +77,11 @@ u32 kvm_realm_ipa_limit(void) return u64_get_bits(rmm_feat_reg0, RMI_FEATURE_REGISTER_0_S2SZ); } +u32 kvm_realm_vgic_nr_lr(void) +{ + return u64_get_bits(rmm_feat_reg0, RMI_FEATURE_REGISTER_0_GICV3_NUM_LRS); +} + static int get_start_level(struct realm *realm) { return 4 - ((realm->ia_bits - 8) / (RMM_PAGE_SHIFT - 3)); diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index b7306c588d9d..cffb3030c524 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -81,7 +81,7 @@ int kvm_vgic_create(struct kvm *kvm, u32 type) * the proper checks already. */ if (type == KVM_DEV_TYPE_ARM_VGIC_V2 && - !kvm_vgic_global_state.can_emulate_gicv2) + (!kvm_vgic_global_state.can_emulate_gicv2 || kvm_is_realm(kvm))) return -ENODEV; /* Must be held to avoid race with vCPU creation */ diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 3dfc8b84e03e..2a03c16b3bb3 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -7,9 +7,11 @@ #include #include #include +#include #include #include #include +#include #include "vgic.h" @@ -749,6 +751,9 @@ void vgic_v3_put(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + if (vcpu_is_rec(vcpu)) + cpu_if->vgic_vmcr = vcpu->arch.rec.run->exit.gicv3_vmcr; + WARN_ON(vgic_v4_put(vcpu)); vgic_v3_vmcr_sync(vcpu); diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 6430de54bb2b..07d73faacc51 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -10,7 +10,9 @@ #include #include +#include #include +#include #include "vgic.h" @@ -23,6 +25,8 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = { static inline int kvm_vcpu_vgic_nr_lr(struct kvm_vcpu *vcpu) { + if (unlikely(vcpu_is_rec(vcpu))) + return kvm_realm_vgic_nr_lr(); return kvm_vgic_global_state.nr_lr; } @@ -870,10 +874,23 @@ static inline bool can_access_vgic_from_kernel(void) return !static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif) || has_vhe(); } +static inline void vgic_rmm_save_state(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + int i; + + for (i = 0; i < kvm_vcpu_vgic_nr_lr(vcpu); i++) { + cpu_if->vgic_lr[i] = vcpu->arch.rec.run->exit.gicv3_lrs[i]; + vcpu->arch.rec.run->enter.gicv3_lrs[i] = 0; + } +} + static inline void vgic_save_state(struct kvm_vcpu *vcpu) { if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) vgic_v2_save_state(vcpu); + else if (vcpu_is_rec(vcpu)) + vgic_rmm_save_state(vcpu); else __vgic_v3_save_state(&vcpu->arch.vgic_cpu.vgic_v3); } @@ -900,10 +917,28 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) vgic_prune_ap_list(vcpu); } +static inline void vgic_rmm_restore_state(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + int i; + + for (i = 0; i < kvm_vcpu_vgic_nr_lr(vcpu); i++) { + vcpu->arch.rec.run->enter.gicv3_lrs[i] = cpu_if->vgic_lr[i]; + /* + * Also populate the rec.run->exit copies so that a late + * decision to back out from entering the realm doesn't cause + * the state to be lost + */ + vcpu->arch.rec.run->exit.gicv3_lrs[i] = cpu_if->vgic_lr[i]; + } +} + static inline void vgic_restore_state(struct kvm_vcpu *vcpu) { if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) vgic_v2_restore_state(vcpu); + else if (vcpu_is_rec(vcpu)) + vgic_rmm_restore_state(vcpu); else __vgic_v3_restore_state(&vcpu->arch.vgic_cpu.vgic_v3); } @@ -944,7 +979,9 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) void kvm_vgic_load(struct kvm_vcpu *vcpu) { - if (unlikely(!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm))) { + if (unlikely(!irqchip_in_kernel(vcpu->kvm) || + !vgic_initialized(vcpu->kvm) || + vcpu_is_rec(vcpu))) { if (has_vhe() && static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) __vgic_v3_activate_traps(&vcpu->arch.vgic_cpu.vgic_v3); return; @@ -958,7 +995,9 @@ void kvm_vgic_load(struct kvm_vcpu *vcpu) void kvm_vgic_put(struct kvm_vcpu *vcpu) { - if (unlikely(!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm))) { + if (unlikely(!irqchip_in_kernel(vcpu->kvm) || + !vgic_initialized(vcpu->kvm) || + vcpu_is_rec(vcpu))) { if (has_vhe() && static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) __vgic_v3_deactivate_traps(&vcpu->arch.vgic_cpu.vgic_v3); return; -- Gitee From 8eb582e0623c34b5fd3d21451f49cfd0dd7ba8e8 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Sat, 14 Jun 2025 15:35:48 +0800 Subject: [PATCH 47/84] KVM: arm64: Support timers in realm RECs community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- The RMM keeps track of the timer while the realm REC is running, but on exit to the normal world KVM is responsible for handling the timers. The RMM doesn't provide a mechanism to set the counter offset, so don't expose KVM_CAP_COUNTER_OFFSET for a realm VM. A later patch adds the support for propagating the timer values from the exit data structure and calling kvm_realm_timers_update(). Signed-off-by: Steven Price Reviewed-by: Suzuki K Poulose Signed-off-by: Xu Raoqing --- arch/arm64/kvm/arch_timer.c | 48 +++++++++++++++++++++++++++++++++--- arch/arm64/kvm/arm.c | 2 +- include/kvm/arm_arch_timer.h | 2 ++ 3 files changed, 47 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index a795dad2b99a..70c1fb6c217e 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -162,6 +162,13 @@ static void timer_set_cval(struct arch_timer_context *ctxt, u64 cval) static void timer_set_offset(struct arch_timer_context *ctxt, u64 offset) { + struct kvm_vcpu *vcpu = ctxt->vcpu; + + if (kvm_is_realm(vcpu->kvm)) { + WARN_ON(offset); + return; + } + if (!ctxt->offset.vm_offset) { WARN(offset, "timer %ld\n", arch_timer_ctx_index(ctxt)); return; @@ -460,6 +467,21 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, } } +void kvm_realm_timers_update(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *arch_timer = &vcpu->arch.timer_cpu; + int i; + + for (i = 0; i < NR_KVM_EL0_TIMERS; i++) { + struct arch_timer_context *timer = &arch_timer->timers[i]; + bool status = timer_get_ctl(timer) & ARCH_TIMER_CTRL_IT_STAT; + bool level = kvm_timer_irq_can_fire(timer) && status; + + if (level != timer->irq.level) + kvm_timer_update_irq(vcpu, level, timer); + } +} + /* Only called for a fully emulated timer */ static void timer_emulate(struct arch_timer_context *ctx) { @@ -829,6 +851,8 @@ void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) if (unlikely(!timer->enabled)) return; + kvm_timer_unblocking(vcpu); + get_timer_map(vcpu, &map); if (static_branch_likely(&has_gic_active_state)) { @@ -842,8 +866,6 @@ void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) kvm_timer_vcpu_load_nogic(vcpu); } - kvm_timer_unblocking(vcpu); - timer_restore_state(map.direct_vtimer); if (map.direct_ptimer) timer_restore_state(map.direct_ptimer); @@ -988,7 +1010,9 @@ static void timer_context_init(struct kvm_vcpu *vcpu, int timerid) ctxt->vcpu = vcpu; - if (timerid == TIMER_VTIMER) + if (kvm_is_realm(vcpu->kvm)) + ctxt->offset.vm_offset = NULL; + else if (timerid == TIMER_VTIMER) ctxt->offset.vm_offset = &kvm->arch.timer_data.voffset; else ctxt->offset.vm_offset = &kvm->arch.timer_data.poffset; @@ -1011,13 +1035,19 @@ static void timer_context_init(struct kvm_vcpu *vcpu, int timerid) void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); + u64 cntvoff; for (int i = 0; i < NR_KVM_TIMERS; i++) timer_context_init(vcpu, i); + if (kvm_is_realm(vcpu->kvm)) + cntvoff = 0; + else + cntvoff = kvm_phys_timer_read(); + /* Synchronize offsets across timers of a VM if not already provided */ if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &vcpu->kvm->arch.flags)) { - timer_set_offset(vcpu_vtimer(vcpu), kvm_phys_timer_read()); + timer_set_offset(vcpu_vtimer(vcpu), cntvoff); timer_set_offset(vcpu_ptimer(vcpu), 0); } @@ -1525,6 +1555,13 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu) return -EINVAL; } + /* + * We don't use mapped IRQs for Realms because the RMI doesn't allow + * us setting the LR.HW bit in the VGIC. + */ + if (vcpu_is_rec(vcpu)) + return 0; + get_timer_map(vcpu, &map); ret = kvm_vgic_map_phys_irq(vcpu, @@ -1656,6 +1693,9 @@ int kvm_vm_ioctl_set_counter_offset(struct kvm *kvm, if (offset->reserved) return -EINVAL; + if (kvm_is_realm(kvm)) + return -EINVAL; + mutex_lock(&kvm->lock); if (lock_all_vcpus(kvm)) { diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 9824589e7338..b68f1fcb0dc7 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -268,9 +268,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_PTP_KVM: case KVM_CAP_ARM_SYSTEM_SUSPEND: case KVM_CAP_IRQFD_RESAMPLE: - case KVM_CAP_COUNTER_OFFSET: r = 1; break; + case KVM_CAP_COUNTER_OFFSET: case KVM_CAP_SET_GUEST_DEBUG: r = !kvm_is_realm(kvm); break; diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index e748bc957d83..bde9bdb8233f 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -112,6 +112,8 @@ int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); +void kvm_realm_timers_update(struct kvm_vcpu *vcpu); + u64 kvm_phys_timer_read(void); void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu); -- Gitee From 0de68ea7f422f5ce4e0e74f759cfffcb2c22514b Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Tue, 17 Jun 2025 17:17:43 +0800 Subject: [PATCH 48/84] KVM: Add member to struct kvm_gfn_range to indicate private/shared mainline inclusion from mainline-v6.14-rc1 commit dca6c88532322830d5d92486467fcc91b67a9ad8 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=dca6c88532322830d5d92486467fcc91b67a9ad8 -------------------------------- Add new members to strut kvm_gfn_range to indicate which mapping (private-vs-shared) to operate on: enum kvm_gfn_range_filter attr_filter. Update the core zapping operations to set them appropriately. TDX utilizes two GPA aliases for the same memslots, one for memory that is for private memory and one that is for shared. For private memory, KVM cannot always perform the same operations it does on memory for default VMs, such as zapping pages and having them be faulted back in, as this requires guest coordination. However, some operations such as guest driven conversion of memory between private and shared should zap private memory. Internally to the MMU, private and shared mappings are tracked on separate roots. Mapping and zapping operations will operate on the respective GFN alias for each root (private or shared). So zapping operations will by default zap both aliases. Add fields in struct kvm_gfn_range to allow callers to specify which aliases so they can only target the aliases appropriate for their specific operation. There was feedback that target aliases should be specified such that the default value (0) is to operate on both aliases. Several options were considered. Several variations of having separate bools defined such that the default behavior was to process both aliases. They either allowed nonsensical configurations, or were confusing for the caller. A simple enum was also explored and was close, but was hard to process in the caller. Instead, use an enum with the default value (0) reserved as a disallowed value. Catch ranges that didn't have the target aliases specified by looking for that specific value. Set target alias with enum appropriately for these MMU operations: - For KVM's mmu notifier callbacks, zap shared pages only because private pages won't have a userspace mapping - For setting memory attributes, kvm_arch_pre_set_memory_attributes() chooses the aliases based on the attribute. - For guest_memfd invalidations, zap private only. Link: https://lore.kernel.org/kvm/ZivIF9vjKcuGie3s@google.com/ Signed-off-by: Isaku Yamahata Co-developed-by: Rick Edgecombe Signed-off-by: Rick Edgecombe Message-ID: <20240718211230.1492011-3-rick.p.edgecombe@intel.com> Signed-off-by: Paolo Bonzini Conflicts:virt/kvm/kvm_main.c,arch/x86/kvm/mmu/mmu.c Signed-off-by: Xu Raoqing --- include/linux/kvm_host.h | 6 ++++++ virt/kvm/kvm_main.c | 5 +++++ 2 files changed, 11 insertions(+) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index de2b643ac217..cebf74319ffc 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -259,11 +259,17 @@ union kvm_mmu_notifier_arg { pte_t pte; }; +enum kvm_gfn_range_filter { + KVM_FILTER_SHARED = BIT(0), + KVM_FILTER_PRIVATE = BIT(1), +}; + struct kvm_gfn_range { struct kvm_memory_slot *slot; gfn_t start; gfn_t end; union kvm_mmu_notifier_arg arg; + enum kvm_gfn_range_filter attr_filter; bool may_block; }; bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e4077b389c80..de586c589f05 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -625,6 +625,11 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, */ gfn_range.arg = range->arg; gfn_range.may_block = range->may_block; + /* + * HVA-based notifications aren't relevant to private + * mappings as they don't have a userspace mapping. + */ + gfn_range.attr_filter = KVM_FILTER_SHARED; /* * {gfn(page) | page intersects with [hva_start, hva_end)} = -- Gitee From a40d812938259e468d27818f228c0b6943624c1c Mon Sep 17 00:00:00 2001 From: Steven Price Date: Sat, 14 Jun 2025 15:37:16 +0800 Subject: [PATCH 49/84] arm64: RME: Allow VMM to set RIPAS community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- Each page within the protected region of the realm guest can be marked as either RAM or EMPTY. Allow the VMM to control this before the guest has started and provide the equivalent functions to change this (with the guest's approval) at runtime. When transitioning from RIPAS RAM (1) to RIPAS EMPTY (0) the memory is unmapped from the guest and undelegated allowing the memory to be reused by the host. When transitioning to RIPAS RAM the actual population of the leaf RTTs is done later on stage 2 fault, however it may be necessary to allocate additional RTTs to allow the RMM track the RIPAS for the requested range. When freeing a block mapping it is necessary to temporarily unfold the RTT which requires delegating an extra page to the RMM, this page can then be recovered once the contents of the block mapping have been freed. Signed-off-by: Steven Price Signed-off-by: Xu Raoqing --- arch/arm64/include/asm/kvm_rme.h | 5 + arch/arm64/kvm/mmu.c | 16 +- arch/arm64/kvm/rme.c | 381 +++++++++++++++++++++++++++++++ 3 files changed, 396 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/kvm_rme.h b/arch/arm64/include/asm/kvm_rme.h index 9bcad6ec5dbb..b916db8565a2 100644 --- a/arch/arm64/include/asm/kvm_rme.h +++ b/arch/arm64/include/asm/kvm_rme.h @@ -101,6 +101,11 @@ void kvm_realm_destroy_rtts(struct kvm *kvm, u32 ia_bits); int kvm_create_rec(struct kvm_vcpu *vcpu); void kvm_destroy_rec(struct kvm_vcpu *vcpu); +void kvm_realm_unmap_range(struct kvm *kvm, + unsigned long ipa, + unsigned long size, + bool unmap_private); + static inline bool kvm_realm_is_private_address(struct realm *realm, unsigned long addr) { diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index f21c6729c5b6..ee4f379222fd 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -310,6 +310,7 @@ static void invalidate_icache_guest_page(void *va, size_t size) * @start: The intermediate physical base address of the range to unmap * @size: The size of the area to unmap * @may_block: Whether or not we are permitted to block + * @only_shared: If true then protected mappings should not be unmapped * * Clear a range of stage-2 mappings, lowering the various ref-counts. Must * be called while holding mmu_lock (unless for freeing the stage2 pgd before @@ -317,7 +318,7 @@ static void invalidate_icache_guest_page(void *va, size_t size) * with things behind our backs. */ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size, - bool may_block) + bool may_block, bool only_shared) { struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); phys_addr_t end = start + size; @@ -328,9 +329,10 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 may_block)); } -static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size) +static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size, + bool may_block) { - __unmap_stage2_range(mmu, start, size, true); + __unmap_stage2_range(mmu, start, size, may_block, false); } static void stage2_flush_memslot(struct kvm *kvm, @@ -985,7 +987,8 @@ static void stage2_unmap_memslot(struct kvm *kvm, if (!(vma->vm_flags & VM_PFNMAP)) { gpa_t gpa = addr + (vm_start - memslot->userspace_addr); - unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start); + unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start, + true); } hva = vm_end; } while (hva < reg_end); @@ -1800,7 +1803,8 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT, (range->end - range->start) << PAGE_SHIFT, - range->may_block); + range->may_block, + !(range->attr_filter & KVM_FILTER_PRIVATE)); return false; } @@ -2110,7 +2114,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, phys_addr_t size = slot->npages << PAGE_SHIFT; write_lock(&kvm->mmu_lock); - unmap_stage2_range(&kvm->arch.mmu, gpa, size); + unmap_stage2_range(&kvm->arch.mmu, gpa, size, true); write_unlock(&kvm->mmu_lock); } diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c index 2b87cc251a10..460d8cf50410 100644 --- a/arch/arm64/kvm/rme.c +++ b/arch/arm64/kvm/rme.c @@ -87,6 +87,51 @@ static int get_start_level(struct realm *realm) return 4 - ((realm->ia_bits - 8) / (RMM_PAGE_SHIFT - 3)); } +static int find_map_level(struct realm *realm, + unsigned long start, + unsigned long end) +{ + int level = RMM_RTT_MAX_LEVEL; + + while (level > get_start_level(realm)) { + unsigned long map_size = rme_rtt_level_mapsize(level - 1); + + if (!IS_ALIGNED(start, map_size) || + (start + map_size) > end) + break; + + level--; + } + + return level; +} + +static phys_addr_t alloc_delegated_granule(struct kvm_mmu_memory_cache *mc) +{ + phys_addr_t phys; + void *virt; + + if (mc) + virt = kvm_mmu_memory_cache_alloc(mc); + else + virt = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); + + if (!virt) + return PHYS_ADDR_MAX; + + phys = virt_to_phys(virt); + + if (rmi_granule_delegate(phys)) { + free_page((unsigned long)virt); + + return PHYS_ADDR_MAX; + } + + kvm_account_pgtable_pages(virt, 1); + + return phys; +} + static void free_delegated_granule(phys_addr_t phys) { if (WARN_ON(rmi_granule_undelegate(phys))) { @@ -99,6 +144,151 @@ static void free_delegated_granule(phys_addr_t phys) free_page((unsigned long)phys_to_virt(phys)); } +static int realm_rtt_create(struct realm *realm, + unsigned long addr, + int level, + phys_addr_t phys) +{ + addr = ALIGN_DOWN(addr, rme_rtt_level_mapsize(level - 1)); + return rmi_rtt_create(virt_to_phys(realm->rd), phys, addr, level); +} + +static int realm_rtt_fold(struct realm *realm, + unsigned long addr, + int level, + phys_addr_t *rtt_granule) +{ + unsigned long out_rtt; + int ret; + + ret = rmi_rtt_fold(virt_to_phys(realm->rd), addr, level, &out_rtt); + + if (RMI_RETURN_STATUS(ret) == RMI_SUCCESS && rtt_granule) + *rtt_granule = out_rtt; + + return ret; +} + +static int realm_destroy_private_granule(struct realm *realm, + unsigned long ipa, + unsigned long *next_addr, + phys_addr_t *out_rtt) +{ + unsigned long rd = virt_to_phys(realm->rd); + unsigned long rtt_addr; + phys_addr_t rtt; + int ret; + +retry: + ret = rmi_data_destroy(rd, ipa, &rtt_addr, next_addr); + if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) { + if (*next_addr > ipa) + return 0; /* UNASSIGNED */ + rtt = alloc_delegated_granule(NULL); + if (WARN_ON(rtt == PHYS_ADDR_MAX)) + return -ENOMEM; + /* + * ASSIGNED - ipa is mapped as a block, so split. The index + * from the return code should be 2 otherwise it appears + * there's a huge page bigger than KVM currently supports + */ + WARN_ON(RMI_RETURN_INDEX(ret) != 2); + ret = realm_rtt_create(realm, ipa, 3, rtt); + if (WARN_ON(ret)) { + free_delegated_granule(rtt); + return -ENXIO; + } + goto retry; + } else if (WARN_ON(ret)) { + return -ENXIO; + } + + ret = rmi_granule_undelegate(rtt_addr); + if (WARN_ON(ret)) + return -ENXIO; + + *out_rtt = rtt_addr; + + return 0; +} + +static int realm_unmap_private_page(struct realm *realm, + unsigned long ipa, + unsigned long *next_addr) +{ + unsigned long end = ALIGN(ipa + 1, PAGE_SIZE); + unsigned long addr; + phys_addr_t out_rtt = PHYS_ADDR_MAX; + int ret; + + for (addr = ipa; addr < end; addr = *next_addr) { + ret = realm_destroy_private_granule(realm, addr, next_addr, + &out_rtt); + if (ret) + return ret; + } + + return 0; +} + +static void realm_unmap_shared_range(struct kvm *kvm, + int level, + unsigned long start, + unsigned long end) +{ + struct realm *realm = &kvm->arch.realm; + unsigned long rd = virt_to_phys(realm->rd); + ssize_t map_size = rme_rtt_level_mapsize(level); + unsigned long next_addr, addr; + unsigned long shared_bit = BIT(realm->ia_bits - 1); + + if (WARN_ON(level > RMM_RTT_MAX_LEVEL)) + return; + + start |= shared_bit; + end |= shared_bit; + + for (addr = start; addr < end; addr = next_addr) { + unsigned long align_addr = ALIGN(addr, map_size); + int ret; + + next_addr = ALIGN(addr + 1, map_size); + + if (align_addr != addr || next_addr > end) { + /* Need to recurse deeper */ + if (addr < align_addr) + next_addr = align_addr; + realm_unmap_shared_range(kvm, level + 1, addr, + min(next_addr, end)); + continue; + } + + ret = rmi_rtt_unmap_unprotected(rd, addr, level, &next_addr); + switch (RMI_RETURN_STATUS(ret)) { + case RMI_SUCCESS: + break; + case RMI_ERROR_RTT: + if (next_addr == addr) { + /* + * There's a mapping here, but it's not a block + * mapping, so reset next_addr to the next block + * boundary and recurse to clear out the pages + * one level deeper. + */ + next_addr = ALIGN(addr + 1, map_size); + realm_unmap_shared_range(kvm, level + 1, addr, + next_addr); + } + break; + default: + WARN_ON(1); + return; + } + + cond_resched_rwlock_write(&kvm->mmu_lock); + } +} + /* Calculate the number of s2 root rtts needed */ static int realm_num_root_rtts(struct realm *realm) { @@ -209,6 +399,40 @@ static int realm_rtt_destroy(struct realm *realm, unsigned long addr, return ret; } +static int realm_create_rtt_levels(struct realm *realm, + unsigned long ipa, + int level, + int max_level, + struct kvm_mmu_memory_cache *mc) +{ + if (level == max_level) + return 0; + + while (level++ < max_level) { + phys_addr_t rtt = alloc_delegated_granule(mc); + int ret; + + if (rtt == PHYS_ADDR_MAX) + return -ENOMEM; + + ret = realm_rtt_create(realm, ipa, level, rtt); + + if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT && + RMI_RETURN_INDEX(ret) == level - 1) { + /* The RTT already exists, continue */ + continue; + } + if (ret) { + WARN(1, "Failed to create RTT at level %d: %d\n", + level, ret); + free_delegated_granule(rtt); + return -ENXIO; + } + } + + return 0; +} + static int realm_tear_down_rtt_level(struct realm *realm, int level, unsigned long start, unsigned long end) { @@ -299,6 +523,61 @@ static int realm_tear_down_rtt_range(struct realm *realm, start, end); } +/* + * Returns 0 on successful fold, a negative value on error, a positive value if + * we were not able to fold all tables at this level. + */ +static int realm_fold_rtt_level(struct realm *realm, int level, + unsigned long start, unsigned long end) +{ + int not_folded = 0; + ssize_t map_size; + unsigned long addr, next_addr; + + if (WARN_ON(level > RMM_RTT_MAX_LEVEL)) + return -EINVAL; + + map_size = rme_rtt_level_mapsize(level - 1); + + for (addr = start; addr < end; addr = next_addr) { + phys_addr_t rtt_granule; + int ret; + unsigned long align_addr = ALIGN(addr, map_size); + + next_addr = ALIGN(addr + 1, map_size); + + ret = realm_rtt_fold(realm, align_addr, level, &rtt_granule); + + switch (RMI_RETURN_STATUS(ret)) { + case RMI_SUCCESS: + free_delegated_granule(rtt_granule); + break; + case RMI_ERROR_RTT: + if (level == RMM_RTT_MAX_LEVEL || + RMI_RETURN_INDEX(ret) < level) { + not_folded++; + break; + } + /* Recurse a level deeper */ + ret = realm_fold_rtt_level(realm, + level + 1, + addr, + next_addr); + if (ret < 0) + return ret; + else if (ret == 0) + /* Try again at this level */ + next_addr = addr; + break; + default: + WARN_ON(1); + return -ENXIO; + } + } + + return not_folded; +} + void kvm_realm_destroy_rtts(struct kvm *kvm, u32 ia_bits) { struct realm *realm = &kvm->arch.realm; @@ -306,6 +585,96 @@ void kvm_realm_destroy_rtts(struct kvm *kvm, u32 ia_bits) WARN_ON(realm_tear_down_rtt_range(realm, 0, (1UL << ia_bits))); } +static void realm_unmap_private_range(struct kvm *kvm, + unsigned long start, + unsigned long end) +{ + struct realm *realm = &kvm->arch.realm; + unsigned long next_addr, addr; + int ret; + + for (addr = start; addr < end; addr = next_addr) { + ret = realm_unmap_private_page(realm, addr, &next_addr); + + if (ret) + break; + } + + realm_fold_rtt_level(realm, get_start_level(realm) + 1, + start, end); +} + +void kvm_realm_unmap_range(struct kvm *kvm, unsigned long start, + unsigned long size, bool unmap_private) +{ + unsigned long end = start + size; + struct realm *realm = &kvm->arch.realm; + + end = min(BIT(realm->ia_bits - 1), end); + + if (realm->state == REALM_STATE_NONE) + return; + + realm_unmap_shared_range(kvm, find_map_level(realm, start, end), + start, end); + if (unmap_private) + realm_unmap_private_range(kvm, start, end); +} + +static int realm_init_ipa_state(struct realm *realm, + unsigned long ipa, + unsigned long end) +{ + phys_addr_t rd_phys = virt_to_phys(realm->rd); + int ret; + + while (ipa < end) { + unsigned long next; + + ret = rmi_rtt_init_ripas(rd_phys, ipa, end, &next); + + if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) { + int err_level = RMI_RETURN_INDEX(ret); + int level = find_map_level(realm, ipa, end); + + if (WARN_ON(err_level >= level)) + return -ENXIO; + + ret = realm_create_rtt_levels(realm, ipa, + err_level, + level, NULL); + if (ret) + return ret; + /* Retry with the RTT levels in place */ + continue; + } else if (WARN_ON(ret)) { + return -ENXIO; + } + + ipa = next; + } + + return 0; +} + +static int kvm_init_ipa_range_realm(struct kvm *kvm, + struct arm_rme_init_ripas *args) +{ + gpa_t addr, end; + struct realm *realm = &kvm->arch.realm; + + addr = args->base; + end = addr + args->size; + + if (end < addr) + return -EINVAL; + + if (kvm_realm_state(kvm) != REALM_STATE_NEW) + return -EPERM; + + return realm_init_ipa_state(realm, addr, end); +} + /* Protects access to rme_vmid_bitmap */ static DEFINE_SPINLOCK(rme_vmid_lock); static unsigned long *rme_vmid_bitmap; @@ -431,6 +800,18 @@ int kvm_realm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) case KVM_CAP_ARM_RME_CREATE_REALM: r = kvm_create_realm(kvm); break; + case KVM_CAP_ARM_RME_INIT_RIPAS_REALM: { + struct arm_rme_init_ripas args; + void __user *argp = u64_to_user_ptr(cap->args[1]); + + if (copy_from_user(&args, argp, sizeof(args))) { + r = -EFAULT; + break; + } + + r = kvm_init_ipa_range_realm(kvm, &args); + break; + } default: r = -EINVAL; break; -- Gitee From c657e3dddcc6f32680b07c1ee08c39ef034651ed Mon Sep 17 00:00:00 2001 From: Steven Price Date: Sat, 14 Jun 2025 15:38:20 +0800 Subject: [PATCH 50/84] arm64: RME: Handle realm enter/exit community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- Entering a realm is done using a SMC call to the RMM. On exit the exit-codes need to be handled slightly differently to the normal KVM path so define our own functions for realm enter/exit and hook them in if the guest is a realm guest. Signed-off-by: Steven Price Reviewed-by: Gavin Shan Signed-off-by: Xu Raoqing --- arch/arm64/include/asm/kvm_rme.h | 3 + arch/arm64/kvm/Makefile | 2 +- arch/arm64/kvm/arm.c | 19 +++- arch/arm64/kvm/rme-exit.c | 162 +++++++++++++++++++++++++++++++ arch/arm64/kvm/rme.c | 19 ++++ 5 files changed, 199 insertions(+), 6 deletions(-) create mode 100644 arch/arm64/kvm/rme-exit.c diff --git a/arch/arm64/include/asm/kvm_rme.h b/arch/arm64/include/asm/kvm_rme.h index b916db8565a2..d86051ef0c5c 100644 --- a/arch/arm64/include/asm/kvm_rme.h +++ b/arch/arm64/include/asm/kvm_rme.h @@ -101,6 +101,9 @@ void kvm_realm_destroy_rtts(struct kvm *kvm, u32 ia_bits); int kvm_create_rec(struct kvm_vcpu *vcpu); void kvm_destroy_rec(struct kvm_vcpu *vcpu); +int kvm_rec_enter(struct kvm_vcpu *vcpu); +int handle_rec_exit(struct kvm_vcpu *vcpu, int rec_run_status); + void kvm_realm_unmap_range(struct kvm *kvm, unsigned long ipa, unsigned long size, diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 1c1d8cdf381f..2eec980cbe5c 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -21,7 +21,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \ vgic/vgic-mmio.o vgic/vgic-mmio-v2.o \ vgic/vgic-mmio-v3.o vgic/vgic-kvm-device.o \ vgic/vgic-its.o vgic/vgic-debug.o \ - rme.o + rme.o rme-exit.o kvm-$(CONFIG_HW_PERF_EVENTS) += pmu-emul.o pmu.o diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index b68f1fcb0dc7..8c4f859e97a8 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1032,7 +1032,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) trace_kvm_entry(*vcpu_pc(vcpu)); guest_timing_enter_irqoff(); - ret = kvm_arm_vcpu_enter_exit(vcpu); + if (vcpu_is_rec(vcpu)) + ret = kvm_rec_enter(vcpu); + else + ret = kvm_arm_vcpu_enter_exit(vcpu); vcpu->mode = OUTSIDE_GUEST_MODE; vcpu->stat.exits++; @@ -1086,10 +1089,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) local_irq_enable(); - trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); - /* Exit types that need handling before we can be preempted */ - handle_exit_early(vcpu, ret); + if (!vcpu_is_rec(vcpu)) { + trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), + *vcpu_pc(vcpu)); + + handle_exit_early(vcpu, ret); + } preempt_enable(); @@ -1112,7 +1118,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) ret = ARM_EXCEPTION_IL; } - ret = handle_exit(vcpu, ret); + if (vcpu_is_rec(vcpu)) + ret = handle_rec_exit(vcpu, ret); + else + ret = handle_exit(vcpu, ret); } /* Tell userspace about in-kernel device output levels */ diff --git a/arch/arm64/kvm/rme-exit.c b/arch/arm64/kvm/rme-exit.c new file mode 100644 index 000000000000..d433ab72ee90 --- /dev/null +++ b/arch/arm64/kvm/rme-exit.c @@ -0,0 +1,162 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 ARM Ltd. + */ + +#include +#include +#include + +#include +#include +#include +#include + +typedef int (*exit_handler_fn)(struct kvm_vcpu *vcpu); + +static int rec_exit_reason_notimpl(struct kvm_vcpu *vcpu) +{ + struct realm_rec *rec = &vcpu->arch.rec; + + vcpu_err(vcpu, "Unhandled exit reason from realm (ESR: %#llx)\n", + rec->run->exit.esr); + return -ENXIO; +} + +static int rec_exit_sync_dabt(struct kvm_vcpu *vcpu) +{ + return kvm_handle_guest_abort(vcpu); +} + +static int rec_exit_sync_iabt(struct kvm_vcpu *vcpu) +{ + struct realm_rec *rec = &vcpu->arch.rec; + + vcpu_err(vcpu, "Unhandled instruction abort (ESR: %#llx).\n", + rec->run->exit.esr); + return -ENXIO; +} + +static int rec_exit_sys_reg(struct kvm_vcpu *vcpu) +{ + struct realm_rec *rec = &vcpu->arch.rec; + unsigned long esr = kvm_vcpu_get_esr(vcpu); + int rt = kvm_vcpu_sys_get_rt(vcpu); + bool is_write = !(esr & 1); + int ret; + + if (is_write) + vcpu_set_reg(vcpu, rt, rec->run->exit.gprs[0]); + + ret = kvm_handle_sys_reg(vcpu); + if (ret > 0 && !is_write) + rec->run->enter.gprs[0] = vcpu_get_reg(vcpu, rt); + + return ret; +} + +static exit_handler_fn rec_exit_handlers[] = { + [0 ... ESR_ELx_EC_MAX] = rec_exit_reason_notimpl, + [ESR_ELx_EC_SYS64] = rec_exit_sys_reg, + [ESR_ELx_EC_DABT_LOW] = rec_exit_sync_dabt, + [ESR_ELx_EC_IABT_LOW] = rec_exit_sync_iabt +}; + +static int rec_exit_psci(struct kvm_vcpu *vcpu) +{ + struct realm_rec *rec = &vcpu->arch.rec; + int i; + + for (i = 0; i < REC_RUN_GPRS; i++) + vcpu_set_reg(vcpu, i, rec->run->exit.gprs[i]); + + return kvm_smccc_call_handler(vcpu); +} + +static int rec_exit_ripas_change(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct realm *realm = &kvm->arch.realm; + struct realm_rec *rec = &vcpu->arch.rec; + unsigned long base = rec->run->exit.ripas_base; + unsigned long top = rec->run->exit.ripas_top; + unsigned long ripas = rec->run->exit.ripas_value; + + if (!kvm_realm_is_private_address(realm, base) || + !kvm_realm_is_private_address(realm, top - 1)) { + vcpu_err(vcpu, "Invalid RIPAS_CHANGE for %#lx - %#lx, ripas: %#lx\n", + base, top, ripas); + return -EINVAL; + } + + return 1; +} + +static void update_arch_timer_irq_lines(struct kvm_vcpu *vcpu) +{ + struct realm_rec *rec = &vcpu->arch.rec; + + __vcpu_sys_reg(vcpu, CNTV_CTL_EL0) = rec->run->exit.cntv_ctl; + __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0) = rec->run->exit.cntv_cval; + __vcpu_sys_reg(vcpu, CNTP_CTL_EL0) = rec->run->exit.cntp_ctl; + __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0) = rec->run->exit.cntp_cval; + + kvm_realm_timers_update(vcpu); +} + +/* + * Return > 0 to return to guest, < 0 on error, 0 (and set exit_reason) on + * proper exit to userspace. + */ +int handle_rec_exit(struct kvm_vcpu *vcpu, int rec_run_ret) +{ + struct realm_rec *rec = &vcpu->arch.rec; + u8 esr_ec = ESR_ELx_EC(rec->run->exit.esr); + unsigned long status, index; + + status = RMI_RETURN_STATUS(rec_run_ret); + index = RMI_RETURN_INDEX(rec_run_ret); + + /* + * If a PSCI_SYSTEM_OFF request raced with a vcpu executing, we might + * see the following status code and index indicating an attempt to run + * a REC when the RD state is SYSTEM_OFF. In this case, we just need to + * return to user space which can deal with the system event or will try + * to run the KVM VCPU again, at which point we will no longer attempt + * to enter the Realm because we will have a sleep request pending on + * the VCPU as a result of KVM's PSCI handling. + */ + if (status == RMI_ERROR_REALM && index == 1) { + vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; + return 0; + } + + if (rec_run_ret) + return -ENXIO; + + vcpu->arch.fault.esr_el2 = rec->run->exit.esr; + vcpu->arch.fault.far_el2 = rec->run->exit.far; + vcpu->arch.fault.hpfar_el2 = rec->run->exit.hpfar; + + update_arch_timer_irq_lines(vcpu); + + /* Reset the emulation flags for the next run of the REC */ + rec->run->enter.flags = 0; + + switch (rec->run->exit.exit_reason) { + case RMI_EXIT_SYNC: + return rec_exit_handlers[esr_ec](vcpu); + case RMI_EXIT_IRQ: + case RMI_EXIT_FIQ: + return 1; + case RMI_EXIT_PSCI: + return rec_exit_psci(vcpu); + case RMI_EXIT_RIPAS_CHANGE: + return rec_exit_ripas_change(vcpu); + } + + kvm_pr_unimpl("Unsupported exit reason: %u\n", + rec->run->exit.exit_reason); + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + return 0; +} diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c index 460d8cf50410..1ccf5b924b4a 100644 --- a/arch/arm64/kvm/rme.c +++ b/arch/arm64/kvm/rme.c @@ -860,6 +860,25 @@ void kvm_destroy_realm(struct kvm *kvm) kvm_free_stage2_pgd(&kvm->arch.mmu); } +int kvm_rec_enter(struct kvm_vcpu *vcpu) +{ + struct realm_rec *rec = &vcpu->arch.rec; + + switch (rec->run->exit.exit_reason) { + case RMI_EXIT_HOST_CALL: + case RMI_EXIT_PSCI: + for (int i = 0; i < REC_RUN_GPRS; i++) + rec->run->enter.gprs[i] = vcpu_get_reg(vcpu, i); + break; + } + + if (kvm_realm_state(vcpu->kvm) != REALM_STATE_ACTIVE) + return -EINVAL; + + return rmi_rec_enter(virt_to_phys(rec->rec_page), + virt_to_phys(rec->run)); +} + static void free_rec_aux(struct page **aux_pages, unsigned int num_aux) { -- Gitee From 56065fc600a94823acff037ebf4f3e5528d070d8 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Sat, 14 Jun 2025 15:39:46 +0800 Subject: [PATCH 51/84] arm64: RME: Handle RMI_EXIT_RIPAS_CHANGE community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- The guest can request that a region of it's protected address space is switched between RIPAS_RAM and RIPAS_EMPTY (and back) using RSI_IPA_STATE_SET. This causes a guest exit with the RMI_EXIT_RIPAS_CHANGE code. We treat this as a request to convert a protected region to unprotected (or back), exiting to the VMM to make the necessary changes to the guest_memfd and memslot mappings. On the next entry the RIPAS changes are committed by making RMI_RTT_SET_RIPAS calls. The VMM may wish to reject the RIPAS change requested by the guest. For now it can only do with by no longer scheduling the VCPU as we don't currently have a usecase for returning that rejection to the guest, but by postponing the RMI_RTT_SET_RIPAS changes to entry we leave the door open for adding a new ioctl in the future for this purpose. Signed-off-by: Steven Price Reviewed-by: Gavin Shan Signed-off-by: Xu Raoqing --- arch/arm64/kvm/rme.c | 88 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c index 1ccf5b924b4a..e7b776a382b3 100644 --- a/arch/arm64/kvm/rme.c +++ b/arch/arm64/kvm/rme.c @@ -621,6 +621,65 @@ void kvm_realm_unmap_range(struct kvm *kvm, unsigned long start, realm_unmap_private_range(kvm, start, end); } +static int realm_set_ipa_state(struct kvm_vcpu *vcpu, + unsigned long start, + unsigned long end, + unsigned long ripas, + unsigned long *top_ipa) +{ + struct kvm *kvm = vcpu->kvm; + struct realm *realm = &kvm->arch.realm; + struct realm_rec *rec = &vcpu->arch.rec; + phys_addr_t rd_phys = virt_to_phys(realm->rd); + phys_addr_t rec_phys = virt_to_phys(rec->rec_page); + struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; + unsigned long ipa = start; + int ret = 0; + + while (ipa < end) { + unsigned long next; + + ret = rmi_rtt_set_ripas(rd_phys, rec_phys, ipa, end, &next); + + if (RMI_RETURN_STATUS(ret) == RMI_SUCCESS) { + ipa = next; + } else if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) { + int walk_level = RMI_RETURN_INDEX(ret); + int level = find_map_level(realm, ipa, end); + + /* + * If the RMM walk ended early then more tables are + * needed to reach the required depth to set the RIPAS. + */ + if (walk_level < level) { + ret = realm_create_rtt_levels(realm, ipa, + walk_level, + level, + memcache); + /* Retry with RTTs created */ + if (!ret) + continue; + } else { + ret = -EINVAL; + } + + break; + } else { + WARN(1, "Unexpected error in %s: %#x\n", __func__, + ret); + ret = -ENXIO; + break; + } + } + + *top_ipa = ipa; + + if (ripas == RMI_EMPTY && ipa != start) + realm_unmap_private_range(kvm, start, ipa); + + return ret; +} + static int realm_init_ipa_state(struct realm *realm, unsigned long ipa, unsigned long end) @@ -860,6 +919,32 @@ void kvm_destroy_realm(struct kvm *kvm) kvm_free_stage2_pgd(&kvm->arch.mmu); } +static void kvm_complete_ripas_change(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct realm_rec *rec = &vcpu->arch.rec; + unsigned long base = rec->run->exit.ripas_base; + unsigned long top = rec->run->exit.ripas_top; + unsigned long ripas = rec->run->exit.ripas_value; + unsigned long top_ipa; + int ret; + + do { + kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_cache, + kvm_mmu_cache_min_pages(kvm)); + write_lock(&kvm->mmu_lock); + ret = realm_set_ipa_state(vcpu, base, top, ripas, &top_ipa); + write_unlock(&kvm->mmu_lock); + + if (WARN_RATELIMIT(ret && ret != -ENOMEM, + "Unable to satisfy RIPAS_CHANGE for %#lx - %#lx, ripas: %#lx\n", + base, top, ripas)) + break; + + base = top_ipa; + } while (top_ipa < top); +} + int kvm_rec_enter(struct kvm_vcpu *vcpu) { struct realm_rec *rec = &vcpu->arch.rec; @@ -870,6 +955,9 @@ int kvm_rec_enter(struct kvm_vcpu *vcpu) for (int i = 0; i < REC_RUN_GPRS; i++) rec->run->enter.gprs[i] = vcpu_get_reg(vcpu, i); break; + case RMI_EXIT_RIPAS_CHANGE: + kvm_complete_ripas_change(vcpu); + break; } if (kvm_realm_state(vcpu->kvm) != REALM_STATE_ACTIVE) -- Gitee From c90b7b84b51745a7da662378d2ae72ab404242f9 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Sat, 14 Jun 2025 15:40:44 +0800 Subject: [PATCH 52/84] KVM: arm64: Handle realm MMIO emulation community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- MMIO emulation for a realm cannot be done directly with the VM's registers as they are protected from the host. However, for emulatable data aborts, the RMM uses GPRS[0] to provide the read/written value. We can transfer this from/to the equivalent VCPU's register entry and then depend on the generic MMIO handling code in KVM. For a MMIO read, the value is placed in the shared RecExit structure during kvm_handle_mmio_return() rather than in the VCPU's register entry. Signed-off-by: Steven Price Reviewed-by: Gavin Shan Reviewed-by: Suzuki K Poulose Signed-off-by: Xu Raoqing --- arch/arm64/kvm/inject_fault.c | 4 +++- arch/arm64/kvm/mmio.c | 17 +++++++++++++++-- arch/arm64/kvm/rme-exit.c | 14 ++++++++++++++ 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c index 0bd93a5f21ce..2027d8e2fda2 100644 --- a/arch/arm64/kvm/inject_fault.c +++ b/arch/arm64/kvm/inject_fault.c @@ -165,7 +165,9 @@ static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt, u32 addr) */ void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr) { - if (vcpu_el1_is_32bit(vcpu)) + if (unlikely(vcpu_is_rec(vcpu))) + vcpu->arch.rec.run->enter.flags |= REC_ENTER_FLAG_INJECT_SEA; + else if (vcpu_el1_is_32bit(vcpu)) inject_abt32(vcpu, false, addr); else inject_abt64(vcpu, false, addr); diff --git a/arch/arm64/kvm/mmio.c b/arch/arm64/kvm/mmio.c index 2aa503ff742e..ac214bd9dbc2 100644 --- a/arch/arm64/kvm/mmio.c +++ b/arch/arm64/kvm/mmio.c @@ -6,6 +6,7 @@ #include #include +#include #include #include "trace.h" @@ -136,14 +137,21 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu) trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr, &data); data = vcpu_data_host_to_guest(vcpu, data, len); - vcpu_set_reg(vcpu, kvm_vcpu_dabt_get_rd(vcpu), data); + + if (vcpu_is_rec(vcpu)) + vcpu->arch.rec.run->enter.gprs[0] = data; + else + vcpu_set_reg(vcpu, kvm_vcpu_dabt_get_rd(vcpu), data); } /* * The MMIO instruction is emulated and should not be re-executed * in the guest. */ - kvm_incr_pc(vcpu); + if (vcpu_is_rec(vcpu)) + vcpu->arch.rec.run->enter.flags |= REC_ENTER_FLAG_EMULATED_MMIO; + else + kvm_incr_pc(vcpu); return 1; } @@ -163,6 +171,11 @@ int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) * volunteered to do so, and bail out otherwise. */ if (!kvm_vcpu_dabt_isvalid(vcpu)) { + if (vcpu_is_rec(vcpu)) { + kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); + return 1; + } + if (test_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER, &vcpu->kvm->arch.flags)) { run->exit_reason = KVM_EXIT_ARM_NISV; diff --git a/arch/arm64/kvm/rme-exit.c b/arch/arm64/kvm/rme-exit.c index d433ab72ee90..730e2aa621ae 100644 --- a/arch/arm64/kvm/rme-exit.c +++ b/arch/arm64/kvm/rme-exit.c @@ -25,6 +25,20 @@ static int rec_exit_reason_notimpl(struct kvm_vcpu *vcpu) static int rec_exit_sync_dabt(struct kvm_vcpu *vcpu) { + struct realm_rec *rec = &vcpu->arch.rec; + + /* + * In the case of a write, copy over gprs[0] to the target GPR, + * preparing to handle MMIO write fault. The content to be written has + * been saved to gprs[0] by the RMM (even if another register was used + * by the guest). In the case of normal memory access this is redundant + * (the guest will replay the instruction), but the overhead is + * minimal. + */ + if (kvm_vcpu_dabt_iswrite(vcpu) && kvm_vcpu_dabt_isvalid(vcpu)) + vcpu_set_reg(vcpu, kvm_vcpu_dabt_get_rd(vcpu), + rec->run->exit.gprs[0]); + return kvm_handle_guest_abort(vcpu); } -- Gitee From 3c1755e1ef3cc30b5495fe1d4ff1327ee6f0b7dd Mon Sep 17 00:00:00 2001 From: Steven Price Date: Sat, 14 Jun 2025 15:41:29 +0800 Subject: [PATCH 53/84] arm64: RME: Allow populating initial contents community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- The VMM needs to populate the realm with some data before starting (e.g. a kernel and initrd). This is measured by the RMM and used as part of the attestation later on. Co-developed-by: Suzuki K Poulose Signed-off-by: Suzuki K Poulose Signed-off-by: Steven Price Reviewed-by: Gavin Shan Signed-off-by: Xu Raoqing --- arch/arm64/kvm/rme.c | 227 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 227 insertions(+) diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c index e7b776a382b3..d964ccd7966c 100644 --- a/arch/arm64/kvm/rme.c +++ b/arch/arm64/kvm/rme.c @@ -621,6 +621,221 @@ void kvm_realm_unmap_range(struct kvm *kvm, unsigned long start, realm_unmap_private_range(kvm, start, end); } +static int realm_create_protected_data_granule(struct realm *realm, + unsigned long ipa, + phys_addr_t dst_phys, + phys_addr_t src_phys, + unsigned long flags) +{ + phys_addr_t rd = virt_to_phys(realm->rd); + int ret; + + if (rmi_granule_delegate(dst_phys)) + return -ENXIO; + + ret = rmi_data_create(rd, dst_phys, ipa, src_phys, flags); + if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) { + /* Create missing RTTs and retry */ + int level = RMI_RETURN_INDEX(ret); + + WARN_ON(level == RMM_RTT_MAX_LEVEL); + + ret = realm_create_rtt_levels(realm, ipa, level, + RMM_RTT_MAX_LEVEL, NULL); + if (ret) + return -EIO; + + ret = rmi_data_create(rd, dst_phys, ipa, src_phys, flags); + } + if (ret) + return -EIO; + + return 0; +} + +static int realm_create_protected_data_page(struct realm *realm, + unsigned long ipa, + kvm_pfn_t dst_pfn, + kvm_pfn_t src_pfn, + unsigned long flags) +{ + unsigned long rd = virt_to_phys(realm->rd); + phys_addr_t dst_phys, src_phys; + bool undelegate_failed = false; + int ret, offset; + + dst_phys = __pfn_to_phys(dst_pfn); + src_phys = __pfn_to_phys(src_pfn); + + for (offset = 0; offset < PAGE_SIZE; offset += RMM_PAGE_SIZE) { + ret = realm_create_protected_data_granule(realm, + ipa, + dst_phys, + src_phys, + flags); + if (ret) + goto err; + + ipa += RMM_PAGE_SIZE; + dst_phys += RMM_PAGE_SIZE; + src_phys += RMM_PAGE_SIZE; + } + + return 0; + +err: + if (ret == -EIO) { + /* current offset needs undelegating */ + if (WARN_ON(rmi_granule_undelegate(dst_phys))) + undelegate_failed = true; + } + while (offset > 0) { + ipa -= RMM_PAGE_SIZE; + offset -= RMM_PAGE_SIZE; + dst_phys -= RMM_PAGE_SIZE; + + rmi_data_destroy(rd, ipa, NULL, NULL); + + if (WARN_ON(rmi_granule_undelegate(dst_phys))) + undelegate_failed = true; + } + + if (undelegate_failed) { + /* + * A granule could not be undelegated, + * so the page has to be leaked + */ + get_page(pfn_to_page(dst_pfn)); + } + + return -ENXIO; +} + +static int populate_region(struct kvm *kvm, + phys_addr_t ipa_base, + phys_addr_t ipa_end, + unsigned long data_flags) +{ + struct realm *realm = &kvm->arch.realm; + struct kvm_memory_slot *memslot; + gfn_t base_gfn, end_gfn; + int idx; + phys_addr_t ipa = ipa_base; + int ret = 0; + + base_gfn = gpa_to_gfn(ipa_base); + end_gfn = gpa_to_gfn(ipa_end); + + idx = srcu_read_lock(&kvm->srcu); + memslot = gfn_to_memslot(kvm, base_gfn); + if (!memslot) { + ret = -EFAULT; + goto out; + } + + /* We require the region to be contained within a single memslot */ + if (memslot->base_gfn + memslot->npages < end_gfn) { + ret = -EINVAL; + goto out; + } + + if (!kvm_slot_can_be_private(memslot)) { + ret = -EPERM; + goto out; + } + + while (ipa < ipa_end) { + struct vm_area_struct *vma; + unsigned long hva; + struct page *page; + bool writeable; + kvm_pfn_t pfn; + kvm_pfn_t priv_pfn; + struct page *gmem_page; + + hva = gfn_to_hva_memslot(memslot, gpa_to_gfn(ipa)); + vma = vma_lookup(current->mm, hva); + if (!vma) { + ret = -EFAULT; + break; + } + + pfn = __kvm_faultin_pfn(memslot, gpa_to_gfn(ipa), FOLL_WRITE, + &writeable, &page); + + if (is_error_pfn(pfn)) { + ret = -EFAULT; + break; + } + + ret = kvm_gmem_get_pfn(kvm, memslot, + ipa >> PAGE_SHIFT, + &priv_pfn, &gmem_page, NULL); + if (ret) + break; + + ret = realm_create_protected_data_page(realm, ipa, + priv_pfn, + pfn, + data_flags); + + kvm_release_page_clean(page); + + if (ret) + break; + + ipa += PAGE_SIZE; + } + +out: + srcu_read_unlock(&kvm->srcu, idx); + return ret; +} + +static int kvm_populate_realm(struct kvm *kvm, + struct arm_rme_populate_realm *args) +{ + phys_addr_t ipa_base, ipa_end; + unsigned long data_flags = 0; + + if (kvm_realm_state(kvm) != REALM_STATE_NEW) + return -EPERM; + + if (!IS_ALIGNED(args->base, PAGE_SIZE) || + !IS_ALIGNED(args->size, PAGE_SIZE) || + (args->flags & ~RMI_MEASURE_CONTENT)) + return -EINVAL; + + ipa_base = args->base; + ipa_end = ipa_base + args->size; + + if (ipa_end < ipa_base) + return -EINVAL; + + if (args->flags & RMI_MEASURE_CONTENT) + data_flags |= RMI_MEASURE_CONTENT; + + /* + * Perform the population in parts to ensure locks are not held for too + * long + */ + while (ipa_base < ipa_end) { + phys_addr_t end = min(ipa_end, ipa_base + SZ_2M); + + int ret = populate_region(kvm, ipa_base, end, + args->flags); + + if (ret) + return ret; + + ipa_base = end; + + cond_resched(); + } + + return 0; +} + static int realm_set_ipa_state(struct kvm_vcpu *vcpu, unsigned long start, unsigned long end, @@ -871,6 +1086,18 @@ int kvm_realm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) r = kvm_init_ipa_range_realm(kvm, &args); break; } + case KVM_CAP_ARM_RME_POPULATE_REALM: { + struct arm_rme_populate_realm args; + void __user *argp = u64_to_user_ptr(cap->args[1]); + + if (copy_from_user(&args, argp, sizeof(args))) { + r = -EFAULT; + break; + } + + r = kvm_populate_realm(kvm, &args); + break; + } default: r = -EINVAL; break; -- Gitee From 6855c109b51852fb22ff06d906b1b5b79f99dec6 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Sat, 14 Jun 2025 15:42:16 +0800 Subject: [PATCH 54/84] rme: populate guest memory region without guest_memfd community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://patchew.org/linux/20241004152804.72508-1-steven.price@arm.com/20241004152804.72508-21-steven.price@arm.com/ -------------------------------- The VMM needs to populate the realm with some data before starting (e.g. a kernel and initrd). This is measured by the RMM and used as part of the attestation later on. For now only 4k mappings are supported, future work may add support for larger mappings. Co-developed-by: Suzuki K Poulose Signed-off-by: Suzuki K Poulose Signed-off-by: Steven Price Signed-off-by: Xu Raoqing --- arch/arm64/kvm/rme.c | 47 ++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c index d964ccd7966c..c9f8d53dd4d2 100644 --- a/arch/arm64/kvm/rme.c +++ b/arch/arm64/kvm/rme.c @@ -655,8 +655,8 @@ static int realm_create_protected_data_granule(struct realm *realm, static int realm_create_protected_data_page(struct realm *realm, unsigned long ipa, - kvm_pfn_t dst_pfn, - kvm_pfn_t src_pfn, + struct page *dst_page, + struct page *src_page, unsigned long flags) { unsigned long rd = virt_to_phys(realm->rd); @@ -664,10 +664,12 @@ static int realm_create_protected_data_page(struct realm *realm, bool undelegate_failed = false; int ret, offset; - dst_phys = __pfn_to_phys(dst_pfn); - src_phys = __pfn_to_phys(src_pfn); + dst_phys = page_to_phys(dst_page); + src_phys = page_to_phys(src_page); + copy_page(page_address(src_page), page_address(dst_page)); for (offset = 0; offset < PAGE_SIZE; offset += RMM_PAGE_SIZE) { + ret = realm_create_protected_data_granule(realm, ipa, dst_phys, @@ -705,7 +707,7 @@ static int realm_create_protected_data_page(struct realm *realm, * A granule could not be undelegated, * so the page has to be leaked */ - get_page(pfn_to_page(dst_pfn)); + get_page(dst_page); } return -ENXIO; @@ -721,6 +723,7 @@ static int populate_region(struct kvm *kvm, gfn_t base_gfn, end_gfn; int idx; phys_addr_t ipa = ipa_base; + struct page *tmp_page; int ret = 0; base_gfn = gpa_to_gfn(ipa_base); @@ -739,19 +742,19 @@ static int populate_region(struct kvm *kvm, goto out; } - if (!kvm_slot_can_be_private(memslot)) { - ret = -EPERM; + tmp_page = alloc_page(GFP_KERNEL); + if (!tmp_page) { + ret = -ENOMEM; goto out; } + mmap_read_lock(current->mm); + while (ipa < ipa_end) { struct vm_area_struct *vma; unsigned long hva; struct page *page; - bool writeable; kvm_pfn_t pfn; - kvm_pfn_t priv_pfn; - struct page *gmem_page; hva = gfn_to_hva_memslot(memslot, gpa_to_gfn(ipa)); vma = vma_lookup(current->mm, hva); @@ -760,34 +763,30 @@ static int populate_region(struct kvm *kvm, break; } - pfn = __kvm_faultin_pfn(memslot, gpa_to_gfn(ipa), FOLL_WRITE, - &writeable, &page); + pfn = gfn_to_pfn_memslot(memslot, gpa_to_gfn(ipa)); if (is_error_pfn(pfn)) { ret = -EFAULT; break; } - ret = kvm_gmem_get_pfn(kvm, memslot, - ipa >> PAGE_SHIFT, - &priv_pfn, &gmem_page, NULL); - if (ret) - break; + page = pfn_to_page(pfn); ret = realm_create_protected_data_page(realm, ipa, - priv_pfn, - pfn, + page, + tmp_page, data_flags); - - kvm_release_page_clean(page); - - if (ret) + if (ret) { + kvm_release_page_clean(page); break; + } ipa += PAGE_SIZE; + kvm_release_pfn_dirty(pfn); } - out: + mmap_read_unlock(current->mm); + __free_page(tmp_page); srcu_read_unlock(&kvm->srcu, idx); return ret; } -- Gitee From 6a44641f8748de427b944d2bc1843047940ed020 Mon Sep 17 00:00:00 2001 From: Sebastian Ene Date: Tue, 17 Jun 2025 19:51:46 +0800 Subject: [PATCH 55/84] KVM: arm64: Move pagetable definitions to common header mainline inclusion from mainline-v6.12-rc1 commit 29caeda359da15d16963096043cda39530f81cc4 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=29caeda359da15d16963096043cda39530f81cc4 -------------------------------- In preparation for using the stage-2 definitions in ptdump, move some of these macros in the common header. Signed-off-by: Sebastian Ene Link: https://lore.kernel.org/r/20240909124721.1672199-2-sebastianene@google.com Signed-off-by: Marc Zyngier [xuraoqing: only need macro KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R and KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W] Signed-off-by: Xu Raoqing --- arch/arm64/include/asm/kvm_pgtable.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index d3e354bb8351..4153b77d8aae 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -44,6 +44,15 @@ typedef u64 kvm_pte_t; #define KVM_PHYS_INVALID (-1ULL) +#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6) +#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7) + +/* + * Used to indicate a pte for which a 'break-before-make' sequence is in + * progress. + */ +#define KVM_INVALID_PTE_LOCKED BIT(10) + static inline bool kvm_pte_valid(kvm_pte_t pte) { return pte & KVM_PTE_VALID; -- Gitee From 0bce492f5244da24a41176838f1a705ff6640c33 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Sat, 14 Jun 2025 15:43:07 +0800 Subject: [PATCH 56/84] arm64: RME: Runtime faulting of memory community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- At runtime if the realm guest accesses memory which hasn't yet been mapped then KVM needs to either populate the region or fault the guest. For memory in the lower (protected) region of IPA a fresh page is provided to the RMM which will zero the contents. For memory in the upper (shared) region of IPA, the memory from the memslot is mapped into the realm VM non secure. Signed-off-by: Steven Price Signed-off-by: Xu Raoqing --- arch/arm64/include/asm/kvm_emulate.h | 10 ++ arch/arm64/include/asm/kvm_rme.h | 10 ++ arch/arm64/kvm/mmu.c | 64 +++++++++- arch/arm64/kvm/rme.c | 179 +++++++++++++++++++++++++++ 4 files changed, 257 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 426e8e6bbf09..4cab0024e260 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -633,6 +633,16 @@ static inline bool kvm_realm_is_created(struct kvm *kvm) return kvm_is_realm(kvm) && kvm_realm_state(kvm) != REALM_STATE_NONE; } +static inline gpa_t kvm_gpa_from_fault(struct kvm *kvm, phys_addr_t ipa) +{ + if (kvm_is_realm(kvm)) { + struct realm *realm = &kvm->arch.realm; + + return ipa & ~BIT(realm->ia_bits - 1); + } + return ipa; +} + static inline bool vcpu_is_rec(struct kvm_vcpu *vcpu) { if (static_branch_unlikely(&kvm_rme_is_available)) diff --git a/arch/arm64/include/asm/kvm_rme.h b/arch/arm64/include/asm/kvm_rme.h index d86051ef0c5c..47aa6362c6c9 100644 --- a/arch/arm64/include/asm/kvm_rme.h +++ b/arch/arm64/include/asm/kvm_rme.h @@ -108,6 +108,16 @@ void kvm_realm_unmap_range(struct kvm *kvm, unsigned long ipa, unsigned long size, bool unmap_private); +int realm_map_protected(struct realm *realm, + unsigned long base_ipa, + kvm_pfn_t pfn, + unsigned long size, + struct kvm_mmu_memory_cache *memcache); +int realm_map_non_secure(struct realm *realm, + unsigned long ipa, + kvm_pfn_t pfn, + unsigned long size, + struct kvm_mmu_memory_cache *memcache); static inline bool kvm_realm_is_private_address(struct realm *realm, unsigned long addr) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index ee4f379222fd..d60e47cc3196 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -325,8 +325,13 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 lockdep_assert_held_write(&kvm->mmu_lock); WARN_ON(size & ~PAGE_MASK); - WARN_ON(stage2_apply_range(mmu, start, end, kvm_pgtable_stage2_unmap, - may_block)); + + if (kvm_is_realm(kvm)) + kvm_realm_unmap_range(kvm, start, size, !only_shared); + else + WARN_ON(stage2_apply_range(mmu, start, end, + kvm_pgtable_stage2_unmap, + may_block)); } static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size, @@ -341,7 +346,11 @@ static void stage2_flush_memslot(struct kvm *kvm, phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; phys_addr_t end = addr + PAGE_SIZE * memslot->npages; - stage2_apply_range_resched(&kvm->arch.mmu, addr, end, kvm_pgtable_stage2_flush); + if (kvm_is_realm(kvm)) + kvm_realm_unmap_range(kvm, addr, end - addr, false); + else + stage2_apply_range_resched(&kvm->arch.mmu, addr, end, + kvm_pgtable_stage2_flush); } /** @@ -1007,6 +1016,10 @@ void stage2_unmap_vm(struct kvm *kvm) struct kvm_memory_slot *memslot; int idx, bkt; + /* For realms this is handled by the RMM so nothing to do here */ + if (kvm_is_realm(kvm)) + return; + idx = srcu_read_lock(&kvm->srcu); mmap_read_lock(current->mm); write_lock(&kvm->mmu_lock); @@ -1030,6 +1043,7 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) if (kvm_is_realm(kvm) && (kvm_realm_state(kvm) != REALM_STATE_DEAD && kvm_realm_state(kvm) != REALM_STATE_NONE)) { + unmap_stage2_range(mmu, 0, (~0ULL) & PAGE_MASK, false); write_unlock(&kvm->mmu_lock); kvm_realm_destroy_rtts(kvm, pgt->ia_bits); @@ -1419,6 +1433,25 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) return vma->vm_flags & VM_MTE_ALLOWED; } +static int realm_map_ipa(struct kvm *kvm, phys_addr_t ipa, + kvm_pfn_t pfn, unsigned long map_size, + enum kvm_pgtable_prot prot, + struct kvm_mmu_memory_cache *memcache) +{ + struct realm *realm = &kvm->arch.realm; + + if (WARN_ON(!(prot & KVM_PGTABLE_PROT_W))) + return -EFAULT; + + ipa = ALIGN_DOWN(ipa, PAGE_SIZE); + + if (!kvm_realm_is_private_address(realm, ipa)) + return realm_map_non_secure(realm, ipa, pfn, map_size, + memcache); + + return realm_map_protected(realm, ipa, pfn, map_size, memcache); +} + static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_memory_slot *memslot, unsigned long hva, unsigned long fault_status) @@ -1442,6 +1475,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, fault_granule = 1UL << ARM64_HW_PGTABLE_LEVEL_SHIFT(fault_level); write_fault = kvm_is_write_fault(vcpu); + + /* + * Realms cannot map protected pages read-only + * FIXME: It should be possible to map unprotected pages read-only + */ + if (vcpu_is_rec(vcpu)) + write_fault = true; + exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); VM_BUG_ON(write_fault && exec_fault); @@ -1515,7 +1556,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) fault_ipa &= ~(vma_pagesize - 1); - gfn = fault_ipa >> PAGE_SHIFT; + gfn = kvm_gpa_from_fault(kvm, fault_ipa) >> PAGE_SHIFT; mte_allowed = kvm_vma_mte_allowed(vma); /* Don't use the VMA after the unlock -- it may have vanished */ @@ -1616,6 +1657,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, */ if (fault_status == ESR_ELx_FSC_PERM && vma_pagesize == fault_granule) ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot); + else if (kvm_is_realm(kvm)) + ret = realm_map_ipa(kvm, fault_ipa, pfn, vma_pagesize, + prot, memcache); else ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize, __pfn_to_phys(pfn), prot, @@ -1725,7 +1769,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) idx = srcu_read_lock(&vcpu->kvm->srcu); - gfn = fault_ipa >> PAGE_SHIFT; + gfn = kvm_gpa_from_fault(vcpu->kvm, fault_ipa) >> PAGE_SHIFT; memslot = gfn_to_memslot(vcpu->kvm, gfn); hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); write_fault = kvm_is_write_fault(vcpu); @@ -1770,7 +1814,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) * of the page size. */ fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1); - ret = io_mem_abort(vcpu, fault_ipa); + ret = io_mem_abort(vcpu, kvm_gpa_from_fault(vcpu->kvm, fault_ipa)); goto out_unlock; } @@ -1850,6 +1894,10 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) if (!kvm->arch.mmu.pgt) return false; + /* We don't support aging for Realms */ + if (kvm_is_realm(kvm)) + return true; + return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT, size, true); @@ -1862,6 +1910,10 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) if (!kvm->arch.mmu.pgt) return false; + /* We don't support aging for Realms */ + if (kvm_is_realm(kvm)) + return true; + return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT, size, false); diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c index c9f8d53dd4d2..770493f111a9 100644 --- a/arch/arm64/kvm/rme.c +++ b/arch/arm64/kvm/rme.c @@ -713,6 +713,185 @@ static int realm_create_protected_data_page(struct realm *realm, return -ENXIO; } +static int fold_rtt(struct realm *realm, unsigned long addr, int level) +{ + phys_addr_t rtt_addr; + int ret; + + ret = realm_rtt_fold(realm, addr, level, &rtt_addr); + if (ret) + return ret; + + free_delegated_granule(rtt_addr); + + return 0; +} + +int realm_map_protected(struct realm *realm, + unsigned long ipa, + kvm_pfn_t pfn, + unsigned long map_size, + struct kvm_mmu_memory_cache *memcache) +{ + phys_addr_t phys = __pfn_to_phys(pfn); + phys_addr_t rd = virt_to_phys(realm->rd); + unsigned long base_ipa = ipa; + unsigned long size; + int map_level; + int ret = 0; + + if (WARN_ON(!IS_ALIGNED(map_size, RMM_PAGE_SIZE))) + return -EINVAL; + + if (WARN_ON(!IS_ALIGNED(ipa, map_size))) + return -EINVAL; + + if (IS_ALIGNED(map_size, RMM_L2_BLOCK_SIZE)) + map_level = 2; + else + map_level = 3; + + if (map_level < RMM_RTT_MAX_LEVEL) { + /* + * A temporary RTT is needed during the map, precreate it, + * however if there is an error (e.g. missing parent tables) + * this will be handled below. + */ + realm_create_rtt_levels(realm, ipa, map_level, + RMM_RTT_MAX_LEVEL, memcache); + } + + for (size = 0; size < map_size; size += RMM_PAGE_SIZE) { + if (rmi_granule_delegate(phys)) { + /* + * It's likely we raced with another VCPU on the same + * fault. Assume the other VCPU has handled the fault + * and return to the guest. + */ + return 0; + } + + ret = rmi_data_create_unknown(rd, phys, ipa); + + if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) { + /* Create missing RTTs and retry */ + int level = RMI_RETURN_INDEX(ret); + + WARN_ON(level == RMM_RTT_MAX_LEVEL); + + ret = realm_create_rtt_levels(realm, ipa, level, + RMM_RTT_MAX_LEVEL, + memcache); + if (ret) + goto err_undelegate; + + ret = rmi_data_create_unknown(rd, phys, ipa); + } + + if (WARN_ON(ret)) + goto err_undelegate; + + phys += RMM_PAGE_SIZE; + ipa += RMM_PAGE_SIZE; + } + + if (map_size == RMM_L2_BLOCK_SIZE) { + ret = fold_rtt(realm, base_ipa, map_level + 1); + if (WARN_ON(ret)) + goto err; + } + + return 0; + +err_undelegate: + if (WARN_ON(rmi_granule_undelegate(phys))) { + /* Page can't be returned to NS world so is lost */ + get_page(phys_to_page(phys)); + } +err: + while (size > 0) { + unsigned long data, top; + + phys -= RMM_PAGE_SIZE; + size -= RMM_PAGE_SIZE; + ipa -= RMM_PAGE_SIZE; + + WARN_ON(rmi_data_destroy(rd, ipa, &data, &top)); + + if (WARN_ON(rmi_granule_undelegate(phys))) { + /* Page can't be returned to NS world so is lost */ + get_page(phys_to_page(phys)); + } + } + return -ENXIO; +} + +int realm_map_non_secure(struct realm *realm, + unsigned long ipa, + kvm_pfn_t pfn, + unsigned long size, + struct kvm_mmu_memory_cache *memcache) +{ + phys_addr_t rd = virt_to_phys(realm->rd); + phys_addr_t phys = __pfn_to_phys(pfn); + unsigned long offset; + int map_size, map_level; + int ret = 0; + + if (WARN_ON(!IS_ALIGNED(size, RMM_PAGE_SIZE))) + return -EINVAL; + + if (WARN_ON(!IS_ALIGNED(ipa, size))) + return -EINVAL; + + if (IS_ALIGNED(size, RMM_L2_BLOCK_SIZE)) { + map_level = 2; + map_size = RMM_L2_BLOCK_SIZE; + } else { + map_level = 3; + map_size = RMM_PAGE_SIZE; + } + + for (offset = 0; offset < size; offset += map_size) { + /* + * realm_map_ipa() enforces that the memory is writable, + * so for now we permit both read and write. + */ + unsigned long desc = phys | + PTE_S2_MEMATTR(MT_S2_FWB_NORMAL) | + (3 << 6); + ret = rmi_rtt_map_unprotected(rd, ipa, map_level, desc); + + if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) { + /* Create missing RTTs and retry */ + int level = RMI_RETURN_INDEX(ret); + + ret = realm_create_rtt_levels(realm, ipa, level, + map_level, memcache); + if (ret) + return -ENXIO; + + ret = rmi_rtt_map_unprotected(rd, ipa, map_level, desc); + } + /* + * RMI_ERROR_RTT can be reported for two reasons: either the + * RTT tables are not there, or there is an RTTE already + * present for the address. The call to + * realm_create_rtt_levels() above handles the first case, and + * in the second case this indicates that another thread has + * already populated the RTTE for us, so we can ignore the + * error and continue. + */ + if (ret && RMI_RETURN_STATUS(ret) != RMI_ERROR_RTT) + return -ENXIO; + + ipa += map_size; + phys += map_size; + } + + return 0; +} + static int populate_region(struct kvm *kvm, phys_addr_t ipa_base, phys_addr_t ipa_end, -- Gitee From 4f7cae6d9c14d1b9c9c1a83e5c70cfe802e5c985 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Sat, 14 Jun 2025 15:44:56 +0800 Subject: [PATCH 57/84] KVM: arm64: Handle realm VCPU load community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- When loading a realm VCPU much of the work is handled by the RMM so only some of the actions are required. Rearrange kvm_arch_vcpu_load() slightly so we can bail out early for a realm guest. Signed-off-by: Steven Price Reviewed-by: Gavin Shan Reviewed-by: Suzuki K Poulose Conflicts:arch/arm64/kvm/arm.c Signed-off-by: Yiwei Zhuang Signed-off-by: Xu Raoqing --- arch/arm64/kvm/arm.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 8c4f859e97a8..cbdf41282057 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -485,10 +485,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_vgic_load(vcpu); kvm_timer_vcpu_load(vcpu); - if (has_vhe()) - kvm_vcpu_load_sysregs_vhe(vcpu); - kvm_arch_vcpu_load_fp(vcpu); - kvm_vcpu_pmu_restore_guest(vcpu); if (kvm_arm_is_pvtime_enabled(&vcpu->arch)) kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu); @@ -501,6 +497,15 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vcpu_ptrauth_disable(vcpu); kvm_arch_vcpu_load_debug_state_flags(vcpu); + /* No additional state needs to be loaded on Realmed VMs */ + if (vcpu_is_rec(vcpu)) + return; + + if (has_vhe()) + kvm_vcpu_load_sysregs_vhe(vcpu); + kvm_arch_vcpu_load_fp(vcpu); + kvm_vcpu_pmu_restore_guest(vcpu); + if (!cpumask_test_cpu(cpu, vcpu->kvm->arch.supported_cpus)) vcpu_set_on_unsupported_cpu(vcpu); } -- Gitee From 339d4d1e27c709678adb62a149546c5c4eca40a8 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Sat, 14 Jun 2025 15:45:49 +0800 Subject: [PATCH 58/84] KVM: arm64: Validate register access for a Realm VM community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- The RMM only allows setting the GPRS (x0-x30) and PC for a realm guest. Check this in kvm_arm_set_reg() so that the VMM can receive a suitable error return if other registers are written to. The RMM makes similar restrictions for reading of the guest's registers (this is *confidential* compute after all), however we don't impose the restriction here. This allows the VMM to read (stale) values from the registers which might be useful to read back the initial values even if the RMM doesn't provide the latest version. For migration of a realm VM, a new interface will be needed so that the VMM can receive an (encrypted) blob of the VM's state. Signed-off-by: Steven Price Reviewed-by: Gavin Shan Reviewed-by: Suzuki K Poulose Signed-off-by: Xu Raoqing --- arch/arm64/kvm/guest.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index efe82cc86bd1..55631ebdec45 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -73,6 +73,24 @@ static u64 core_reg_offset_from_id(u64 id) return id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_CORE); } +static bool kvm_realm_validate_core_reg(u64 off) +{ + /* + * Note that GPRs can only sometimes be controlled by the VMM. + * For PSCI only X0-X6 are used, higher registers are ignored (restored + * from the REC). + * For HOST_CALL all of X0-X30 are copied to the RsiHostCall structure. + * For emulated MMIO X0 is always used. + */ + switch (off) { + case KVM_REG_ARM_CORE_REG(regs.regs[0]) ... + KVM_REG_ARM_CORE_REG(regs.regs[30]): + case KVM_REG_ARM_CORE_REG(regs.pc): + return true; + } + return false; +} + static int core_reg_size_from_offset(const struct kvm_vcpu *vcpu, u64 off) { int size; @@ -780,12 +798,34 @@ int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) return kvm_arm_sys_reg_get_reg(vcpu, reg); } +/* + * The RMI ABI only enables setting some GPRs and PC. The selection of GPRs + * that are available depends on the Realm state and the reason for the last + * exit. All other registers are reset to architectural or otherwise defined + * reset values by the RMM, except for a few configuration fields that + * correspond to Realm parameters. + */ +static bool validate_realm_set_reg(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) +{ + if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_CORE) { + u64 off = core_reg_offset_from_id(reg->id); + + return kvm_realm_validate_core_reg(off); + } + + return false; +} + int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { /* We currently use nothing arch-specific in upper 32 bits */ if ((reg->id & ~KVM_REG_SIZE_MASK) >> 32 != KVM_REG_ARM64 >> 32) return -EINVAL; + if (kvm_is_realm(vcpu->kvm) && !validate_realm_set_reg(vcpu, reg)) + return -EINVAL; + switch (reg->id & KVM_REG_ARM_COPROC_MASK) { case KVM_REG_ARM_CORE: return set_core_reg(vcpu, reg); case KVM_REG_ARM_FW: -- Gitee From f7a9447929846b4c3f220e6e9fe7ded8c8361c3a Mon Sep 17 00:00:00 2001 From: Steven Price Date: Sat, 14 Jun 2025 15:46:40 +0800 Subject: [PATCH 59/84] KVM: arm64: Handle Realm PSCI requests community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- The RMM needs to be informed of the target REC when a PSCI call is made with an MPIDR argument. Expose an ioctl to the userspace in case the PSCI is handled by it. Co-developed-by: Suzuki K Poulose Signed-off-by: Suzuki K Poulose Signed-off-by: Steven Price Reviewed-by: Gavin Shan Signed-off-by: Xu Raoqing --- arch/arm64/include/asm/kvm_rme.h | 3 +++ arch/arm64/kvm/arm.c | 25 +++++++++++++++++++++++++ arch/arm64/kvm/psci.c | 30 ++++++++++++++++++++++++++++++ arch/arm64/kvm/rme.c | 14 ++++++++++++++ 4 files changed, 72 insertions(+) diff --git a/arch/arm64/include/asm/kvm_rme.h b/arch/arm64/include/asm/kvm_rme.h index 47aa6362c6c9..7000fa1227d0 100644 --- a/arch/arm64/include/asm/kvm_rme.h +++ b/arch/arm64/include/asm/kvm_rme.h @@ -118,6 +118,9 @@ int realm_map_non_secure(struct realm *realm, kvm_pfn_t pfn, unsigned long size, struct kvm_mmu_memory_cache *memcache); +int realm_psci_complete(struct kvm_vcpu *source, + struct kvm_vcpu *target, + unsigned long status); static inline bool kvm_realm_is_private_address(struct realm *realm, unsigned long addr) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index cbdf41282057..e949cadc2d17 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1476,6 +1476,22 @@ static int kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, return __kvm_arm_vcpu_set_events(vcpu, events); } +static int kvm_arm_vcpu_rmm_psci_complete(struct kvm_vcpu *vcpu, + struct kvm_arm_rmm_psci_complete *arg) +{ + struct kvm_vcpu *target = kvm_mpidr_to_vcpu(vcpu->kvm, arg->target_mpidr); + + if (!target) + return -EINVAL; + + /* + * RMM v1.0 only supports PSCI_RET_SUCCESS or PSCI_RET_DENIED + * for the status. But, let us leave it to the RMM to filter + * for making this future proof. + */ + return realm_psci_complete(vcpu, target, arg->psci_status); +} + long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -1598,6 +1614,15 @@ long kvm_arch_vcpu_ioctl(struct file *filp, return kvm_arm_vcpu_finalize(vcpu, what); } + case KVM_ARM_VCPU_RMM_PSCI_COMPLETE: { + struct kvm_arm_rmm_psci_complete req; + + if (!vcpu_is_rec(vcpu)) + return -EPERM; + if (copy_from_user(&req, argp, sizeof(req))) + return -EFAULT; + return kvm_arm_vcpu_rmm_psci_complete(vcpu, &req); + } default: r = -EINVAL; } diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c index 1f69b667332b..750b3899d462 100644 --- a/arch/arm64/kvm/psci.c +++ b/arch/arm64/kvm/psci.c @@ -103,6 +103,12 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) reset_state->reset = true; kvm_make_request(KVM_REQ_VCPU_RESET, vcpu); + /* + * Make sure we issue PSCI_COMPLETE before the VCPU can be + * scheduled. + */ + if (vcpu_is_rec(vcpu)) + realm_psci_complete(source_vcpu, vcpu, PSCI_RET_SUCCESS); /* * Make sure the reset request is observed if the RUNNABLE mp_state is @@ -115,6 +121,11 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) out_unlock: spin_unlock(&vcpu->arch.mp_state_lock); + if (vcpu_is_rec(vcpu) && ret != PSCI_RET_SUCCESS) { + realm_psci_complete(source_vcpu, vcpu, + ret == PSCI_RET_ALREADY_ON ? + PSCI_RET_SUCCESS : PSCI_RET_DENIED); + } return ret; } @@ -142,6 +153,25 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu) /* Ignore other bits of target affinity */ target_affinity &= target_affinity_mask; + if (vcpu_is_rec(vcpu)) { + struct kvm_vcpu *target_vcpu; + + /* RMM supports only zero affinity level */ + if (lowest_affinity_level != 0) + return PSCI_RET_INVALID_PARAMS; + + target_vcpu = kvm_mpidr_to_vcpu(kvm, target_affinity); + if (!target_vcpu) + return PSCI_RET_INVALID_PARAMS; + + /* + * Provide the references of the source and target RECs to the + * RMM so that the RMM can complete the PSCI request. + */ + realm_psci_complete(vcpu, target_vcpu, PSCI_RET_SUCCESS); + return PSCI_RET_SUCCESS; + } + /* * If one or more VCPU matching target affinity are running * then ON else OFF diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c index 770493f111a9..0eb24131b322 100644 --- a/arch/arm64/kvm/rme.c +++ b/arch/arm64/kvm/rme.c @@ -144,6 +144,20 @@ static void free_delegated_granule(phys_addr_t phys) free_page((unsigned long)phys_to_virt(phys)); } +int realm_psci_complete(struct kvm_vcpu *source, struct kvm_vcpu *target, + unsigned long status) +{ + int ret; + + ret = rmi_psci_complete(virt_to_phys(source->arch.rec.rec_page), + virt_to_phys(target->arch.rec.rec_page), + status); + if (ret) + return -EINVAL; + + return 0; +} + static int realm_rtt_create(struct realm *realm, unsigned long addr, int level, -- Gitee From 0ab0dc1e0bb623f3732c214af6d7a1c17ec2831f Mon Sep 17 00:00:00 2001 From: Steven Price Date: Sat, 14 Jun 2025 15:47:19 +0800 Subject: [PATCH 60/84] KVM: arm64: WARN on injected undef exceptions community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- The RMM doesn't allow injection of a undefined exception into a realm guest. Add a WARN to catch if this ever happens. Signed-off-by: Steven Price Reviewed-by: Gavin Shan Signed-off-by: Xu Raoqing --- arch/arm64/kvm/inject_fault.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c index 2027d8e2fda2..50f434af0838 100644 --- a/arch/arm64/kvm/inject_fault.c +++ b/arch/arm64/kvm/inject_fault.c @@ -226,6 +226,7 @@ void kvm_inject_size_fault(struct kvm_vcpu *vcpu) */ void kvm_inject_undefined(struct kvm_vcpu *vcpu) { + WARN(vcpu_is_rec(vcpu), "Unexpected undefined exception injection to REC"); if (vcpu_el1_is_32bit(vcpu)) inject_undef32(vcpu); else -- Gitee From 1cd989ebf37efb6086615882545145e9d99ac423 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Sat, 14 Jun 2025 15:48:21 +0800 Subject: [PATCH 61/84] arm64: Don't expose stolen time for realm guests community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- It doesn't make much sense as a realm guest wouldn't want to trust the host. It will also need some extra work to ensure that KVM will only attempt to write into a shared memory region. So for now just disable it. Reviewed-by: Suzuki K Poulose Reviewed-by: Gavin Shan Signed-off-by: Steven Price Signed-off-by: Xu Raoqing --- Documentation/virt/kvm/api.rst | 3 +++ arch/arm64/kvm/arm.c | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 49f510bd3a55..df694e75d2d0 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8237,6 +8237,9 @@ is supported, than the other should as well and vice versa. For arm64 see Documentation/virt/kvm/devices/vcpu.rst "KVM_ARM_VCPU_PVTIME_CTRL". For x86 see Documentation/virt/kvm/x86/msr.rst "MSR_KVM_STEAL_TIME". +Note that steal time accounting is not available when a guest is running +within a Arm CCA realm (machine type KVM_VM_TYPE_ARM_REALM). + 8.25 KVM_CAP_S390_DIAG318 ------------------------- diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index e949cadc2d17..64df6b6cc4b4 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -313,7 +313,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = system_supports_mte(); break; case KVM_CAP_STEAL_TIME: - r = kvm_arm_pvtime_supported(); + if (kvm_is_realm(kvm)) + r = 0; + else + r = kvm_arm_pvtime_supported(); break; case KVM_CAP_ARM_EL1_32BIT: r = cpus_have_const_cap(ARM64_HAS_32BIT_EL1); -- Gitee From 2bcc721d4f3323ad951da6b353027994125b8395 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Sat, 14 Jun 2025 15:49:06 +0800 Subject: [PATCH 62/84] arm64: RME: allow userspace to inject aborts community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- Extend KVM_SET_VCPU_EVENTS to support realms, where KVM cannot set the system registers, and the RMM must perform it on next REC entry. Signed-off-by: Joey Gouly Signed-off-by: Steven Price Reviewed-by: Gavin Shan Signed-off-by: Yiwei Zhuang Signed-off-by: Xu Raoqing --- Documentation/virt/kvm/api.rst | 2 ++ arch/arm64/kvm/guest.c | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index df694e75d2d0..22c664fedbd6 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -1283,6 +1283,8 @@ User space may need to inject several types of events to the guest. Set the pending SError exception state for this VCPU. It is not possible to 'cancel' an Serror that has been made pending. +User space cannot inject SErrors into Realms. + If the guest performed an access to I/O memory which could not be handled by userspace, for example because of missing instruction syndrome decode information or because there is no device mapped at the accessed IPA, then diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 55631ebdec45..e0203b5cbe14 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -877,6 +877,30 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, bool has_esr = events->exception.serror_has_esr; bool ext_dabt_pending = events->exception.ext_dabt_pending; + if (vcpu_is_rec(vcpu)) { + /* Cannot inject SError into a Realm. */ + if (serror_pending) + return -EINVAL; + + /* + * If a data abort is pending, set the flag and let the RMM + * inject an SEA when the REC is scheduled to be run. + */ + if (ext_dabt_pending) { + /* + * Can only inject SEA into a Realm if the previous exit + * was due to a data abort of an Unprotected IPA. + */ + if (!(vcpu->arch.rec.run->enter.flags & REC_ENTER_FLAG_EMULATED_MMIO)) + return -EINVAL; + + vcpu->arch.rec.run->enter.flags &= ~REC_ENTER_FLAG_EMULATED_MMIO; + vcpu->arch.rec.run->enter.flags |= REC_ENTER_FLAG_INJECT_SEA; + } + + return 0; + } + if (serror_pending && has_esr) { if (!cpus_have_const_cap(ARM64_HAS_RAS_EXTN)) return -EINVAL; -- Gitee From 752d34677fef9770e886e13b3c3fa05469ac56bc Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Sat, 14 Jun 2025 15:50:00 +0800 Subject: [PATCH 63/84] arm64: RME: support RSI_HOST_CALL community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- From: Joey Gouly Forward RSI_HOST_CALLS to KVM's HVC handler. Signed-off-by: Joey Gouly Signed-off-by: Steven Price Reviewed-by: Gavin Shan Signed-off-by: Xu Raoqing --- arch/arm64/kvm/rme-exit.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/arm64/kvm/rme-exit.c b/arch/arm64/kvm/rme-exit.c index 730e2aa621ae..68533ad7e057 100644 --- a/arch/arm64/kvm/rme-exit.c +++ b/arch/arm64/kvm/rme-exit.c @@ -106,6 +106,19 @@ static int rec_exit_ripas_change(struct kvm_vcpu *vcpu) return 1; } +static int rec_exit_host_call(struct kvm_vcpu *vcpu) +{ + int i; + struct realm_rec *rec = &vcpu->arch.rec; + + vcpu->stat.hvc_exit_stat++; + + for (i = 0; i < REC_RUN_GPRS; i++) + vcpu_set_reg(vcpu, i, rec->run->exit.gprs[i]); + + return kvm_smccc_call_handler(vcpu); +} + static void update_arch_timer_irq_lines(struct kvm_vcpu *vcpu) { struct realm_rec *rec = &vcpu->arch.rec; @@ -167,6 +180,8 @@ int handle_rec_exit(struct kvm_vcpu *vcpu, int rec_run_ret) return rec_exit_psci(vcpu); case RMI_EXIT_RIPAS_CHANGE: return rec_exit_ripas_change(vcpu); + case RMI_EXIT_HOST_CALL: + return rec_exit_host_call(vcpu); } kvm_pr_unimpl("Unsupported exit reason: %u\n", -- Gitee From 6a736e216da235e12a7a387dc9697b8fa47a741c Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Sat, 14 Jun 2025 15:50:30 +0800 Subject: [PATCH 64/84] arm64: RME: Allow checking SVE on VM instance community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- Given we have different types of VMs supported, check the support for SVE for the given instance of the VM to accurately report the status. Signed-off-by: Suzuki K Poulose Signed-off-by: Steven Price Reviewed-by: Gavin Shan Signed-off-by: Yiwei Zhuang Signed-off-by: Xu Raoqing --- arch/arm64/include/asm/kvm_rme.h | 2 ++ arch/arm64/kvm/arm.c | 5 ++++- arch/arm64/kvm/rme.c | 5 +++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_rme.h b/arch/arm64/include/asm/kvm_rme.h index 7000fa1227d0..f786fd978cf6 100644 --- a/arch/arm64/include/asm/kvm_rme.h +++ b/arch/arm64/include/asm/kvm_rme.h @@ -94,6 +94,8 @@ void kvm_init_rme(void); u32 kvm_realm_ipa_limit(void); u32 kvm_realm_vgic_nr_lr(void); +bool kvm_rme_supports_sve(void); + int kvm_realm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap); int kvm_init_realm_vm(struct kvm *kvm); void kvm_destroy_realm(struct kvm *kvm); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 64df6b6cc4b4..909e51fbd946 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -337,7 +337,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = get_kvm_ipa_limit(); break; case KVM_CAP_ARM_SVE: - r = system_supports_sve(); + if (kvm_is_realm(kvm)) + r = kvm_rme_supports_sve(); + else + r = system_supports_sve(); break; case KVM_CAP_ARM_PTRAUTH_ADDRESS: case KVM_CAP_ARM_PTRAUTH_GENERIC: diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c index 0eb24131b322..03436288d9d5 100644 --- a/arch/arm64/kvm/rme.c +++ b/arch/arm64/kvm/rme.c @@ -38,6 +38,11 @@ static bool rme_has_feature(unsigned long feature) return !!u64_get_bits(rmm_feat_reg0, feature); } +bool kvm_rme_supports_sve(void) +{ + return rme_has_feature(RMI_FEATURE_REGISTER_0_SVE_EN); +} + static int rmi_check_version(void) { struct arm_smccc_res res; -- Gitee From 8ec7f2e2b86273a65ad7e52a6e67c77b2e593657 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Sat, 14 Jun 2025 15:51:25 +0800 Subject: [PATCH 65/84] arm64: RME: Always use 4k pages for realms community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- Guest_memfd doesn't yet natively support huge pages, and there are currently difficulties for a VMM to manage huge pages efficiently so for now always split up mappings to PTE (4k). The two issues that need progressing before supporting huge pages for realms are: 1. guest_memfd needs to be able to allocate from an appropriate allocator which can provide huge pages. 2. The VMM needs to be able to repurpose private memory for a shared mapping when the guest VM requests memory is transitioned. Because this can happen at a 4k granularity it isn't possible to free/reallocate while huge pages are in use. Allowing the VMM to mmap() the shared portion of a huge page would allow the huge page to be recreated when the memory is unshared and made protected again. These two issues are not specific to realms and don't affect the realm API, so for now just break everything down to 4k pages in the RMM controlled stage 2. Future work can add huge page support without changing the uAPI. Signed-off-by: Steven Price Reviewed-by: Gavin Shan Reviewed-by: Suzuki K Poulose Signed-off-by: Xu Raoqing --- arch/arm64/kvm/mmu.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index d60e47cc3196..64d8fd83fd6c 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1524,6 +1524,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (logging_active) { force_pte = true; vma_shift = PAGE_SHIFT; + } else if (vcpu_is_rec(vcpu)) { + // Force PTE level mappings for realms + force_pte = true; + vma_shift = PAGE_SHIFT; } else { vma_shift = get_vma_page_shift(vma, hva); } -- Gitee From c4020be6a233834e0b62a399d51ce3bb9de140de Mon Sep 17 00:00:00 2001 From: Steven Price Date: Sat, 14 Jun 2025 15:52:09 +0800 Subject: [PATCH 66/84] arm64: RME: Prevent Device mappings for Realms community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- Physical device assignment is not yet supported by the RMM, so it doesn't make much sense to allow device mappings within the realm. Prevent them when the guest is a realm. Signed-off-by: Steven Price Reviewed-by: Gavin Shan Reviewed-by: Suzuki K Poulose Signed-off-by: Xu Raoqing --- arch/arm64/kvm/mmu.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 64d8fd83fd6c..512a6b743d04 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1117,6 +1117,10 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, if (is_protected_kvm_enabled()) return -EPERM; + /* We don't support mapping special pages into a Realm */ + if (kvm_is_realm(kvm)) + return -EPERM; + size += offset_in_page(guest_ipa); guest_ipa &= PAGE_MASK; @@ -1609,6 +1613,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (exec_fault && device) return -ENOEXEC; + /* + * Adapted from cca-v8 + * Since OLK-6.6 does not implement the private_memslot_fault() + * that depends on the higher version feature guest memfd. + * Here we should handle protected addresses fault expect protected devices. + */ + if (device && vcpu_is_rec(vcpu) && + kvm_gpa_from_fault(kvm, fault_ipa) == fault_ipa) + return -EINVAL; + read_lock(&kvm->mmu_lock); pgt = vcpu->arch.hw_mmu->pgt; if (mmu_invalidate_retry(kvm, mmu_seq)) -- Gitee From 64b602e497a0300f137e962256e39794c66a0a65 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Sat, 14 Jun 2025 15:53:06 +0800 Subject: [PATCH 67/84] arm_pmu: Provide a mechanism for disabling the physical IRQ community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- Arm CCA assigns the physical PMU device to the guest running in realm world, however the IRQs are routed via the host. To enter a realm guest while a PMU IRQ is pending it is necessary to block the physical IRQ to prevent an immediate exit. Provide a mechanism in the PMU driver for KVM to control the physical IRQ. Signed-off-by: Steven Price Signed-off-by: Xu Raoqing --- drivers/perf/arm_pmu.c | 15 +++++++++++++++ include/linux/perf/arm_pmu.h | 7 ++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index b3bb4d62f13d..aa79e76ef21a 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -734,6 +734,21 @@ static int arm_perf_teardown_cpu(unsigned int cpu, struct hlist_node *node) return 0; } +void arm_pmu_set_phys_irq(bool enable) +{ + int cpu = get_cpu(); + struct arm_pmu *pmu = per_cpu(cpu_armpmu, cpu); + int irq; + + irq = armpmu_get_cpu_irq(pmu, cpu); + if (irq && !enable) + per_cpu(cpu_irq_ops, cpu)->disable_pmuirq(irq); + else if (irq && enable) + per_cpu(cpu_irq_ops, cpu)->enable_pmuirq(irq); + + put_cpu(); +} + #ifdef CONFIG_CPU_PM static void cpu_pm_pmu_setup(struct arm_pmu *armpmu, unsigned long cmd) { diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 143fbc10ecfe..266c7c82affd 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -174,6 +174,7 @@ void kvm_host_pmu_init(struct arm_pmu *pmu); #endif bool arm_pmu_irq_is_nmi(void); +void arm_pmu_set_phys_irq(bool enable); /* Internal functions only for core arm_pmu code */ struct arm_pmu *armpmu_alloc(void); @@ -184,7 +185,11 @@ void armpmu_free_irq(int irq, int cpu); #define ARMV8_PMU_PDEV_NAME "armv8-pmu" -#endif /* CONFIG_ARM_PMU */ +#else /* CONFIG_ARM_PMU */ + +static inline void arm_pmu_set_phys_irq(bool enable) {} + +#endif #define ARMV8_SPE_PDEV_NAME "arm,spe-v1" #define ARMV8_TRBE_PDEV_NAME "arm,trbe" -- Gitee From 6095b9100dbd3456163ef60960be9284de96a393 Mon Sep 17 00:00:00 2001 From: Reiji Watanabe Date: Mon, 23 Jun 2025 15:45:29 +0800 Subject: [PATCH 68/84] KVM: arm64: PMU: Set PMCR_EL0.N for vCPU based on the associated PMU mainline inclusion from mainline-v6.7-rc1 commit 4d20debf9ca160720a0b01ba4f2dc3d62296c4d1 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4d20debf9ca160720a0b01ba4f2dc3d62296c4d1 -------------------------------- The number of PMU event counters is indicated in PMCR_EL0.N. For a vCPU with PMUv3 configured, the value is set to the same value as the current PE on every vCPU reset. Unless the vCPU is pinned to PEs that has the PMU associated to the guest from the initial vCPU reset, the value might be different from the PMU's PMCR_EL0.N on heterogeneous PMU systems. Fix this by setting the vCPU's PMCR_EL0.N to the PMU's PMCR_EL0.N value. Track the PMCR_EL0.N per guest, as only one PMU can be set for the guest (PMCR_EL0.N must be the same for all vCPUs of the guest), and it is convenient for updating the value. To achieve this, the patch introduces a helper, kvm_arm_pmu_get_max_counters(), that reads the maximum number of counters from the arm_pmu associated to the VM. Make the function global as upcoming patches will be interested to know the value while setting the PMCR.N of the guest from userspace. KVM does not yet support userspace modifying PMCR_EL0.N. The following patch will add support for that. Reviewed-by: Sebastian Ott Co-developed-by: Marc Zyngier Signed-off-by: Marc Zyngier Signed-off-by: Reiji Watanabe Signed-off-by: Raghavendra Rao Ananta Link: https://lore.kernel.org/r/20231020214053.2144305-5-rananta@google.com Signed-off-by: Oliver Upton Conflicts:arch/arm64/kvm/pmu-emul.c Signed-off-by: Xu Raoqing --- arch/arm64/include/asm/kvm_host.h | 3 +++ arch/arm64/kvm/pmu-emul.c | 27 ++++++++++++++++++++++++++- arch/arm64/kvm/sys_regs.c | 27 ++++++++++++++------------- include/kvm/arm_pmu.h | 6 ++++++ 4 files changed, 49 insertions(+), 14 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index db84d0622021..1a395abedfc9 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -257,6 +257,9 @@ struct kvm_arch { cpumask_var_t supported_cpus; + /* PMCR_EL0.N value for the guest */ + u8 pmcr_n; + /* Hypercall features firmware registers' descriptor */ struct kvm_smccc_features smccc_feat; struct maple_tree smccc_filter; diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index d02944e9b4b7..b54a86f01c3f 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -872,6 +872,29 @@ static bool pmu_irq_is_valid(struct kvm *kvm, int irq) return true; } +/** + * kvm_arm_pmu_get_max_counters - Return the max number of PMU counters. + * @kvm: The kvm pointer + */ +u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm) +{ + struct arm_pmu *arm_pmu = kvm->arch.arm_pmu; + + /* + * The arm_pmu->num_events considers the cycle counter as well. + * Ignore that and return only the general-purpose counters. + */ + return arm_pmu->num_events - 1; +} + +static void kvm_arm_set_pmu(struct kvm *kvm, struct arm_pmu *arm_pmu) +{ + lockdep_assert_held(&kvm->arch.config_lock); + + kvm->arch.arm_pmu = arm_pmu; + kvm->arch.pmcr_n = kvm_arm_pmu_get_max_counters(kvm); +} + static int kvm_arm_pmu_v3_set_pmu(struct kvm_vcpu *vcpu, int pmu_id) { struct kvm *kvm = vcpu->kvm; @@ -1077,5 +1100,7 @@ u8 kvm_arm_pmu_get_pmuver_limit(void) */ u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu) { - return __vcpu_sys_reg(vcpu, PMCR_EL0); + u64 pmcr = __vcpu_sys_reg(vcpu, PMCR_EL0); + + return u64_replace_bits(pmcr, vcpu->kvm->arch.pmcr_n, ARMV8_PMU_PMCR_N); } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 1ce2e1975d57..12eecceb47db 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -727,11 +727,7 @@ static u64 reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 n, mask = BIT(ARMV8_PMU_CYCLE_IDX); - /* No PMU available, any PMU reg may UNDEF... */ - if (!kvm_arm_support_pmu_v3()) - return 0; - - n = FIELD_GET(ARMV8_PMU_PMCR_N, read_sysreg(pmcr_el0)); + n = vcpu->kvm->arch.pmcr_n; if (n) mask |= GENMASK(n - 1, 0); @@ -767,17 +763,15 @@ static u64 reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) static u64 reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { - u64 pmcr; + u64 pmcr = 0; - /* No PMU available, PMCR_EL0 may UNDEF... */ - if (!kvm_arm_support_pmu_v3()) - return 0; - - /* Only preserve PMCR_EL0.N, and reset the rest to 0 */ - pmcr = read_sysreg(pmcr_el0) & ARMV8_PMU_PMCR_N; if (!kvm_supports_32bit_el0()) pmcr |= ARMV8_PMU_PMCR_LC; + /* + * The value of PMCR.N field is included when the + * vCPU register is read via kvm_vcpu_read_pmcr(). + */ __vcpu_sys_reg(vcpu, r->reg) = pmcr; return __vcpu_sys_reg(vcpu, r->reg); @@ -1108,6 +1102,13 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, return true; } +static int get_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + *val = kvm_vcpu_read_pmcr(vcpu); + return 0; +} + /* Silly macro to expand the DBG{BCR,BVR,WVR,WCR}n_EL1 registers in one go */ #define DBG_BCR_BVR_WCR_WVR_EL1(n) \ { SYS_DESC(SYS_DBGBVRn_EL1(n)), \ @@ -2224,7 +2225,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_SVCR), undef_access }, { PMU_SYS_REG(PMCR_EL0), .access = access_pmcr, - .reset = reset_pmcr, .reg = PMCR_EL0 }, + .reset = reset_pmcr, .reg = PMCR_EL0, .get_user = get_pmcr }, { PMU_SYS_REG(PMCNTENSET_EL0), .access = access_pmcnten, .reg = PMCNTENSET_EL0 }, { PMU_SYS_REG(PMCNTENCLR_EL0), diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 934ed4bc96d8..2a84425148c9 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -101,6 +101,7 @@ void kvm_vcpu_pmu_resync_el0(void); }) u8 kvm_arm_pmu_get_pmuver_limit(void); +u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm); u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu); #else @@ -175,6 +176,11 @@ static inline u8 kvm_arm_pmu_get_pmuver_limit(void) } static inline void kvm_vcpu_pmu_resync_el0(void) {} +static inline u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm) +{ + return 0; +} + static inline u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu) { return 0; -- Gitee From 29a5e36ec3bc0df2ae9c423def531987ab69defd Mon Sep 17 00:00:00 2001 From: Reiji Watanabe Date: Mon, 23 Jun 2025 15:47:11 +0800 Subject: [PATCH 69/84] KVM: arm64: PMU: Allow userspace to limit PMCR_EL0.N for the guest mainline inclusion from mainline-v6.7-rc1 commit ea9ca904d24ff15ded92fd76c16462c47bcae2f8 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ea9ca904d24ff15ded92fd76c16462c47bcae2f8 -------------------------------- KVM does not yet support userspace modifying PMCR_EL0.N (With the previous patch, KVM ignores what is written by userspace). Add support userspace limiting PMCR_EL0.N. Disallow userspace to set PMCR_EL0.N to a value that is greater than the host value as KVM doesn't support more event counters than what the host HW implements. Also, make this register immutable after the VM has started running. To maintain the existing expectations, instead of returning an error, KVM returns a success for these two cases. Finally, ignore writes to read-only bits that are cleared on vCPU reset, and RES{0,1} bits (including writable bits that KVM doesn't support yet), as those bits shouldn't be modified (at least with the current KVM). Co-developed-by: Marc Zyngier Signed-off-by: Marc Zyngier Signed-off-by: Reiji Watanabe Signed-off-by: Raghavendra Rao Ananta Signed-off-by: Oliver Upton Conflicts:arch/arm64/kvm/sys_regs.c Signed-off-by: Xu Raoqing --- arch/arm64/kvm/sys_regs.c | 46 +++++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 12eecceb47db..cc1a77b44bbc 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -725,9 +725,9 @@ static unsigned int pmu_visibility(const struct kvm_vcpu *vcpu, static u64 reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { - u64 n, mask = BIT(ARMV8_PMU_CYCLE_IDX); + u64 mask = BIT(ARMV8_PMU_CYCLE_IDX); + u8 n = vcpu->kvm->arch.pmcr_n; - n = vcpu->kvm->arch.pmcr_n; if (n) mask |= GENMASK(n - 1, 0); @@ -1109,6 +1109,44 @@ static int get_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, return 0; } +static int set_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) +{ + u8 new_n = FIELD_GET(ARMV8_PMU_PMCR_N, val); + struct kvm *kvm = vcpu->kvm; + + mutex_lock(&kvm->arch.config_lock); + + /* + * The vCPU can't have more counters than the PMU hardware + * implements. Ignore this error to maintain compatibility + * with the existing KVM behavior. + */ + if (!kvm_vm_has_ran_once(kvm) && + new_n <= kvm_arm_pmu_get_max_counters(kvm)) + kvm->arch.pmcr_n = new_n; + + mutex_unlock(&kvm->arch.config_lock); + + /* + * Ignore writes to RES0 bits, read only bits that are cleared on + * vCPU reset, and writable bits that KVM doesn't support yet. + * (i.e. only PMCR.N and bits [7:0] are mutable from userspace) + * The LP bit is RES0 when FEAT_PMUv3p5 is not supported on the vCPU. + * But, we leave the bit as it is here, as the vCPU's PMUver might + * be changed later (NOTE: the bit will be cleared on first vCPU run + * if necessary). + */ + val &= ARMV8_PMU_PMCR_MASK; + + /* The LC bit is RES1 when AArch32 is not supported */ + if (!kvm_supports_32bit_el0()) + val |= ARMV8_PMU_PMCR_LC; + + __vcpu_sys_reg(vcpu, r->reg) = val; + return 0; +} + /* Silly macro to expand the DBG{BCR,BVR,WVR,WCR}n_EL1 registers in one go */ #define DBG_BCR_BVR_WCR_WVR_EL1(n) \ { SYS_DESC(SYS_DBGBVRn_EL1(n)), \ @@ -2224,8 +2262,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_CTR_EL0), access_ctr }, { SYS_DESC(SYS_SVCR), undef_access }, - { PMU_SYS_REG(PMCR_EL0), .access = access_pmcr, - .reset = reset_pmcr, .reg = PMCR_EL0, .get_user = get_pmcr }, + { PMU_SYS_REG(PMCR_EL0), .access = access_pmcr, .reset = reset_pmcr, + .reg = PMCR_EL0, .get_user = get_pmcr, .set_user = set_pmcr }, { PMU_SYS_REG(PMCNTENSET_EL0), .access = access_pmcnten, .reg = PMCNTENSET_EL0 }, { PMU_SYS_REG(PMCNTENCLR_EL0), -- Gitee From 4e217e27c74b7a33a8e90700937db82172135491 Mon Sep 17 00:00:00 2001 From: Reiji Watanabe Date: Fri, 20 Oct 2023 21:40:41 +0000 Subject: [PATCH 70/84] KVM: arm64: PMU: Introduce helpers to set the guest's PMU mainline inclusion from mainline-v6.7-rc1 commit 1616ca6f3c10723c1b60ae44724212fae88f502d category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1616ca6f3c10723c1b60ae44724212fae88f502d -------------------------------- Introduce new helper functions to set the guest's PMU (kvm->arch.arm_pmu) either to a default probed instance or to a caller requested one, and use it when the guest's PMU needs to be set. These helpers will make it easier for the following patches to modify the relevant code. No functional change intended. Reviewed-by: Sebastian Ott Signed-off-by: Reiji Watanabe Signed-off-by: Raghavendra Rao Ananta Reviewed-by: Eric Auger Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20231020214053.2144305-2-rananta@google.com Signed-off-by: Oliver Upton [arch/arm64/kvm/pmu-emul.c: remove duplicate kvm_arm_set_pmu] Signed-off-by: Xu Raoqing --- arch/arm64/kvm/pmu-emul.c | 59 ++++++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 23 deletions(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index b54a86f01c3f..12a48c023f75 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -872,6 +872,37 @@ static bool pmu_irq_is_valid(struct kvm *kvm, int irq) return true; } +static void kvm_arm_set_pmu(struct kvm *kvm, struct arm_pmu *arm_pmu) +{ + lockdep_assert_held(&kvm->arch.config_lock); + + kvm->arch.arm_pmu = arm_pmu; + kvm->arch.pmcr_n = kvm_arm_pmu_get_max_counters(kvm); +} + +/** + * kvm_arm_set_default_pmu - No PMU set, get the default one. + * @kvm: The kvm pointer + * + * The observant among you will notice that the supported_cpus + * mask does not get updated for the default PMU even though it + * is quite possible the selected instance supports only a + * subset of cores in the system. This is intentional, and + * upholds the preexisting behavior on heterogeneous systems + * where vCPUs can be scheduled on any core but the guest + * counters could stop working. + */ +static int kvm_arm_set_default_pmu(struct kvm *kvm) +{ + struct arm_pmu *arm_pmu = kvm_pmu_probe_armpmu(); + + if (!arm_pmu) + return -ENODEV; + + kvm_arm_set_pmu(kvm, arm_pmu); + return 0; +} + /** * kvm_arm_pmu_get_max_counters - Return the max number of PMU counters. * @kvm: The kvm pointer @@ -887,14 +918,6 @@ u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm) return arm_pmu->num_events - 1; } -static void kvm_arm_set_pmu(struct kvm *kvm, struct arm_pmu *arm_pmu) -{ - lockdep_assert_held(&kvm->arch.config_lock); - - kvm->arch.arm_pmu = arm_pmu; - kvm->arch.pmcr_n = kvm_arm_pmu_get_max_counters(kvm); -} - static int kvm_arm_pmu_v3_set_pmu(struct kvm_vcpu *vcpu, int pmu_id) { struct kvm *kvm = vcpu->kvm; @@ -914,7 +937,7 @@ static int kvm_arm_pmu_v3_set_pmu(struct kvm_vcpu *vcpu, int pmu_id) break; } - kvm->arch.arm_pmu = arm_pmu; + kvm_arm_set_pmu(kvm, arm_pmu); cpumask_copy(kvm->arch.supported_cpus, &arm_pmu->supported_cpus); ret = 0; break; @@ -938,20 +961,10 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) return -EBUSY; if (!kvm->arch.arm_pmu) { - /* - * No PMU set, get the default one. - * - * The observant among you will notice that the supported_cpus - * mask does not get updated for the default PMU even though it - * is quite possible the selected instance supports only a - * subset of cores in the system. This is intentional, and - * upholds the preexisting behavior on heterogeneous systems - * where vCPUs can be scheduled on any core but the guest - * counters could stop working. - */ - kvm->arch.arm_pmu = kvm_pmu_probe_armpmu(); - if (!kvm->arch.arm_pmu) - return -ENODEV; + int ret = kvm_arm_set_default_pmu(kvm); + + if (ret) + return ret; } switch (attr->attr) { -- Gitee From b3da8e4e75501ecb6776826880588c41bb1bc996 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Tue, 17 Jun 2025 20:29:12 +0800 Subject: [PATCH 71/84] arm64: rme: Enable PMU support with a realm guest community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- Use the PMU registers from the RmiRecExit structure to identify when an overflow interrupt is due and inject it into the guest. Also hook up the configuration option for enabling the PMU within the guest. When entering a realm guest with a PMU interrupt pending, it is necessary to disable the physical interrupt. Otherwise when the RMM restores the PMU state the physical interrupt will trigger causing an immediate exit back to the host. The guest is expected to acknowledge the interrupt causing a host exit (to update the GIC state) which gives the opportunity to re-enable the physical interrupt before the next PMU event. Number of PMU counters is configured by the VMM by writing to PMCR.N. Signed-off-by: Steven Price Signed-off-by: Xu Raoqing --- arch/arm64/kvm/arm.c | 11 +++++++++++ arch/arm64/kvm/guest.c | 7 +++++++ arch/arm64/kvm/pmu-emul.c | 3 +++ arch/arm64/kvm/rme.c | 8 ++++++++ arch/arm64/kvm/sys_regs.c | 2 +- include/kvm/arm_pmu.h | 4 ++++ 6 files changed, 34 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 909e51fbd946..d9344cf152fc 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -980,6 +981,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) run->exit_reason = KVM_EXIT_UNKNOWN; run->flags = 0; while (ret > 0) { + bool pmu_stopped = false; + /* * Check conditions before entering the guest */ @@ -1008,6 +1011,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) kvm_pmu_flush_hwstate(vcpu); + if (vcpu_is_rec(vcpu) && kvm_pmu_get_irq_level(vcpu)) { + pmu_stopped = true; + arm_pmu_set_phys_irq(false); + } + local_irq_disable(); kvm_vgic_flush_hwstate(vcpu); @@ -1110,6 +1118,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) preempt_enable(); + if (pmu_stopped) + arm_pmu_set_phys_irq(true); + /* * The ARMv8 architecture doesn't give the hypervisor * a mechanism to prevent a guest from dropping to AArch32 EL0 diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index e0203b5cbe14..29505f5b2fb5 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -798,6 +798,8 @@ int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) return kvm_arm_sys_reg_get_reg(vcpu, reg); } +#define KVM_REG_ARM_PMCR_EL0 ARM64_SYS_REG(3, 3, 9, 12, 0) + /* * The RMI ABI only enables setting some GPRs and PC. The selection of GPRs * that are available depends on the Realm state and the reason for the last @@ -812,6 +814,11 @@ static bool validate_realm_set_reg(struct kvm_vcpu *vcpu, u64 off = core_reg_offset_from_id(reg->id); return kvm_realm_validate_core_reg(off); + } else { + switch (reg->id) { + case KVM_REG_ARM_PMCR_EL0: + return true; + } } return false; diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 12a48c023f75..062ab8e273f2 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -323,6 +323,9 @@ static u64 kvm_pmu_overflow_status(struct kvm_vcpu *vcpu) { u64 reg = 0; + if (vcpu_is_rec(vcpu)) + return vcpu->arch.rec.run->exit.pmu_ovf_status; + if ((kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E)) { reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0); reg &= __vcpu_sys_reg(vcpu, PMINTENSET_EL1); diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c index 03436288d9d5..d6cf4a79b1e7 100644 --- a/arch/arm64/kvm/rme.c +++ b/arch/arm64/kvm/rme.c @@ -366,6 +366,11 @@ static int realm_create_rd(struct kvm *kvm) params->rtt_base = kvm->arch.mmu.pgd_phys; params->vmid = realm->vmid; + if (kvm->arch.arm_pmu) { + params->pmu_num_ctrs = kvm->arch.pmcr_n; + params->flags |= RMI_REALM_PARAM_FLAG_PMU; + } + params_phys = virt_to_phys(params); if (rmi_realm_create(rd_phys, params_phys)) { @@ -1483,6 +1488,9 @@ int kvm_create_rec(struct kvm_vcpu *vcpu) if (!vcpu_has_feature(vcpu, KVM_ARM_VCPU_PSCI_0_2)) return -EINVAL; + if (vcpu->kvm->arch.arm_pmu && !kvm_vcpu_has_pmu(vcpu)) + return -EINVAL; + BUILD_BUG_ON(sizeof(*params) > PAGE_SIZE); BUILD_BUG_ON(sizeof(*rec->run) > PAGE_SIZE); diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index cc1a77b44bbc..c21e2024d237 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1122,7 +1122,7 @@ static int set_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, * implements. Ignore this error to maintain compatibility * with the existing KVM behavior. */ - if (!kvm_vm_has_ran_once(kvm) && + if (!kvm_vm_has_ran_once(kvm) && !kvm_realm_is_created(kvm) && new_n <= kvm_arm_pmu_get_max_counters(kvm)) kvm->arch.pmcr_n = new_n; diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 2a84425148c9..b675cc217cf7 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -76,6 +76,8 @@ void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu); void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu); void kvm_vcpu_pmu_resync_el0(void); +#define kvm_pmu_get_irq_level(vcpu) ((vcpu)->arch.pmu.irq_level) + #define kvm_vcpu_has_pmu(vcpu) \ (test_bit(KVM_ARM_VCPU_PMU_V3, (vcpu)->arch.features)) @@ -165,6 +167,8 @@ static inline u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) return 0; } +#define kvm_pmu_get_irq_level(vcpu) (false) + #define kvm_vcpu_has_pmu(vcpu) ({ false; }) #define kvm_pmu_is_3p5(vcpu) ({ false; }) static inline void kvm_pmu_update_vcpu_events(struct kvm_vcpu *vcpu) {} -- Gitee From 775ee103d49fd2143a111822a1541eb3dba5cc67 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Sat, 14 Jun 2025 15:58:13 +0800 Subject: [PATCH 72/84] arm64: RME: Hide KVM_CAP_READONLY_MEM for realm guests community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- For protected memory read only isn't supported by the RMM. While it may be possible to support read only for unprotected memory, this isn't supported at the present time. Note that this does mean that ROM (or flash) data cannot be emulated correctly by the VMM as the stage 2 mappings are either always read/write or are trapped as MMIO (so don't support operations where the syndrome information doesn't allow emulation, e.g. load/store pair). This restriction can be lifted in the future by allowing the stage 2 mappings to be made read only. Signed-off-by: Steven Price Reviewed-by: Gavin Shan Reviewed-by: Suzuki K Poulose Signed-off-by: Xu Raoqing --- arch/arm64/kvm/arm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index d9344cf152fc..d5f44277ca30 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -258,7 +258,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ONE_REG: case KVM_CAP_ARM_PSCI: case KVM_CAP_ARM_PSCI_0_2: - case KVM_CAP_READONLY_MEM: case KVM_CAP_MP_STATE: case KVM_CAP_IMMEDIATE_EXIT: case KVM_CAP_VCPU_EVENTS: @@ -272,6 +271,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = 1; break; case KVM_CAP_COUNTER_OFFSET: + case KVM_CAP_READONLY_MEM: case KVM_CAP_SET_GUEST_DEBUG: r = !kvm_is_realm(kvm); break; -- Gitee From a2c7ca76383cd62af5a24aa171d2b1cd6146b90e Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Sat, 14 Jun 2025 15:58:49 +0800 Subject: [PATCH 73/84] arm64: RME: Propagate number of breakpoints and watchpoints to userspace community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- The RMM describes the maximum number of BPs/WPs available to the guest in the Feature Register 0. Propagate those numbers into ID_AA64DFR0_EL1, which is visible to userspace. A VMM needs this information in order to set up realm parameters. Signed-off-by: Jean-Philippe Brucker Signed-off-by: Steven Price Reviewed-by: Gavin Shan Reviewed-by: Suzuki K Poulose Signed-off-by: Yiwei Zhuang Signed-off-by: Xu Raoqing --- arch/arm64/include/asm/kvm_rme.h | 2 ++ arch/arm64/kvm/rme.c | 22 ++++++++++++++++++++++ arch/arm64/kvm/sys_regs.c | 2 +- 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_rme.h b/arch/arm64/include/asm/kvm_rme.h index f786fd978cf6..09cbb61816f3 100644 --- a/arch/arm64/include/asm/kvm_rme.h +++ b/arch/arm64/include/asm/kvm_rme.h @@ -94,6 +94,8 @@ void kvm_init_rme(void); u32 kvm_realm_ipa_limit(void); u32 kvm_realm_vgic_nr_lr(void); +u64 kvm_realm_reset_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val); + bool kvm_rme_supports_sve(void); int kvm_realm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap); diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c index d6cf4a79b1e7..a2f051848d8d 100644 --- a/arch/arm64/kvm/rme.c +++ b/arch/arm64/kvm/rme.c @@ -87,6 +87,28 @@ u32 kvm_realm_vgic_nr_lr(void) return u64_get_bits(rmm_feat_reg0, RMI_FEATURE_REGISTER_0_GICV3_NUM_LRS); } +u64 kvm_realm_reset_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val) +{ + u32 bps = u64_get_bits(rmm_feat_reg0, RMI_FEATURE_REGISTER_0_NUM_BPS); + u32 wps = u64_get_bits(rmm_feat_reg0, RMI_FEATURE_REGISTER_0_NUM_WPS); + u32 ctx_cmps; + + if (!kvm_is_realm(vcpu->kvm)) + return val; + + /* Ensure CTX_CMPs is still valid */ + ctx_cmps = FIELD_GET(ID_AA64DFR0_EL1_CTX_CMPs, val); + ctx_cmps = min(bps, ctx_cmps); + + val &= ~(ID_AA64DFR0_EL1_BRPs_MASK | ID_AA64DFR0_EL1_WRPs_MASK | + ID_AA64DFR0_EL1_CTX_CMPs); + val |= FIELD_PREP(ID_AA64DFR0_EL1_BRPs_MASK, bps) | + FIELD_PREP(ID_AA64DFR0_EL1_WRPs_MASK, wps) | + FIELD_PREP(ID_AA64DFR0_EL1_CTX_CMPs, ctx_cmps); + + return val; +} + static int get_start_level(struct realm *realm) { return 4 - ((realm->ia_bits - 8) / (RMM_PAGE_SHIFT - 3)); diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index c21e2024d237..00d5a95774ef 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1545,7 +1545,7 @@ static u64 read_sanitised_id_aa64dfr0_el1(struct kvm_vcpu *vcpu, /* Hide SPE from guests */ val &= ~ID_AA64DFR0_EL1_PMSVer_MASK; - return val; + return kvm_realm_reset_id_aa64dfr0_el1(vcpu, val); } static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu, -- Gitee From 600b6d73cf3498f50d7ff06ce5a4fd7c6253dea1 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Sat, 21 Jun 2025 18:07:41 +0800 Subject: [PATCH 74/84] arm64: RME: Set breakpoint parameters through SET_ONE_REG community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://patchwork.kernel.org/project/kvm/patch/20250416134208.383984-36-steven.price@arm.com/ -------------------------------- Allow userspace to configure the number of breakpoints and watchpoints of a Realm VM through KVM_SET_ONE_REG ID_AA64DFR0_EL1. The KVM sys_reg handler checks the user value against the maximum value given by RMM (arm64_check_features() gets it from the read_sanitised_id_aa64dfr0_el1() reset handler). Userspace discovers that it can write these fields by issuing a KVM_ARM_GET_REG_WRITABLE_MASKS ioctl. Signed-off-by: Jean-Philippe Brucker Signed-off-by: Steven Price Reviewed-by: Gavin Shan Conflicts:arch/arm64/kvm/rme.c Signed-off-by: Xu Raoqing --- arch/arm64/kvm/guest.c | 2 ++ arch/arm64/kvm/rme.c | 3 +++ arch/arm64/kvm/sys_regs.c | 19 ++++++++++++++++--- 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 29505f5b2fb5..693879bb38b2 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -799,6 +799,7 @@ int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) } #define KVM_REG_ARM_PMCR_EL0 ARM64_SYS_REG(3, 3, 9, 12, 0) +#define KVM_REG_ARM_ID_AA64DFR0_EL1 ARM64_SYS_REG(3, 0, 0, 5, 0) /* * The RMI ABI only enables setting some GPRs and PC. The selection of GPRs @@ -817,6 +818,7 @@ static bool validate_realm_set_reg(struct kvm_vcpu *vcpu, } else { switch (reg->id) { case KVM_REG_ARM_PMCR_EL0: + case KVM_REG_ARM_ID_AA64DFR0_EL1: return true; } } diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c index a2f051848d8d..9001ee357207 100644 --- a/arch/arm64/kvm/rme.c +++ b/arch/arm64/kvm/rme.c @@ -351,6 +351,7 @@ static int realm_create_rd(struct kvm *kvm) void *rd = NULL; phys_addr_t rd_phys, params_phys; size_t pgd_size = kvm_pgtable_stage2_pgd_size(kvm->arch.vtcr); + u64 dfr0 = IDREG(kvm, SYS_ID_AA64DFR0_EL1); int i, r; int rtt_num_start; @@ -387,6 +388,8 @@ static int realm_create_rd(struct kvm *kvm) params->rtt_num_start = rtt_num_start; params->rtt_base = kvm->arch.mmu.pgd_phys; params->vmid = realm->vmid; + params->num_bps = SYS_FIELD_GET(ID_AA64DFR0_EL1, BRPs, dfr0); + params->num_wps = SYS_FIELD_GET(ID_AA64DFR0_EL1, WRPs, dfr0); if (kvm->arch.arm_pmu) { params->pmu_num_ctrs = kvm->arch.pmcr_n; diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 00d5a95774ef..f644b01a4414 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1552,7 +1552,11 @@ static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val) { + u8 debugver = SYS_FIELD_GET(ID_AA64DFR0_EL1, DebugVer, val); u8 pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, val); + u8 bps = SYS_FIELD_GET(ID_AA64DFR0_EL1, BRPs, val); + u8 wps = SYS_FIELD_GET(ID_AA64DFR0_EL1, WRPs, val); + u8 ctx_cmps = SYS_FIELD_GET(ID_AA64DFR0_EL1, CTX_CMPs, val); /* * Prior to commit 3d0dba5764b9 ("KVM: arm64: PMU: Move the @@ -1571,6 +1575,14 @@ static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu, if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) val &= ~ID_AA64DFR0_EL1_PMUVer_MASK; + /* + * ID_AA64DFR0_EL1.DebugVer, BRPs and WRPs all have to be greater than + * zero. CTX_CMPs is never greater than BRPs. + */ + if (debugver < ID_AA64DFR0_EL1_DebugVer_IMP || !bps || !wps || + ctx_cmps > bps) + return -EINVAL; + return set_id_reg(vcpu, rd, val); } @@ -1681,10 +1693,11 @@ static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, mutex_lock(&vcpu->kvm->arch.config_lock); /* - * Once the VM has started the ID registers are immutable. Reject any - * write that does not match the final register value. + * Once the VM has started or the Realm descriptor is created, the ID + * registers are immutable. Reject any write that does not match the + * final register value. */ - if (kvm_vm_has_ran_once(vcpu->kvm)) { + if (kvm_vm_has_ran_once(vcpu->kvm) || kvm_realm_is_created(vcpu->kvm)) { if (val != read_id_reg(vcpu, rd)) ret = -EBUSY; else -- Gitee From d3ca2150ecb5af570aa59646d246721efb4d963f Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Mon, 23 Jun 2025 16:06:24 +0800 Subject: [PATCH 75/84] arm64: RME: Initialize PMCR.N with number counter supported by RMM community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://patchwork.kernel.org/project/kvm/patch/20250416134208.383984-37-steven.price@arm.com/ -------------------------------- Provide an accurate number of available PMU counters to userspace when setting up a Realm. Signed-off-by: Jean-Philippe Brucker Signed-off-by: Steven Price Signed-off-by: Xu Raoqing --- arch/arm64/include/asm/kvm_rme.h | 1 + arch/arm64/kvm/pmu-emul.c | 3 +++ arch/arm64/kvm/rme.c | 5 +++++ 3 files changed, 9 insertions(+) diff --git a/arch/arm64/include/asm/kvm_rme.h b/arch/arm64/include/asm/kvm_rme.h index 09cbb61816f3..773e730da364 100644 --- a/arch/arm64/include/asm/kvm_rme.h +++ b/arch/arm64/include/asm/kvm_rme.h @@ -93,6 +93,7 @@ struct realm_rec { void kvm_init_rme(void); u32 kvm_realm_ipa_limit(void); u32 kvm_realm_vgic_nr_lr(void); +u8 kvm_realm_max_pmu_counters(void); u64 kvm_realm_reset_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val); diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 062ab8e273f2..419cd108bf91 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -914,6 +914,9 @@ u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm) { struct arm_pmu *arm_pmu = kvm->arch.arm_pmu; + if (kvm_is_realm(kvm)) + return kvm_realm_max_pmu_counters(); + /* * The arm_pmu->num_events considers the cycle counter as well. * Ignore that and return only the general-purpose counters. diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c index 9001ee357207..9ba880abcee9 100644 --- a/arch/arm64/kvm/rme.c +++ b/arch/arm64/kvm/rme.c @@ -87,6 +87,11 @@ u32 kvm_realm_vgic_nr_lr(void) return u64_get_bits(rmm_feat_reg0, RMI_FEATURE_REGISTER_0_GICV3_NUM_LRS); } +u8 kvm_realm_max_pmu_counters(void) +{ + return u64_get_bits(rmm_feat_reg0, RMI_FEATURE_REGISTER_0_PMU_NUM_CTRS); +} + u64 kvm_realm_reset_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val) { u32 bps = u64_get_bits(rmm_feat_reg0, RMI_FEATURE_REGISTER_0_NUM_BPS); -- Gitee From 599d621c53fb032b4cc49c3f2715ed9c7e686181 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Sat, 14 Jun 2025 16:00:15 +0800 Subject: [PATCH 76/84] arm64: RME: Propagate max SVE vector length from RMM community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- RMM provides the maximum vector length it supports for a guest in its feature register. Make it visible to the rest of KVM and to userspace via KVM_REG_ARM64_SVE_VLS. Signed-off-by: Jean-Philippe Brucker Signed-off-by: Steven Price Reviewed-by: Gavin Shan Reviewed-by: Suzuki K Poulose Signed-off-by: Yiwei Zhuang Signed-off-by: Xu Raoqing --- arch/arm64/include/asm/kvm_host.h | 2 +- arch/arm64/include/asm/kvm_rme.h | 1 + arch/arm64/kvm/guest.c | 2 +- arch/arm64/kvm/reset.c | 12 ++++++++++-- arch/arm64/kvm/rme.c | 6 ++++++ 5 files changed, 19 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 1a395abedfc9..698f551808b5 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -73,9 +73,9 @@ enum kvm_mode kvm_get_mode(void); static inline enum kvm_mode kvm_get_mode(void) { return KVM_MODE_NONE; }; #endif -extern unsigned int __ro_after_init kvm_sve_max_vl; extern unsigned int __ro_after_init kvm_host_sve_max_vl; int __init kvm_arm_init_sve(void); +unsigned int kvm_sve_get_max_vl(struct kvm *kvm); u32 __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/include/asm/kvm_rme.h b/arch/arm64/include/asm/kvm_rme.h index 773e730da364..98a83b1ed4f4 100644 --- a/arch/arm64/include/asm/kvm_rme.h +++ b/arch/arm64/include/asm/kvm_rme.h @@ -94,6 +94,7 @@ void kvm_init_rme(void); u32 kvm_realm_ipa_limit(void); u32 kvm_realm_vgic_nr_lr(void); u8 kvm_realm_max_pmu_counters(void); +unsigned int kvm_realm_sve_max_vl(void); u64 kvm_realm_reset_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val); diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 693879bb38b2..3bdfc58dd0b5 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -374,7 +374,7 @@ static int set_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) if (vq_present(vqs, vq)) max_vq = vq; - if (max_vq > sve_vq_from_vl(kvm_sve_max_vl)) + if (max_vq > sve_vq_from_vl(kvm_sve_get_max_vl(vcpu->kvm))) return -EINVAL; /* diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index c73735ad8fae..38282c3b5236 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -45,7 +45,7 @@ static u32 __ro_after_init kvm_ipa_limit; #define VCPU_RESET_PSTATE_SVC (PSR_AA32_MODE_SVC | PSR_AA32_A_BIT | \ PSR_AA32_I_BIT | PSR_AA32_F_BIT) -unsigned int __ro_after_init kvm_sve_max_vl; +static unsigned int __ro_after_init kvm_sve_max_vl; unsigned int __ro_after_init kvm_host_sve_max_vl; int __init kvm_arm_init_sve(void) @@ -76,12 +76,20 @@ int __init kvm_arm_init_sve(void) return 0; } +unsigned int kvm_sve_get_max_vl(struct kvm *kvm) +{ + if (kvm_is_realm(kvm)) + return kvm_realm_sve_max_vl(); + else + return kvm_sve_max_vl; +} + static int kvm_vcpu_enable_sve(struct kvm_vcpu *vcpu) { if (!system_supports_sve()) return -EINVAL; - vcpu->arch.sve_max_vl = kvm_sve_max_vl; + vcpu->arch.sve_max_vl = kvm_sve_get_max_vl(vcpu->kvm); /* * Userspace can still customize the vector lengths by writing diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c index 9ba880abcee9..5c84c52283ca 100644 --- a/arch/arm64/kvm/rme.c +++ b/arch/arm64/kvm/rme.c @@ -92,6 +92,12 @@ u8 kvm_realm_max_pmu_counters(void) return u64_get_bits(rmm_feat_reg0, RMI_FEATURE_REGISTER_0_PMU_NUM_CTRS); } +unsigned int kvm_realm_sve_max_vl(void) +{ + return sve_vl_from_vq(u64_get_bits(rmm_feat_reg0, + RMI_FEATURE_REGISTER_0_SVE_VL) + 1); +} + u64 kvm_realm_reset_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val) { u32 bps = u64_get_bits(rmm_feat_reg0, RMI_FEATURE_REGISTER_0_NUM_BPS); -- Gitee From c4b3d076d2495f831ddd61dbdd9f61e7370c6dd1 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Sat, 14 Jun 2025 16:00:51 +0800 Subject: [PATCH 77/84] rm64: RME: Configure max SVE vector length for a Realm community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- Obtain the max vector length configured by userspace on the vCPUs, and write it into the Realm parameters. By default the vCPU is configured with the max vector length reported by RMM, and userspace can reduce it with a write to KVM_REG_ARM64_SVE_VLS. Signed-off-by: Jean-Philippe Brucker Signed-off-by: Steven Price Reviewed-by: Gavin Shan Reviewed-by: Suzuki K Poulose Signed-off-by: Xu Raoqing --- arch/arm64/kvm/guest.c | 3 ++- arch/arm64/kvm/rme.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 3bdfc58dd0b5..fe608e7b9333 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -360,7 +360,7 @@ static int set_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) if (!vcpu_has_sve(vcpu)) return -ENOENT; - if (kvm_arm_vcpu_sve_finalized(vcpu)) + if (kvm_arm_vcpu_sve_finalized(vcpu) || kvm_realm_is_created(vcpu->kvm)) return -EPERM; /* too late! */ if (WARN_ON(vcpu->arch.sve_state)) @@ -819,6 +819,7 @@ static bool validate_realm_set_reg(struct kvm_vcpu *vcpu, switch (reg->id) { case KVM_REG_ARM_PMCR_EL0: case KVM_REG_ARM_ID_AA64DFR0_EL1: + case KVM_REG_ARM64_SVE_VLS: return true; } } diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c index 5c84c52283ca..ff2e57de6394 100644 --- a/arch/arm64/kvm/rme.c +++ b/arch/arm64/kvm/rme.c @@ -341,6 +341,44 @@ static void realm_unmap_shared_range(struct kvm *kvm, } } +static int realm_init_sve_param(struct kvm *kvm, struct realm_params *params) +{ + int ret = 0; + unsigned long i; + struct kvm_vcpu *vcpu; + int vl, last_vl = -1; + + /* + * Get the preferred SVE configuration, set by userspace with the + * KVM_ARM_VCPU_SVE feature and KVM_REG_ARM64_SVE_VLS pseudo-register. + */ + kvm_for_each_vcpu(i, vcpu, kvm) { + mutex_lock(&vcpu->mutex); + if (vcpu_has_sve(vcpu)) { + if (!kvm_arm_vcpu_sve_finalized(vcpu)) + ret = -EINVAL; + vl = vcpu->arch.sve_max_vl; + } else { + vl = 0; + } + mutex_unlock(&vcpu->mutex); + if (ret) + return ret; + + /* We need all vCPUs to have the same SVE config */ + if (last_vl >= 0 && last_vl != vl) + return -EINVAL; + + last_vl = vl; + } + + if (last_vl > 0) { + params->sve_vl = sve_vq_from_vl(last_vl) - 1; + params->flags |= RMI_REALM_PARAM_FLAG_SVE; + } + return 0; +} + /* Calculate the number of s2 root rtts needed */ static int realm_num_root_rtts(struct realm *realm) { @@ -407,6 +445,10 @@ static int realm_create_rd(struct kvm *kvm) params->flags |= RMI_REALM_PARAM_FLAG_PMU; } + r = realm_init_sve_param(kvm, params); + if (r) + goto out_undelegate_tables; + params_phys = virt_to_phys(params); if (rmi_realm_create(rd_phys, params_phys)) { -- Gitee From b6964feac21502752aa46660c38af2c71536766d Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Sat, 14 Jun 2025 16:01:24 +0800 Subject: [PATCH 78/84] arm64: RME: Provide register list for unfinalized RME RECs community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- KVM_GET_REG_LIST should not be called before SVE is finalized. The ioctl handler currently returns -EPERM in this case. But because it uses kvm_arm_vcpu_is_finalized(), it now also rejects the call for unfinalized REC even though finalizing the REC can only be done late, after Realm descriptor creation. Move the check to copy_sve_reg_indices(). One adverse side effect of this change is that a KVM_GET_REG_LIST call that only probes for the array size will now succeed even if SVE is not finalized, but that seems harmless since the following KVM_GET_REG_LIST with the full array will fail. Signed-off-by: Jean-Philippe Brucker Signed-off-by: Steven Price Reviewed-by: Gavin Shan Signed-off-by: Yiwei Zhuang Signed-off-by: Xu Raoqing --- arch/arm64/kvm/arm.c | 4 ---- arch/arm64/kvm/guest.c | 9 +++------ 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index d5f44277ca30..bb8929924af2 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1563,10 +1563,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (unlikely(!kvm_vcpu_initialized(vcpu))) break; - r = -EPERM; - if (!kvm_arm_vcpu_is_finalized(vcpu)) - break; - r = -EFAULT; if (copy_from_user(®_list, user_list, sizeof(reg_list))) break; diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index fe608e7b9333..66b503a2be23 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -671,12 +671,9 @@ static unsigned long num_sve_regs(const struct kvm_vcpu *vcpu) { const unsigned int slices = vcpu_sve_slices(vcpu); - if (!vcpu_has_sve(vcpu)) + if (!vcpu_has_sve(vcpu) || !kvm_arm_vcpu_sve_finalized(vcpu)) return 0; - /* Policed by KVM_GET_REG_LIST: */ - WARN_ON(!kvm_arm_vcpu_sve_finalized(vcpu)); - return slices * (SVE_NUM_PREGS + SVE_NUM_ZREGS + 1 /* FFR */) + 1; /* KVM_REG_ARM64_SVE_VLS */ } @@ -692,8 +689,8 @@ static int copy_sve_reg_indices(const struct kvm_vcpu *vcpu, if (!vcpu_has_sve(vcpu)) return 0; - /* Policed by KVM_GET_REG_LIST: */ - WARN_ON(!kvm_arm_vcpu_sve_finalized(vcpu)); + if (!kvm_arm_vcpu_sve_finalized(vcpu)) + return -EPERM; /* * Enumerate this first, so that userspace can save/restore in -- Gitee From a919f4c0ff3cd85ba0e8f2ec903e81dc7975e38a Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Sat, 14 Jun 2025 16:02:43 +0800 Subject: [PATCH 79/84] arm64: RME: Provide accurate register list community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- Userspace can set a few registers with KVM_SET_ONE_REG (9 GP registers at runtime, and 3 system registers during initialization). Update the register list returned by KVM_GET_REG_LIST. Signed-off-by: Jean-Philippe Brucker Signed-off-by: Steven Price Reviewed-by: Gavin Shan Reviewed-by: Suzuki K Poulose Signed-off-by: Xu Raoqing --- arch/arm64/kvm/guest.c | 19 ++++++++---- arch/arm64/kvm/hypercalls.c | 4 +-- arch/arm64/kvm/sys_regs.c | 59 ++++++++++++++++++++++++++++--------- 3 files changed, 61 insertions(+), 21 deletions(-) diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 66b503a2be23..0566e3429366 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -618,8 +618,6 @@ static const u64 timer_reg_list[] = { KVM_REG_ARM_PTIMER_CVAL, }; -#define NUM_TIMER_REGS ARRAY_SIZE(timer_reg_list) - static bool is_timer_reg(u64 index) { switch (index) { @@ -634,9 +632,14 @@ static bool is_timer_reg(u64 index) return false; } +static unsigned long num_timer_regs(struct kvm_vcpu *vcpu) +{ + return kvm_is_realm(vcpu->kvm) ? 0 : ARRAY_SIZE(timer_reg_list); +} + static int copy_timer_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) { - for (int i = 0; i < NUM_TIMER_REGS; i++) { + for (int i = 0; i < num_timer_regs(vcpu); i++) { if (put_user(timer_reg_list[i], uindices)) return -EFAULT; uindices++; @@ -674,6 +677,9 @@ static unsigned long num_sve_regs(const struct kvm_vcpu *vcpu) if (!vcpu_has_sve(vcpu) || !kvm_arm_vcpu_sve_finalized(vcpu)) return 0; + if (kvm_is_realm(vcpu->kvm)) + return 1; /* KVM_REG_ARM64_SVE_VLS */ + return slices * (SVE_NUM_PREGS + SVE_NUM_ZREGS + 1 /* FFR */) + 1; /* KVM_REG_ARM64_SVE_VLS */ } @@ -701,6 +707,9 @@ static int copy_sve_reg_indices(const struct kvm_vcpu *vcpu, return -EFAULT; ++num_regs; + if (kvm_is_realm(vcpu->kvm)) + return num_regs; + for (i = 0; i < slices; i++) { for (n = 0; n < SVE_NUM_ZREGS; n++) { reg = KVM_REG_ARM64_SVE_ZREG(n, i); @@ -738,7 +747,7 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu) res += num_sve_regs(vcpu); res += kvm_arm_num_sys_reg_descs(vcpu); res += kvm_arm_get_fw_num_regs(vcpu); - res += NUM_TIMER_REGS; + res += num_timer_regs(vcpu); return res; } @@ -770,7 +779,7 @@ int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) ret = copy_timer_indices(vcpu, uindices); if (ret < 0) return ret; - uindices += NUM_TIMER_REGS; + uindices += num_timer_regs(vcpu); return kvm_arm_copy_sys_reg_indices(vcpu, uindices); } diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c index 7fb4df0456de..c06e9352a80a 100644 --- a/arch/arm64/kvm/hypercalls.c +++ b/arch/arm64/kvm/hypercalls.c @@ -397,14 +397,14 @@ void kvm_arm_teardown_hypercalls(struct kvm *kvm) int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu) { - return ARRAY_SIZE(kvm_arm_fw_reg_ids); + return kvm_is_realm(vcpu->kvm) ? 0 : ARRAY_SIZE(kvm_arm_fw_reg_ids); } int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) { int i; - for (i = 0; i < ARRAY_SIZE(kvm_arm_fw_reg_ids); i++) { + for (i = 0; i < kvm_arm_get_fw_num_regs(vcpu); i++) { if (put_user(kvm_arm_fw_reg_ids[i], uindices++)) return -EFAULT; } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index f644b01a4414..2c4f4d4a1f55 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -3572,18 +3572,18 @@ int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); } -static unsigned int num_demux_regs(void) +static unsigned int num_demux_regs(struct kvm_vcpu *vcpu) { - return CSSELR_MAX; + return kvm_is_realm(vcpu->kvm) ? 0 : CSSELR_MAX; } -static int write_demux_regids(u64 __user *uindices) +static int write_demux_regids(struct kvm_vcpu *vcpu, u64 __user *uindices) { u64 val = KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_DEMUX; unsigned int i; val |= KVM_REG_ARM_DEMUX_ID_CCSIDR; - for (i = 0; i < CSSELR_MAX; i++) { + for (i = 0; i < num_demux_regs(vcpu); i++) { if (put_user(val | i, uindices)) return -EFAULT; uindices++; @@ -3591,6 +3591,24 @@ static int write_demux_regids(u64 __user *uindices) return 0; } +static unsigned int num_invariant_regs(struct kvm_vcpu *vcpu) +{ + return kvm_is_realm(vcpu->kvm) ? 0 : ARRAY_SIZE(invariant_sys_regs); +} + +static int write_invariant_regids(struct kvm_vcpu *vcpu, u64 __user *uindices) +{ + unsigned int i; + + for (i = 0; i < num_invariant_regs(vcpu); i++) { + if (put_user(sys_reg_to_index(&invariant_sys_regs[i]), uindices)) + return -EFAULT; + uindices++; + } + return 0; +} + + static u64 sys_reg_to_index(const struct sys_reg_desc *reg) { return (KVM_REG_ARM64 | KVM_REG_SIZE_U64 | @@ -3614,11 +3632,27 @@ static bool copy_reg_to_user(const struct sys_reg_desc *reg, u64 __user **uind) return true; } +static bool kvm_realm_sys_reg_hidden_user(const struct kvm_vcpu *vcpu, u64 reg) +{ + if (!kvm_is_realm(vcpu->kvm)) + return false; + + switch (reg) { + case SYS_ID_AA64DFR0_EL1: + case SYS_PMCR_EL0: + return false; + } + return true; +} + static int walk_one_sys_reg(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 __user **uind, unsigned int *total) { + if (kvm_realm_sys_reg_hidden_user(vcpu, reg_to_encoding(rd))) + return 0; + /* * Ignore registers we trap but don't save, * and for which no custom user accessor is provided. @@ -3656,29 +3690,26 @@ static int walk_sys_regs(struct kvm_vcpu *vcpu, u64 __user *uind) unsigned long kvm_arm_num_sys_reg_descs(struct kvm_vcpu *vcpu) { - return ARRAY_SIZE(invariant_sys_regs) - + num_demux_regs() + return num_invariant_regs(vcpu) + + num_demux_regs(vcpu) + walk_sys_regs(vcpu, (u64 __user *)NULL); } int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) { - unsigned int i; int err; - /* Then give them all the invariant registers' indices. */ - for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++) { - if (put_user(sys_reg_to_index(&invariant_sys_regs[i]), uindices)) - return -EFAULT; - uindices++; - } + err = write_invariant_regids(vcpu, uindices); + if (err) + return err; + uindices += num_invariant_regs(vcpu); err = walk_sys_regs(vcpu, uindices); if (err < 0) return err; uindices += err; - return write_demux_regids(uindices); + return write_demux_regids(vcpu, uindices); } int __init kvm_sys_reg_table_init(void) -- Gitee From 72b19ba68a94003ff576ebab561e9f2916e8ec40 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Sat, 14 Jun 2025 16:03:48 +0800 Subject: [PATCH 80/84] KVM: arm64: Expose KVM_ARM_VCPU_REC to user space community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- Increment KVM_VCPU_MAX_FEATURES to expose the new capability to user space. Signed-off-by: Steven Price Reviewed-by: Gavin Shan Signed-off-by: Xu Raoqing --- arch/arm64/include/asm/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 698f551808b5..9afcb17e1d19 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -39,7 +39,7 @@ #define KVM_MAX_VCPUS VGIC_V3_MAX_CPUS -#define KVM_VCPU_MAX_FEATURES 7 +#define KVM_VCPU_MAX_FEATURES 10 #define KVM_VCPU_VALID_FEATURES (BIT(KVM_VCPU_MAX_FEATURES) - 1) #define KVM_REQ_SLEEP \ -- Gitee From 78e3d2693211a282cd71cc844535067c242b830f Mon Sep 17 00:00:00 2001 From: Steven Price Date: Sat, 14 Jun 2025 16:05:06 +0800 Subject: [PATCH 81/84] KVM: arm64: Allow activating realms community inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBY08N Reference: https://lore.kernel.org/kvm/20250416134208.383984-1-steven.price@arm.com/T/ -------------------------------- Add the ioctl to activate a realm and set the static branch to enable access to the realm functionality if the RMM is detected. Signed-off-by: Steven Price Reviewed-by: Gavin Shan Reviewed-by: Suzuki K Poulose Signed-off-by: Yiwei Zhuang Signed-off-by: Xu Raoqing --- arch/arm64/kvm/rme.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c index ff2e57de6394..70687bb2e20f 100644 --- a/arch/arm64/kvm/rme.c +++ b/arch/arm64/kvm/rme.c @@ -1229,6 +1229,20 @@ static int kvm_init_ipa_range_realm(struct kvm *kvm, return realm_init_ipa_state(realm, addr, end); } +static int kvm_activate_realm(struct kvm *kvm) +{ + struct realm *realm = &kvm->arch.realm; + + if (kvm_realm_state(kvm) != REALM_STATE_NEW) + return -EINVAL; + + if (rmi_realm_activate(virt_to_phys(realm->rd))) + return -ENXIO; + + WRITE_ONCE(realm->state, REALM_STATE_ACTIVE); + return 0; +} + /* Protects access to rme_vmid_bitmap */ static DEFINE_SPINLOCK(rme_vmid_lock); static unsigned long *rme_vmid_bitmap; @@ -1378,6 +1392,9 @@ int kvm_realm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) r = kvm_populate_realm(kvm, &args); break; } + case KVM_CAP_ARM_RME_ACTIVATE_REALM: + r = kvm_activate_realm(kvm); + break; default: r = -EINVAL; break; @@ -1682,5 +1699,5 @@ void kvm_init_rme(void) if (rme_vmid_init()) return; - /* Future patch will enable static branch kvm_rme_is_available */ + static_branch_enable(&kvm_rme_is_available); } -- Gitee From ebbc49fd9f15aebc8dd883927355da2d17b5e015 Mon Sep 17 00:00:00 2001 From: Reiji Watanabe Date: Fri, 20 Oct 2023 21:40:42 +0000 Subject: [PATCH 82/84] KVM: arm64: Select default PMU in KVM_ARM_VCPU_INIT handler mainline inclusion from mainline-v6.7-rc1 commit 427733579744ef22ee6d0da9907560d79d937458 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICWPIF Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=427733579744ef22ee6d0da9907560d79d937458 -------------------------------- Future changes to KVM's sysreg emulation will rely on having a valid PMU instance to determine the number of implemented counters (PMCR_EL0.N). This is earlier than when userspace is expected to modify the vPMU device attributes, where the default is selected today. Select the default PMU when handling KVM_ARM_VCPU_INIT such that it is available in time for sysreg emulation. Reviewed-by: Sebastian Ott Co-developed-by: Marc Zyngier Signed-off-by: Marc Zyngier Signed-off-by: Reiji Watanabe Signed-off-by: Raghavendra Rao Ananta Link: https://lore.kernel.org/r/20231020214053.2144305-3-rananta@google.com [Oliver: rewrite changelog] Signed-off-by: Oliver Upton Signed-off-by: Xu Raoqing --- arch/arm64/kvm/arm.c | 19 +++++++++++++++++++ arch/arm64/kvm/pmu-emul.c | 16 ++++------------ include/kvm/arm_pmu.h | 5 +++++ 3 files changed, 28 insertions(+), 12 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index bb8929924af2..80e5be61a67c 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1320,6 +1320,21 @@ static bool kvm_vcpu_init_changed(struct kvm_vcpu *vcpu, return !bitmap_equal(vcpu->arch.features, &features, KVM_VCPU_MAX_FEATURES); } +static int kvm_setup_vcpu(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + int ret = 0; + + /* + * When the vCPU has a PMU, but no PMU is set for the guest + * yet, set the default one. + */ + if (kvm_vcpu_has_pmu(vcpu) && !kvm->arch.arm_pmu) + ret = kvm_arm_set_default_pmu(kvm); + + return ret; +} + static int __kvm_vcpu_set_target(struct kvm_vcpu *vcpu, const struct kvm_vcpu_init *init) { @@ -1335,6 +1350,10 @@ static int __kvm_vcpu_set_target(struct kvm_vcpu *vcpu, bitmap_copy(vcpu->arch.features, &features, KVM_VCPU_MAX_FEATURES); + ret = kvm_setup_vcpu(vcpu); + if (ret) + goto out_unlock; + /* Now we know what it is, we can reset it. */ ret = kvm_reset_vcpu(vcpu); if (ret) { diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 419cd108bf91..90b7755c7b66 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -718,10 +718,9 @@ static struct arm_pmu *kvm_pmu_probe_armpmu(void) * It is still necessary to get a valid cpu, though, to probe for the * default PMU instance as userspace is not required to specify a PMU * type. In order to uphold the preexisting behavior KVM selects the - * PMU instance for the core where the first call to the - * KVM_ARM_VCPU_PMU_V3_CTRL attribute group occurs. A dependent use case - * would be a user with disdain of all things big.LITTLE that affines - * the VMM to a particular cluster of cores. + * PMU instance for the core during vcpu init. A dependent use + * case would be a user with disdain of all things big.LITTLE that + * affines the VMM to a particular cluster of cores. * * In any case, userspace should just do the sane thing and use the UAPI * to select a PMU type directly. But, be wary of the baggage being @@ -895,7 +894,7 @@ static void kvm_arm_set_pmu(struct kvm *kvm, struct arm_pmu *arm_pmu) * where vCPUs can be scheduled on any core but the guest * counters could stop working. */ -static int kvm_arm_set_default_pmu(struct kvm *kvm) +int kvm_arm_set_default_pmu(struct kvm *kvm) { struct arm_pmu *arm_pmu = kvm_pmu_probe_armpmu(); @@ -966,13 +965,6 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) if (vcpu->arch.pmu.created) return -EBUSY; - if (!kvm->arch.arm_pmu) { - int ret = kvm_arm_set_default_pmu(kvm); - - if (ret) - return ret; - } - switch (attr->attr) { case KVM_ARM_VCPU_PMU_V3_IRQ: { int __user *uaddr = (int __user *)(long)attr->addr; diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index b675cc217cf7..34204f0a8701 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -103,6 +103,7 @@ void kvm_vcpu_pmu_resync_el0(void); }) u8 kvm_arm_pmu_get_pmuver_limit(void); +int kvm_arm_set_default_pmu(struct kvm *kvm); u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm); u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu); @@ -179,6 +180,10 @@ static inline u8 kvm_arm_pmu_get_pmuver_limit(void) return 0; } static inline void kvm_vcpu_pmu_resync_el0(void) {} +static inline int kvm_arm_set_default_pmu(struct kvm *kvm) +{ + return -ENODEV; +} static inline u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm) { -- Gitee From 1fbdc359f6c6e246c9ab7029e1d23320cb49f221 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Tue, 9 Sep 2025 20:40:23 +0800 Subject: [PATCH 83/84] arm64: RME: Introduce kvm_rec_pre_enter() called before entering an atomic section community inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICX7FX?from=project-issue Reference: https://patchew.org/linux/20250820145606.180644-1-steven.price@arm.com/20250820145606.180644-17-steven.price@arm.com -------------------------------- Entering a realm is done using a SMC call to the RMM. On exit the exit-codes need to be handled slightly differently to the normal KVM path so define our own functions for realm enter/exit and hook them in if the guest is a realm guest. Fixes: a9b2e8a67446 ("[v8-16-43]arm64: RME: Handle realm enter/exit") Signed-off-by: Steven Price Reviewed-by: Gavin Shan Signed-off-by: Xu Raoqing --- arch/arm64/include/asm/kvm_rme.h | 1 + arch/arm64/kvm/arm.c | 3 +++ arch/arm64/kvm/rme-exit.c | 10 ++++++++-- arch/arm64/kvm/rme.c | 24 +++++++++++++++++++++--- 4 files changed, 33 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/kvm_rme.h b/arch/arm64/include/asm/kvm_rme.h index 98a83b1ed4f4..2761e7eb5f3d 100644 --- a/arch/arm64/include/asm/kvm_rme.h +++ b/arch/arm64/include/asm/kvm_rme.h @@ -108,6 +108,7 @@ int kvm_create_rec(struct kvm_vcpu *vcpu); void kvm_destroy_rec(struct kvm_vcpu *vcpu); int kvm_rec_enter(struct kvm_vcpu *vcpu); +int kvm_rec_pre_enter(struct kvm_vcpu *vcpu); int handle_rec_exit(struct kvm_vcpu *vcpu, int rec_run_status); void kvm_realm_unmap_range(struct kvm *kvm, diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 80e5be61a67c..20607591763f 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -993,6 +993,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) if (ret > 0) ret = check_vcpu_requests(vcpu); + if (ret > 0 && vcpu_is_rec(vcpu)) + ret = kvm_rec_pre_enter(vcpu); + /* * Preparing the interrupts to be injected also * involves poking the GIC, which must be done in a diff --git a/arch/arm64/kvm/rme-exit.c b/arch/arm64/kvm/rme-exit.c index 68533ad7e057..76ae7ecc7bee 100644 --- a/arch/arm64/kvm/rme-exit.c +++ b/arch/arm64/kvm/rme-exit.c @@ -154,7 +154,7 @@ int handle_rec_exit(struct kvm_vcpu *vcpu, int rec_run_ret) * the VCPU as a result of KVM's PSCI handling. */ if (status == RMI_ERROR_REALM && index == 1) { - vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; + vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; return 0; } @@ -163,7 +163,8 @@ int handle_rec_exit(struct kvm_vcpu *vcpu, int rec_run_ret) vcpu->arch.fault.esr_el2 = rec->run->exit.esr; vcpu->arch.fault.far_el2 = rec->run->exit.far; - vcpu->arch.fault.hpfar_el2 = rec->run->exit.hpfar; + /* HPFAR_EL2 is only valid for RMI_EXIT_SYNC */ + vcpu->arch.fault.hpfar_el2 = 0; update_arch_timer_irq_lines(vcpu); @@ -172,6 +173,11 @@ int handle_rec_exit(struct kvm_vcpu *vcpu, int rec_run_ret) switch (rec->run->exit.exit_reason) { case RMI_EXIT_SYNC: + /* + * HPFAR_EL2_NS is hijacked to indicate a valid HPFAR value, + * see __get_fault_info() + */ + vcpu->arch.fault.hpfar_el2 = rec->run->exit.hpfar; return rec_exit_handlers[esr_ec](vcpu); case RMI_EXIT_IRQ: case RMI_EXIT_FIQ: diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c index 70687bb2e20f..2d1fbd145626 100644 --- a/arch/arm64/kvm/rme.c +++ b/arch/arm64/kvm/rme.c @@ -1469,10 +1469,24 @@ static void kvm_complete_ripas_change(struct kvm_vcpu *vcpu) } while (top_ipa < top); } -int kvm_rec_enter(struct kvm_vcpu *vcpu) +/* + * kvm_rec_pre_enter - Complete operations before entering a REC + * + * Some operations require work to be completed before entering a realm. That + * work may require memory allocation so cannot be done in the kvm_rec_enter() + * call. + * + * Return: 1 if we should enter the guest + * 0 if we should exit to userspace + * < 0 if we should exit to userspace, where the return value indicates + * an error + */ +int kvm_rec_pre_enter(struct kvm_vcpu *vcpu) { struct realm_rec *rec = &vcpu->arch.rec; + if (kvm_realm_state(vcpu->kvm) != REALM_STATE_ACTIVE) + return -EINVAL; switch (rec->run->exit.exit_reason) { case RMI_EXIT_HOST_CALL: case RMI_EXIT_PSCI: @@ -1484,8 +1498,12 @@ int kvm_rec_enter(struct kvm_vcpu *vcpu) break; } - if (kvm_realm_state(vcpu->kvm) != REALM_STATE_ACTIVE) - return -EINVAL; + return 1; +} + +int kvm_rec_enter(struct kvm_vcpu *vcpu) +{ + struct realm_rec *rec = &vcpu->arch.rec; return rmi_rec_enter(virt_to_phys(rec->rec_page), virt_to_phys(rec->run)); -- Gitee From f382147661171ed9dfa3a331cd058d67c5c7ea74 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Tue, 9 Sep 2025 21:12:18 +0800 Subject: [PATCH 84/84] arm64: RME: handle RIPAS changes before kvm_rec_enter community inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICX7FX?from=project-issue Reference: https://patchew.org/linux/20250820145606.180644-1-steven.price@arm.com/20250820145606.180644-16-steven.price@arm.com ------------------------ Each page within the protected region of the realm guest can be marked as either RAM or EMPTY. Allow the VMM to control this before the guest has started and provide the equivalent functions to change this (with the guest's approval) at runtime. When transitioning from RIPAS RAM (1) to RIPAS EMPTY (0) the memory is unmapped from the guest and undelegated allowing the memory to be reused by the host. When transitioning to RIPAS RAM the actual population of the leaf RTTs is done later on stage 2 fault, however it may be necessary to allocate additional RTTs to allow the RMM track the RIPAS for the requested range. When freeing a block mapping it is necessary to temporarily unfold the RTT which requires delegating an extra page to the RMM, this page can then be recovered once the contents of the block mapping have been freed. Fixes: 4afc64441759 ("[v8-15-43]arm64: RME: Allow VMM to set RIPAS") Signed-off-by: Steven Price Signed-off-by: Xu Raoqing --- arch/arm64/kvm/rme.c | 132 ++++++++++++++++++++++--------------------- 1 file changed, 67 insertions(+), 65 deletions(-) diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c index 2d1fbd145626..d29e4dd9abc5 100644 --- a/arch/arm64/kvm/rme.c +++ b/arch/arm64/kvm/rme.c @@ -213,6 +213,7 @@ static int realm_rtt_fold(struct realm *realm, unsigned long out_rtt; int ret; + addr = ALIGN_DOWN(addr, rme_rtt_level_mapsize(level - 1)); ret = rmi_rtt_fold(virt_to_phys(realm->rd), addr, level, &out_rtt); if (RMI_RETURN_STATUS(ret) == RMI_SUCCESS && rtt_granule) @@ -283,6 +284,61 @@ static int realm_unmap_private_page(struct realm *realm, return 0; } +/* + * Returns 0 on successful fold, a negative value on error, a positive value if + * we were not able to fold all tables at this level. + */ +static int realm_fold_rtt_level(struct realm *realm, int level, + unsigned long start, unsigned long end) +{ + int not_folded = 0; + ssize_t map_size; + unsigned long addr, next_addr; + + if (WARN_ON(level > RMM_RTT_MAX_LEVEL)) + return -EINVAL; + + map_size = rme_rtt_level_mapsize(level - 1); + + for (addr = start; addr < end; addr = next_addr) { + phys_addr_t rtt_granule; + int ret; + unsigned long align_addr = ALIGN(addr, map_size); + + next_addr = ALIGN(addr + 1, map_size); + + ret = realm_rtt_fold(realm, align_addr, level, &rtt_granule); + + switch (RMI_RETURN_STATUS(ret)) { + case RMI_SUCCESS: + free_delegated_granule(rtt_granule); + break; + case RMI_ERROR_RTT: + if (level == RMM_RTT_MAX_LEVEL || + RMI_RETURN_INDEX(ret) < level) { + not_folded++; + break; + } + /* Recurse a level deeper */ + ret = realm_fold_rtt_level(realm, + level + 1, + addr, + next_addr); + if (ret < 0) + return ret; + else if (ret == 0) + /* Try again at this level */ + next_addr = addr; + break; + default: + WARN_ON(1); + return -ENXIO; + } + } + + return not_folded; +} + static void realm_unmap_shared_range(struct kvm *kvm, int level, unsigned long start, @@ -339,6 +395,7 @@ static void realm_unmap_shared_range(struct kvm *kvm, cond_resched_rwlock_write(&kvm->mmu_lock); } + realm_fold_rtt_level(realm, get_start_level(realm) + 1, start, end); } static int realm_init_sve_param(struct kvm *kvm, struct realm_params *params) @@ -522,6 +579,7 @@ static int realm_create_rtt_levels(struct realm *realm, if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT && RMI_RETURN_INDEX(ret) == level - 1) { /* The RTT already exists, continue */ + free_delegated_granule(rtt); continue; } if (ret) { @@ -625,61 +683,6 @@ static int realm_tear_down_rtt_range(struct realm *realm, start, end); } -/* - * Returns 0 on successful fold, a negative value on error, a positive value if - * we were not able to fold all tables at this level. - */ -static int realm_fold_rtt_level(struct realm *realm, int level, - unsigned long start, unsigned long end) -{ - int not_folded = 0; - ssize_t map_size; - unsigned long addr, next_addr; - - if (WARN_ON(level > RMM_RTT_MAX_LEVEL)) - return -EINVAL; - - map_size = rme_rtt_level_mapsize(level - 1); - - for (addr = start; addr < end; addr = next_addr) { - phys_addr_t rtt_granule; - int ret; - unsigned long align_addr = ALIGN(addr, map_size); - - next_addr = ALIGN(addr + 1, map_size); - - ret = realm_rtt_fold(realm, align_addr, level, &rtt_granule); - - switch (RMI_RETURN_STATUS(ret)) { - case RMI_SUCCESS: - free_delegated_granule(rtt_granule); - break; - case RMI_ERROR_RTT: - if (level == RMM_RTT_MAX_LEVEL || - RMI_RETURN_INDEX(ret) < level) { - not_folded++; - break; - } - /* Recurse a level deeper */ - ret = realm_fold_rtt_level(realm, - level + 1, - addr, - next_addr); - if (ret < 0) - return ret; - else if (ret == 0) - /* Try again at this level */ - next_addr = addr; - break; - default: - WARN_ON(1); - return -ENXIO; - } - } - - return not_folded; -} - void kvm_realm_destroy_rtts(struct kvm *kvm, u32 ia_bits) { struct realm *realm = &kvm->arch.realm; @@ -1146,18 +1149,16 @@ static int realm_set_ipa_state(struct kvm_vcpu *vcpu, * If the RMM walk ended early then more tables are * needed to reach the required depth to set the RIPAS. */ - if (walk_level < level) { - ret = realm_create_rtt_levels(realm, ipa, + if (walk_level >= level) + return -EINVAL; + + ret = realm_create_rtt_levels(realm, ipa, walk_level, level, memcache); - /* Retry with RTTs created */ - if (!ret) - continue; - } else { - ret = -EINVAL; - } - + if (ret) + return ret; + /* Retry with the RTT levels in place */ break; } else { WARN(1, "Unexpected error in %s: %#x\n", __func__, @@ -1466,7 +1467,8 @@ static void kvm_complete_ripas_change(struct kvm_vcpu *vcpu) break; base = top_ipa; - } while (top_ipa < top); + } while (base < top); + rec->run->exit.ripas_base = base; } /* -- Gitee