diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index a66e4630d25aaf0b42df9b4d6158a7294f4407dc..7b78c14a6e5e657b70c47ce0fdbab3f2d10ea6ad 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -8345,4 +8345,5 @@ CONFIG_UB_UBASE=m CONFIG_UB_UMMU_CORE=y CONFIG_UB_UMMU_CORE_DRIVER=m CONFIG_UB_UMMU_PMU=m +CONFIG_UB_UMMU=m # end of UMMU diff --git a/drivers/iommu/hisilicon/Kconfig b/drivers/iommu/hisilicon/Kconfig index 76acd888b65e11f347cfcbf46012cba72568f683..60308ece229fe8e4f57a4d92298f4c6183bbb659 100644 --- a/drivers/iommu/hisilicon/Kconfig +++ b/drivers/iommu/hisilicon/Kconfig @@ -4,3 +4,20 @@ # source "drivers/iommu/hisilicon/ummu-core/Kconfig" + +config UB_UMMU + tristate "Hisilicon UB MMU Support" + depends on ARM64 && ARCH_HISI + depends on UB_UBUS && UB_UBFI && UB_UBRT_PLAT_DEV + default n + select IOMMU_API + select IOMMU_IO_PGTABLE_LPAE + select GENERIC_MSI_IRQ + select IOMMUFD_DRIVER if IOMMUFD + select UMMU_CORE + help + Support for implementations of the hisilicon UMMU architecture. + UMMU provides address translation for device access to the + local host. + Say Y here if your Soc includes an UMMU device implementing + the Hisilicon UMMU architecture. diff --git a/drivers/iommu/hisilicon/Makefile b/drivers/iommu/hisilicon/Makefile index e32879971f3fd702c4e0f85a6c8ce59fc69f3171..5064e5df09695c4da3b05f5761b4eb6afb63850e 100644 --- a/drivers/iommu/hisilicon/Makefile +++ b/drivers/iommu/hisilicon/Makefile @@ -1,3 +1,8 @@ # SPDX-License-Identifier: GPL-2.0+ obj-y += ummu-core/ +obj-$(CONFIG_UB_UMMU) += ummu.o +ummu-y := ummu_main.o \ + queue.o \ + interrupt.o \ + flush.o diff --git a/drivers/iommu/hisilicon/flush.c b/drivers/iommu/hisilicon/flush.c new file mode 100644 index 0000000000000000000000000000000000000000..464c93c8c2a3db874bc91e5a8af9a31dab6b1502 --- /dev/null +++ b/drivers/iommu/hisilicon/flush.c @@ -0,0 +1,433 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (c) 2025 HiSilicon Technologies Co., Ltd. All rights reserved. + */ + +#include +#include +#include +#include + +#include "ummu.h" +#include "queue.h" +#include "flush.h" + +enum ummu_tlbi_scene { + UMMU_TLBI_SCENE_DMA = 0, + UMMU_TLBI_SCENE_SVA, + UMMU_TLBI_SCENE_NUM, +}; + +enum ummu_tlbi_scope { + UMMU_TLBI_SCOPE_CTX = 0, + UMMU_TLBI_SCOPE_RNG, + UMMU_TLBI_SCOPE_NUM, +}; + +enum ummu_tlbi_type { + UMMU_TLBI_TYPE_S1E2H = 0, + UMMU_TLBI_TYPE_S1NH, + UMMU_TLBI_TYPE_S2, + UMMU_TLBI_TYPE_NUM, +}; + +const static +u8 ummu_tlbi_code_table[UMMU_TLBI_SCENE_NUM][UMMU_TLBI_SCOPE_NUM][UMMU_TLBI_TYPE_NUM] = { + [UMMU_TLBI_SCENE_DMA] = { + [UMMU_TLBI_SCOPE_CTX] = { + [UMMU_TLBI_TYPE_S1E2H] = CMD_TLBI_HYP_TID, + [UMMU_TLBI_TYPE_S1NH] = CMD_TLBI_OS_TID, + [UMMU_TLBI_TYPE_S2] = CMD_TLBI_S1S2_VMALL, + }, + [UMMU_TLBI_SCOPE_RNG] = { + [UMMU_TLBI_TYPE_S1E2H] = CMD_TLBI_HYP_VA, + [UMMU_TLBI_TYPE_S1NH] = CMD_TLBI_OS_TID, + [UMMU_TLBI_TYPE_S2] = CMD_TLBI_OS_VA, + }, + }, + [UMMU_TLBI_SCENE_SVA] = { + [UMMU_TLBI_SCOPE_CTX] = { + [UMMU_TLBI_TYPE_S1E2H] = CMD_TLBI_HYP_ASID_U, + [UMMU_TLBI_TYPE_S1NH] = CMD_TLBI_OS_ASID_U, + [UMMU_TLBI_TYPE_S2] = CMD_TLBI_OS_ASID_U, + }, + [UMMU_TLBI_SCOPE_RNG] = { + [UMMU_TLBI_TYPE_S1E2H] = CMD_TLBI_HYP_VA_U, + [UMMU_TLBI_TYPE_S1NH] = CMD_TLBI_OS_ASID_U, + [UMMU_TLBI_TYPE_S2] = CMD_TLBI_OS_VA_U, + }, + }, +}; + +static int ummu_domain_tlbi_cmd(struct ummu_domain *domain, + enum ummu_tlbi_scope scope, + enum ummu_tlbi_scene scene, + struct ummu_mcmdq_ent *cmd) +{ + struct ummu_s1_cfg *s1cfg = &domain->cfgs.s1_cfg; + struct ummu_s2_cfg *s2cfg = &domain->cfgs.s2_cfg; + struct ummu_device *ummu_dev; + enum ummu_tlbi_type type; + bool e2h; + + switch (domain->cfgs.stage) { + case UMMU_DOMAIN_S1: + if (scene == UMMU_TLBI_SCENE_DMA) { + if (domain->base_domain.tid == UMMU_INVALID_TID) + return -EINVAL; + + cmd->tlbi.tid = domain->base_domain.tid; + } else { + cmd->tlbi.asid = s1cfg->tct.asid; + } + + ummu_dev = core_to_ummu_device(domain->base_domain.core_dev); + e2h = !!(ummu_dev->cap.features & UMMU_FEAT_E2H); + type = e2h ? UMMU_TLBI_TYPE_S1E2H : UMMU_TLBI_TYPE_S1NH; + break; + case UMMU_DOMAIN_S2: + if (scene == UMMU_TLBI_SCENE_DMA) + cmd->tlbi.tect_tag = domain->cfgs.tecte_tag; + else + cmd->tlbi.vmid = s2cfg->vmid; + + type = UMMU_TLBI_TYPE_S2; + break; + default: + WARN(1, "get unexpected domain stage: %d", + (int)domain->cfgs.stage); + return -EINVAL; + } + + cmd->opcode = ummu_tlbi_code_table[scene][scope][type]; + return 0; +} + +static void ummu_range_tlbi_nofeat(struct ummu_device *ummu, + struct ummu_mcmdq_ent *cmd, + struct ummu_tlb_range *range) +{ + unsigned long rg_start = range->iova, rg_end = range->iova + range->size; + struct ummu_mcmdq_batch batch_cmds = {}; + + while (rg_start < rg_end) { + cmd->tlbi.addr = rg_start; + ummu_mcmdq_batch_add(ummu, &batch_cmds, cmd); + rg_start += range->granule; + } + ummu_mcmdq_batch_submit(ummu, &batch_cmds); +} + +/* `granule` is inv granule and `translation_granule` is the granule of page table */ +#define granule_to_lvl(granule, translation_granule) \ + (4 - (ilog2(granule) - 3) / ((translation_granule)-3)) +/* this function highly rely on pagetable format, follow arm implementation now */ +static void __ummu_tlbi_range(struct ummu_mcmdq_ent *cmd, + struct ummu_tlb_range *range, + struct ummu_domain *domain) +{ + struct ummu_device *ummu = core_to_ummu_device(domain->base_domain.core_dev); + unsigned long num_pages, gs, rg_start, rg_end, scale, num; + struct ummu_mcmdq_batch batch_cmds = {}; + size_t ranged; + + if (range->iova == ULONG_MAX || range->size == 0) + return; + + if (!(ummu->cap.features & UMMU_FEAT_RANGE_INV)) { + ummu_range_tlbi_nofeat(ummu, cmd, range); + return; + } + + rg_start = range->iova; + rg_end = rg_start + range->size; + /* tg will be 12, 14, 16, indicating 4K, 16K, 64K pgtable */ + gs = __ffs(domain->base_domain.domain.pgsize_bitmap); + num_pages = range->size >> gs; + + /* transfer 12,14,16 to 1,2,3, refer to the protocol */ + cmd->tlbi.gs = (gs - 10) >> 1; + cmd->tlbi.tl = granule_to_lvl(range->granule, gs); + + while (rg_start < rg_end) { + cmd->tlbi.addr = rg_start; + + scale = __ffs(num_pages); + cmd->tlbi.scale = scale; + + num = (num_pages >> scale) & CMD_TLBI_RANGE_NUM_MAX; + cmd->tlbi.num = num - 1; + + ummu_mcmdq_batch_add(ummu, &batch_cmds, cmd); + + ranged = num << (scale + gs); + num_pages -= num << scale; + rg_start += ranged; + } + ummu_mcmdq_batch_submit(ummu, &batch_cmds); +} + +static void ummu_tlbi_range(struct ummu_tlb_range *range, bool leaf, + struct ummu_domain *domain) +{ + struct ummu_mcmdq_ent cmd = {0}; + int err; + + err = ummu_domain_tlbi_cmd(domain, UMMU_TLBI_SCOPE_RNG, UMMU_TLBI_SCENE_DMA, &cmd); + if (err) + return; + + cmd.tlbi.leaf = leaf; + __ummu_tlbi_range(&cmd, range, domain); +} + +/* for io_pgtable */ +void ummu_tlbi_context(void *cookie) +{ + struct ummu_domain *domain = (struct ummu_domain *)cookie; + struct ummu_device *ummu = core_to_ummu_device( + domain->base_domain.core_dev); + struct ummu_mcmdq_ent cmd = {0}; + int err; + + err = ummu_domain_tlbi_cmd(domain, UMMU_TLBI_SCOPE_CTX, UMMU_TLBI_SCENE_DMA, &cmd); + if (err) + return; + + ummu_mcmdq_issue_cmd_with_sync(ummu, &cmd); +} + +void ummu_tlbi_walk(unsigned long iova, size_t size, size_t granule, + void *cookie) +{ + struct ummu_domain *domain = (struct ummu_domain *)cookie; + struct ummu_tlb_range range = { + .iova = iova, + .size = size, + .granule = granule, + }; + + ummu_tlbi_range(&range, false, domain); +} + +void ummu_tlbi_page(struct iommu_iotlb_gather *gather, unsigned long iova, + size_t granule, void *cookie) +{ + struct ummu_domain *domain = (struct ummu_domain *)cookie; + + iommu_iotlb_gather_add_page(&domain->base_domain.domain, gather, iova, granule); +} + +void ummu_iotlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather) +{ + struct ummu_domain *u_domain = to_ummu_domain(domain); + struct ummu_tlb_range range = { + .iova = gather->start, + .size = gather->end - gather->start + 1, + .granule = gather->pgsize, + }; + + ummu_tlbi_range(&range, true, u_domain); +} + +void ummu_non_agent_iotlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather) +{ + struct ummu_domain *u_domain = to_ummu_domain(domain); + struct ummu_tlb_range range = { + .iova = gather->start, + .size = gather->end - gather->start + 1, + .granule = gather->pgsize, + }; + + ummu_tlbi_range(&range, false, u_domain); +} + +void ummu_flush_iotlb_all(struct iommu_domain *domain) +{ + struct ummu_domain *u_domain = to_ummu_domain(domain); + + ummu_tlbi_context(u_domain); +} + +void ummu_init_flush_iotlb(struct ummu_device *ummu) +{ + struct ummu_mcmdq_ent cmd; + + if (ummu->cap.features & UMMU_FEAT_HYP) { + cmd.opcode = CMD_TLBI_HYP_ALL; + ummu_mcmdq_issue_cmd_with_sync(ummu, &cmd); + } + + cmd.opcode = CMD_TLBI_NS_OS_ALL; + ummu_mcmdq_issue_cmd_with_sync(ummu, &cmd); +} + +void ummu_device_prefetch_cfg(struct ummu_device *ummu, u32 tecte_tag, + u32 tid) +{ + struct ummu_mcmdq_ent cmd_prefet = { + .opcode = CMD_PREFET_CFG, + .prefet = { + .tkv = (tid == UMMU_INVALID_TID) ? false : true, + .tid = tid, + .deid_0 = tecte_tag, + }, + }; + + ummu_mcmdq_issue_cmd_with_sync(ummu, &cmd_prefet); +} + +void ummu_sync_tect_range(struct ummu_device *ummu, u32 tecte_tag, + u8 range) +{ + struct ummu_mcmdq_ent cmd_cfgi_tect_range = { + .opcode = CMD_CFGI_TECT_RANGE, + .cfgi = { + .range = range, + .deid_0 = tecte_tag, + }, + }; + + ummu_mcmdq_issue_cmd_with_sync(ummu, &cmd_cfgi_tect_range); +} + +void ummu_sync_tect_all(struct ummu_device *ummu) +{ + ummu_sync_tect_range(ummu, 0, CMD_TLBI_RANGE_NUM_MAX); +} + +void ummu_device_sync_tect(struct ummu_device *ummu, u32 tecte_tag) +{ + struct ummu_mcmdq_ent cmd_cfgi_tect = { + .opcode = CMD_CFGI_TECT, + .cfgi = { + .leaf = true, + .deid_0 = tecte_tag, + }, + }; + + ummu_mcmdq_issue_cmd_with_sync(ummu, &cmd_cfgi_tect); +} + +void ummu_sync_tct(struct ummu_device *ummu, u32 tecte_tag, u32 tid, + bool leaf) +{ + struct ummu_mcmdq_ent cmd_cfgi_tct = { + .opcode = CMD_CFGI_TCT, + .cfgi = { + .leaf = leaf, + .tid = tid, + .deid_0 = tecte_tag, + }, + }; + struct ummu_mcmdq_ent cmd_plbi_all = { + .opcode = CMD_PLBI_OS_EIDTID, + .plbi = { + .tid = tid, + .tecte_tag = tecte_tag, + }, + }; + + if (!ummu->cap.prod_ver) + ummu_mcmdq_issue_cmd(ummu, &cmd_plbi_all); + ummu_mcmdq_issue_cmd_with_sync(ummu, &cmd_cfgi_tct); +} + +void ummu_sync_tct_all(struct ummu_device *ummu, u32 tecte_tag) +{ + struct ummu_mcmdq_ent cmd_cfgi_tct_all = { + .opcode = CMD_CFGI_TCT_ALL, + .cfgi = { + .deid_0 = tecte_tag, + }, + }; + + ummu_mcmdq_issue_cmd_with_sync(ummu, &cmd_cfgi_tct_all); +} + +static u8 get_minist_log2size_range(size_t size) +{ + u8 index = 0; + + if (size > 0) + size -= 1; + + while (size > 0) { + size >>= 1; + index++; + } + + return index; +} + +int ummu_device_flush_plb(struct ummu_device *ummu, u32 tag, u32 tid, + u64 addr, size_t size) +{ + u32 plbi_num = (ummu->cap.options & UMMU_OPT_DOUBLE_PLBI) ? 2 : 1; + struct ummu_mcmdq_ent cmd = { + .opcode = CMD_PLBI_OS_VA, + .plbi = { + .tid = tid, + .tecte_tag = (u16)tag, + .range = get_minist_log2size_range(size), + .addr = addr, + }, + }; + u32 idx; + int ret; + + for (idx = 0; idx < plbi_num; idx++) { + ret = ummu_mcmdq_issue_cmd_with_sync(ummu, &cmd); + if (ret) + dev_err(ummu->dev, + "issue plbi va cmd failed, idx = %u, ret = %d\n", idx, ret); + } + + return ret; +} + +void ummu_device_flush_plb_all(struct iommu_domain *domain) +{ + struct ummu_base_domain *base_domain = to_ummu_base_domain(domain); + struct ummu_device *ummu = core_to_ummu_device(base_domain->core_dev); + u32 plbi_num = (ummu->cap.options & UMMU_OPT_DOUBLE_PLBI) ? 2 : 1; + struct ummu_domain *u_domain = to_ummu_domain(domain); + struct ummu_mcmdq_ent cmd = { + .opcode = CMD_PLBI_OS_EIDTID, + .plbi = { + .tid = base_domain->tid, + .tecte_tag = u_domain->cfgs.tecte_tag, + }, + }; + u32 idx; + int ret; + + for (idx = 0; idx < plbi_num; idx++) { + ret = ummu_mcmdq_issue_cmd_with_sync(ummu, &cmd); + if (ret) + dev_err(ummu->dev, + "issue plbi tid cmd failed, idx = %u, ret = %d\n", idx, ret); + } +} + +int ummu_device_check_pa_continuity(struct ummu_device *ummu, u64 addr, + u32 size_order, u32 id) +{ + struct ummu_mcmdq_ent cmd_ent = { + .opcode = CMD_NULL_OP, + .null_op = { + .sub_op = SUB_CMD_NULL_CHECK_PA_CONTINUITY, + .check_pa_conti = { + .result = 0, + .flag = 0, + .size_order = size_order, + .id = id, + .addr = addr, + }, + }, + }; + + return ummu_mcmdq_issue_cmd_with_sync(ummu, &cmd_ent); +} diff --git a/drivers/iommu/hisilicon/flush.h b/drivers/iommu/hisilicon/flush.h new file mode 100644 index 0000000000000000000000000000000000000000..6799ec5595c953dab789bd29d51b153169fedcc6 --- /dev/null +++ b/drivers/iommu/hisilicon/flush.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Copyright (c) 2025 HiSilicon Technologies Co., Ltd. All rights reserved. + */ + +#ifndef __UMMU_FLUSH_H__ +#define __UMMU_FLUSH_H__ + +#include "ummu.h" + +struct ummu_tlb_range { + unsigned long iova; + size_t size; + size_t granule; +}; + +/* for ummu device reset */ +void ummu_init_flush_iotlb(struct ummu_device *ummu); + +/* for default_domain_ops */ +void ummu_flush_iotlb_all(struct iommu_domain *iommu_domain); +void ummu_iotlb_sync(struct iommu_domain *iommu_domain, + struct iommu_iotlb_gather *gather); +void ummu_non_agent_iotlb_sync(struct iommu_domain *iommu_domain, + struct iommu_iotlb_gather *gather); + +/* for io_pgtable */ +void ummu_tlbi_context(void *cookie); +void ummu_tlbi_walk(unsigned long iova, size_t size, size_t granule, + void *cookie); +void ummu_tlbi_page(struct iommu_iotlb_gather *gather, unsigned long iova, + size_t granule, void *cookie); + +void ummu_device_prefetch_cfg(struct ummu_device *ummu, u32 tecte_tag, + u32 tid); +void ummu_sync_tect_range(struct ummu_device *ummu, u32 tecte_tag, + u8 range); +void ummu_sync_tect_all(struct ummu_device *ummu); +void ummu_device_sync_tect(struct ummu_device *ummu, u32 tecte_tag); +void ummu_sync_tct(struct ummu_device *ummu, u32 tecte_tag, u32 tid, + bool leaf); +void ummu_sync_tct_all(struct ummu_device *ummu, u32 tecte_tag); +int ummu_device_flush_plb(struct ummu_device *ummu, u32 tag, u32 tid, + u64 addr, size_t size); +void ummu_device_flush_plb_all(struct iommu_domain *iommu_domain); +int ummu_device_check_pa_continuity(struct ummu_device *ummu, u64 addr, + u32 size_order, u32 id); +#endif /*__UMMU_FLUSH_H__*/ diff --git a/drivers/iommu/hisilicon/interrupt.c b/drivers/iommu/hisilicon/interrupt.c new file mode 100644 index 0000000000000000000000000000000000000000..7c2d8d3d942999560580e76a15c6a016b7e8cb4d --- /dev/null +++ b/drivers/iommu/hisilicon/interrupt.c @@ -0,0 +1,388 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (c) 2025 HiSilicon Technologies Co., Ltd. All rights reserved. + * Description: UMMU Interrupt Management + */ + +#define pr_fmt(fmt) "UMMU: " fmt +#include + +#include "ummu.h" +#include "queue.h" +#include "regs.h" +#include "interrupt.h" + +#define EVT_LOG_LIMIT_TIMEOUT 5000 + +enum ummu_evtq_evt_name { + EVT_UNKNOWN = 0x0, + /* unsupport translation type */ + EVT_UT, + /* dstEid overflow */ + EVT_BAD_DSTEID, + /* abort when visit tect, or addr overflow */ + EVT_TECT_FETCH, + /* TECT not valid, (V=0) */ + EVT_BAD_TECT, + /* tect ent lack tokenid */ + EVT_RESERVE_0 = 0x5, + /* reserved, no content */ + EVT_BAD_TOKENID, + /* + * 1. TECT.TCT_MAXNUM = 0, tokenid disable, + * 2. TECT.ST_MODE[0] = 0, stage 1 translation close. + * 3. tokenid > TECT.TCT_MAXNUM + * 4. lvl1 tct invalid in two-level tct + */ + EVT_TCT_FETCH, + /* invalid tct */ + EVT_BAD_TCT, + /* error when Address Table walk */ + EVT_A_PTW_EABT, + /* + * translation input bigger than max valid value, + * or no valid translation table descriptor + */ + EVT_A_TRANSLATION = 0xa, + /* address translation out put bigger than max valid value */ + EVT_A_ADDR_SIZE, + /* Access flag fault because of AF=0 */ + EVT_ACCESS, + /* address translation permission error */ + EVT_A_PERMISSION, + /* TLB or PLB conflicted in translation */ + EVT_TBU_CONFLICT, + /* config cache conflicted in translation */ + EVT_CFG_CONFLICT = 0xf, + /* error occurred when getting VMS */ + EVT_VMS_FETCH, + /* error when Permission Table walk */ + EVT_P_PTW_EABT, + /* abnormal software configuration in PTW */ + EVT_P_CFG_ERROR, + /* permission exception in PTW process */ + EVT_P_PERMISSION, + /* E-Bit verification failed */ + EVT_RESERVE_1 = 0x14, + /* reserved, no content */ + EVT_EBIT_DENY, + /* + * the UMMU hardware reports the execution result + * of the CMD_CREAT_DSTEID_TECT_RELATION command + * to the software. + */ + EVT_CREATE_DSTEID_TECT_RELATION_RESULT = 0x60, + /* + * the UMMU hardware reports the execution result + * of the CMD_DELETE_DSTEID_TECT_RELATION command + * to the software. + */ + EVT_DELETE_DSTEID_TECT_RELATION_RESULT, +}; + +static phys_addr_t ummu_msi_cfg[UMMU_MAX_MSIS][3] = { + [EVTQ_MSI_INDEX] = { + UMMU_EVENT_QUE_MSI_ADDR0, + UMMU_EVENT_QUE_MSI_DATA, + UMMU_EVENT_QUE_MSI_ATTR, + }, + [GERROR_MSI_INDEX] = { + UMMU_GLB_ERR_INT_MSI_ADDR0, + UMMU_GLB_ERR_INT_MSI_DATA, + UMMU_GLB_ERR_INT_MSI_ATTR, + }, +}; + +/* implementation is based on the ARM SMMU arm_smmu_cmdq_skip_err */ +static void ummu_device_mcmdq_skip_err(struct ummu_device *ummu, + struct ummu_queue *q) +{ + static const char * const cerror_str[] = { + [MCMDQ_CERROR_NONE_IDX] = "No error", + [MCMDQ_CERROR_ILL_IDX] = "Illegal command", + [MCMDQ_CERROR_ABT_IDX] = "Abort on command fetch", + }; + + u32 cons = readl_relaxed(q->cons_reg); + u32 rsn_idx = FIELD_GET(MCMDQ_CONS_ERR_REASON, cons); + struct ummu_mcmdq_ent cmd_sync = { + .opcode = CMD_SYNC, + }; + u64 cmd[MCMDQ_ENT_DWORDS]; + size_t i; + + dev_err_ratelimited(ummu->dev, "MCMDQ error (cons 0x%08x): %s\n", cons, + rsn_idx < ARRAY_SIZE(cerror_str) ? cerror_str[rsn_idx] : + "Unknown"); + + switch (rsn_idx) { + case MCMDQ_CERROR_ABT_IDX: + dev_err_ratelimited(ummu->dev, "retrying command fetch\n"); + return; + case MCMDQ_CERROR_NONE_IDX: + return; + case MCMDQ_CERROR_ILL_IDX: + break; + default: + break; + } + + /* + * We may have concurrent producers, so we need to be careful + * not to touch any of the shadow cmdq state. + */ + ummu_queue_read(cmd, Q_ENT(q, cons), q->ent_dwords); + dev_err_ratelimited(ummu->dev, "skipping command in error state:\n"); + for (i = 0; i < ARRAY_SIZE(cmd); ++i) + dev_err_ratelimited(ummu->dev, "\t0x%016llx\n", (unsigned long long)cmd[i]); + + /* Convert the erroneous command into a CMD_SYNC */ + if (ummu_mcmdq_build_cmd(ummu, cmd, &cmd_sync)) { + dev_err_ratelimited(ummu->dev, "failed to convert to CMD_SYNC\n"); + return; + } + + ummu_queue_write(Q_ENT(q, cons), cmd, q->ent_dwords); +} + +static void ummu_mcmdq_skip_err(struct ummu_device *ummu) +{ + struct ummu_mcmdq *mcmdq; + unsigned long flags; + u32 prod, cons; + u32 i; + + for (i = 0; i < ummu->nr_mcmdq; i++) { + mcmdq = *per_cpu_ptr(ummu->mcmdq, i); + prod = readl_relaxed(mcmdq->q.prod_reg); + cons = readl_relaxed(mcmdq->q.cons_reg); + if (((prod ^ cons) & MCMDQ_CONS_ERR) == 0) + continue; + + ummu_device_mcmdq_skip_err(ummu, &mcmdq->q); + + write_lock_irqsave(&mcmdq->mcmdq_lock, flags); + mcmdq->mcmdq_prod &= ~MCMDQ_PROD_ERRACK; + mcmdq->mcmdq_prod |= cons & MCMDQ_CONS_ERR; + + prod = readl_relaxed(mcmdq->q.prod_reg); + prod &= ~MCMDQ_PROD_ERRACK; + prod |= cons & MCMDQ_CONS_ERR; + writel(prod, mcmdq->q.prod_reg); + write_unlock_irqrestore(&mcmdq->mcmdq_lock, flags); + } +} + +static irqreturn_t ummu_gerror_handler(int irq, void *dev) +{ + struct ummu_device *ummu = (struct ummu_device *)dev; + u32 gerror, gerrorn, active; + + gerror = readl_relaxed(ummu->base + UMMU_GERROR); + gerrorn = readl_relaxed(ummu->base + UMMU_GERRORN); + + active = gerror ^ gerrorn; + if (!(active & GERROR_ERR_MASK)) + return IRQ_NONE; /* No errors pending */ + + dev_err_ratelimited( + ummu->dev, + "unexpected global error reported (0x%08x), this could be serious\n", + active); + + if (active & GERROR_MSI_GERR_ABT_ERR) + dev_err_ratelimited(ummu->dev, "GERROR MSI write aborted\n"); + + if (active & GERROR_MSI_UIEQ_ABT_ERR) + dev_err_ratelimited(ummu->dev, "UIEQ MSI sync cmdq write aborted\n"); + + if (active & GERROR_MSI_EVTQ_ABT_ERR) + dev_err_ratelimited(ummu->dev, "EVTQ MSI write aborted\n"); + + if (active & GERROR_MSI_MCMDQ_ABT_ERR) + dev_err_ratelimited(ummu->dev, "CMDQ MSI write aborted\n"); + + if (active & GERROR_EVTQ_ABT_ERR) + dev_err_ratelimited(ummu->dev, + "EVTQ write aborted -- events may have been lost\n"); + + if (active & GERROR_MCMDQ_ERR) + ummu_mcmdq_skip_err(ummu); + + writel(gerror, ummu->base + UMMU_GERRORN); + return IRQ_HANDLED; +} + +static void ummu_print_event(struct ummu_device *ummu, u8 code, u64 *evt) +{ + static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + static u8 last_evt_code; + static u64 timeout; + int i; + + if (!__ratelimit(&rs)) + return; + + if (last_evt_code == code && time_is_after_jiffies64(timeout)) + return; + + last_evt_code = code; + timeout = get_jiffies_64() + msecs_to_jiffies(EVT_LOG_LIMIT_TIMEOUT); + dev_info(ummu->dev, "event 0x%02x received:\n", code); + for (i = 0; i < EVTQ_ENT_DWORDS; ++i) + dev_info(ummu->dev, "\t0x%016llx\n", (u64)evt[i]); +} + +/* implementation is based on the ARM SMMU arm_smmu_evtq_thread */ +static irqreturn_t ummu_evtq_thread(int irq, void *dev) +{ + struct ummu_device *ummu = (struct ummu_device *)dev; + struct ummu_queue *q = &ummu->evtq.q; + struct ummu_ll_queue *llq = &q->llq; + u64 evt[EVTQ_ENT_DWORDS]; + u32 tid; + u8 code; + + do { + while (!ummu_queue_remove_raw(q, evt)) { + code = FIELD_GET(EVTQ_ENT0_CODE, evt[0]); + tid = FIELD_GET(EVTQ_ENT0_TID, evt[0]); + + ummu_print_event(ummu, code, evt); + + cond_resched(); + } + + if (ummu_queue_sync_prod_in(q) == -EOVERFLOW) + dev_err(ummu->dev, + "EVTQ overflow detected -- events lost\n"); + } while (!ummu_queue_empty(llq)); + + if (likely(Q_OVF(llq->prod) == Q_OVF(llq->cons))) + goto handled; + + /* Sync overflow flag */ + llq->cons = Q_OVF(llq->prod) | Q_WRP(llq, llq->cons) | + Q_IDX(llq, llq->cons); + __iomb(); + writel_relaxed(q->llq.cons, q->cons_reg); +handled: + return IRQ_HANDLED; +} + +static void ummu_free_msis(void *data) +{ + struct device *dev = (struct device *)data; + + platform_msi_domain_free_irqs(dev); +} + +static void ummu_write_msi_msg(struct msi_desc *desc, struct msi_msg *msg) +{ + struct device *dev = msi_desc_to_dev(desc); + struct ummu_device *ummu = dev_get_drvdata(dev); + phys_addr_t msi_addr; + phys_addr_t *cfg; + + if (desc->msi_index > GERROR_MSI_INDEX) + return; + + cfg = ummu_msi_cfg[desc->msi_index]; + /* 32 bit addresses are converted to 64 bit addresses. */ + msi_addr = (((u64)msg->address_hi) << 32) | msg->address_lo; + msi_addr &= UMMU_MSI_ADDR_MASK; + + writeq_relaxed(msi_addr, ummu->base + cfg[0]); + writel_relaxed(msg->data, ummu->base + cfg[1]); + writel_relaxed(UMMU_MEMATTR_DEVICE_nGnRE, ummu->base + cfg[2]); +} + +static int ummu_device_setup_msis(struct ummu_device *ummu) +{ + struct device *dev = ummu->dev; + int ret; + + if (!(ummu->cap.features & UMMU_FEAT_MSI)) + return -EOPNOTSUPP; + + if (!dev->msi.domain) + return -ENODEV; + + /* Clear the MSI address regs */ + writeq_relaxed(0, ummu->base + UMMU_EVENT_QUE_MSI_ADDR0); + writeq_relaxed(0, ummu->base + UMMU_GLB_ERR_INT_MSI_ADDR0); + + /* Allocate MSIs for evtq, gerror */ + ret = platform_msi_domain_alloc_irqs(dev, UMMU_MAX_MSIS, ummu_write_msi_msg); + if (ret) { + dev_err(dev, "failed to allocate MSIs. ret = %d\n", ret); + return ret; + } + + /* Add callback to free MSIs on teardown */ + ret = devm_add_action_or_reset(dev, ummu_free_msis, dev); + if (ret) + dev_err(dev, "failed to add free msis action ret = %d.\n", ret); + + return ret; +} + +static inline void ummu_disable_irqs(struct ummu_device *ummu) +{ + writel_relaxed(0, ummu->base + UMMU_GLB_IRQ_EN); +} + +static inline void ummu_enable_irqs(struct ummu_device *ummu) +{ + u32 irqen_flags = IRQ_CTRL_EVTQ_IRQEN | IRQ_CTRL_GERROR_IRQEN; + + writel_relaxed(irqen_flags, ummu->base + UMMU_GLB_IRQ_EN); +} + +static inline void ummu_init_evtq_irq(struct ummu_device *ummu, int irq) +{ + int ret = devm_request_threaded_irq(ummu->dev, irq, NULL, + ummu_evtq_thread, IRQF_ONESHOT, + "ummu-evtq", ummu); + if (ret < 0) + dev_warn(ummu->dev, "failed to enable evtq irq\n"); +} + +static inline void ummu_init_gerr_irq(struct ummu_device *ummu, int irq) +{ + int ret = devm_request_irq(ummu->dev, irq, ummu_gerror_handler, 0, + "ummu-gerror", ummu); + if (ret < 0) + dev_warn(ummu->dev, "failed to enable gerror irq\n"); +} + +void ummu_setup_irqs(struct ummu_device *ummu) +{ + u32 evtq_irq, gerr_irq; + int ret; + + ummu_disable_irqs(ummu); + + ret = ummu_device_setup_msis(ummu); + if (ret) { + dev_err(ummu->dev, "failed to setup msis. ret = %d\n", ret); + return; + } + + evtq_irq = msi_get_virq(ummu->dev, EVTQ_MSI_INDEX); + if (evtq_irq) + ummu_init_evtq_irq(ummu, evtq_irq); + else + dev_warn(ummu->dev, + "no evtq irq - events will not be reported!\n"); + + gerr_irq = msi_get_virq(ummu->dev, GERROR_MSI_INDEX); + if (gerr_irq) + ummu_init_gerr_irq(ummu, gerr_irq); + else + dev_warn(ummu->dev, + "no gerr irq - errors will not be reported!\n"); + + ummu_enable_irqs(ummu); +} diff --git a/drivers/iommu/hisilicon/interrupt.h b/drivers/iommu/hisilicon/interrupt.h new file mode 100644 index 0000000000000000000000000000000000000000..fe25b6ab11acb4afcd9d820d2d0741e53030facd --- /dev/null +++ b/drivers/iommu/hisilicon/interrupt.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Copyright (c) 2025 HiSilicon Technologies Co., Ltd. All rights reserved. + * Description: UMMU Interrupt interface + */ + +#ifndef __UMMU_INTERRUPT_H__ +#define __UMMU_INTERRUPT_H__ + +#include "ummu.h" + +void ummu_setup_irqs(struct ummu_device *ummu); + +#endif /* __UMMU_INTERRUPT_H__ */ diff --git a/drivers/iommu/hisilicon/queue.c b/drivers/iommu/hisilicon/queue.c new file mode 100644 index 0000000000000000000000000000000000000000..9755d64969dd67d842445a5a051d8f2469783a8f --- /dev/null +++ b/drivers/iommu/hisilicon/queue.c @@ -0,0 +1,1215 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (c) 2025 HiSilicon Technologies Co., Ltd. All rights reserved. + * + * Description: UMMU Queue Resource Management. Somewhat based on arm-smmu-v3.c + * Copyright (C) 2025 ARM Limited + */ + +#define pr_fmt(fmt) "UMMU: " fmt + +#include +#include +#include + +#include "ummu.h" +#include "regs.h" +#include "queue.h" + +#define ENTRY_DWORDS_TO_SIZE(dwords) ((dwords) << 3) + +struct ummu_queue_poll { + ktime_t timeout; + u32 delay; + u32 spin_cnt; + bool wfe; +}; + +/* Low-level queue manipulation functions */ +static bool ummu_queue_has_space(struct ummu_ll_queue *q, u32 n) +{ + u32 space, prod, cons; + + prod = Q_IDX(q, q->prod); + cons = Q_IDX(q, q->cons); + if (Q_WRP(q, q->prod) == Q_WRP(q, q->cons)) + space = (1UL << q->log2size) - (prod - cons); + else + space = cons - prod; + + return space >= n; +} + +static bool ummu_queue_full(struct ummu_ll_queue *q) +{ + return Q_IDX(q, q->prod) == Q_IDX(q, q->cons) && + Q_WRP(q, q->prod) != Q_WRP(q, q->cons); +} + +bool ummu_queue_empty(struct ummu_ll_queue *q) +{ + return Q_IDX(q, q->prod) == Q_IDX(q, q->cons) && + Q_WRP(q, q->prod) == Q_WRP(q, q->cons); +} + +static bool ummu_queue_consumed(struct ummu_ll_queue *q, u32 prod) +{ + return ((Q_WRP(q, q->cons) == Q_WRP(q, prod)) && + (Q_IDX(q, q->cons) > Q_IDX(q, prod))) || + ((Q_WRP(q, q->cons) != Q_WRP(q, prod)) && + (Q_IDX(q, q->cons) <= Q_IDX(q, prod))); +} + +static void ummu_queue_sync_cons_out(struct ummu_queue *q) +{ + /* + * Ensure that all CPU accesses (reads and writes) to the queue + * are complete before we update the cons pointer. + */ + __iomb(); + writel_relaxed(q->llq.cons, q->cons_reg); +} + +static void ummu_queue_inc_cons(struct ummu_ll_queue *q) +{ + u32 cons = (Q_WRP(q, q->cons) | Q_IDX(q, q->cons)) + 1; + + q->cons = Q_OVF(q->cons) | Q_WRP(q, cons) | Q_IDX(q, cons); +} + +int ummu_queue_sync_prod_in(struct ummu_queue *q) +{ + u32 prod; + int ret = 0; + + prod = readl(q->prod_reg); + /* + * We can't use the variable _relaxed() here because we have to prevent + * speculative read of the queue before we determine The prod has moved. + */ + if (Q_OVF(prod) != Q_OVF(q->llq.prod)) + ret = -EOVERFLOW; + + q->llq.prod = prod; + + return ret; +} + +static u32 ummu_queue_inc_prod_n(struct ummu_ll_queue *q, int n) +{ + u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + n; + + return Q_OVF(q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod); +} + +static void ummu_queue_poll_init(struct ummu_device *ummu, + struct ummu_queue_poll *qp) +{ + qp->delay = 1; + qp->spin_cnt = 0; + qp->wfe = !!(ummu->cap.features & UMMU_FEAT_SEV); + qp->timeout = ktime_add_us(ktime_get(), UMMU_QUE_POLL_TIMEOUT_US); +} + +static int ummu_queue_poll(struct ummu_queue_poll *qp) +{ + if (ktime_compare(ktime_get(), qp->timeout) > 0) + return -ETIMEDOUT; + + if (qp->wfe) { + wfe(); + } else if (++qp->spin_cnt < UMMU_POLL_SPIN_COUNT) { + cpu_relax(); + } else { + udelay(qp->delay); + qp->delay *= 2; /* multiply the delay by 2 */ + qp->spin_cnt = 0; + } + + return 0; +} + +void ummu_queue_write(__le64 *dst, u64 *src, size_t n_dwords) +{ + size_t i; + + for (i = 0; i < n_dwords; ++i) + *dst++ = cpu_to_le64(*src++); +} + +void ummu_queue_read(u64 *dst, __le64 *src, size_t n_dwords) +{ + size_t i; + + for (i = 0; i < n_dwords; ++i) + *dst++ = le64_to_cpu(*src++); +} + +int ummu_queue_remove_raw(struct ummu_queue *queue, u64 *ent) +{ + if (ummu_queue_empty(&queue->llq)) + return -EAGAIN; + + ummu_queue_read(ent, Q_ENT(queue, queue->llq.cons), queue->ent_dwords); + ummu_queue_inc_cons(&queue->llq); + ummu_queue_sync_cons_out(queue); + + return 0; +} + +static int ummu_common_init_queue(struct ummu_device *ummu, + struct ummu_queue *q, size_t dwords) +{ + size_t qsz; + + q->base = NULL; + do { + qsz = ENTRY_DWORDS_TO_SIZE((1 << q->llq.log2size) * dwords); + if (get_order(qsz) <= MAX_ORDER) + q->base = (__le64 *)devm_get_free_pages(ummu->dev, GFP_KERNEL, + get_order(qsz)); + + q->llq.log2size--; + } while (!q->base && qsz > PAGE_SIZE); + + /* confirm right log2size after the loop */ + q->llq.log2size++; + + if (q->base) { + q->base_pa = virt_to_phys(q->base); + } else { + dev_err(ummu->dev, + "failed to allocate queue (0x%zx bytes)\n", qsz); + return -ENOMEM; + } + + q->ent_dwords = dwords; + q->q_base = Q_BASE_RWA; + q->q_base |= q->base_pa & Q_BASE_ADDR_MASK; + q->q_base |= FIELD_PREP(Q_BASE_LOG2SIZE, q->llq.log2size); + q->llq.prod = q->llq.cons = 0; + + return 0; +} + +static int ummu_mcmdq_allocate(struct ummu_device *ummu) +{ + struct ummu_mcmdq __percpu *mcmdqs; + struct ummu_mcmdq *mcmdq; + u32 cpu, host_cpu; + + mcmdqs = devm_alloc_percpu(ummu->dev, *mcmdq); + if (!mcmdqs) + return -ENOMEM; + + /* A core requires at most one ECMDQ */ + if (num_possible_cpus() < ummu->nr_mcmdq) + ummu->nr_mcmdq = num_possible_cpus(); + + for_each_possible_cpu(cpu) { + if (cpu < ummu->nr_mcmdq) { + mcmdq = per_cpu_ptr(mcmdqs, cpu); + mcmdq->configured = 0; + } else { + host_cpu = cpu % ummu->nr_mcmdq; + mcmdq = per_cpu_ptr(mcmdqs, host_cpu); + mcmdq->shared = 1; + } + *per_cpu_ptr(ummu->mcmdq, cpu) = mcmdq; + } + + return 0; +} + +static int ummu_mcmdq_cfg_para(struct ummu_device *ummu, + struct ummu_mcmdq *mcmdq) +{ + atomic_long_t *bitmap; + + mcmdq->mcmdq_prod = MCMDQ_PROD_EN; + + atomic_set(&mcmdq->owner_prod, 0); + rwlock_init(&mcmdq->mcmdq_lock); + + bitmap = (atomic_long_t *)devm_bitmap_zalloc( + ummu->dev, 1UL << mcmdq->q.llq.log2size, GFP_KERNEL); + if (!bitmap) { + dev_err(ummu->dev, "failed to zalloc mcmdq bitmap\n"); + return -ENOMEM; + } + mcmdq->valid_map = bitmap; + + return 0; +} + +static int ummu_mcmdq_init(struct ummu_device *ummu) +{ + struct ummu_mcmdq *mcmdq; + u32 shift; + u64 base_addr = 0; + int cpu, ret; + + ummu->nr_mcmdq = 1UL << ummu->cap.mcmdq_log2num; + ummu->nr_mcmdq -= 1; + shift = order_base_2(num_possible_cpus() / ummu->nr_mcmdq); + + ummu->mcmdq = devm_alloc_percpu(ummu->dev, struct ummu_mcmdq *); + if (!ummu->mcmdq) { + dev_err(ummu->dev, "alloc mcmdq ptr failed\n"); + goto err; + } + + ret = ummu_mcmdq_allocate(ummu); + if (ret) { + dev_err(ummu->dev, "mcmdq allocate failed\n"); + goto err; + } + + for_each_possible_cpu(cpu) { + mcmdq = *per_cpu_ptr(ummu->mcmdq, cpu); + /* prevent repeated init when it is shared to multiple CPUs. */ + if (!mcmdq || mcmdq->mcmdq_prod == MCMDQ_PROD_EN) + continue; + + mcmdq->q.llq.log2size = MCMDQ_MAX_SZ_SHIFT + shift; + mcmdq->base = ummu->base + UMMU_MCMDQ_OFFSET + base_addr; + mcmdq->q.prod_reg = (u32 *)(mcmdq->base + MCMDQ_PROD_OFFSET); + mcmdq->q.cons_reg = (u32 *)(mcmdq->base + MCMDQ_CONS_OFFSET); + ret = ummu_common_init_queue(ummu, &mcmdq->q, MCMDQ_ENT_DWORDS); + if (ret) + goto err; + ret = ummu_mcmdq_cfg_para(ummu, mcmdq); + if (ret) + goto err; + + base_addr += MCMDQ_ENT_SIZE; + } + + return 0; +err: + ummu->nr_mcmdq = 0; + return -ENOMEM; +} + +static int ummu_write_mcmdq_regs(struct ummu_device *ummu) +{ + struct ummu_mcmdq *mcmdq; + struct ummu_queue *q; + int i = 0, ret; + u32 cpu; + u32 reg; + + if (unlikely(!ummu->nr_mcmdq)) { + dev_err(ummu->dev, "have not mcmdq resource.\n"); + return -EINVAL; + } + + for_each_possible_cpu(cpu) { + mcmdq = *per_cpu_ptr(ummu->mcmdq, cpu); + if (mcmdq->configured == 1) + continue; + + q = &mcmdq->q; + i++; + if (WARN_ON(q->llq.prod != q->llq.cons)) { + q->llq.prod = 0; + q->llq.cons = 0; + } + /* + * In kdump kernel, the mcmdq should be turned off first to + * prevent "CMD_SYNC timeout" problem. + */ + reg = readl(q->prod_reg); + if (reg & MCMDQ_PROD_EN) { + writel(reg & ~MCMDQ_PROD_EN, q->prod_reg); + ret = readl_relaxed_poll_timeout(q->cons_reg, reg, + !(reg & MCMDQ_EN_RESP), 1, + UMMU_CONS_POLL_TIMEOUT_US); + if (ret) { + dev_warn(ummu->dev, + "mcmdq[%d] disable failed\n", i); + mcmdq->configured = 0; + return ret; + } + } + + /* close mcmdq_base write protection */ + writel_relaxed(q->llq.prod, q->prod_reg); + writel_relaxed(q->llq.cons, q->cons_reg); + writeq_relaxed(q->q_base, mcmdq->base); + + /* enable mcmdq and open write protection */ + writel_relaxed(MCMDQ_PROD_EN | q->llq.prod, mcmdq->q.prod_reg); + ret = readl_relaxed_poll_timeout(mcmdq->q.cons_reg, reg, + reg & MCMDQ_EN_RESP, 1, + UMMU_CONS_POLL_TIMEOUT_US); + if (ret) { + dev_err(ummu->dev, "prod_reg write timeout ret = %d.\n", + ret); + return ret; + } + mcmdq->configured = 1; + } + + return 0; +} + +int ummu_device_mcmdq_init_cfg(struct ummu_device *ummu) +{ + return ummu_write_mcmdq_regs(ummu); +} + +int ummu_write_evtq_regs(struct ummu_device *ummu) +{ + u32 cr0 = readl_relaxed(ummu->base + UMMU_CR0); + struct ummu_queue *q = &ummu->evtq.q; + + /* evtq disabled in function ummu_device_disable */ + writeq_relaxed(q->q_base, ummu->base + UMMU_EVTQ_OFFSET); + + writel_relaxed(q->llq.prod, ummu->base + UMMU_EVTQ_PROD_OFFSET); + writel_relaxed(q->llq.cons, ummu->base + UMMU_EVTQ_CONS_OFFSET); + + cr0 |= CR0_EVENTQ_EN; + + return ummu_write_reg_sync(ummu, cr0, UMMU_CR0, UMMU_CR0ACK); +} + +static int ummu_evtq_init(struct ummu_device *ummu) +{ + struct ummu_queue *q = &ummu->evtq.q; + + q->llq.log2size = min(EVTQ_MAX_SZ_SHIFT, ummu->cap.evtq_log2size); + q->prod_reg = (u32 *)(ummu->base + UMMU_EVTQ_PROD_OFFSET); + q->cons_reg = (u32 *)(ummu->base + UMMU_EVTQ_CONS_OFFSET); + return ummu_common_init_queue(ummu, q, EVTQ_ENT_DWORDS); +} + +int ummu_init_queues(struct ummu_device *ummu) +{ + if (!(ummu->cap.features & UMMU_FEAT_MCMDQ) || + !(ummu->cap.features & UMMU_FEAT_EVENTQ)) + return -EOPNOTSUPP; + + if (ummu_mcmdq_init(ummu)) + return -ENOMEM; + + if (ummu_evtq_init(ummu)) + return -ENOMEM; + + return 0; +} + +#define ummu_mcmdq_exclusive_trylock_irqsave(mcmdq, flags) \ + ({ \ + bool __ret; \ + local_irq_save(flags); \ + __ret = !atomic_cmpxchg_relaxed(&(mcmdq)->lock, 0, INT_MIN); \ + if (!__ret) \ + local_irq_restore(flags); \ + __ret; \ + }) + +#define ummu_mcmdq_exclusive_unlock_irqrestore(mcmdq, flags) \ + ({ \ + atomic_set_release(&(mcmdq)->lock, 0); \ + local_irq_restore(flags); \ + }) + +/* Wait for the command queue to become non-full */ +static int ummu_mcmdq_poll_until_not_full(struct ummu_device *ummu, + struct ummu_mcmdq *mcmdq, + struct ummu_ll_queue *llq) +{ + struct ummu_queue_poll qp; + unsigned long flags; + int ret = 0; + + /* + * Try to update our copy of cons by grabbing exclusive mcmdq access. If + * that fails, spin until somebody else updates it for us. + */ + if (ummu_mcmdq_exclusive_trylock_irqsave(mcmdq, flags)) { + WRITE_ONCE(mcmdq->q.llq.cons, readl_relaxed(mcmdq->q.cons_reg)); + ummu_mcmdq_exclusive_unlock_irqrestore(mcmdq, flags); + llq->val = READ_ONCE(mcmdq->q.llq.val); + return 0; + } + + ummu_queue_poll_init(ummu, &qp); + do { + llq->val = READ_ONCE(mcmdq->q.llq.val); + if (!ummu_queue_full(llq)) + break; + + ret = ummu_queue_poll(&qp); + } while (!ret); + + return ret; +} + +/* + * The command queue is locked. + * This is a private form of rwlock with the following main variations: + * + * - The UNLOCK routine is supplemented by shared_tryunlock(), where + * If the caller appears to be the last lock holder (yes, this is + * All successful UNLOCK routines have RELEASE semantics. + * + * - The only LOCK routines are exclusive_trylock() and shared_lock(). + * Neither has barrier semantics, but only provides control. + * Dependency. + */ +static void ummu_mcmdq_shared_lock(struct ummu_mcmdq *mcmdq) +{ + int val; + + /* + * We can try to avoid the cmpxchg() loop by simply incrementing the + * lock counter. When held in exclusive state, the lock counter is set + * to INT_MIN so these increments won't hurt as the value will remain + * negative. + */ + if (atomic_fetch_inc_relaxed(&mcmdq->lock) >= 0) + return; + + do { + val = atomic_cond_read_relaxed(&mcmdq->lock, VAL >= 0); + } while (atomic_cmpxchg_relaxed(&mcmdq->lock, val, val + 1) != val); +} + +static void ummu_mcmdq_shared_unlock(struct ummu_mcmdq *mcmdq) +{ + (void)atomic_dec_return_release(&mcmdq->lock); +} + +static bool ummu_mcmdq_shared_tryunlock(struct ummu_mcmdq *mcmdq) +{ + if (atomic_read(&mcmdq->lock) == 1) + return false; + + ummu_mcmdq_shared_unlock(mcmdq); + + return true; +} + +static int ummu_mcmdq_build_nop_cmd(u64 *cmd, struct ummu_mcmdq_ent *ent) +{ + cmd[0] |= FIELD_PREP(CMD_NULL_OP_SUB_OP, ent->null_op.sub_op); + switch (ent->null_op.sub_op) { + case SUB_CMD_NULL_CHECK_PA_CONTINUITY: + cmd[0] |= FIELD_PREP(SUB_OP_CHECK_PA_CONTI_0_RESULT, + ent->null_op.check_pa_conti.result); + cmd[0] |= ent->null_op.check_pa_conti.flag ? + SUB_OP_CHECK_PA_CONTI_0_FLAG : 0; + cmd[0] |= FIELD_PREP(SUB_OP_CHECK_PA_CONTI_0_SIZE, + ent->null_op.check_pa_conti.size_order); + cmd[0] |= FIELD_PREP(SUB_OP_CHECK_PA_CONTI_0_ID, + ent->null_op.check_pa_conti.id); + cmd[1] |= SUB_OP_CHECK_PA_CONTI_1_ADDR & + ent->null_op.check_pa_conti.addr; + break; + default: + return -EINVAL; + } + return 0; +} + +int ummu_mcmdq_build_cmd(struct ummu_device *ummu, u64 *cmd, + struct ummu_mcmdq_ent *ent) +{ + memset(cmd, 0, 1 << MCMDQ_ENT_SZ_SHIFT); + cmd[0] |= FIELD_PREP(CMD_0_OP, ent->opcode); + + /* build cmd method for different cmds */ + switch (ent->opcode) { + case CMD_SYNC: + if (ent->sync.msi_addr) { + cmd[0] |= FIELD_PREP(CMD_SYNC_0_CM, CMD_SYNC_0_CM_IRQ); + cmd[1] |= ent->sync.msi_addr & CMD_SYNC_1_MSIADDR; + } else if (ent->sync.support_sev) { + cmd[0] |= FIELD_PREP(CMD_SYNC_0_CM, CMD_SYNC_0_CM_SEV); + } else { + cmd[0] |= FIELD_PREP(CMD_SYNC_0_CM, CMD_SYNC_0_CM_NONE); + } + cmd[0] |= FIELD_PREP(CMD_SYNC_0_MSISH, UMMU_SH_ISH); + cmd[0] |= FIELD_PREP(CMD_SYNC_0_MSIATTR, UMMU_MEMATTR_OIWB); + break; + case CMD_STALL_RESUME: + cmd[0] |= ent->stall_resume.dsec ? CMD_STALL_0_DSEC : 0; + cmd[0] |= ent->stall_resume.retry ? CMD_STALL_0_RETRY : 0; + cmd[0] |= ent->stall_resume.abort ? CMD_STALL_0_ABORT : 0; + cmd[1] |= FIELD_PREP(CMD_STALL_1_TAG, ent->stall_resume.tag); + cmd[2] |= FIELD_PREP(CMD_STALL_2_TECT_TAG, + ent->stall_resume.tect_tag); + break; + case CMD_STALL_TERM: + cmd[2] |= FIELD_PREP(CMD_STALL_2_TECT_TAG, ent->stall_resume.tect_tag); + break; + case CMD_PREFET_CFG: + cmd[0] |= ent->prefet.tkv ? CMD_PREFET_0_TKV : 0; + cmd[0] |= FIELD_PREP(CMD_PREFET_0_TID, ent->prefet.tid); + cmd[2] |= FIELD_PREP(CMD_PREFET_2_DEID_0, ent->prefet.deid_0); + cmd[2] |= FIELD_PREP(CMD_PREFET_2_DEID_1, ent->prefet.deid_1); + cmd[3] |= FIELD_PREP(CMD_PREFET_3_DEID_0, ent->prefet.deid_2); + cmd[3] |= FIELD_PREP(CMD_PREFET_3_DEID_1, ent->prefet.deid_3); + break; + case CMD_CFGI_TECT: + cmd[0] |= ent->cfgi.leaf ? CMD_CFGI_0_LEAF : 0; + cmd[2] |= FIELD_PREP(CMD_CFGI_2_DEID_0, ent->cfgi.deid_0); + cmd[2] |= FIELD_PREP(CMD_CFGI_2_DEID_1, ent->cfgi.deid_1); + cmd[3] |= FIELD_PREP(CMD_CFGI_3_DEID_0, ent->cfgi.deid_2); + cmd[3] |= FIELD_PREP(CMD_CFGI_3_DEID_1, ent->cfgi.deid_3); + break; + case CMD_CFGI_TECT_RANGE: + cmd[0] |= FIELD_PREP(CMD_CFGI_0_RANGE, ent->cfgi.range); + cmd[2] |= FIELD_PREP(CMD_CFGI_2_DEID_0, ent->cfgi.deid_0); + cmd[2] |= FIELD_PREP(CMD_CFGI_2_DEID_1, ent->cfgi.deid_1); + cmd[3] |= FIELD_PREP(CMD_CFGI_3_DEID_0, ent->cfgi.deid_2); + cmd[3] |= FIELD_PREP(CMD_CFGI_3_DEID_1, ent->cfgi.deid_3); + break; + case CMD_CFGI_TCT: + cmd[0] |= ent->cfgi.leaf ? CMD_CFGI_0_LEAF : 0; + cmd[0] |= FIELD_PREP(CMD_CFGI_0_TID, ent->cfgi.tid); + cmd[2] |= FIELD_PREP(CMD_CFGI_2_DEID_0, ent->cfgi.deid_0); + cmd[2] |= FIELD_PREP(CMD_CFGI_2_DEID_1, ent->cfgi.deid_1); + cmd[3] |= FIELD_PREP(CMD_CFGI_3_DEID_0, ent->cfgi.deid_2); + cmd[3] |= FIELD_PREP(CMD_CFGI_3_DEID_1, ent->cfgi.deid_3); + break; + case CMD_CFGI_TCT_ALL: + cmd[0] |= ent->cfgi.leaf ? CMD_CFGI_0_LEAF : 0; + cmd[0] |= FIELD_PREP(CMD_CFGI_0_TID, ent->cfgi.tid); + cmd[2] |= FIELD_PREP(CMD_CFGI_2_DEID_0, ent->cfgi.deid_0); + cmd[2] |= FIELD_PREP(CMD_CFGI_2_DEID_1, ent->cfgi.deid_1); + cmd[3] |= FIELD_PREP(CMD_CFGI_3_DEID_0, ent->cfgi.deid_2); + cmd[3] |= FIELD_PREP(CMD_CFGI_3_DEID_1, ent->cfgi.deid_3); + break; + case CMD_PLBI_OS_EID: + cmd[2] |= FIELD_PREP(CMD_PLBI_2_TECTE_TAG, ent->plbi.tecte_tag); + break; + case CMD_PLBI_OS_EIDTID: + cmd[0] |= FIELD_PREP(CMD_PLBI_0_TID, ent->plbi.tid); + cmd[2] |= FIELD_PREP(CMD_PLBI_2_TECTE_TAG, ent->plbi.tecte_tag); + break; + case CMD_PLBI_OS_VA: + cmd[0] |= FIELD_PREP(CMD_PLBI_0_TID, ent->plbi.tid); + cmd[0] |= FIELD_PREP(CMD_PLBI_0_RANGE, ent->plbi.range); + cmd[1] |= ent->plbi.addr & CMD_PLBI_1_ADDR_MASK; + cmd[2] |= FIELD_PREP(CMD_PLBI_2_TECTE_TAG, ent->plbi.tecte_tag); + break; + case CMD_TLBI_OS_ALL: + cmd[2] |= FIELD_PREP(CMD_TLBI_2_TECTE_TAG, ent->tlbi.tect_tag); + break; + case CMD_TLBI_OS_TID: + cmd[0] |= FIELD_PREP(CMD_TLBI_0_TOKEN_ID, ent->tlbi.tid); + cmd[2] |= FIELD_PREP(CMD_TLBI_2_TECTE_TAG, ent->tlbi.tect_tag); + break; + case CMD_TLBI_OS_VA: + cmd[0] |= FIELD_PREP(CMD_TLBI_0_TOKEN_ID, ent->tlbi.tid); + fallthrough; + case CMD_TLBI_OS_VAA: + cmd[2] |= FIELD_PREP(CMD_TLBI_2_TECTE_TAG, ent->tlbi.tect_tag); + cmd[0] |= ent->tlbi.leaf ? CMD_TLBI_0_LEAF : 0; + cmd[0] |= FIELD_PREP(CMD_TLBI_0_NUM, ent->tlbi.num); + cmd[0] |= FIELD_PREP(CMD_TLBI_0_SCALE, ent->tlbi.scale); + cmd[0] |= FIELD_PREP(CMD_TLBI_0_TL, ent->tlbi.tl); + cmd[1] |= FIELD_PREP(CMD_TLBI_1_GS, ent->tlbi.gs); + cmd[1] |= ent->tlbi.addr & CMD_TLBI_1_VA_MASK; + break; + case CMD_TLBI_HYP_TID: + cmd[0] |= FIELD_PREP(CMD_TLBI_0_TOKEN_ID, ent->tlbi.tid); + break; + case CMD_TLBI_HYP_VA: + cmd[0] |= FIELD_PREP(CMD_TLBI_0_TOKEN_ID, ent->tlbi.tid); + fallthrough; + case CMD_TLBI_HYP_VAA: + cmd[0] |= ent->tlbi.leaf ? CMD_TLBI_0_LEAF : 0; + cmd[0] |= FIELD_PREP(CMD_TLBI_0_NUM, ent->tlbi.num); + cmd[0] |= FIELD_PREP(CMD_TLBI_0_SCALE, ent->tlbi.scale); + cmd[0] |= FIELD_PREP(CMD_TLBI_0_TL, ent->tlbi.tl); + cmd[1] |= FIELD_PREP(CMD_TLBI_1_GS, ent->tlbi.gs); + cmd[1] |= ent->tlbi.addr & CMD_TLBI_1_VA_MASK; + break; + case CMD_TLBI_S1S2_VMALL: + cmd[2] |= FIELD_PREP(CMD_TLBI_2_TECTE_TAG, ent->tlbi.tect_tag); + break; + case CMD_TLBI_S2_IPA: + cmd[2] |= FIELD_PREP(CMD_TLBI_2_TECTE_TAG, ent->tlbi.tect_tag); + cmd[0] |= ent->tlbi.leaf ? CMD_TLBI_0_LEAF : 0; + cmd[0] |= FIELD_PREP(CMD_TLBI_0_NUM, ent->tlbi.num); + cmd[0] |= FIELD_PREP(CMD_TLBI_0_SCALE, ent->tlbi.scale); + cmd[0] |= FIELD_PREP(CMD_TLBI_0_TL, ent->tlbi.tl); + cmd[1] |= FIELD_PREP(CMD_TLBI_1_GS, ent->tlbi.gs); + cmd[1] |= ent->tlbi.addr & CMD_TLBI_1_VA_MASK; + break; + case CMD_TLBI_HYP_ALL: + case CMD_TLBI_NS_OS_ALL: + break; + case CMD_CREATE_KVTBL: + cmd[0] |= ent->create_kvtbl.evt_en ? + CMD_CREATE_KVTBL0_EVT_EN : 0; + cmd[0] |= FIELD_PREP(CMD_CREATE_KVTBL0_TAG_MASK, + ent->create_kvtbl.tecte_tag); + cmd[0] |= FIELD_PREP(CMD_CREATE_KVTBL0_KV_INDEX_MASK, + ent->create_kvtbl.kv_index); + cmd[1] |= ent->create_kvtbl.tect_base_addr & + CMD_CREATE_KVTBL1_ADDR_MASK; + cmd[2] |= FIELD_PREP(CMD_CREATE_KVTBL2_EID_LOW, + ent->create_kvtbl.eid_low); + cmd[3] |= FIELD_PREP(CMD_CREATE_KVTBL3_EID_HIGH, + ent->create_kvtbl.eid_high); + break; + case CMD_DELETE_KVTBL: + cmd[0] |= ent->delete_kvtbl.evt_en ? + CMD_DELETE_KVTBL0_EVT_EN : 0; + cmd[0] |= FIELD_PREP(CMD_DELETE_KVTBL0_TAG_MASK, + ent->delete_kvtbl.tecte_tag); + cmd[0] |= FIELD_PREP(CMD_DELETE_KVTBL0_KV_INDEX_MASK, + ent->delete_kvtbl.kv_index); + cmd[2] |= FIELD_PREP(CMD_DELETE_KVTBL2_EID_LOW, + ent->delete_kvtbl.eid_low); + cmd[3] |= FIELD_PREP(CMD_DELETE_KVTBL3_EID_HIGH, + ent->delete_kvtbl.eid_high); + break; + case CMD_TLBI_OS_ALL_U: + cmd[0] |= FIELD_PREP(CMD_TLBI_0_VMID, ent->tlbi.vmid); + break; + case CMD_TLBI_OS_ASID_U: + cmd[0] |= FIELD_PREP(CMD_TLBI_0_ASID, ent->tlbi.asid); + cmd[0] |= FIELD_PREP(CMD_TLBI_0_VMID, ent->tlbi.vmid); + break; + case CMD_TLBI_OS_VA_U: + cmd[0] |= FIELD_PREP(CMD_TLBI_0_ASID, ent->tlbi.asid); + fallthrough; + case CMD_TLBI_OS_VAA_U: + cmd[0] |= FIELD_PREP(CMD_TLBI_0_VMID, ent->tlbi.vmid); + cmd[0] |= ent->tlbi.leaf ? CMD_TLBI_0_LEAF : 0; + cmd[0] |= FIELD_PREP(CMD_TLBI_0_NUM, ent->tlbi.num); + cmd[0] |= FIELD_PREP(CMD_TLBI_0_SCALE, ent->tlbi.scale); + cmd[0] |= FIELD_PREP(CMD_TLBI_0_TL, ent->tlbi.tl); + cmd[1] |= FIELD_PREP(CMD_TLBI_1_GS, ent->tlbi.gs); + cmd[1] |= ent->tlbi.addr & CMD_TLBI_1_VA_MASK; + break; + case CMD_TLBI_HYP_ASID_U: + cmd[0] |= FIELD_PREP(CMD_TLBI_0_ASID, ent->tlbi.asid); + break; + case CMD_TLBI_HYP_VA_U: + cmd[0] |= FIELD_PREP(CMD_TLBI_0_ASID, ent->tlbi.asid); + cmd[0] |= ent->tlbi.leaf ? CMD_TLBI_0_LEAF : 0; + cmd[0] |= FIELD_PREP(CMD_TLBI_0_NUM, ent->tlbi.num); + cmd[0] |= FIELD_PREP(CMD_TLBI_0_SCALE, ent->tlbi.scale); + cmd[0] |= FIELD_PREP(CMD_TLBI_0_TL, ent->tlbi.tl); + cmd[1] |= FIELD_PREP(CMD_TLBI_1_GS, ent->tlbi.gs); + cmd[1] |= ent->tlbi.addr & CMD_TLBI_1_VA_MASK; + break; + case CMD_TLBI_S1S2_VMALL_U: + cmd[0] |= FIELD_PREP(CMD_TLBI_0_VMID, ent->tlbi.vmid); + break; + case CMD_TLBI_S2_IPA_U: + cmd[0] |= FIELD_PREP(CMD_TLBI_0_VMID, ent->tlbi.vmid); + cmd[0] |= ent->tlbi.leaf ? CMD_TLBI_0_LEAF : 0; + cmd[0] |= FIELD_PREP(CMD_TLBI_0_NUM, ent->tlbi.num); + cmd[0] |= FIELD_PREP(CMD_TLBI_0_SCALE, ent->tlbi.scale); + cmd[0] |= FIELD_PREP(CMD_TLBI_0_TL, ent->tlbi.tl); + cmd[1] |= FIELD_PREP(CMD_TLBI_1_GS, ent->tlbi.gs); + cmd[1] |= ent->tlbi.addr & CMD_TLBI_1_IPA_MASK; + break; + case CMD_NULL_OP: + return ummu_mcmdq_build_nop_cmd(cmd, ent); + default: + return -EINVAL; + } + + return 0; +} + +/* + * Command queue insertion. + * This process became cumbersome as we aimed to achieve scalability + * due to the shared queue among all CPUs in the system. + * If you desire a combination of size concurrency, dependency order, + * and loose atoms, then you will absolutely adore this monstrous solution. + * The fundamental concept is to divide the queue into command ranges + * owned by each CPU. The owner may not have authored all the commands + * themselves but assumes responsibility for advancing the hardware product + * pointer under certain circumstances: when it's time. The algorithm can be + * summarized as follows: + * 1. Allocate space within the queue while also determining if another CPU + * currently owns the head of the queue or if we are its rightful owners. + * 2. Write our commands into the allocated slot within the queue. + * 3. Mark our slot as valid in ummu_mcmdq.valid_map. + * 4. If we are indeed the owner: + * A. Wait for completion by any previous owner. + * B. Declare that there is no current owner for this range, + * indicating our responsibility for publishing it. + * C. Await execution of all orders within our possession. + * D. Advance the hardware product pointer. + * E. Notify subsequent hosts that we have completed our tasks. + * 5. If we insert CMD_SYNC (whether or not we are its owner), + * then we must persist with it until completion: + * A. If MSI is available, UMMU can write back to CMD_SYNC and + * clear its first 4 bytes. + * B. Otherwise, rotate and wait until the hardware cons pointer points + * beyond our command. + * The devil lies in these intricate details-particularly regarding locking + * mechanisms-to ensure complete synchronization and efficient utilization of + * space within the queue before deeming it full. + */ +static void ummu_mcmdq_build_sync_cmd(u64 *cmd, struct ummu_device *ummu, + struct ummu_queue *q, u32 prod) +{ + struct ummu_mcmdq_ent ent = { + .opcode = CMD_SYNC, + }; + + /* + * Beware that Hi16xx adds an extra 32 bits of goodness to its MSI + * payload, so the write will zero the entire command on that platform. + */ + if (ummu->cap.options & UMMU_OPT_MSIPOLL) + ent.sync.msi_addr = q->base_pa + Q_IDX(&q->llq, prod) * + ENTRY_DWORDS_TO_SIZE(q->ent_dwords); + ent.sync.support_sev = !!(ummu->cap.features & UMMU_FEAT_SEV); + (void)ummu_mcmdq_build_cmd(ummu, cmd, &ent); +} + +static void ummu_mcmdq_poll_set_valid_map(struct ummu_mcmdq *mcmdq, u32 sprod, u32 eprod, bool set) +{ + u32 swidx, sbidx, ewidx, ebidx; + struct ummu_ll_queue llq; + unsigned long valid; + unsigned long mask; + atomic_long_t *ptr; + u32 limit; + + llq.prod = sprod; + llq.log2size = mcmdq->q.llq.log2size; + + ewidx = BIT_WORD(Q_IDX(&llq, eprod)); + ebidx = Q_IDX(&llq, eprod) % BITS_PER_LONG; + + while (llq.prod != eprod) { + limit = BITS_PER_LONG; + swidx = BIT_WORD(Q_IDX(&llq, llq.prod)); + sbidx = Q_IDX(&llq, llq.prod) % BITS_PER_LONG; + + ptr = &mcmdq->valid_map[swidx]; + + if ((swidx == ewidx) && (sbidx < ebidx)) + limit = ebidx; + + mask = GENMASK(limit - 1, sbidx); + + if (set) { + atomic_long_xor(mask, ptr); + } else { /* Poll */ + /* + * The valid bit is equal to the wrap bit. + * This means that a queue initialized to 0 is invalid, + * and after all elements are marked as valid, causing a rollover, + * all elements become invalid again. + */ + + valid = (ULONG_MAX + !!Q_WRP(&llq, llq.prod)) & mask; + atomic_long_cond_read_relaxed(ptr, + (VAL & mask) == valid); + } + + llq.prod = ummu_queue_inc_prod_n(&llq, limit - sbidx); + } +} + +/* Mark all entries in the range [sprod, eprod) as valid */ +static void ummu_mcmdq_set_valid_map(struct ummu_mcmdq *mcmdq, u32 sprod, + u32 eprod) +{ + ummu_mcmdq_poll_set_valid_map(mcmdq, sprod, eprod, true); +} + +/* Wait for all entries in the range [sprod, eprod) to become valid */ +static void ummu_mcmdq_poll_valid_map(struct ummu_mcmdq *mcmdq, u32 sprod, + u32 eprod) +{ + ummu_mcmdq_poll_set_valid_map(mcmdq, sprod, eprod, false); +} + +/* + * Wait until the UMMU signals a CMD_SYNC completion MSI. + */ +static int ummu_mcmdq_poll_until_msi(struct ummu_device *ummu, + struct ummu_mcmdq *mcmdq, + struct ummu_ll_queue *llq) +{ + u32 *cmd = (u32 *)(Q_ENT(&mcmdq->q, llq->prod)); + struct ummu_queue_poll qp; + int ret = 0; + + ummu_queue_poll_init(ummu, &qp); + + /* + * The MSI won't generate an event, since it's being written back + * into the command queue. + */ + qp.wfe = false; + smp_cond_load_relaxed(cmd, !VAL || (ret = ummu_queue_poll(&qp))); + llq->cons = ret ? readl(mcmdq->q.cons_reg) : ummu_queue_inc_prod_n(llq, 1); + + return ret; +} + +/* + * Wait until the UMMU cons index passes llq->prod. + */ +static int ummu_mcmdq_poll_until_consumed(struct ummu_device *ummu, + struct ummu_mcmdq *mcmdq, + struct ummu_ll_queue *llq) +{ + struct ummu_queue_poll qp; + u32 prod = llq->prod; + int ret = 0; + + ummu_queue_poll_init(ummu, &qp); + llq->val = READ_ONCE(mcmdq->q.llq.val); + do { + if (ummu_queue_consumed(llq, prod)) + break; + + ret = ummu_queue_poll(&qp); + + /* + * This needs to be a readl() so that our subsequent call + * to ummu_mcmdq_shared_tryunlock() can fail accurately. + * + * Specifically, we need to ensure that we observe all + * shared_lock()s by other CMD_SYNCs that share our owner, + * so that a failing call to tryunlock() means that we're + * the last one out and therefore we can safely advance + * mcmdq->q.llq.cons. Roughly speaking: + */ + llq->cons = readl(mcmdq->q.cons_reg); + } while (!ret); + + return ret; +} + +static int ummu_mcmdq_poll_until_sync(struct ummu_device *ummu, + struct ummu_mcmdq *mcmdq, + struct ummu_ll_queue *llq) +{ + if (ummu->cap.options & UMMU_OPT_MSIPOLL) + return ummu_mcmdq_poll_until_msi(ummu, mcmdq, llq); + + return ummu_mcmdq_poll_until_consumed(ummu, mcmdq, llq); +} + +static void ummu_mcmdq_write_entries(struct ummu_mcmdq *mcmdq, u64 *cmds, + u32 prod, int n) +{ + struct ummu_ll_queue llq; + u64 *cmd; + int i; + + llq.prod = prod; + llq.log2size = mcmdq->q.llq.log2size; + + for (i = 0; i < n; ++i) { + cmd = &cmds[i * MCMDQ_ENT_DWORDS]; + prod = ummu_queue_inc_prod_n(&llq, i); + ummu_queue_write(Q_ENT(&mcmdq->q, prod), cmd, MCMDQ_ENT_DWORDS); + } +} + +static int check_pa_continuity_nop_exec(struct ummu_queue *q, u32 prod) +{ + u64 cmd = (u64)le64_to_cpu(Q_ENT(q, prod)); + + if (FIELD_GET(CMD_0_OP, cmd) == CMD_NULL_OP && + FIELD_GET(CMD_NULL_OP_SUB_OP, cmd) == + SUB_CMD_NULL_CHECK_PA_CONTINUITY) { + if (FIELD_GET(SUB_OP_CHECK_PA_CONTI_0_RESULT, cmd)) + return -ENOSPC; + if (FIELD_GET(SUB_OP_CHECK_PA_CONTI_0_FLAG, cmd) == 1 && + FIELD_GET(SUB_OP_CHECK_PA_CONTI_0_ID, cmd) != 0) + return -ERANGE; + } + return 0; +} + +static struct ummu_mcmdq *ummu_device_get_mcmdq(struct ummu_device *ummu, + u64 *cmd) +{ + return *this_cpu_ptr(ummu->mcmdq); +} + +static int ummu_mcmdq_exclusive_issue_cmdlist(struct ummu_device *ummu, + struct ummu_mcmdq *mcmdq, + u64 *cmds, int n, bool sync) +{ + u64 cmd_sync[MCMDQ_ENT_DWORDS], old; + struct ummu_ll_queue llq, head; + unsigned long flags; + u32 prod, sprod; + int ret = 0; + + llq.log2size = mcmdq->q.llq.log2size; + /* 1. Allocate some space in the queue */ + local_irq_save(flags); + llq.val = READ_ONCE(mcmdq->q.llq.val); + do { + while (!ummu_queue_has_space(&llq, n + sync)) { + local_irq_restore(flags); + if (ummu_mcmdq_poll_until_not_full(ummu, mcmdq, &llq)) { + dev_err_ratelimited(ummu->dev, "wait MCMDQ not full timeout.\n"); + return -ETIMEDOUT; + } + local_irq_save(flags); + } + + head.cons = llq.cons; + head.prod = ummu_queue_inc_prod_n(&llq, n + (sync ? 1 : 0)); + + old = cmpxchg_relaxed(&mcmdq->q.llq.val, llq.val, head.val); + if (old == llq.val) + break; + + llq.val = old; + } while (1); + sprod = llq.prod; + + /* 2. Write our commands into the queue */ + ummu_mcmdq_write_entries(mcmdq, cmds, llq.prod, n); + if (sync) { + prod = ummu_queue_inc_prod_n(&llq, n); + ummu_mcmdq_build_sync_cmd(cmd_sync, ummu, &mcmdq->q, prod); + ummu_queue_write(Q_ENT(&mcmdq->q, prod), cmd_sync, MCMDQ_ENT_DWORDS); + } + + /* 3. Ensuring commands are visible first */ + dma_wmb(); + + /* 4. Advance the hardware prod pointer */ + read_lock(&mcmdq->mcmdq_lock); + writel_relaxed(head.prod | mcmdq->mcmdq_prod, mcmdq->q.prod_reg); + read_unlock(&mcmdq->mcmdq_lock); + + /* 5. If we are inserting a CMD_SYNC, we must wait for it to complete */ + if (sync) { + llq.prod = ummu_queue_inc_prod_n(&llq, n); + ret = ummu_mcmdq_poll_until_sync(ummu, mcmdq, &llq); + if (ret) { + /* + * When sync times out, error handling cannot be performed more + * effectively and CIs need to be maintained. Therefore, continue. + */ + dev_err_ratelimited(ummu->dev, + "CMD_SYNC timeout at 0x%08x [hwprod 0x%08x, hwcons 0x%08x]\n", + llq.prod, + readl_relaxed(mcmdq->q.prod_reg), + readl_relaxed(mcmdq->q.cons_reg)); + } + + ret = check_pa_continuity_nop_exec(&mcmdq->q, sprod); + /* + * Update mcmdq->q.llq.cons, to improve the success rate of + * ummu_queue_has_space() when some new commands are inserted next + * time. + */ + WRITE_ONCE(mcmdq->q.llq.cons, llq.cons); + } + + local_irq_restore(flags); + return ret; +} + +/* + * The actual insert function provides the following functionality for + * sorting guarantees to callers: + * - Prioritizing write ordering of data structures in memory + * by ensuring a dma_wmb() before publishing any + * command to the queue. + * - Sorting subsequent writes to memory + * (e.g., releasing the IOVA after CMD_SYNC is complete) through a + * control dependency when CMD_SYNC is finished. + * - Ensuring fully ordered command insertion, where if two CPUs + * compete with each other to insert their own command lists, one CPU's + * commands will always appear before any commands from another CPU. + */ +int ummu_mcmdq_issue_cmdlist(struct ummu_device *ummu, u64 *cmds, + int n, bool sync) +{ + struct ummu_mcmdq *mcmdq = ummu_device_get_mcmdq(ummu, cmds); + u64 cmd_sync[MCMDQ_ENT_DWORDS]; + struct ummu_ll_queue llq, head; + unsigned long flags; + u32 prod, sprod; + int ret = 0; + bool owner; + u64 old; + + if (unlikely(!mcmdq->shared)) + return ummu_mcmdq_exclusive_issue_cmdlist(ummu, mcmdq, cmds, + n, sync); + + llq.log2size = mcmdq->q.llq.log2size; + + /* 1. Allocate some space in the queue */ + local_irq_save(flags); + llq.val = READ_ONCE(mcmdq->q.llq.val); + do { + while (!ummu_queue_has_space(&llq, n + sync)) { + local_irq_restore(flags); + if (ummu_mcmdq_poll_until_not_full(ummu, mcmdq, &llq)) { + dev_err_ratelimited(ummu->dev, "wait MCMDQ not full timeout.\n"); + return -ETIMEDOUT; + } + local_irq_save(flags); + } + + head.cons = llq.cons; + head.prod = ummu_queue_inc_prod_n(&llq, n + sync) | + MCMDQ_PROD_OWNED_FLAG; + + old = cmpxchg_relaxed(&mcmdq->q.llq.val, llq.val, head.val); + if (old == llq.val) + break; + + llq.val = old; + } while (1); + owner = !(llq.prod & MCMDQ_PROD_OWNED_FLAG); + head.prod &= ~MCMDQ_PROD_OWNED_FLAG; + llq.prod &= ~MCMDQ_PROD_OWNED_FLAG; + sprod = llq.prod; + /* + * 2. Write our commands into the queue + * Dependency ordering from the cmpxchg() loop above. + */ + ummu_mcmdq_write_entries(mcmdq, cmds, llq.prod, n); + if (sync) { + prod = ummu_queue_inc_prod_n(&llq, n); + ummu_mcmdq_build_sync_cmd(cmd_sync, ummu, &mcmdq->q, prod); + ummu_queue_write(Q_ENT(&mcmdq->q, prod), cmd_sync, MCMDQ_ENT_DWORDS); + + /* + * In order to determine completion of our CMD_SYNC, we must + * ensure that the queue can't wrap twice without us noticing. + * We achieve that by taking the mcmdq lock as shared before + * marking our slot as valid. + */ + ummu_mcmdq_shared_lock(mcmdq); + } + + /* 3. Mark our slots as valid, ensuring commands are visible first */ + dma_wmb(); + ummu_mcmdq_set_valid_map(mcmdq, llq.prod, head.prod); + + /* 4. If we are the owner, take control of the UMMU hardware */ + if (owner) { + /* a. Wait for previous owner to finish */ + atomic_cond_read_relaxed(&mcmdq->owner_prod, VAL == llq.prod); + + /* b. Stop gathering work by clearing the owned flag */ + prod = atomic_fetch_andnot_relaxed(MCMDQ_PROD_OWNED_FLAG, + &mcmdq->q.llq.atomic.prod); + prod &= ~MCMDQ_PROD_OWNED_FLAG; + + /* + * c. Wait for any gathered work to be written to the queue. + * Note that we read our own entries so that we have the control + * dependency required by (d). + */ + ummu_mcmdq_poll_valid_map(mcmdq, llq.prod, prod); + + /* + * d. Advance the hardware prod pointer + * Control dependency ordering from the entries becoming valid. + */ + read_lock(&mcmdq->mcmdq_lock); + writel_relaxed(prod | mcmdq->mcmdq_prod, mcmdq->q.prod_reg); + read_unlock(&mcmdq->mcmdq_lock); + + /* + * e. Tell the next owner we're done + * Make sure we've updated the hardware first, so that we don't + * race to update prod and potentially move it backwards. + */ + atomic_set_release(&mcmdq->owner_prod, prod); + } + + /* 5. If we are inserting a CMD_SYNC, we must wait for it to complete */ + if (sync) { + llq.prod = ummu_queue_inc_prod_n(&llq, n); + + ret = ummu_mcmdq_poll_until_sync(ummu, mcmdq, &llq); + if (ret) + dev_err_ratelimited( + ummu->dev, + "CMD_SYNC timeout at 0x%08x [hwprod 0x%08x, hwcons 0x%08x]\n", + llq.prod, readl_relaxed(mcmdq->q.prod_reg), + readl_relaxed(mcmdq->q.cons_reg)); + + ret = check_pa_continuity_nop_exec(&mcmdq->q, sprod); + /* + * Try to unlock the mcmdq lock. This will fail if we're the last + * reader, in which case we can safely update mcmdq->q.llq.cons + */ + if (!ummu_mcmdq_shared_tryunlock(mcmdq)) { + WRITE_ONCE(mcmdq->q.llq.cons, llq.cons); + ummu_mcmdq_shared_unlock(mcmdq); + } + } + + local_irq_restore(flags); + return ret; +} + +static int __ummu_mcmdq_issue_cmd(struct ummu_device *ummu, + struct ummu_mcmdq_ent *ent, bool sync) +{ + u64 cmd[MCMDQ_ENT_DWORDS]; + + if (unlikely(ummu_mcmdq_build_cmd(ummu, cmd, ent))) { + dev_warn(ummu->dev, "ignoring unknown MCMDQ opcode = 0x%x\n", + ent->opcode); + return -EINVAL; + } + + return ummu_mcmdq_issue_cmdlist(ummu, cmd, 1, sync); +} + +int ummu_mcmdq_issue_cmd(struct ummu_device *ummu, struct ummu_mcmdq_ent *ent) +{ + return __ummu_mcmdq_issue_cmd(ummu, ent, false); +} + +int ummu_mcmdq_issue_cmd_with_sync(struct ummu_device *ummu, + struct ummu_mcmdq_ent *ent) +{ + return __ummu_mcmdq_issue_cmd(ummu, ent, true); +} + +void ummu_mcmdq_batch_add(struct ummu_device *ummu, + struct ummu_mcmdq_batch *cmds, + struct ummu_mcmdq_ent *cmd) +{ + int index; + + if (cmds->num == MCMDQ_BATCH_ENTRIES) { + (void)ummu_mcmdq_issue_cmdlist(ummu, cmds->cmds, cmds->num, false); + cmds->num = 0; + } + + index = cmds->num * MCMDQ_ENT_DWORDS; + if (unlikely(ummu_mcmdq_build_cmd(ummu, &cmds->cmds[index], cmd))) { + dev_warn(ummu->dev, "ignoring unknown MCMDQ opcode = 0x%x\n", + cmd->opcode); + return; + } + + cmds->num++; +} + +int ummu_mcmdq_batch_submit(struct ummu_device *ummu, + struct ummu_mcmdq_batch *cmds) +{ + return ummu_mcmdq_issue_cmdlist(ummu, cmds->cmds, cmds->num, true); +} diff --git a/drivers/iommu/hisilicon/queue.h b/drivers/iommu/hisilicon/queue.h new file mode 100644 index 0000000000000000000000000000000000000000..2f60fd52ce8342d9af5dd24dd24109e303f46268 --- /dev/null +++ b/drivers/iommu/hisilicon/queue.h @@ -0,0 +1,324 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Copyright (c) 2025 HiSilicon Technologies Co., Ltd. All rights reserved. + * Description: mcmdq/evtq/permq header file + */ + +#ifndef __UMMU_QUEUE_H__ +#define __UMMU_QUEUE_H__ + +#include "ummu.h" + +#define Q_IDX(llq, p) ((p) & ((1 << (llq)->log2size) - 1)) +#define Q_WRP(llq, p) ((p) & (1 << (llq)->log2size)) +#define Q_OVERFLOW_FLAG (1UL << 31) +#define Q_OVF(p) ((p)&Q_OVERFLOW_FLAG) +#define Q_ENT(q, p) ((q)->base + Q_IDX(&((q)->llq), p) * (q)->ent_dwords) + +/* + * Ensure DMA allocations are naturally aligned + * Hardware requirements base address by address length align + */ +#if IS_ENABLED(CONFIG_CMA_ALIGNMENT) +#define Q_MAX_SZ_SHIFT (PAGE_SHIFT + CONFIG_CMA_ALIGNMENT) +#else +#define Q_MAX_SZ_SHIFT (PAGE_SHIFT + MAX_ORDER) +#endif + +#define Q_BASE_RWA (1ULL << 63) +#define Q_BASE_ADDR_MASK GENMASK_ULL(51, 5) +#define Q_BASE_LOG2SIZE GENMASK(4, 0) + +/* multiple command queue */ +#define MCMDQ_ENT_SZ_SHIFT 5 +#define MCMDQ_ENT_DWORDS ((1UL << MCMDQ_ENT_SZ_SHIFT) / sizeof(u64)) +#define MCMDQ_ENT_SIZE 16 +#define MCMDQ_MAX_SZ_SHIFT 8 + +#define UMMU_MCMDQ_OFFSET 0x100 +#define MCMDQ_PROD_OFFSET 0x8 +#define MCMDQ_CONS_OFFSET 0xC +#define MCMDQ_CONS_ERR (1UL << 23) +#define MCMDQ_PROD_ERRACK (1UL << 23) +#define MCMDQ_PROD_EN (1UL << 31) +#define MCMDQ_EN_RESP (1UL << 31) + +#define MCMDQ_CONS_ERR_REASON GENMASK(26, 24) +#define MCMDQ_CERROR_NONE_IDX 0 +#define MCMDQ_CERROR_ILL_IDX 1 +#define MCMDQ_CERROR_ABT_IDX 2 + +#define MCMDQ_PROD_OWNED_FLAG Q_OVERFLOW_FLAG + +#define MCMDQ_BATCH_ENTRIES 32 +#define CMD_0_OP GENMASK_ULL(7, 0) +#define CMD_0_SSV (1UL << 11) + +#define CMD_SYNC_0_CM GENMASK_ULL(13, 12) +#define CMD_SYNC_0_CM_NONE 0 +#define CMD_SYNC_0_CM_IRQ 1 +#define CMD_SYNC_0_CM_SEV 2 +#define CMD_SYNC_0_MSISH GENMASK_ULL(15, 14) +#define CMD_SYNC_0_MSIATTR GENMASK_ULL(19, 16) +#define CMD_SYNC_0_MSIDATA GENMASK_ULL(63, 32) +#define CMD_SYNC_1_MSIADDR GENMASK_ULL(51, 2) + +#define CMD_STALL_0_DSEC (1UL << 10) +#define CMD_STALL_0_RETRY (1UL << 12) +#define CMD_STALL_0_ABORT (1UL << 13) +#define CMD_STALL_1_TAG GENMASK_ULL(15, 0) +#define CMD_STALL_2_TECT_TAG GENMASK_ULL(15, 0) + +#define CMD_PREFET_0_TKV (1UL << 11) +#define CMD_PREFET_0_TID GENMASK_ULL(31, 12) +#define CMD_PREFET_0_SIZE GENMASK_ULL(56, 52) +#define CMD_PREFET_0_STRIDE GENMASK_ULL(61, 57) +#define CMD_PREFET_1_ADDR_MASK GENMASK_ULL(63, 12) +#define CMD_PREFET_2_TECTE_TAG GENMASK_ULL(15, 0) +#define CMD_PREFET_2_DEID_0 GENMASK_ULL(31, 0) +#define CMD_PREFET_2_DEID_1 GENMASK_ULL(63, 32) +#define CMD_PREFET_3_DEID_0 GENMASK_ULL(31, 0) +#define CMD_PREFET_3_DEID_1 GENMASK_ULL(63, 32) + +#define CMD_CFGI_0_LEAF (1UL << 8) +#define CMD_CFGI_0_TID GENMASK_ULL(31, 12) +#define CMD_CFGI_0_VMID GENMASK_ULL(47, 32) +#define CMD_CFGI_0_RANGE GENMASK_ULL(56, 52) +#define CMD_CFGI_2_TECTE_TAG GENMASK_ULL(15, 0) +#define CMD_CFGI_2_DEID_0 GENMASK_ULL(31, 0) +#define CMD_CFGI_2_DEID_1 GENMASK_ULL(63, 32) +#define CMD_CFGI_3_DEID_0 GENMASK_ULL(31, 0) +#define CMD_CFGI_3_DEID_1 GENMASK_ULL(63, 32) + +#define CMD_TLBI_0_LEAF (1UL << 8) +#define CMD_TLBI_0_ASID GENMASK_ULL(27, 12) +#define CMD_TLBI_0_TOKEN_ID GENMASK_ULL(31, 12) +#define CMD_TLBI_0_VMID GENMASK_ULL(47, 32) +#define CMD_TLBI_0_NUM GENMASK_ULL(56, 52) +#define CMD_TLBI_0_SCALE GENMASK_ULL(61, 57) +#define CMD_TLBI_0_TL GENMASK_ULL(63, 62) +#define CMD_TLBI_1_GS GENMASK_ULL(1, 0) +#define CMD_TLBI_1_VA_MASK GENMASK_ULL(63, 12) +#define CMD_TLBI_1_IPA_MASK GENMASK_ULL(51, 12) +#define CMD_TLBI_2_TECTE_TAG GENMASK_ULL(15, 0) +#define CMD_TLBI_RANGE_NUM_MAX 31 + +#define CMD_PLBI_0_TID GENMASK_ULL(31, 12) +#define CMD_PLBI_0_RANGE GENMASK_ULL(37, 32) +#define CMD_PLBI_1_ADDR_MASK GENMASK_ULL(63, 0) +#define CMD_PLBI_2_TECTE_TAG GENMASK_ULL(15, 0) + +#define CMD_CREATE_KVTBL0_EVT_EN BIT(8) +#define CMD_CREATE_KVTBL0_TAG_MASK GENMASK_ULL(31, 16) +#define CMD_CREATE_KVTBL0_KV_INDEX_MASK GENMASK_ULL(63, 32) +#define CMD_CREATE_KVTBL1_ADDR_MASK GENMASK_ULL(51, 6) +#define CMD_CREATE_KVTBL2_EID_LOW GENMASK_ULL(63, 0) +#define CMD_CREATE_KVTBL3_EID_HIGH GENMASK_ULL(63, 0) + +#define CMD_DELETE_KVTBL0_EVT_EN BIT(8) +#define CMD_DELETE_KVTBL0_TAG_MASK GENMASK_ULL(31, 16) +#define CMD_DELETE_KVTBL0_KV_INDEX_MASK GENMASK_ULL(63, 32) +#define CMD_DELETE_KVTBL2_EID_LOW GENMASK_ULL(63, 0) +#define CMD_DELETE_KVTBL3_EID_HIGH GENMASK_ULL(63, 0) + +#define CMD_NULL_OP_SUB_OP GENMASK(15, 8) +#define SUB_OP_CHECK_PA_CONTI_0_RESULT GENMASK(19, 16) +#define SUB_OP_CHECK_PA_CONTI_0_FLAG BIT(20) +#define SUB_OP_CHECK_PA_CONTI_0_SIZE GENMASK(29, 24) +#define SUB_OP_CHECK_PA_CONTI_0_ID GENMASK_ULL(40, 32) +#define SUB_OP_CHECK_PA_CONTI_1_ADDR GENMASK_ULL(63, 12) + +/* event queue */ +#define EVTQ_ENT_SZ_SHIFT 6 +#define EVTQ_ENT_DWORDS (1UL << EVTQ_ENT_SZ_SHIFT >> 3) +#define EVTQ_MAX_SZ_SHIFT (Q_MAX_SZ_SHIFT - EVTQ_ENT_SZ_SHIFT) + +#define UMMU_EVTQ_OFFSET 0x1100 +#define UMMU_EVTQ_PROD_OFFSET 0x1108 +#define UMMU_EVTQ_CONS_OFFSET 0x110C + +#define EVTQ_ENT0_CODE GENMASK(7, 0) +#define EVTQ_ENT0_RNW (1U << 11) +#define EVTQ_ENT0_IND (1U << 12) +#define EVTQ_ENT0_PNU (1U << 13) +#define EVTQ_ENT0_CLS GENMASK(15, 14) +#define EVTQ_ENT0_NSIPA (1U << 16) +#define EVTQ_ENT0_S2 (1U << 17) +#define EVTQ_ENT0_STALL (1U << 18) +#define EVTQ_ENT0_TTRNW (1U << 19) +#define EVTQ_ENT0_TID GENMASK_ULL(51, 32) + +#define EVTQ_ENT1_STAG GENMASK(15, 0) +#define EVTQ_ENT1_IMPL_DEF GENMASK(31, 16) +#define EVTQ_ENT1_REASON GENMASK_ULL(63, 32) + +#define EVTQ_ENT2_IPA GENMASK_ULL(51, 12) +#define EVTQ_ENT3_IADDR GENMASK_ULL(63, 0) +#define EVTQ_ENT4_TECTE_TAG GENMASK(15, 0) +#define EVTQ_ENT4_EID_LOW GENMASK_ULL(63, 0) +#define EVTQ_ENT5_EID_HIGH GENMASK_ULL(63, 0) +#define EVTQ_ENT6_FTADDR GENMASK_ULL(51, 3) + +struct ummu_mcmdq_batch { + u64 cmds[MCMDQ_BATCH_ENTRIES * MCMDQ_ENT_DWORDS]; + int num; +}; + +struct ummu_mcmdq_ent { + /* Common fields */ + u8 opcode; + + /* Command-specific fields */ + union { +#define CMD_SYNC 0x1 + struct { + u64 msi_addr; + bool support_sev; + } sync; + +#define CMD_STALL_RESUME 0x02 + struct { + bool dsec; + bool retry; + bool abort; + u16 tect_tag; + u16 tag; + } stall_resume; + +#define CMD_STALL_TERM 0x03 + struct { + u16 tect_tag; + } stall_term; + +#define CMD_PREFET_CFG 0x04 + struct { + bool tkv; + u32 tid; + u32 deid_0; + u32 deid_1; + u32 deid_2; + u32 deid_3; + } prefet; + +#define CMD_CFGI_TECT 0x08 +#define CMD_CFGI_TECT_RANGE 0x09 +#define CMD_CFGI_TCT 0x0A +#define CMD_CFGI_TCT_ALL 0x0B +#define CMD_CFGI_TECTS_PIDM 0x0C + struct { + bool leaf; + u32 tid; + u16 vmid; + u8 range; + u32 deid_0; + u32 deid_1; + u32 deid_2; + u32 deid_3; + } cfgi; + +#define CMD_PLBI_OS_EID 0x14 +#define CMD_PLBI_OS_EIDTID 0x15 +#define CMD_PLBI_OS_VA 0x16 + struct { + u32 tid; + u16 tecte_tag; + u8 range; + u64 addr; + } plbi; + +#define CMD_TLBI_OS_ALL 0x10 +#define CMD_TLBI_OS_TID 0x11 +#define CMD_TLBI_OS_VA 0x12 +#define CMD_TLBI_OS_VAA 0x13 +#define CMD_TLBI_HYP_ALL 0x18 +#define CMD_TLBI_HYP_TID 0x19 +#define CMD_TLBI_HYP_VA 0x1A +#define CMD_TLBI_HYP_VAA 0x1B +#define CMD_TLBI_S1S2_VMALL 0x28 +#define CMD_TLBI_S2_IPA 0x2A +#define CMD_TLBI_NS_OS_ALL 0x2C +#define CMD_TLBI_OS_ALL_U 0x90 +#define CMD_TLBI_OS_ASID_U 0x91 +#define CMD_TLBI_OS_VA_U 0x92 +#define CMD_TLBI_OS_VAA_U 0x93 +#define CMD_TLBI_HYP_ASID_U 0x99 +#define CMD_TLBI_HYP_VA_U 0x9A +#define CMD_TLBI_S1S2_VMALL_U 0xA8 +#define CMD_TLBI_S2_IPA_U 0xAA + struct { + bool leaf; + u16 asid; + u16 vmid; + u32 tid; + u16 tect_tag; + u8 num; + u8 scale; + u8 tl; + u8 gs; + u64 addr; + } tlbi; + +#define CMD_RESUME 0x44 + struct { + u32 deid; + u16 stag; + u8 resp; + } resume; + +#define CMD_CREATE_KVTBL 0x60 + struct { + bool evt_en; + u16 tecte_tag; + u32 kv_index; + u64 tect_base_addr; + u64 eid_low; + u64 eid_high; + } create_kvtbl; + +#define CMD_DELETE_KVTBL 0x61 + struct { + bool evt_en; + u16 tecte_tag; + u32 kv_index; + u64 eid_low; + u64 eid_high; + } delete_kvtbl; + +#define CMD_NULL_OP 0x62 + struct { + u8 sub_op; + union { +#define SUB_CMD_NULL_CHECK_PA_CONTINUITY 0x1 + struct { + u16 result; + u16 flag; + u32 size_order; + u32 id; + u64 addr; + } check_pa_conti; + }; + } null_op; + }; +}; + +void ummu_queue_write(__le64 *dst, u64 *src, size_t n_dwords); +void ummu_queue_read(u64 *dst, __le64 *src, size_t n_dwords); +int ummu_queue_remove_raw(struct ummu_queue *q, u64 *ent); +int ummu_queue_sync_prod_in(struct ummu_queue *q); +bool ummu_queue_empty(struct ummu_ll_queue *q); +int ummu_write_evtq_regs(struct ummu_device *ummu); +int ummu_init_queues(struct ummu_device *ummu); +int ummu_device_mcmdq_init_cfg(struct ummu_device *ummu); +int ummu_mcmdq_issue_cmd(struct ummu_device *ummu, struct ummu_mcmdq_ent *ent); +int ummu_mcmdq_build_cmd(struct ummu_device *ummu, u64 *cmd, + struct ummu_mcmdq_ent *ent); +int ummu_mcmdq_issue_cmdlist(struct ummu_device *ummu, u64 *cmds, + int n, bool sync); +int ummu_mcmdq_issue_cmd_with_sync(struct ummu_device *ummu, + struct ummu_mcmdq_ent *ent); +void ummu_mcmdq_batch_add(struct ummu_device *ummu, + struct ummu_mcmdq_batch *cmds, + struct ummu_mcmdq_ent *cmd); +int ummu_mcmdq_batch_submit(struct ummu_device *ummu, + struct ummu_mcmdq_batch *cmds); +#endif /* __UMMU_QUEUE_H__ */ diff --git a/drivers/iommu/hisilicon/regs.h b/drivers/iommu/hisilicon/regs.h new file mode 100644 index 0000000000000000000000000000000000000000..23d3b033c60958605f6330f9566fcf7ca16ae601 --- /dev/null +++ b/drivers/iommu/hisilicon/regs.h @@ -0,0 +1,186 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* Copyright (c) 2025 HiSilicon Technologies Co., Ltd. All rights reserved. */ + +#ifndef __UMMU_REGS_H__ +#define __UMMU_REGS_H__ + +#define UMMU_REG_SZ 0x5000 + +/* MMIO registers */ +#define UMMU_IIDR 0x0 +#define IIDR_PROD_ID GENMASK(19, 8) +#define IIDR_PROD_VARIANT GENMASK(7, 4) +#define UMMU_AIDR 0x4 + +#define UMMU_CAP0 0x10 +#define CAP0_TECT_LVL_BIT (1UL << 19) +#define CAP0_TECT_MODE_MASK GENMASK(18, 17) +#define CAP0_TCT_LVL_BIT (1UL << 16) +#define CAP0_S2_ATTR_TYPE (1UL << 15) +#define CAP0_ATTR_TYPES_OVR (1UL << 14) +#define CAP0_ATTR_PERMS_OVR (1UL << 13) +#define CAP0_TIDSIZE_MASK GENMASK(12, 8) +#define CAP0_DEIDSIZE_MASK GENMASK(7, 0) + +#define UMMU_CAP1 0x14 +#define CAP1_STALL_MAX GENMASK(31, 20) +#define CAP1_EVENT_GEN (1UL << 19) +#define CAP1_MCMDQ_SUPPORT (1UL << 18) +#define CAP1_MCMDQ_LOG2NUM GENMASK(17, 14) +#define CAP1_MCMDQ_LOG2SIZE GENMASK(13, 10) +#define MCMDQ_MAX_LOG2SIZE 15 +#define CAP1_EVENTQ_SUPPORT (1UL << 9) +#define CAP1_EVENTQ_LOG2NUM GENMASK(8, 5) +#define CAP1_EVENTQ_LOG2SIZE GENMASK(4, 0) +#define EVTQ_MAX_LOG2SIZE 19 + +#define UMMU_CAP2 0x18 +#define CAP2_TTF_MASK GENMASK(15, 14) +#define CAP2_TTF_IAS_40 40 +#define CAP2_TTF_AARCH64 2 +#define CAP2_TTF_AARCH32_64 3 +#define CAP2_TRANS_SMALL_BIT (1UL << 13) +#define CAP2_S1P_BIT (1UL << 12) +#define CAP2_S2P_BIT (1UL << 11) +#define CAP2_VA_EXT_MASK GENMASK(10, 9) +#define CAP2_VA_EXT_52 1 +#define CAP2_GRAN64K_BIT (1UL << 8) +#define CAP2_GRAN16K_BIT (1UL << 7) +#define CAP2_GRAN4K_BIT (1UL << 6) +#define CAP2_OAS_MASK GENMASK(5, 3) +#define CAP2_OAS_32_BIT 0 +#define CAP2_OAS_36_BIT 1 +#define CAP2_OAS_40_BIT 2 +#define CAP2_OAS_42_BIT 3 +#define CAP2_OAS_44_BIT 4 +#define CAP2_OAS_48_BIT 5 +#define CAP2_TTF_OAS_32 32 +#define CAP2_TTF_OAS_36 36 +#define CAP2_TTF_OAS_40 40 +#define CAP2_TTF_OAS_42 42 +#define CAP2_TTF_OAS_44 44 +#define CAP2_TTF_OAS_48 48 +#define CAP2_RTLBI_BIT (1UL << 2) +#define CAP2_BTLBI_BIT (1UL << 1) +#define CAP2_VMIDTLBI_BIT (1UL << 0) + +#define UMMU_CAP3 0x1C +#define CAP3_SATIMAX_MASK GENMASK(20, 15) +#define CAP3_TERM_MODEL_BIT (1UL << 14) +#define CAP3_STALL_MODEL_MASK GENMASK(13, 12) +#define CAP3_STALL_MODE 0 +#define CAP3_STALL_MODE_FORCE 2 +#define CAP3_MSI_SUPPORT_BIT (1UL << 11) +#define CAP3_HYP_S1CTX_BIT (1UL << 10) +#define CAP3_HTTU_MASK GENMASK(9, 8) +#define CAP3_HTTU_ACCESS_DIRTY 2 +#define CAP3_HTTU_ACCESS 1 +#define CAP3_MTM_BIT (1UL << 7) +#define CAP3_TTENDIAN_MASK GENMASK(6, 5) +#define CAP3_TTENDIAN_MIXED 0 +#define CAP3_TTENDIAN_LE 2 +#define CAP3_TTENDIAN_BE 3 +#define CAP3_COHACC_BIT (1UL << 4) +#define CAP3_BBML_MASK GENMASK(3, 2) +#define CAP3_BBML0 0 +#define CAP3_BBML1 1 +#define CAP3_BBML2 2 +#define CAP3_S2_EXE_NEVER_CTRL_BIT (1UL << 1) +#define CAP3_HIER_ARRT_DISABLE_BIT (1UL << 0) + +#define UMMU_CAP4 0x20 +#define CAP4_UEQ_SUPPORT (1UL << 24) +#define CAP4_UEQ_LOG2NUM GENMASK(23, 20) +#define CAP4_UEQ_LOG2SIZE GENMASK(19, 16) +#define CAP4_UCPLQ_LOG2SIZE GENMASK(15, 12) +#define CAP4_UCMDQ_LOG2SIZE GENMASK(11, 8) +#define CAP4_UCMDQ_CPLQ_LOG2NUM GENMASK(7, 0) + +#define UMMU_CAP5 0x24 +#define CAP5_BRDCAST_PLBI_BIT (1UL << 9) +#define CAP5_RANGE_PLBI_BIT (1UL << 8) +#define CAP5_TKVALCHK_MOD GENMASK(7, 6) +#define CAP5_TKVALCHK_BIT (1UL << 5) +#define CAP5_PT_GRAN4K_BIT (1UL << 4) +#define CAP5_PT_GRAN2M_BIT (1UL << 3) +#define CAP5_MAPT_MODE_MASK GENMASK(2, 1) +#define CAP5_MAPT_SUPPORT (1UL << 0) + +#define UMMU_CAP6 0x28 +#define CAP6_MTM_GP_MAX GENMASK(23, 16) +#define CAP6_MTM_ID_MAX GENMASK(15, 0) + +#define UMMU_CR0 0x30 +#define UMMU_CR0ACK 0x34 +#define CR0_MAPT_EN (1UL << 5) +#define CR0_VMID_WILDCARD_MASK GENMASK(4, 2) +#define CR0_EVENTQ_EN (1UL << 1) +#define CR0_UMMU_EN (1UL << 0) + +#define UMMU_CR1 0x38 +#define CR1_TECT_MODE_SEL (1UL << 15) +#define CR1_PRIVATE_TLB (1UL << 14) +#define CR1_BAD_EID_RECORD (1UL << 13) +#define CR1_E2H (1UL << 12) +#define CR1_TABLE_SH GENMASK(11, 10) +#define CR1_TABLE_OC GENMASK(9, 8) +#define CR1_TABLE_IC GENMASK(7, 6) +#define CR1_QUEUE_SH GENMASK(5, 4) +#define CR1_QUEUE_OC GENMASK(3, 2) +#define CR1_QUEUE_IC GENMASK(1, 0) + +#define UMMU_CR2 0x3C +#define CR2_PRIVATE_PLB (1UL << 6) +#define CR2_UE_QUEUE_SH GENMASK(5, 4) +#define CR2_UE_QUEUE_OC GENMASK(3, 2) +#define CR2_UE_QUEUE_IC GENMASK(1, 0) + +#define UMMU_CR3 0x40 +#define CR3_UPDATE_FLAG (1UL << 31) +#define CR3_TRANS_MTM_GP GENMASK(23, 16) +#define CR3_TRANS_MTM_ID GENMASK(15, 0) + +#define UMMU_GBPA 0x50 +#define GBPA_UPDATE_BIT (1UL << 31) +#define GBPA_ABORT_BIT (1UL << 15) + +#define UMMU_EVENT_QUE_MSI_ADDR0 0x1110 +#define UMMU_MSI_ADDR1_OFFSET 0x04 +#define UMMU_MSI_ADDR_MASK GENMASK_ULL(51, 2) +#define UMMU_EVENT_QUE_MSI_DATA 0x1118 +#define UMMU_EVENT_QUE_MSI_ATTR 0x111C + +#define UMMU_GLB_IRQ_EN 0x1130 +#define IRQ_CTRL_EVTQ_IRQEN (1UL << 1) +#define IRQ_CTRL_GERROR_IRQEN (1UL << 0) + +#define UMMU_GERROR 0x1134 +#define GERROR_MSI_GERR_ABT_ERR (1UL << 7) +#define GERROR_MSI_UIEQ_ABT_ERR (1UL << 4) +#define GERROR_MSI_EVTQ_ABT_ERR (1UL << 3) +#define GERROR_MSI_MCMDQ_ABT_ERR (1UL << 2) +#define GERROR_EVTQ_ABT_ERR (1UL << 1) +#define GERROR_MCMDQ_ERR (1UL << 0) +#define GERROR_ERR_MASK GENMASK(6, 0) +#define UMMU_GERRORN 0x1138 + +#define UMMU_GLB_ERR_INT_MSI_ADDR0 0x1140 +#define UMMU_GLB_ERR_INT_MSI_DATA 0x1148 +#define UMMU_GLB_ERR_INT_MSI_ATTR 0x114C + +/* Common memory attribute values */ +#define UMMU_SH_NSH 0 +#define UMMU_CACHE_WB 1 +#define UMMU_SH_OSH 2 +#define UMMU_SH_ISH 3 +#define UMMU_MEMATTR_DEVICE_nGnRE 0x1 +#define UMMU_MEMATTR_OIWB 0xf + +#define UMMU_REG_POLL_TIMEOUT_US 5 +#define UMMU_QUE_POLL_TIMEOUT_US 100000 +#define UMMU_POLL_SPIN_COUNT 10 + +#define PERMQ_RELEASE_TIMEOUT_US 100 +#define UMMU_CONS_POLL_TIMEOUT_US 5 + +#endif /* __UMMU_REGS_H__ */ diff --git a/drivers/iommu/hisilicon/ummu.h b/drivers/iommu/hisilicon/ummu.h new file mode 100644 index 0000000000000000000000000000000000000000..7eb8089eeb2db145b047f72878d368678df98574 --- /dev/null +++ b/drivers/iommu/hisilicon/ummu.h @@ -0,0 +1,197 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Copyright (c) 2025 HiSilicon Technologies Co., Ltd. All rights reserved. + * Description: UMMU Device's implementations + */ + +#ifndef __UMMU_H__ +#define __UMMU_H__ + +#include +#include +#include +#include +#include + +extern struct platform_driver ummu_driver; + +enum ummu_device_msi_index { + EVTQ_MSI_INDEX, + GERROR_MSI_INDEX, + UMMU_MAX_MSIS, +}; + +struct ummu_tct_desc { + u32 asid; +}; + +/* translation stage1 table config */ +struct ummu_s1_cfg { + struct ummu_tct_desc tct; +}; + +/* translation stage2 table config */ +struct ummu_s2_cfg { + u16 vmid; +}; + +enum ummu_domain_stage { + UMMU_DOMAIN_S1 = 0, + UMMU_DOMAIN_S2, +}; + +struct ummu_ll_queue { + union { + u64 val; + struct { + u32 prod; + u32 cons; + }; + struct { + atomic_t prod; + atomic_t cons; + } atomic; + u8 __pad[SMP_CACHE_BYTES]; + } ____cacheline_aligned_in_smp; + u32 log2size; +}; + +struct ummu_queue { + struct ummu_ll_queue llq; + __le64 *base; + phys_addr_t base_pa; + u64 q_base; + + size_t ent_dwords; + u32 __iomem *prod_reg; + u32 __iomem *cons_reg; +}; + +struct ummu_mcmdq { + struct ummu_queue q; + atomic_long_t *valid_map; + atomic_t owner_prod; + u32 mcmdq_prod; + rwlock_t mcmdq_lock; + atomic_t lock; + int configured; + int shared; + void __iomem *base; +}; + +struct ummu_evtq { + struct ummu_queue q; + u32 max_stalls; +}; + +struct ummu_capability { +#define UMMU_FEAT_2_LVL_TECT BIT(0) +#define UMMU_FEAT_2_LVL_TCT BIT(1) +#define UMMU_FEAT_MCMDQ BIT(2) +#define UMMU_FEAT_EVENTQ BIT(3) +#define UMMU_FEAT_SEV BIT(4) +#define UMMU_FEAT_TRANS_S1 BIT(5) +#define UMMU_FEAT_TRANS_S2 BIT(6) +#define UMMU_FEAT_RANGE_INV BIT(7) +#define UMMU_FEAT_STALLS BIT(8) +#define UMMU_FEAT_STALL_FORCE BIT(9) +#define UMMU_FEAT_MSI BIT(10) +#define UMMU_FEAT_HYP BIT(11) +#define UMMU_FEAT_HA BIT(12) +#define UMMU_FEAT_HD BIT(13) +#define UMMU_FEAT_MTM BIT(14) +#define UMMU_FEAT_TT_LE BIT(15) +#define UMMU_FEAT_TT_BE BIT(16) +#define UMMU_FEAT_COHERENCY BIT(17) +#define UMMU_FEAT_BBML1 BIT(18) +#define UMMU_FEAT_BBML2 BIT(19) +#define UMMU_FEAT_VAX BIT(20) +#define UMMU_FEAT_BTM BIT(21) +#define UMMU_FEAT_SVA BIT(22) +#define UMMU_FEAT_E2H BIT(23) +#define UMMU_FEAT_MAPT BIT(24) +#define UMMU_FEAT_RANGE_PLBI BIT(25) +#define UMMU_FEAT_TOKEN_CHK BIT(26) +#define UMMU_FEAT_PERMQ BIT(27) +#define UMMU_FEAT_NESTING BIT(28) + + u32 features; + u32 deid_bits; + u32 tid_bits; + u64 pgsize_bitmap; + u32 ias; + u32 oas; + u64 ptsize_bitmap; +#define UMMU_OPT_MSIPOLL (1UL << 0) +#define UMMU_OPT_DOUBLE_PLBI (1UL << 1) + u32 options; + +#define UMMU_MAX_ASIDS (1UL << 16) + unsigned int asid_bits; +#define UMMU_MAX_VMIDS (1UL << 16) + unsigned int vmid_bits; + + bool support_mapt; + u32 mcmdq_log2num; + u32 mcmdq_log2size; + u32 evtq_log2num; + u32 evtq_log2size; + u32 permq_num; + struct { + u32 cmdq_num; + u32 cplq_num; + } permq_ent_num; + u32 mtm_gp_max; + u32 mtm_id_max; + u16 prod_ver; +}; + +struct ummu_device { + struct device *dev; + void __iomem *base; + + struct ummu_capability cap; + + u32 nr_mcmdq; + struct ummu_mcmdq *__percpu *mcmdq; + struct ummu_evtq evtq; + + struct ummu_core_device core_dev; + const struct ummu_device_helper *helper_ops; + struct list_head list; +}; + +struct ummu_domain_cfgs { + enum ummu_domain_stage stage; + + u32 tecte_tag; + + union { + struct ummu_s1_cfg s1_cfg; + struct ummu_s2_cfg s2_cfg; + }; +}; + +struct ummu_domain { + struct ummu_base_domain base_domain; + struct ummu_domain_cfgs cfgs; +}; + +static inline +struct ummu_device *core_to_ummu_device(struct ummu_core_device *ummu_core_dev) +{ + return container_of(ummu_core_dev, struct ummu_device, core_dev); +} + +static inline struct ummu_domain *to_ummu_domain(struct iommu_domain *dom) +{ + struct ummu_base_domain *base_dom = + container_of(dom, struct ummu_base_domain, domain); + + return container_of(base_dom, struct ummu_domain, base_domain); +} + +int ummu_write_reg_sync(struct ummu_device *ummu, u32 val, + u32 reg_off, u32 ack_off); + +#endif /* __UMMU_H__ */ diff --git a/drivers/iommu/hisilicon/ummu_main.c b/drivers/iommu/hisilicon/ummu_main.c new file mode 100644 index 0000000000000000000000000000000000000000..3992e248dc5988d82336a87f420a6758d4e20f83 --- /dev/null +++ b/drivers/iommu/hisilicon/ummu_main.c @@ -0,0 +1,670 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (c) 2025 HiSilicon Technologies Co., Ltd. All rights reserved. + * Description: UMMU Device's implementations + */ + +#define pr_fmt(fmt) "UMMU: " fmt + +#include +#include +#include +#include + +#include "interrupt.h" +#include "queue.h" +#include "regs.h" +#include "flush.h" +#include "ummu.h" + +#define UMMU_DRV_NAME "ummu" + +int ummu_write_reg_sync(struct ummu_device *ummu, u32 val, + u32 reg_off, u32 ack_off) +{ + u32 reg; + + writel_relaxed(val, ummu->base + reg_off); + return readl_relaxed_poll_timeout(ummu->base + ack_off, reg, reg == val, + 1, UMMU_REG_POLL_TIMEOUT_US); +} + +static int ummu_update_gbpa(struct ummu_device *ummu, u32 set, + u32 clr) +{ + void __iomem *gbpa = ummu->base + UMMU_GBPA; + u32 reg; + int ret; + + ret = readl_relaxed_poll_timeout(gbpa, reg, !(reg & GBPA_UPDATE_BIT), 1, + UMMU_REG_POLL_TIMEOUT_US); + if (ret) + return ret; + + reg &= ~clr; + reg |= set; + writel_relaxed(reg | GBPA_UPDATE_BIT, gbpa); + ret = readl_relaxed_poll_timeout(gbpa, reg, !(reg & GBPA_UPDATE_BIT), 1, + UMMU_REG_POLL_TIMEOUT_US); + if (ret) + dev_err(ummu->dev, "GBPA not responding to update\n"); + return ret; +} + +static int ummu_ioremap(struct ummu_device *ummu, resource_size_t start, + resource_size_t size) +{ + struct resource res = DEFINE_RES_MEM(start, size); + + ummu->base = devm_ioremap_resource(ummu->dev, &res); + if (IS_ERR(ummu->base)) + return PTR_ERR(ummu->base); + + return 0; +} + +static int ummu_device_register(struct ummu_device *ummu) +{ + int ret; + + ret = iommu_device_sysfs_add(&ummu->core_dev.iommu, ummu->dev, NULL, + "%s", dev_name(ummu->dev)); + if (ret) + dev_err(ummu->dev, "add iommu sysfs failed, ret = %d.\n", ret); + + return ret; +} + +static void ummu_device_unregister(struct ummu_device *ummu) +{ + iommu_device_sysfs_remove(&ummu->core_dev.iommu); +} + +static int ummu_init_structures(struct ummu_device *ummu) +{ + int ret; + + ret = ummu_init_queues(ummu); + if (ret) { + dev_err(ummu->dev, "init queues failed\n"); + return ret; + } + + return 0; +} + +static void ummu_device_hw_probe_ver(struct ummu_device *ummu) +{ + u32 reg = readl_relaxed(ummu->base + UMMU_IIDR); + + ummu->cap.prod_ver = (u16)FIELD_GET(IIDR_PROD_ID, reg); + /* + * On the hisi chip with IIDR_PROD_ID set to 0, + * ummu enables special_identify to perform some + * specialized operations. + */ + if (!ummu->cap.prod_ver) { + ummu->cap.options |= UMMU_OPT_DOUBLE_PLBI; + ummu->cap.features &= ~UMMU_FEAT_STALLS; + } +} + +static void ummu_device_hw_probe_cap0(struct ummu_device *ummu) +{ + u32 reg, pasids, ubrt_pasids, cap_pasids; + + reg = readl_relaxed(ummu->base + UMMU_CAP0); + + /* 2-level tect structures */ + if (reg & CAP0_TECT_LVL_BIT) + ummu->cap.features |= UMMU_FEAT_2_LVL_TECT; + + /* 2-level tct structures */ + if (reg & CAP0_TCT_LVL_BIT) + ummu->cap.features |= UMMU_FEAT_2_LVL_TCT; + + /* TID size */ + ummu->cap.tid_bits = FIELD_GET(CAP0_TIDSIZE_MASK, reg); + /* The tid cap should follow the UB protocol */ + ubrt_pasids = ummu->core_dev.iommu.max_pasids; + cap_pasids = 1 << ummu->cap.tid_bits; + if (ubrt_pasids > cap_pasids) + pr_warn("ubrt max_pasids[%u] beyond capacity.\n", ubrt_pasids); + pasids = min(cap_pasids, (1UL << UB_MAX_TID_BITS)); + ummu->core_dev.iommu.max_pasids = min(ubrt_pasids, pasids); + /* TECTE_TAG size */ + ummu->cap.deid_bits = FIELD_GET(CAP0_DEIDSIZE_MASK, reg); +} + +static void ummu_device_hw_probe_cap1(struct ummu_device *ummu) +{ + u32 reg = readl_relaxed(ummu->base + UMMU_CAP1); + + /* Maximum number of outstanding stalls */ + ummu->evtq.max_stalls = FIELD_GET(CAP1_STALL_MAX, reg); + + /* Support generation of WFE wake-up events to PE */ + if (reg & CAP1_EVENT_GEN) + ummu->cap.features |= UMMU_FEAT_SEV; + + /* MCMDQ's support, numbers and depth */ + if (reg & CAP1_MCMDQ_SUPPORT) { + ummu->cap.features |= UMMU_FEAT_MCMDQ; + ummu->cap.mcmdq_log2num = FIELD_GET(CAP1_MCMDQ_LOG2NUM, reg); + ummu->cap.mcmdq_log2size = min(FIELD_GET(CAP1_MCMDQ_LOG2SIZE, reg), + MCMDQ_MAX_LOG2SIZE); + } + + /* EVENTQ's support, numbers and depth */ + if (reg & CAP1_EVENTQ_SUPPORT) { + ummu->cap.features |= UMMU_FEAT_EVENTQ; + ummu->cap.evtq_log2num = FIELD_GET(CAP1_EVENTQ_LOG2NUM, reg); + ummu->cap.evtq_log2size = min(FIELD_GET(CAP1_EVENTQ_LOG2SIZE, reg), + EVTQ_MAX_LOG2SIZE); + + } +} + +static int ummu_device_get_ttf(struct ummu_device *ummu, u32 reg) +{ + switch (FIELD_GET(CAP2_TTF_MASK, reg)) { + case CAP2_TTF_AARCH32_64: + ummu->cap.ias = CAP2_TTF_IAS_40; + break; + case CAP2_TTF_AARCH64: + break; + default: + dev_err(ummu->dev, "page table format not supported!\n"); + return -ENXIO; + } + return 0; +} + +static int ummu_device_get_trans_stage(struct ummu_device *ummu, u32 reg) +{ + if (!(reg & (CAP2_S1P_BIT | CAP2_S2P_BIT))) { + dev_err(ummu->dev, "no translation stage support!\n"); + return -ENXIO; + } + + if (reg & CAP2_S1P_BIT) + ummu->cap.features |= UMMU_FEAT_TRANS_S1; + + if (reg & CAP2_S2P_BIT) + ummu->cap.features |= UMMU_FEAT_TRANS_S2; + + if ((ummu->cap.features & UMMU_FEAT_TRANS_S1) && + (ummu->cap.features & UMMU_FEAT_TRANS_S2)) + ummu->cap.features |= UMMU_FEAT_NESTING; + + return 0; +} + +static void ummu_device_get_pgsize(struct ummu_device *ummu, u32 reg) +{ + /* page sizes */ + if (reg & CAP2_GRAN64K_BIT) + ummu->cap.pgsize_bitmap |= SZ_64K | SZ_512M; + if (reg & CAP2_GRAN16K_BIT) + ummu->cap.pgsize_bitmap |= SZ_16K | SZ_32M; + if (reg & CAP2_GRAN4K_BIT) + ummu->cap.pgsize_bitmap |= SZ_4K | SZ_2M | SZ_1G; +} + +static void ummu_device_get_oas(struct ummu_device *ummu, u32 reg) +{ + /* output address size */ + switch (FIELD_GET(CAP2_OAS_MASK, reg)) { + case CAP2_OAS_32_BIT: + ummu->cap.oas = CAP2_TTF_OAS_32; + break; + case CAP2_OAS_36_BIT: + ummu->cap.oas = CAP2_TTF_OAS_36; + break; + case CAP2_OAS_40_BIT: + ummu->cap.oas = CAP2_TTF_OAS_40; + break; + case CAP2_OAS_42_BIT: + ummu->cap.oas = CAP2_TTF_OAS_42; + break; + case CAP2_OAS_44_BIT: + ummu->cap.oas = CAP2_TTF_OAS_44; + break; + default: + dev_warn(ummu->dev, + "unknown output address size. truncating to 48-bit\n"); + fallthrough; + case CAP2_OAS_48_BIT: + ummu->cap.oas = CAP2_TTF_OAS_48; + break; + } +} + +static int ummu_device_hw_probe_cap2(struct ummu_device *ummu) +{ + u32 reg = readl_relaxed(ummu->base + UMMU_CAP2); + int ret = ummu_device_get_ttf(ummu, reg); + + if (ret) + return ret; + + ret = ummu_device_get_trans_stage(ummu, reg); + if (ret) + return ret; + + /* input address size */ + if (FIELD_GET(CAP2_VA_EXT_MASK, reg) == CAP2_VA_EXT_52) + ummu->cap.features |= UMMU_FEAT_VAX; + + ummu_device_get_pgsize(ummu, reg); + + ummu_device_get_oas(ummu, reg); + + ummu->cap.ias = max(ummu->cap.ias, ummu->cap.oas); + + if (FIELD_GET(CAP2_RTLBI_BIT, reg)) + ummu->cap.features |= UMMU_FEAT_RANGE_INV; + + if (FIELD_GET(CAP2_BTLBI_BIT, reg)) + ummu->cap.features |= UMMU_FEAT_BTM; + else + dev_warn(ummu->dev, "don't support BTM!\n"); + return 0; +} + +static void ummu_device_get_stall_model(struct ummu_device *ummu, u32 reg) +{ + switch (FIELD_GET(CAP3_STALL_MODEL_MASK, reg)) { + case CAP3_STALL_MODE_FORCE: + ummu->cap.features |= UMMU_FEAT_STALL_FORCE; + fallthrough; + case CAP3_STALL_MODE: + ummu->cap.features |= UMMU_FEAT_STALLS; + default: + break; + } +} + +static void ummu_device_get_httu(struct ummu_device *ummu, u32 reg) +{ + switch (FIELD_GET(CAP3_HTTU_MASK, reg)) { + case CAP3_HTTU_ACCESS_DIRTY: + ummu->cap.features |= UMMU_FEAT_HD; + fallthrough; + case CAP3_HTTU_ACCESS: + ummu->cap.features |= UMMU_FEAT_HA; + default: + break; + } +} + +static int ummu_device_get_ttendian(struct ummu_device *ummu, u32 reg) +{ + switch (FIELD_GET(CAP3_TTENDIAN_MASK, reg)) { + case CAP3_TTENDIAN_MIXED: + ummu->cap.features |= UMMU_FEAT_TT_LE | UMMU_FEAT_TT_BE; + break; +#ifdef __BIG_ENDIAN + case CAP3_TTENDIAN_BE: + break; +#else + case CAP3_TTENDIAN_LE: + break; +#endif + default: + dev_err(ummu->dev, "unknown/unsupported TT endianness!\n"); + return -ENXIO; + } + return 0; +} + +static void ummu_device_get_bbm_level(struct ummu_device *ummu, u32 reg) +{ + switch (FIELD_GET(CAP3_BBML_MASK, reg)) { + case CAP3_BBML0: + break; + case CAP3_BBML1: + ummu->cap.features |= UMMU_FEAT_BBML1; + break; + case CAP3_BBML2: + ummu->cap.features |= UMMU_FEAT_BBML2; + break; + default: + dev_warn(ummu->dev, "unknown/unsupported BBM behavior level\n"); + } +} + +static int ummu_device_hw_probe_cap3(struct ummu_device *ummu) +{ + u32 reg = readl_relaxed(ummu->base + UMMU_CAP3); + int ret; + + ummu_device_get_stall_model(ummu, reg); + + if (reg & CAP3_MSI_SUPPORT_BIT) + ummu->cap.features |= UMMU_FEAT_MSI; + + if (reg & CAP3_HYP_S1CTX_BIT) { + ummu->cap.features |= UMMU_FEAT_HYP; + if (cpus_have_cap(ARM64_HAS_VIRT_HOST_EXTN)) { + ummu->cap.features |= UMMU_FEAT_E2H; + pr_debug("support hypervisor and E2H\n"); + } + } + + ummu_device_get_httu(ummu, reg); + + if (reg & CAP3_MTM_BIT) + ummu->cap.features |= UMMU_FEAT_MTM; + + ret = ummu_device_get_ttendian(ummu, reg); + if (ret) + return ret; + + if (reg & CAP3_COHACC_BIT) { + ummu->cap.features |= UMMU_FEAT_COHERENCY; + if (ummu->cap.features & UMMU_FEAT_MSI) + ummu->cap.options |= UMMU_OPT_MSIPOLL; + } + + ummu_device_get_bbm_level(ummu, reg); + + return 0; +} + +static int ummu_device_hw_probe_cap4(struct ummu_device *ummu) +{ + u32 reg = readl_relaxed(ummu->base + UMMU_CAP4); + int hw_permq_ent; + + hw_permq_ent = 1 << FIELD_GET(CAP4_UCMDQ_LOG2SIZE, reg); + ummu->cap.permq_ent_num.cmdq_num = hw_permq_ent; + + hw_permq_ent = 1 << FIELD_GET(CAP4_UCPLQ_LOG2SIZE, reg); + ummu->cap.permq_ent_num.cplq_num = hw_permq_ent; + + return 0; +} + +static void ummu_device_hw_probe_cap5(struct ummu_device *ummu) +{ + u32 reg = readl_relaxed(ummu->base + UMMU_CAP5); + + if (reg & CAP5_RANGE_PLBI_BIT) + ummu->cap.features |= UMMU_FEAT_RANGE_PLBI; + + if (reg & CAP5_MAPT_SUPPORT) + ummu->cap.support_mapt = true; + + if (reg & CAP5_PT_GRAN4K_BIT) + ummu->cap.ptsize_bitmap |= SZ_4K; + + if (reg & CAP5_PT_GRAN2M_BIT) + ummu->cap.ptsize_bitmap |= SZ_2M; + + if (reg & CAP5_TKVALCHK_BIT) + ummu->cap.features |= UMMU_FEAT_TOKEN_CHK; + + /* + * the ASID and VMID capabilities are determined based on + * the bit widths of the ASID and VMID in the configuration table. + */ + ummu->cap.asid_bits = ilog2(UMMU_MAX_ASIDS); + ummu->cap.vmid_bits = ilog2(UMMU_MAX_VMIDS); + + dev_info(ummu->dev, "ias = %u-bit, oas = %u-bit, features = 0x%08x.\n", + ummu->cap.ias, ummu->cap.oas, ummu->cap.features); +} + +static void ummu_device_hw_probe_cap6(struct ummu_device *ummu) +{ + u32 reg; + + if (ummu->cap.features & UMMU_FEAT_MTM) { + reg = readl_relaxed(ummu->base + UMMU_CAP6); + ummu->cap.mtm_id_max = FIELD_GET(CAP6_MTM_ID_MAX, reg); + ummu->cap.mtm_gp_max = FIELD_GET(CAP6_MTM_GP_MAX, reg); + } + dev_dbg(ummu->dev, "partid_max %u, pmg_max %u.\n", ummu->cap.mtm_id_max, + ummu->cap.mtm_gp_max); +} + +static int ummu_device_hw_init(struct ummu_device *ummu) +{ + int ret; + + ummu_device_hw_probe_cap0(ummu); + ummu_device_hw_probe_cap1(ummu); + + ret = ummu_device_hw_probe_cap2(ummu); + if (ret) + return ret; + + ret = ummu_device_hw_probe_cap3(ummu); + if (ret) + return ret; + + ret = ummu_device_hw_probe_cap4(ummu); + if (ret) + return ret; + + ummu_device_hw_probe_cap5(ummu); + ummu_device_hw_probe_cap6(ummu); + ummu_device_hw_probe_ver(ummu); + + return 0; +} + +static void ummu_device_sync(struct ummu_device *ummu) +{ + u32 reg = readl_relaxed(ummu->base + UMMU_CR0); + + if (reg & CR0_UMMU_EN) { + dev_warn(ummu->dev, "ummu currently enabled! Resetting...\n"); + ummu_update_gbpa(ummu, GBPA_ABORT_BIT, 0); + } +} + +static int ummu_device_disable(struct ummu_device *ummu) +{ + int ret; + + ret = ummu_write_reg_sync(ummu, 0, UMMU_CR0, UMMU_CR0ACK); + if (ret) + dev_err(ummu->dev, "disable ummu interface failed, ret = %d.\n", ret); + + return ret; +} + +static int ummu_device_enable(struct ummu_device *ummu) +{ + int ret; + u32 cr0; + + cr0 = readl_relaxed(ummu->base + UMMU_CR0); + cr0 |= CR0_UMMU_EN; + ret = ummu_write_reg_sync(ummu, cr0, UMMU_CR0, UMMU_CR0ACK); + if (ret) + dev_err(ummu->dev, "enable ummu interface failed.\n"); + + return ret; +} + +static void ummu_device_set_mem_attr(struct ummu_device *ummu) +{ + u32 reg; + + reg = CR1_TECT_MODE_SEL | CR1_E2H | + FIELD_PREP(CR1_TABLE_SH, UMMU_SH_ISH) | + FIELD_PREP(CR1_TABLE_OC, UMMU_CACHE_WB) | + FIELD_PREP(CR1_TABLE_IC, UMMU_CACHE_WB) | + FIELD_PREP(CR1_QUEUE_SH, UMMU_SH_ISH) | + FIELD_PREP(CR1_QUEUE_OC, UMMU_CACHE_WB) | + FIELD_PREP(CR1_QUEUE_IC, UMMU_CACHE_WB); + + writel_relaxed(reg, ummu->base + UMMU_CR1); +} + +static int ummu_device_reset(struct ummu_device *ummu) +{ + int ret; + + ummu_device_sync(ummu); + + ret = ummu_device_disable(ummu); + if (ret) + return ret; + + /* set configuration table and queue memory attributes */ + ummu_device_set_mem_attr(ummu); + + ret = ummu_device_mcmdq_init_cfg(ummu); + if (ret) + return ret; + + ret = ummu_write_evtq_regs(ummu); + if (ret) + return ret; + + ummu_setup_irqs(ummu); + ummu_sync_tect_all(ummu); + ummu_init_flush_iotlb(ummu); + + return ummu_device_enable(ummu); +} + +static int ummu_device_ubrt_probe(struct ummu_device *ummu) +{ + struct fwnode_handle *fwnode = dev_fwnode(ummu->dev); + struct ubrt_fwnode *fw; + struct ummu_node *node; + + if (!fwnode) + return -EINVAL; + + fw = ubrt_fwnode_get(fwnode); + if (!fw) { + dev_err(ummu->dev, "get ubrt fwnode failed!\n"); + return -ENXIO; + } + + if (fw->type != UBRT_UMMU) { + dev_err(ummu->dev, "get invalid ubct type!\n"); + return -ESPIPE; + } + + node = (struct ummu_node *)fw->ubrt_node; + + ummu->core_dev.iommu.min_pasids = node->min_tid; + ummu->core_dev.iommu.max_pasids = node->max_tid; + + return 0; +} + +static int ummu_device_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct ummu_device *ummu; + struct resource *res; + int ret; + + ummu = devm_kzalloc(dev, sizeof(*ummu), GFP_KERNEL); + if (!ummu) + return -ENOMEM; + + ummu->dev = dev; + + ret = ummu_device_ubrt_probe(ummu); + if (ret) { + dev_err(dev, "failed to probe ummu_node: %d\n", ret); + return ret; + } + + /* Base address */ + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) { + dev_err(dev, "IO resource is null\n"); + return -EINVAL; + } + + /* + * Don't map the IMPLEMENTATION DEFINED regions, since they may contain + * the root registers which are reserved by the bios. + */ + ret = ummu_ioremap(ummu, res->start, UMMU_REG_SZ); + if (ret) + return ret; + + /* hardware init */ + ret = ummu_device_hw_init(ummu); + if (ret) + return ret; + + /* Initialise in-memory data structures */ + ret = ummu_init_structures(ummu); + if (ret) + return ret; + + /* record ummu device */ + platform_set_drvdata(pdev, ummu); + + ret = ummu_device_reset(ummu); + if (ret) + return ret; + + ret = ummu_device_register(ummu); + if (ret) + dev_err(dev, "probe ummu device failed, ret = %d.\n", ret); + + return ret; +} + +static int ummu_device_remove(struct platform_device *pdev) +{ + struct ummu_device *ummu = platform_get_drvdata(pdev); + + ummu_device_disable(ummu); + ummu_device_unregister(ummu); + + dev_dbg(&pdev->dev, "Remove ummu successful!\n"); + return 0; +} + +static void ummu_device_shutdown(struct platform_device *pdev) +{ + struct ummu_device *ummu = platform_get_drvdata(pdev); + + ummu_device_disable(ummu); +} + +static const struct of_device_id hisi_ummu_of_match[] = { + { .compatible = "ub,ummu", }, + { } +}; +MODULE_DEVICE_TABLE(of, hisi_ummu_of_match); + +static const struct acpi_device_id hisi_ummu_acpi_match[] = { + { "HISI0551", 0 }, + { } +}; +MODULE_DEVICE_TABLE(acpi, hisi_ummu_acpi_match); + +struct platform_driver ummu_driver = { + .driver = { + .name = UMMU_DRV_NAME, + .suppress_bind_attrs = true, + .of_match_table = hisi_ummu_of_match, + .acpi_match_table = hisi_ummu_acpi_match, + }, + .probe = ummu_device_probe, + .remove = ummu_device_remove, + .shutdown = ummu_device_shutdown, +}; + +module_driver(ummu_driver, platform_driver_register, platform_driver_unregister); + +MODULE_IMPORT_NS(UMMU_CORE_DRIVER); +MODULE_DESCRIPTION("Hisilicon ummu driver"); +MODULE_AUTHOR("HiSilicon Tech. Co., Ltd."); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:" UMMU_DRV_NAME);