diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig
index a66e4630d25aaf0b42df9b4d6158a7294f4407dc..7b78c14a6e5e657b70c47ce0fdbab3f2d10ea6ad 100644
--- a/arch/arm64/configs/openeuler_defconfig
+++ b/arch/arm64/configs/openeuler_defconfig
@@ -8345,4 +8345,5 @@ CONFIG_UB_UBASE=m
 CONFIG_UB_UMMU_CORE=y
 CONFIG_UB_UMMU_CORE_DRIVER=m
 CONFIG_UB_UMMU_PMU=m
+CONFIG_UB_UMMU=m
 # end of UMMU
diff --git a/drivers/iommu/hisilicon/Kconfig b/drivers/iommu/hisilicon/Kconfig
index 76acd888b65e11f347cfcbf46012cba72568f683..60308ece229fe8e4f57a4d92298f4c6183bbb659 100644
--- a/drivers/iommu/hisilicon/Kconfig
+++ b/drivers/iommu/hisilicon/Kconfig
@@ -4,3 +4,20 @@
 #
 
 source "drivers/iommu/hisilicon/ummu-core/Kconfig"
+
+config UB_UMMU
+	tristate "Hisilicon UB MMU Support"
+	depends on ARM64 && ARCH_HISI
+	depends on UB_UBUS && UB_UBFI && UB_UBRT_PLAT_DEV
+	default n
+	select IOMMU_API
+	select IOMMU_IO_PGTABLE_LPAE
+	select GENERIC_MSI_IRQ
+	select IOMMUFD_DRIVER if IOMMUFD
+	select UMMU_CORE
+	help
+	  Support for implementations of the hisilicon UMMU architecture.
+	  UMMU provides address translation for device access to the
+	  local host.
+	  Say Y here if your Soc includes an UMMU device implementing
+	  the Hisilicon UMMU architecture.
diff --git a/drivers/iommu/hisilicon/Makefile b/drivers/iommu/hisilicon/Makefile
index e32879971f3fd702c4e0f85a6c8ce59fc69f3171..5064e5df09695c4da3b05f5761b4eb6afb63850e 100644
--- a/drivers/iommu/hisilicon/Makefile
+++ b/drivers/iommu/hisilicon/Makefile
@@ -1,3 +1,8 @@
 # SPDX-License-Identifier: GPL-2.0+
 
 obj-y += ummu-core/
+obj-$(CONFIG_UB_UMMU) += ummu.o
+ummu-y := ummu_main.o \
+	  queue.o \
+	  interrupt.o \
+	  flush.o
diff --git a/drivers/iommu/hisilicon/flush.c b/drivers/iommu/hisilicon/flush.c
new file mode 100644
index 0000000000000000000000000000000000000000..464c93c8c2a3db874bc91e5a8af9a31dab6b1502
--- /dev/null
+++ b/drivers/iommu/hisilicon/flush.c
@@ -0,0 +1,433 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (c) 2025 HiSilicon Technologies Co., Ltd. All rights reserved.
+ */
+
+#include <uapi/linux/iommufd.h>
+#include <linux/platform_device.h>
+#include <linux/preempt.h>
+#include <linux/bitops.h>
+
+#include "ummu.h"
+#include "queue.h"
+#include "flush.h"
+
+enum ummu_tlbi_scene {
+	UMMU_TLBI_SCENE_DMA = 0,
+	UMMU_TLBI_SCENE_SVA,
+	UMMU_TLBI_SCENE_NUM,
+};
+
+enum ummu_tlbi_scope {
+	UMMU_TLBI_SCOPE_CTX = 0,
+	UMMU_TLBI_SCOPE_RNG,
+	UMMU_TLBI_SCOPE_NUM,
+};
+
+enum ummu_tlbi_type {
+	UMMU_TLBI_TYPE_S1E2H = 0,
+	UMMU_TLBI_TYPE_S1NH,
+	UMMU_TLBI_TYPE_S2,
+	UMMU_TLBI_TYPE_NUM,
+};
+
+const static
+u8 ummu_tlbi_code_table[UMMU_TLBI_SCENE_NUM][UMMU_TLBI_SCOPE_NUM][UMMU_TLBI_TYPE_NUM] = {
+	[UMMU_TLBI_SCENE_DMA] = {
+		[UMMU_TLBI_SCOPE_CTX] = {
+			[UMMU_TLBI_TYPE_S1E2H]	= CMD_TLBI_HYP_TID,
+			[UMMU_TLBI_TYPE_S1NH]	= CMD_TLBI_OS_TID,
+			[UMMU_TLBI_TYPE_S2]	= CMD_TLBI_S1S2_VMALL,
+		},
+		[UMMU_TLBI_SCOPE_RNG] =  {
+			[UMMU_TLBI_TYPE_S1E2H]	= CMD_TLBI_HYP_VA,
+			[UMMU_TLBI_TYPE_S1NH]	= CMD_TLBI_OS_TID,
+			[UMMU_TLBI_TYPE_S2]	= CMD_TLBI_OS_VA,
+		},
+	},
+	[UMMU_TLBI_SCENE_SVA] = {
+		[UMMU_TLBI_SCOPE_CTX] = {
+			[UMMU_TLBI_TYPE_S1E2H]	= CMD_TLBI_HYP_ASID_U,
+			[UMMU_TLBI_TYPE_S1NH]	= CMD_TLBI_OS_ASID_U,
+			[UMMU_TLBI_TYPE_S2]	= CMD_TLBI_OS_ASID_U,
+		},
+		[UMMU_TLBI_SCOPE_RNG] =  {
+			[UMMU_TLBI_TYPE_S1E2H]	= CMD_TLBI_HYP_VA_U,
+			[UMMU_TLBI_TYPE_S1NH]	= CMD_TLBI_OS_ASID_U,
+			[UMMU_TLBI_TYPE_S2]	= CMD_TLBI_OS_VA_U,
+		},
+	},
+};
+
+static int ummu_domain_tlbi_cmd(struct ummu_domain *domain,
+				enum ummu_tlbi_scope scope,
+				enum ummu_tlbi_scene scene,
+				struct ummu_mcmdq_ent *cmd)
+{
+	struct ummu_s1_cfg *s1cfg = &domain->cfgs.s1_cfg;
+	struct ummu_s2_cfg *s2cfg = &domain->cfgs.s2_cfg;
+	struct ummu_device *ummu_dev;
+	enum ummu_tlbi_type type;
+	bool e2h;
+
+	switch (domain->cfgs.stage) {
+	case UMMU_DOMAIN_S1:
+		if (scene == UMMU_TLBI_SCENE_DMA) {
+			if (domain->base_domain.tid == UMMU_INVALID_TID)
+				return -EINVAL;
+
+			cmd->tlbi.tid = domain->base_domain.tid;
+		} else {
+			cmd->tlbi.asid = s1cfg->tct.asid;
+		}
+
+		ummu_dev = core_to_ummu_device(domain->base_domain.core_dev);
+		e2h = !!(ummu_dev->cap.features & UMMU_FEAT_E2H);
+		type = e2h ? UMMU_TLBI_TYPE_S1E2H : UMMU_TLBI_TYPE_S1NH;
+		break;
+	case UMMU_DOMAIN_S2:
+		if (scene == UMMU_TLBI_SCENE_DMA)
+			cmd->tlbi.tect_tag = domain->cfgs.tecte_tag;
+		else
+			cmd->tlbi.vmid = s2cfg->vmid;
+
+		type = UMMU_TLBI_TYPE_S2;
+		break;
+	default:
+		WARN(1, "get unexpected domain stage: %d",
+			 (int)domain->cfgs.stage);
+		return -EINVAL;
+	}
+
+	cmd->opcode = ummu_tlbi_code_table[scene][scope][type];
+	return 0;
+}
+
+static void ummu_range_tlbi_nofeat(struct ummu_device *ummu,
+				   struct ummu_mcmdq_ent *cmd,
+				   struct ummu_tlb_range *range)
+{
+	unsigned long rg_start = range->iova, rg_end = range->iova + range->size;
+	struct ummu_mcmdq_batch batch_cmds = {};
+
+	while (rg_start < rg_end) {
+		cmd->tlbi.addr = rg_start;
+		ummu_mcmdq_batch_add(ummu, &batch_cmds, cmd);
+		rg_start += range->granule;
+	}
+	ummu_mcmdq_batch_submit(ummu, &batch_cmds);
+}
+
+/* `granule` is inv granule and `translation_granule` is the granule of page table */
+#define granule_to_lvl(granule, translation_granule) \
+	(4 - (ilog2(granule) - 3) / ((translation_granule)-3))
+/* this function highly rely on pagetable format, follow arm implementation now */
+static void __ummu_tlbi_range(struct ummu_mcmdq_ent *cmd,
+			      struct ummu_tlb_range *range,
+			      struct ummu_domain *domain)
+{
+	struct ummu_device *ummu = core_to_ummu_device(domain->base_domain.core_dev);
+	unsigned long num_pages, gs, rg_start, rg_end, scale, num;
+	struct ummu_mcmdq_batch batch_cmds = {};
+	size_t ranged;
+
+	if (range->iova == ULONG_MAX || range->size == 0)
+		return;
+
+	if (!(ummu->cap.features & UMMU_FEAT_RANGE_INV)) {
+		ummu_range_tlbi_nofeat(ummu, cmd, range);
+		return;
+	}
+
+	rg_start = range->iova;
+	rg_end = rg_start + range->size;
+	/* tg will be 12, 14, 16, indicating 4K, 16K, 64K pgtable */
+	gs = __ffs(domain->base_domain.domain.pgsize_bitmap);
+	num_pages = range->size >> gs;
+
+	/* transfer 12,14,16 to 1,2,3, refer to the protocol */
+	cmd->tlbi.gs = (gs - 10) >> 1;
+	cmd->tlbi.tl = granule_to_lvl(range->granule, gs);
+
+	while (rg_start < rg_end) {
+		cmd->tlbi.addr = rg_start;
+
+		scale = __ffs(num_pages);
+		cmd->tlbi.scale = scale;
+
+		num = (num_pages >> scale) & CMD_TLBI_RANGE_NUM_MAX;
+		cmd->tlbi.num = num - 1;
+
+		ummu_mcmdq_batch_add(ummu, &batch_cmds, cmd);
+
+		ranged = num << (scale + gs);
+		num_pages -= num << scale;
+		rg_start += ranged;
+	}
+	ummu_mcmdq_batch_submit(ummu, &batch_cmds);
+}
+
+static void ummu_tlbi_range(struct ummu_tlb_range *range, bool leaf,
+			    struct ummu_domain *domain)
+{
+	struct ummu_mcmdq_ent cmd = {0};
+	int err;
+
+	err = ummu_domain_tlbi_cmd(domain, UMMU_TLBI_SCOPE_RNG, UMMU_TLBI_SCENE_DMA, &cmd);
+	if (err)
+		return;
+
+	cmd.tlbi.leaf = leaf;
+	__ummu_tlbi_range(&cmd, range, domain);
+}
+
+/* for io_pgtable */
+void ummu_tlbi_context(void *cookie)
+{
+	struct ummu_domain *domain = (struct ummu_domain *)cookie;
+	struct ummu_device *ummu = core_to_ummu_device(
+					domain->base_domain.core_dev);
+	struct ummu_mcmdq_ent cmd = {0};
+	int err;
+
+	err = ummu_domain_tlbi_cmd(domain, UMMU_TLBI_SCOPE_CTX, UMMU_TLBI_SCENE_DMA, &cmd);
+	if (err)
+		return;
+
+	ummu_mcmdq_issue_cmd_with_sync(ummu, &cmd);
+}
+
+void ummu_tlbi_walk(unsigned long iova, size_t size, size_t granule,
+		    void *cookie)
+{
+	struct ummu_domain *domain = (struct ummu_domain *)cookie;
+	struct ummu_tlb_range range = {
+		.iova = iova,
+		.size = size,
+		.granule = granule,
+	};
+
+	ummu_tlbi_range(&range, false, domain);
+}
+
+void ummu_tlbi_page(struct iommu_iotlb_gather *gather, unsigned long iova,
+		    size_t granule, void *cookie)
+{
+	struct ummu_domain *domain = (struct ummu_domain *)cookie;
+
+	iommu_iotlb_gather_add_page(&domain->base_domain.domain, gather, iova, granule);
+}
+
+void ummu_iotlb_sync(struct iommu_domain *domain,
+		     struct iommu_iotlb_gather *gather)
+{
+	struct ummu_domain *u_domain = to_ummu_domain(domain);
+	struct ummu_tlb_range range = {
+		.iova = gather->start,
+		.size = gather->end - gather->start + 1,
+		.granule = gather->pgsize,
+	};
+
+	ummu_tlbi_range(&range, true, u_domain);
+}
+
+void ummu_non_agent_iotlb_sync(struct iommu_domain *domain,
+			       struct iommu_iotlb_gather *gather)
+{
+	struct ummu_domain *u_domain = to_ummu_domain(domain);
+	struct ummu_tlb_range range = {
+		.iova = gather->start,
+		.size = gather->end - gather->start + 1,
+		.granule = gather->pgsize,
+	};
+
+	ummu_tlbi_range(&range, false, u_domain);
+}
+
+void ummu_flush_iotlb_all(struct iommu_domain *domain)
+{
+	struct ummu_domain *u_domain = to_ummu_domain(domain);
+
+	ummu_tlbi_context(u_domain);
+}
+
+void ummu_init_flush_iotlb(struct ummu_device *ummu)
+{
+	struct ummu_mcmdq_ent cmd;
+
+	if (ummu->cap.features & UMMU_FEAT_HYP) {
+		cmd.opcode = CMD_TLBI_HYP_ALL;
+		ummu_mcmdq_issue_cmd_with_sync(ummu, &cmd);
+	}
+
+	cmd.opcode = CMD_TLBI_NS_OS_ALL;
+	ummu_mcmdq_issue_cmd_with_sync(ummu, &cmd);
+}
+
+void ummu_device_prefetch_cfg(struct ummu_device *ummu, u32 tecte_tag,
+			      u32 tid)
+{
+	struct ummu_mcmdq_ent cmd_prefet = {
+		.opcode = CMD_PREFET_CFG,
+		.prefet = {
+			.tkv = (tid == UMMU_INVALID_TID) ? false : true,
+			.tid = tid,
+			.deid_0 = tecte_tag,
+		},
+	};
+
+	ummu_mcmdq_issue_cmd_with_sync(ummu, &cmd_prefet);
+}
+
+void ummu_sync_tect_range(struct ummu_device *ummu, u32 tecte_tag,
+			  u8 range)
+{
+	struct ummu_mcmdq_ent cmd_cfgi_tect_range = {
+		.opcode = CMD_CFGI_TECT_RANGE,
+		.cfgi = {
+			.range = range,
+			.deid_0 = tecte_tag,
+		},
+	};
+
+	ummu_mcmdq_issue_cmd_with_sync(ummu, &cmd_cfgi_tect_range);
+}
+
+void ummu_sync_tect_all(struct ummu_device *ummu)
+{
+	ummu_sync_tect_range(ummu, 0, CMD_TLBI_RANGE_NUM_MAX);
+}
+
+void ummu_device_sync_tect(struct ummu_device *ummu, u32 tecte_tag)
+{
+	struct ummu_mcmdq_ent cmd_cfgi_tect = {
+		.opcode = CMD_CFGI_TECT,
+		.cfgi = {
+			.leaf = true,
+			.deid_0 = tecte_tag,
+		},
+	};
+
+	ummu_mcmdq_issue_cmd_with_sync(ummu, &cmd_cfgi_tect);
+}
+
+void ummu_sync_tct(struct ummu_device *ummu, u32 tecte_tag, u32 tid,
+		   bool leaf)
+{
+	struct ummu_mcmdq_ent cmd_cfgi_tct = {
+		.opcode = CMD_CFGI_TCT,
+		.cfgi = {
+			.leaf = leaf,
+			.tid = tid,
+			.deid_0 = tecte_tag,
+		},
+	};
+	struct ummu_mcmdq_ent cmd_plbi_all = {
+		.opcode = CMD_PLBI_OS_EIDTID,
+		.plbi = {
+			.tid = tid,
+			.tecte_tag = tecte_tag,
+		},
+	};
+
+	if (!ummu->cap.prod_ver)
+		ummu_mcmdq_issue_cmd(ummu, &cmd_plbi_all);
+	ummu_mcmdq_issue_cmd_with_sync(ummu, &cmd_cfgi_tct);
+}
+
+void ummu_sync_tct_all(struct ummu_device *ummu, u32 tecte_tag)
+{
+	struct ummu_mcmdq_ent cmd_cfgi_tct_all = {
+		.opcode = CMD_CFGI_TCT_ALL,
+		.cfgi = {
+			.deid_0 = tecte_tag,
+		},
+	};
+
+	ummu_mcmdq_issue_cmd_with_sync(ummu, &cmd_cfgi_tct_all);
+}
+
+static u8 get_minist_log2size_range(size_t size)
+{
+	u8 index = 0;
+
+	if (size > 0)
+		size -= 1;
+
+	while (size > 0) {
+		size >>= 1;
+		index++;
+	}
+
+	return index;
+}
+
+int ummu_device_flush_plb(struct ummu_device *ummu, u32 tag, u32 tid,
+			  u64 addr, size_t size)
+{
+	u32 plbi_num = (ummu->cap.options & UMMU_OPT_DOUBLE_PLBI) ? 2 : 1;
+	struct ummu_mcmdq_ent cmd = {
+		.opcode = CMD_PLBI_OS_VA,
+		.plbi = {
+			.tid = tid,
+			.tecte_tag = (u16)tag,
+			.range = get_minist_log2size_range(size),
+			.addr = addr,
+		},
+	};
+	u32 idx;
+	int ret;
+
+	for (idx = 0; idx < plbi_num; idx++) {
+		ret = ummu_mcmdq_issue_cmd_with_sync(ummu, &cmd);
+		if (ret)
+			dev_err(ummu->dev,
+				"issue plbi va cmd failed, idx = %u, ret = %d\n", idx, ret);
+	}
+
+	return ret;
+}
+
+void ummu_device_flush_plb_all(struct iommu_domain *domain)
+{
+	struct ummu_base_domain *base_domain = to_ummu_base_domain(domain);
+	struct ummu_device *ummu = core_to_ummu_device(base_domain->core_dev);
+	u32 plbi_num = (ummu->cap.options & UMMU_OPT_DOUBLE_PLBI) ? 2 : 1;
+	struct ummu_domain *u_domain = to_ummu_domain(domain);
+	struct ummu_mcmdq_ent cmd = {
+		.opcode = CMD_PLBI_OS_EIDTID,
+		.plbi = {
+			.tid = base_domain->tid,
+			.tecte_tag = u_domain->cfgs.tecte_tag,
+		},
+	};
+	u32 idx;
+	int ret;
+
+	for (idx = 0; idx < plbi_num; idx++) {
+		ret = ummu_mcmdq_issue_cmd_with_sync(ummu, &cmd);
+		if (ret)
+			dev_err(ummu->dev,
+				"issue plbi tid cmd failed, idx = %u, ret = %d\n", idx, ret);
+	}
+}
+
+int ummu_device_check_pa_continuity(struct ummu_device *ummu, u64 addr,
+				    u32 size_order, u32 id)
+{
+	struct ummu_mcmdq_ent cmd_ent = {
+		.opcode = CMD_NULL_OP,
+		.null_op = {
+			.sub_op = SUB_CMD_NULL_CHECK_PA_CONTINUITY,
+			.check_pa_conti = {
+				.result = 0,
+				.flag = 0,
+				.size_order = size_order,
+				.id = id,
+				.addr = addr,
+			},
+		},
+	};
+
+	return ummu_mcmdq_issue_cmd_with_sync(ummu, &cmd_ent);
+}
diff --git a/drivers/iommu/hisilicon/flush.h b/drivers/iommu/hisilicon/flush.h
new file mode 100644
index 0000000000000000000000000000000000000000..6799ec5595c953dab789bd29d51b153169fedcc6
--- /dev/null
+++ b/drivers/iommu/hisilicon/flush.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Copyright (c) 2025 HiSilicon Technologies Co., Ltd. All rights reserved.
+ */
+
+#ifndef __UMMU_FLUSH_H__
+#define __UMMU_FLUSH_H__
+
+#include "ummu.h"
+
+struct ummu_tlb_range {
+	unsigned long iova;
+	size_t size;
+	size_t granule;
+};
+
+/* for ummu device reset */
+void ummu_init_flush_iotlb(struct ummu_device *ummu);
+
+/* for default_domain_ops */
+void ummu_flush_iotlb_all(struct iommu_domain *iommu_domain);
+void ummu_iotlb_sync(struct iommu_domain *iommu_domain,
+		     struct iommu_iotlb_gather *gather);
+void ummu_non_agent_iotlb_sync(struct iommu_domain *iommu_domain,
+			       struct iommu_iotlb_gather *gather);
+
+/* for io_pgtable */
+void ummu_tlbi_context(void *cookie);
+void ummu_tlbi_walk(unsigned long iova, size_t size, size_t granule,
+		    void *cookie);
+void ummu_tlbi_page(struct iommu_iotlb_gather *gather, unsigned long iova,
+		    size_t granule, void *cookie);
+
+void ummu_device_prefetch_cfg(struct ummu_device *ummu, u32 tecte_tag,
+			      u32 tid);
+void ummu_sync_tect_range(struct ummu_device *ummu, u32 tecte_tag,
+			  u8 range);
+void ummu_sync_tect_all(struct ummu_device *ummu);
+void ummu_device_sync_tect(struct ummu_device *ummu, u32 tecte_tag);
+void ummu_sync_tct(struct ummu_device *ummu, u32 tecte_tag, u32 tid,
+		   bool leaf);
+void ummu_sync_tct_all(struct ummu_device *ummu, u32 tecte_tag);
+int ummu_device_flush_plb(struct ummu_device *ummu, u32 tag, u32 tid,
+			  u64 addr, size_t size);
+void ummu_device_flush_plb_all(struct iommu_domain *iommu_domain);
+int ummu_device_check_pa_continuity(struct ummu_device *ummu, u64 addr,
+				    u32 size_order, u32 id);
+#endif /*__UMMU_FLUSH_H__*/
diff --git a/drivers/iommu/hisilicon/interrupt.c b/drivers/iommu/hisilicon/interrupt.c
new file mode 100644
index 0000000000000000000000000000000000000000..7c2d8d3d942999560580e76a15c6a016b7e8cb4d
--- /dev/null
+++ b/drivers/iommu/hisilicon/interrupt.c
@@ -0,0 +1,388 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (c) 2025 HiSilicon Technologies Co., Ltd. All rights reserved.
+ * Description: UMMU Interrupt Management
+ */
+
+#define pr_fmt(fmt) "UMMU: " fmt
+#include <linux/interrupt.h>
+
+#include "ummu.h"
+#include "queue.h"
+#include "regs.h"
+#include "interrupt.h"
+
+#define EVT_LOG_LIMIT_TIMEOUT 5000
+
+enum ummu_evtq_evt_name {
+	EVT_UNKNOWN = 0x0,
+	/* unsupport translation type */
+	EVT_UT,
+	/* dstEid overflow */
+	EVT_BAD_DSTEID,
+	/* abort when visit tect, or addr overflow */
+	EVT_TECT_FETCH,
+	/* TECT not valid, (V=0) */
+	EVT_BAD_TECT,
+	/* tect ent lack tokenid */
+	EVT_RESERVE_0 = 0x5,
+	/* reserved, no content */
+	EVT_BAD_TOKENID,
+	/*
+	 * 1. TECT.TCT_MAXNUM = 0, tokenid disable,
+	 * 2. TECT.ST_MODE[0] = 0, stage 1 translation close.
+	 * 3. tokenid > TECT.TCT_MAXNUM
+	 * 4. lvl1 tct invalid in two-level tct
+	 */
+	EVT_TCT_FETCH,
+	/* invalid tct */
+	EVT_BAD_TCT,
+	/* error when Address Table walk */
+	EVT_A_PTW_EABT,
+	/*
+	 * translation input bigger than max valid value,
+	 * or no valid translation table descriptor
+	 */
+	EVT_A_TRANSLATION = 0xa,
+	/* address translation out put bigger than max valid value */
+	EVT_A_ADDR_SIZE,
+	/* Access flag fault because of AF=0 */
+	EVT_ACCESS,
+	/* address translation permission error */
+	EVT_A_PERMISSION,
+	/* TLB or PLB conflicted in translation */
+	EVT_TBU_CONFLICT,
+	/* config cache conflicted in translation */
+	EVT_CFG_CONFLICT = 0xf,
+	/* error occurred when getting VMS */
+	EVT_VMS_FETCH,
+	/* error when Permission Table walk */
+	EVT_P_PTW_EABT,
+	/* abnormal software configuration in PTW */
+	EVT_P_CFG_ERROR,
+	/* permission exception in PTW process */
+	EVT_P_PERMISSION,
+	/* E-Bit verification failed */
+	EVT_RESERVE_1 = 0x14,
+	/* reserved, no content */
+	EVT_EBIT_DENY,
+	/*
+	 * the UMMU hardware reports the execution result
+	 * of the CMD_CREAT_DSTEID_TECT_RELATION command
+	 * to the software.
+	 */
+	EVT_CREATE_DSTEID_TECT_RELATION_RESULT = 0x60,
+	/*
+	 * the UMMU hardware reports the execution result
+	 * of the CMD_DELETE_DSTEID_TECT_RELATION command
+	 * to the software.
+	 */
+	EVT_DELETE_DSTEID_TECT_RELATION_RESULT,
+};
+
+static phys_addr_t ummu_msi_cfg[UMMU_MAX_MSIS][3] = {
+	[EVTQ_MSI_INDEX] = {
+		UMMU_EVENT_QUE_MSI_ADDR0,
+		UMMU_EVENT_QUE_MSI_DATA,
+		UMMU_EVENT_QUE_MSI_ATTR,
+	},
+	[GERROR_MSI_INDEX] = {
+		UMMU_GLB_ERR_INT_MSI_ADDR0,
+		UMMU_GLB_ERR_INT_MSI_DATA,
+		UMMU_GLB_ERR_INT_MSI_ATTR,
+	},
+};
+
+/* implementation is based on the ARM SMMU arm_smmu_cmdq_skip_err */
+static void ummu_device_mcmdq_skip_err(struct ummu_device *ummu,
+				  struct ummu_queue *q)
+{
+	static const char * const cerror_str[] = {
+		[MCMDQ_CERROR_NONE_IDX] = "No error",
+		[MCMDQ_CERROR_ILL_IDX] = "Illegal command",
+		[MCMDQ_CERROR_ABT_IDX] = "Abort on command fetch",
+	};
+
+	u32 cons = readl_relaxed(q->cons_reg);
+	u32 rsn_idx = FIELD_GET(MCMDQ_CONS_ERR_REASON, cons);
+	struct ummu_mcmdq_ent cmd_sync = {
+		.opcode = CMD_SYNC,
+	};
+	u64 cmd[MCMDQ_ENT_DWORDS];
+	size_t i;
+
+	dev_err_ratelimited(ummu->dev, "MCMDQ error (cons 0x%08x): %s\n", cons,
+		rsn_idx < ARRAY_SIZE(cerror_str) ? cerror_str[rsn_idx] :
+							 "Unknown");
+
+	switch (rsn_idx) {
+	case MCMDQ_CERROR_ABT_IDX:
+		dev_err_ratelimited(ummu->dev, "retrying command fetch\n");
+		return;
+	case MCMDQ_CERROR_NONE_IDX:
+		return;
+	case MCMDQ_CERROR_ILL_IDX:
+		break;
+	default:
+		break;
+	}
+
+	/*
+	 * We may have concurrent producers, so we need to be careful
+	 * not to touch any of the shadow cmdq state.
+	 */
+	ummu_queue_read(cmd, Q_ENT(q, cons), q->ent_dwords);
+	dev_err_ratelimited(ummu->dev, "skipping command in error state:\n");
+	for (i = 0; i < ARRAY_SIZE(cmd); ++i)
+		dev_err_ratelimited(ummu->dev, "\t0x%016llx\n", (unsigned long long)cmd[i]);
+
+	/* Convert the erroneous command into a CMD_SYNC */
+	if (ummu_mcmdq_build_cmd(ummu, cmd, &cmd_sync)) {
+		dev_err_ratelimited(ummu->dev, "failed to convert to CMD_SYNC\n");
+		return;
+	}
+
+	ummu_queue_write(Q_ENT(q, cons), cmd, q->ent_dwords);
+}
+
+static void ummu_mcmdq_skip_err(struct ummu_device *ummu)
+{
+	struct ummu_mcmdq *mcmdq;
+	unsigned long flags;
+	u32 prod, cons;
+	u32 i;
+
+	for (i = 0; i < ummu->nr_mcmdq; i++) {
+		mcmdq = *per_cpu_ptr(ummu->mcmdq, i);
+		prod = readl_relaxed(mcmdq->q.prod_reg);
+		cons = readl_relaxed(mcmdq->q.cons_reg);
+		if (((prod ^ cons) & MCMDQ_CONS_ERR) == 0)
+			continue;
+
+		ummu_device_mcmdq_skip_err(ummu, &mcmdq->q);
+
+		write_lock_irqsave(&mcmdq->mcmdq_lock, flags);
+		mcmdq->mcmdq_prod &= ~MCMDQ_PROD_ERRACK;
+		mcmdq->mcmdq_prod |= cons & MCMDQ_CONS_ERR;
+
+		prod = readl_relaxed(mcmdq->q.prod_reg);
+		prod &= ~MCMDQ_PROD_ERRACK;
+		prod |= cons & MCMDQ_CONS_ERR;
+		writel(prod, mcmdq->q.prod_reg);
+		write_unlock_irqrestore(&mcmdq->mcmdq_lock, flags);
+	}
+}
+
+static irqreturn_t ummu_gerror_handler(int irq, void *dev)
+{
+	struct ummu_device *ummu = (struct ummu_device *)dev;
+	u32 gerror, gerrorn, active;
+
+	gerror = readl_relaxed(ummu->base + UMMU_GERROR);
+	gerrorn = readl_relaxed(ummu->base + UMMU_GERRORN);
+
+	active = gerror ^ gerrorn;
+	if (!(active & GERROR_ERR_MASK))
+		return IRQ_NONE; /* No errors pending */
+
+	dev_err_ratelimited(
+		ummu->dev,
+		"unexpected global error reported (0x%08x), this could be serious\n",
+		active);
+
+	if (active & GERROR_MSI_GERR_ABT_ERR)
+		dev_err_ratelimited(ummu->dev, "GERROR MSI write aborted\n");
+
+	if (active & GERROR_MSI_UIEQ_ABT_ERR)
+		dev_err_ratelimited(ummu->dev, "UIEQ MSI sync cmdq write aborted\n");
+
+	if (active & GERROR_MSI_EVTQ_ABT_ERR)
+		dev_err_ratelimited(ummu->dev, "EVTQ MSI write aborted\n");
+
+	if (active & GERROR_MSI_MCMDQ_ABT_ERR)
+		dev_err_ratelimited(ummu->dev, "CMDQ MSI write aborted\n");
+
+	if (active & GERROR_EVTQ_ABT_ERR)
+		dev_err_ratelimited(ummu->dev,
+			"EVTQ write aborted -- events may have been lost\n");
+
+	if (active & GERROR_MCMDQ_ERR)
+		ummu_mcmdq_skip_err(ummu);
+
+	writel(gerror, ummu->base + UMMU_GERRORN);
+	return IRQ_HANDLED;
+}
+
+static void ummu_print_event(struct ummu_device *ummu, u8 code, u64 *evt)
+{
+	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
+				      DEFAULT_RATELIMIT_BURST);
+	static u8 last_evt_code;
+	static u64 timeout;
+	int i;
+
+	if (!__ratelimit(&rs))
+		return;
+
+	if (last_evt_code == code && time_is_after_jiffies64(timeout))
+		return;
+
+	last_evt_code = code;
+	timeout = get_jiffies_64() + msecs_to_jiffies(EVT_LOG_LIMIT_TIMEOUT);
+	dev_info(ummu->dev, "event 0x%02x received:\n", code);
+	for (i = 0; i < EVTQ_ENT_DWORDS; ++i)
+		dev_info(ummu->dev, "\t0x%016llx\n", (u64)evt[i]);
+}
+
+/* implementation is based on the ARM SMMU arm_smmu_evtq_thread */
+static irqreturn_t ummu_evtq_thread(int irq, void *dev)
+{
+	struct ummu_device *ummu = (struct ummu_device *)dev;
+	struct ummu_queue *q = &ummu->evtq.q;
+	struct ummu_ll_queue *llq = &q->llq;
+	u64 evt[EVTQ_ENT_DWORDS];
+	u32 tid;
+	u8 code;
+
+	do {
+		while (!ummu_queue_remove_raw(q, evt)) {
+			code = FIELD_GET(EVTQ_ENT0_CODE, evt[0]);
+			tid = FIELD_GET(EVTQ_ENT0_TID, evt[0]);
+
+			ummu_print_event(ummu, code, evt);
+
+			cond_resched();
+		}
+
+		if (ummu_queue_sync_prod_in(q) == -EOVERFLOW)
+			dev_err(ummu->dev,
+				"EVTQ overflow detected -- events lost\n");
+	} while (!ummu_queue_empty(llq));
+
+	if (likely(Q_OVF(llq->prod) == Q_OVF(llq->cons)))
+		goto handled;
+
+	/* Sync overflow flag */
+	llq->cons = Q_OVF(llq->prod) | Q_WRP(llq, llq->cons) |
+		    Q_IDX(llq, llq->cons);
+	__iomb();
+	writel_relaxed(q->llq.cons, q->cons_reg);
+handled:
+	return IRQ_HANDLED;
+}
+
+static void ummu_free_msis(void *data)
+{
+	struct device *dev = (struct device *)data;
+
+	platform_msi_domain_free_irqs(dev);
+}
+
+static void ummu_write_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
+{
+	struct device *dev = msi_desc_to_dev(desc);
+	struct ummu_device *ummu = dev_get_drvdata(dev);
+	phys_addr_t msi_addr;
+	phys_addr_t *cfg;
+
+	if (desc->msi_index > GERROR_MSI_INDEX)
+		return;
+
+	cfg = ummu_msi_cfg[desc->msi_index];
+	/* 32 bit addresses are converted to 64 bit addresses. */
+	msi_addr = (((u64)msg->address_hi) << 32) | msg->address_lo;
+	msi_addr &= UMMU_MSI_ADDR_MASK;
+
+	writeq_relaxed(msi_addr, ummu->base + cfg[0]);
+	writel_relaxed(msg->data, ummu->base + cfg[1]);
+	writel_relaxed(UMMU_MEMATTR_DEVICE_nGnRE, ummu->base + cfg[2]);
+}
+
+static int ummu_device_setup_msis(struct ummu_device *ummu)
+{
+	struct device *dev = ummu->dev;
+	int ret;
+
+	if (!(ummu->cap.features & UMMU_FEAT_MSI))
+		return -EOPNOTSUPP;
+
+	if (!dev->msi.domain)
+		return -ENODEV;
+
+	/* Clear the MSI address regs */
+	writeq_relaxed(0, ummu->base + UMMU_EVENT_QUE_MSI_ADDR0);
+	writeq_relaxed(0, ummu->base + UMMU_GLB_ERR_INT_MSI_ADDR0);
+
+	/* Allocate MSIs for evtq, gerror */
+	ret = platform_msi_domain_alloc_irqs(dev, UMMU_MAX_MSIS, ummu_write_msi_msg);
+	if (ret) {
+		dev_err(dev, "failed to allocate MSIs. ret = %d\n", ret);
+		return ret;
+	}
+
+	/* Add callback to free MSIs on teardown */
+	ret = devm_add_action_or_reset(dev, ummu_free_msis, dev);
+	if (ret)
+		dev_err(dev, "failed to add free msis action ret = %d.\n", ret);
+
+	return ret;
+}
+
+static inline void ummu_disable_irqs(struct ummu_device *ummu)
+{
+	writel_relaxed(0, ummu->base + UMMU_GLB_IRQ_EN);
+}
+
+static inline void ummu_enable_irqs(struct ummu_device *ummu)
+{
+	u32 irqen_flags = IRQ_CTRL_EVTQ_IRQEN | IRQ_CTRL_GERROR_IRQEN;
+
+	writel_relaxed(irqen_flags, ummu->base + UMMU_GLB_IRQ_EN);
+}
+
+static inline void ummu_init_evtq_irq(struct ummu_device *ummu, int irq)
+{
+	int ret = devm_request_threaded_irq(ummu->dev, irq, NULL,
+					    ummu_evtq_thread, IRQF_ONESHOT,
+					    "ummu-evtq", ummu);
+	if (ret < 0)
+		dev_warn(ummu->dev, "failed to enable evtq irq\n");
+}
+
+static inline void ummu_init_gerr_irq(struct ummu_device *ummu, int irq)
+{
+	int ret = devm_request_irq(ummu->dev, irq, ummu_gerror_handler, 0,
+				   "ummu-gerror", ummu);
+	if (ret < 0)
+		dev_warn(ummu->dev, "failed to enable gerror irq\n");
+}
+
+void ummu_setup_irqs(struct ummu_device *ummu)
+{
+	u32 evtq_irq, gerr_irq;
+	int ret;
+
+	ummu_disable_irqs(ummu);
+
+	ret = ummu_device_setup_msis(ummu);
+	if (ret) {
+		dev_err(ummu->dev, "failed to setup msis. ret = %d\n", ret);
+		return;
+	}
+
+	evtq_irq = msi_get_virq(ummu->dev, EVTQ_MSI_INDEX);
+	if (evtq_irq)
+		ummu_init_evtq_irq(ummu, evtq_irq);
+	else
+		dev_warn(ummu->dev,
+			 "no evtq irq - events will not be reported!\n");
+
+	gerr_irq = msi_get_virq(ummu->dev, GERROR_MSI_INDEX);
+	if (gerr_irq)
+		ummu_init_gerr_irq(ummu, gerr_irq);
+	else
+		dev_warn(ummu->dev,
+			 "no gerr irq - errors will not be reported!\n");
+
+	ummu_enable_irqs(ummu);
+}
diff --git a/drivers/iommu/hisilicon/interrupt.h b/drivers/iommu/hisilicon/interrupt.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe25b6ab11acb4afcd9d820d2d0741e53030facd
--- /dev/null
+++ b/drivers/iommu/hisilicon/interrupt.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Copyright (c) 2025 HiSilicon Technologies Co., Ltd. All rights reserved.
+ * Description: UMMU Interrupt interface
+ */
+
+#ifndef __UMMU_INTERRUPT_H__
+#define __UMMU_INTERRUPT_H__
+
+#include "ummu.h"
+
+void ummu_setup_irqs(struct ummu_device *ummu);
+
+#endif /* __UMMU_INTERRUPT_H__ */
diff --git a/drivers/iommu/hisilicon/queue.c b/drivers/iommu/hisilicon/queue.c
new file mode 100644
index 0000000000000000000000000000000000000000..9755d64969dd67d842445a5a051d8f2469783a8f
--- /dev/null
+++ b/drivers/iommu/hisilicon/queue.c
@@ -0,0 +1,1215 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (c) 2025 HiSilicon Technologies Co., Ltd. All rights reserved.
+ *
+ * Description: UMMU Queue Resource Management. Somewhat based on arm-smmu-v3.c
+ * Copyright (C) 2025 ARM Limited
+ */
+
+#define pr_fmt(fmt) "UMMU: " fmt
+
+#include <linux/iopoll.h>
+#include <linux/dma-mapping.h>
+#include <linux/log2.h>
+
+#include "ummu.h"
+#include "regs.h"
+#include "queue.h"
+
+#define ENTRY_DWORDS_TO_SIZE(dwords) ((dwords) << 3)
+
+struct ummu_queue_poll {
+	ktime_t timeout;
+	u32 delay;
+	u32 spin_cnt;
+	bool wfe;
+};
+
+/* Low-level queue manipulation functions */
+static bool ummu_queue_has_space(struct ummu_ll_queue *q, u32 n)
+{
+	u32 space, prod, cons;
+
+	prod = Q_IDX(q, q->prod);
+	cons = Q_IDX(q, q->cons);
+	if (Q_WRP(q, q->prod) == Q_WRP(q, q->cons))
+		space = (1UL << q->log2size) - (prod - cons);
+	else
+		space = cons - prod;
+
+	return space >= n;
+}
+
+static bool ummu_queue_full(struct ummu_ll_queue *q)
+{
+	return Q_IDX(q, q->prod) == Q_IDX(q, q->cons) &&
+	       Q_WRP(q, q->prod) != Q_WRP(q, q->cons);
+}
+
+bool ummu_queue_empty(struct ummu_ll_queue *q)
+{
+	return Q_IDX(q, q->prod) == Q_IDX(q, q->cons) &&
+	       Q_WRP(q, q->prod) == Q_WRP(q, q->cons);
+}
+
+static bool ummu_queue_consumed(struct ummu_ll_queue *q, u32 prod)
+{
+	return ((Q_WRP(q, q->cons) == Q_WRP(q, prod)) &&
+		(Q_IDX(q, q->cons) > Q_IDX(q, prod))) ||
+	       ((Q_WRP(q, q->cons) != Q_WRP(q, prod)) &&
+		(Q_IDX(q, q->cons) <= Q_IDX(q, prod)));
+}
+
+static void ummu_queue_sync_cons_out(struct ummu_queue *q)
+{
+	/*
+	 * Ensure that all CPU accesses (reads and writes) to the queue
+	 * are complete before we update the cons pointer.
+	 */
+	__iomb();
+	writel_relaxed(q->llq.cons, q->cons_reg);
+}
+
+static void ummu_queue_inc_cons(struct ummu_ll_queue *q)
+{
+	u32 cons = (Q_WRP(q, q->cons) | Q_IDX(q, q->cons)) + 1;
+
+	q->cons = Q_OVF(q->cons) | Q_WRP(q, cons) | Q_IDX(q, cons);
+}
+
+int ummu_queue_sync_prod_in(struct ummu_queue *q)
+{
+	u32 prod;
+	int ret = 0;
+
+	prod = readl(q->prod_reg);
+	/*
+	 * We can't use the variable _relaxed() here because we have to prevent
+	 * speculative read of the queue before we determine The prod has moved.
+	 */
+	if (Q_OVF(prod) != Q_OVF(q->llq.prod))
+		ret = -EOVERFLOW;
+
+	q->llq.prod = prod;
+
+	return ret;
+}
+
+static u32 ummu_queue_inc_prod_n(struct ummu_ll_queue *q, int n)
+{
+	u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + n;
+
+	return Q_OVF(q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod);
+}
+
+static void ummu_queue_poll_init(struct ummu_device *ummu,
+			    struct ummu_queue_poll *qp)
+{
+	qp->delay = 1;
+	qp->spin_cnt = 0;
+	qp->wfe = !!(ummu->cap.features & UMMU_FEAT_SEV);
+	qp->timeout = ktime_add_us(ktime_get(), UMMU_QUE_POLL_TIMEOUT_US);
+}
+
+static int ummu_queue_poll(struct ummu_queue_poll *qp)
+{
+	if (ktime_compare(ktime_get(), qp->timeout) > 0)
+		return -ETIMEDOUT;
+
+	if (qp->wfe) {
+		wfe();
+	} else if (++qp->spin_cnt < UMMU_POLL_SPIN_COUNT) {
+		cpu_relax();
+	} else {
+		udelay(qp->delay);
+		qp->delay *= 2; /* multiply the delay by 2 */
+		qp->spin_cnt = 0;
+	}
+
+	return 0;
+}
+
+void ummu_queue_write(__le64 *dst, u64 *src, size_t n_dwords)
+{
+	size_t i;
+
+	for (i = 0; i < n_dwords; ++i)
+		*dst++ = cpu_to_le64(*src++);
+}
+
+void ummu_queue_read(u64 *dst, __le64 *src, size_t n_dwords)
+{
+	size_t i;
+
+	for (i = 0; i < n_dwords; ++i)
+		*dst++ = le64_to_cpu(*src++);
+}
+
+int ummu_queue_remove_raw(struct ummu_queue *queue, u64 *ent)
+{
+	if (ummu_queue_empty(&queue->llq))
+		return -EAGAIN;
+
+	ummu_queue_read(ent, Q_ENT(queue, queue->llq.cons), queue->ent_dwords);
+	ummu_queue_inc_cons(&queue->llq);
+	ummu_queue_sync_cons_out(queue);
+
+	return 0;
+}
+
+static int ummu_common_init_queue(struct ummu_device *ummu,
+				  struct ummu_queue *q, size_t dwords)
+{
+	size_t qsz;
+
+	q->base = NULL;
+	do {
+		qsz = ENTRY_DWORDS_TO_SIZE((1 << q->llq.log2size) * dwords);
+		if (get_order(qsz) <= MAX_ORDER)
+			q->base = (__le64 *)devm_get_free_pages(ummu->dev, GFP_KERNEL,
+								get_order(qsz));
+
+		q->llq.log2size--;
+	} while (!q->base && qsz > PAGE_SIZE);
+
+	/* confirm right log2size after the loop */
+	q->llq.log2size++;
+
+	if (q->base) {
+		q->base_pa = virt_to_phys(q->base);
+	} else {
+		dev_err(ummu->dev,
+			"failed to allocate queue (0x%zx bytes)\n", qsz);
+		return -ENOMEM;
+	}
+
+	q->ent_dwords = dwords;
+	q->q_base = Q_BASE_RWA;
+	q->q_base |= q->base_pa & Q_BASE_ADDR_MASK;
+	q->q_base |= FIELD_PREP(Q_BASE_LOG2SIZE, q->llq.log2size);
+	q->llq.prod = q->llq.cons = 0;
+
+	return 0;
+}
+
+static int ummu_mcmdq_allocate(struct ummu_device *ummu)
+{
+	struct ummu_mcmdq __percpu *mcmdqs;
+	struct ummu_mcmdq *mcmdq;
+	u32 cpu, host_cpu;
+
+	mcmdqs = devm_alloc_percpu(ummu->dev, *mcmdq);
+	if (!mcmdqs)
+		return -ENOMEM;
+
+	/* A core requires at most one ECMDQ */
+	if (num_possible_cpus() < ummu->nr_mcmdq)
+		ummu->nr_mcmdq = num_possible_cpus();
+
+	for_each_possible_cpu(cpu) {
+		if (cpu < ummu->nr_mcmdq) {
+			mcmdq = per_cpu_ptr(mcmdqs, cpu);
+			mcmdq->configured = 0;
+		} else {
+			host_cpu = cpu % ummu->nr_mcmdq;
+			mcmdq = per_cpu_ptr(mcmdqs, host_cpu);
+			mcmdq->shared = 1;
+		}
+		*per_cpu_ptr(ummu->mcmdq, cpu) = mcmdq;
+	}
+
+	return 0;
+}
+
+static int ummu_mcmdq_cfg_para(struct ummu_device *ummu,
+			       struct ummu_mcmdq *mcmdq)
+{
+	atomic_long_t *bitmap;
+
+	mcmdq->mcmdq_prod = MCMDQ_PROD_EN;
+
+	atomic_set(&mcmdq->owner_prod, 0);
+	rwlock_init(&mcmdq->mcmdq_lock);
+
+	bitmap = (atomic_long_t *)devm_bitmap_zalloc(
+		ummu->dev, 1UL << mcmdq->q.llq.log2size, GFP_KERNEL);
+	if (!bitmap) {
+		dev_err(ummu->dev, "failed to zalloc mcmdq bitmap\n");
+		return -ENOMEM;
+	}
+	mcmdq->valid_map = bitmap;
+
+	return 0;
+}
+
+static int ummu_mcmdq_init(struct ummu_device *ummu)
+{
+	struct ummu_mcmdq *mcmdq;
+	u32 shift;
+	u64 base_addr = 0;
+	int cpu, ret;
+
+	ummu->nr_mcmdq = 1UL << ummu->cap.mcmdq_log2num;
+	ummu->nr_mcmdq -= 1;
+	shift = order_base_2(num_possible_cpus() / ummu->nr_mcmdq);
+
+	ummu->mcmdq = devm_alloc_percpu(ummu->dev, struct ummu_mcmdq *);
+	if (!ummu->mcmdq) {
+		dev_err(ummu->dev, "alloc mcmdq ptr failed\n");
+		goto err;
+	}
+
+	ret = ummu_mcmdq_allocate(ummu);
+	if (ret) {
+		dev_err(ummu->dev, "mcmdq allocate failed\n");
+		goto err;
+	}
+
+	for_each_possible_cpu(cpu) {
+		mcmdq = *per_cpu_ptr(ummu->mcmdq, cpu);
+		/* prevent repeated init when it is shared to multiple CPUs. */
+		if (!mcmdq || mcmdq->mcmdq_prod == MCMDQ_PROD_EN)
+			continue;
+
+		mcmdq->q.llq.log2size = MCMDQ_MAX_SZ_SHIFT + shift;
+		mcmdq->base = ummu->base + UMMU_MCMDQ_OFFSET + base_addr;
+		mcmdq->q.prod_reg = (u32 *)(mcmdq->base + MCMDQ_PROD_OFFSET);
+		mcmdq->q.cons_reg = (u32 *)(mcmdq->base + MCMDQ_CONS_OFFSET);
+		ret = ummu_common_init_queue(ummu, &mcmdq->q, MCMDQ_ENT_DWORDS);
+		if (ret)
+			goto err;
+		ret = ummu_mcmdq_cfg_para(ummu, mcmdq);
+		if (ret)
+			goto err;
+
+		base_addr += MCMDQ_ENT_SIZE;
+	}
+
+	return 0;
+err:
+	ummu->nr_mcmdq = 0;
+	return -ENOMEM;
+}
+
+static int ummu_write_mcmdq_regs(struct ummu_device *ummu)
+{
+	struct ummu_mcmdq *mcmdq;
+	struct ummu_queue *q;
+	int i = 0, ret;
+	u32 cpu;
+	u32 reg;
+
+	if (unlikely(!ummu->nr_mcmdq)) {
+		dev_err(ummu->dev, "have not mcmdq resource.\n");
+		return -EINVAL;
+	}
+
+	for_each_possible_cpu(cpu) {
+		mcmdq = *per_cpu_ptr(ummu->mcmdq, cpu);
+		if (mcmdq->configured == 1)
+			continue;
+
+		q = &mcmdq->q;
+		i++;
+		if (WARN_ON(q->llq.prod != q->llq.cons)) {
+			q->llq.prod = 0;
+			q->llq.cons = 0;
+		}
+		/*
+		 * In kdump kernel, the mcmdq should be turned off first to
+		 * prevent  "CMD_SYNC timeout" problem.
+		 */
+		reg = readl(q->prod_reg);
+		if (reg & MCMDQ_PROD_EN) {
+			writel(reg & ~MCMDQ_PROD_EN, q->prod_reg);
+			ret = readl_relaxed_poll_timeout(q->cons_reg, reg,
+						!(reg & MCMDQ_EN_RESP), 1,
+						UMMU_CONS_POLL_TIMEOUT_US);
+			if (ret) {
+				dev_warn(ummu->dev,
+					 "mcmdq[%d] disable failed\n", i);
+				mcmdq->configured = 0;
+				return ret;
+			}
+		}
+
+		/* close mcmdq_base write protection */
+		writel_relaxed(q->llq.prod, q->prod_reg);
+		writel_relaxed(q->llq.cons, q->cons_reg);
+		writeq_relaxed(q->q_base, mcmdq->base);
+
+		/* enable mcmdq and open write protection */
+		writel_relaxed(MCMDQ_PROD_EN | q->llq.prod, mcmdq->q.prod_reg);
+		ret = readl_relaxed_poll_timeout(mcmdq->q.cons_reg, reg,
+						 reg & MCMDQ_EN_RESP, 1,
+						 UMMU_CONS_POLL_TIMEOUT_US);
+		if (ret) {
+			dev_err(ummu->dev, "prod_reg write timeout ret = %d.\n",
+				ret);
+			return ret;
+		}
+		mcmdq->configured = 1;
+	}
+
+	return 0;
+}
+
+int ummu_device_mcmdq_init_cfg(struct ummu_device *ummu)
+{
+	return ummu_write_mcmdq_regs(ummu);
+}
+
+int ummu_write_evtq_regs(struct ummu_device *ummu)
+{
+	u32 cr0 = readl_relaxed(ummu->base + UMMU_CR0);
+	struct ummu_queue *q = &ummu->evtq.q;
+
+	/* evtq disabled in function ummu_device_disable */
+	writeq_relaxed(q->q_base, ummu->base + UMMU_EVTQ_OFFSET);
+
+	writel_relaxed(q->llq.prod, ummu->base + UMMU_EVTQ_PROD_OFFSET);
+	writel_relaxed(q->llq.cons, ummu->base + UMMU_EVTQ_CONS_OFFSET);
+
+	cr0 |= CR0_EVENTQ_EN;
+
+	return ummu_write_reg_sync(ummu, cr0, UMMU_CR0, UMMU_CR0ACK);
+}
+
+static int ummu_evtq_init(struct ummu_device *ummu)
+{
+	struct ummu_queue *q = &ummu->evtq.q;
+
+	q->llq.log2size = min(EVTQ_MAX_SZ_SHIFT, ummu->cap.evtq_log2size);
+	q->prod_reg = (u32 *)(ummu->base + UMMU_EVTQ_PROD_OFFSET);
+	q->cons_reg = (u32 *)(ummu->base + UMMU_EVTQ_CONS_OFFSET);
+	return ummu_common_init_queue(ummu, q, EVTQ_ENT_DWORDS);
+}
+
+int ummu_init_queues(struct ummu_device *ummu)
+{
+	if (!(ummu->cap.features & UMMU_FEAT_MCMDQ) ||
+	    !(ummu->cap.features & UMMU_FEAT_EVENTQ))
+		return -EOPNOTSUPP;
+
+	if (ummu_mcmdq_init(ummu))
+		return -ENOMEM;
+
+	if (ummu_evtq_init(ummu))
+		return -ENOMEM;
+
+	return 0;
+}
+
+#define ummu_mcmdq_exclusive_trylock_irqsave(mcmdq, flags)                   \
+	({                                                                   \
+		bool __ret;                                                  \
+		local_irq_save(flags);                                       \
+		__ret = !atomic_cmpxchg_relaxed(&(mcmdq)->lock, 0, INT_MIN); \
+		if (!__ret)                                                  \
+			local_irq_restore(flags);                            \
+		__ret;                                                       \
+	})
+
+#define ummu_mcmdq_exclusive_unlock_irqrestore(mcmdq, flags) \
+	({                                                   \
+		atomic_set_release(&(mcmdq)->lock, 0);       \
+		local_irq_restore(flags);                    \
+	})
+
+/* Wait for the command queue to become non-full */
+static int ummu_mcmdq_poll_until_not_full(struct ummu_device *ummu,
+				   struct ummu_mcmdq *mcmdq,
+				   struct ummu_ll_queue *llq)
+{
+	struct ummu_queue_poll qp;
+	unsigned long flags;
+	int ret = 0;
+
+	/*
+	 * Try to update our copy of cons by grabbing exclusive mcmdq access. If
+	 * that fails, spin until somebody else updates it for us.
+	 */
+	if (ummu_mcmdq_exclusive_trylock_irqsave(mcmdq, flags)) {
+		WRITE_ONCE(mcmdq->q.llq.cons, readl_relaxed(mcmdq->q.cons_reg));
+		ummu_mcmdq_exclusive_unlock_irqrestore(mcmdq, flags);
+		llq->val = READ_ONCE(mcmdq->q.llq.val);
+		return 0;
+	}
+
+	ummu_queue_poll_init(ummu, &qp);
+	do {
+		llq->val = READ_ONCE(mcmdq->q.llq.val);
+		if (!ummu_queue_full(llq))
+			break;
+
+		ret = ummu_queue_poll(&qp);
+	} while (!ret);
+
+	return ret;
+}
+
+/*
+ * The command queue is locked.
+ * This is a private form of rwlock with the following main variations:
+ *
+ * - The UNLOCK routine is supplemented by shared_tryunlock(), where
+ * If the caller appears to be the last lock holder (yes, this is
+ * All successful UNLOCK routines have RELEASE semantics.
+ *
+ * - The only LOCK routines are exclusive_trylock() and shared_lock().
+ * Neither has barrier semantics, but only provides control.
+ * Dependency.
+ */
+static void ummu_mcmdq_shared_lock(struct ummu_mcmdq *mcmdq)
+{
+	int val;
+
+	/*
+	 * We can try to avoid the cmpxchg() loop by simply incrementing the
+	 * lock counter. When held in exclusive state, the lock counter is set
+	 * to INT_MIN so these increments won't hurt as the value will remain
+	 * negative.
+	 */
+	if (atomic_fetch_inc_relaxed(&mcmdq->lock) >= 0)
+		return;
+
+	do {
+		val = atomic_cond_read_relaxed(&mcmdq->lock, VAL >= 0);
+	} while (atomic_cmpxchg_relaxed(&mcmdq->lock, val, val + 1) != val);
+}
+
+static void ummu_mcmdq_shared_unlock(struct ummu_mcmdq *mcmdq)
+{
+	(void)atomic_dec_return_release(&mcmdq->lock);
+}
+
+static bool ummu_mcmdq_shared_tryunlock(struct ummu_mcmdq *mcmdq)
+{
+	if (atomic_read(&mcmdq->lock) == 1)
+		return false;
+
+	ummu_mcmdq_shared_unlock(mcmdq);
+
+	return true;
+}
+
+static int ummu_mcmdq_build_nop_cmd(u64 *cmd, struct ummu_mcmdq_ent *ent)
+{
+	cmd[0] |= FIELD_PREP(CMD_NULL_OP_SUB_OP, ent->null_op.sub_op);
+	switch (ent->null_op.sub_op) {
+	case SUB_CMD_NULL_CHECK_PA_CONTINUITY:
+		cmd[0] |= FIELD_PREP(SUB_OP_CHECK_PA_CONTI_0_RESULT,
+				     ent->null_op.check_pa_conti.result);
+		cmd[0] |= ent->null_op.check_pa_conti.flag ?
+			  SUB_OP_CHECK_PA_CONTI_0_FLAG : 0;
+		cmd[0] |= FIELD_PREP(SUB_OP_CHECK_PA_CONTI_0_SIZE,
+				     ent->null_op.check_pa_conti.size_order);
+		cmd[0] |= FIELD_PREP(SUB_OP_CHECK_PA_CONTI_0_ID,
+				     ent->null_op.check_pa_conti.id);
+		cmd[1] |= SUB_OP_CHECK_PA_CONTI_1_ADDR &
+			  ent->null_op.check_pa_conti.addr;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+int ummu_mcmdq_build_cmd(struct ummu_device *ummu, u64 *cmd,
+			 struct ummu_mcmdq_ent *ent)
+{
+	memset(cmd, 0, 1 << MCMDQ_ENT_SZ_SHIFT);
+	cmd[0] |= FIELD_PREP(CMD_0_OP, ent->opcode);
+
+	/* build cmd method for different cmds */
+	switch (ent->opcode) {
+	case CMD_SYNC:
+		if (ent->sync.msi_addr) {
+			cmd[0] |= FIELD_PREP(CMD_SYNC_0_CM, CMD_SYNC_0_CM_IRQ);
+			cmd[1] |= ent->sync.msi_addr & CMD_SYNC_1_MSIADDR;
+		} else if (ent->sync.support_sev) {
+			cmd[0] |= FIELD_PREP(CMD_SYNC_0_CM, CMD_SYNC_0_CM_SEV);
+		} else {
+			cmd[0] |= FIELD_PREP(CMD_SYNC_0_CM, CMD_SYNC_0_CM_NONE);
+		}
+		cmd[0] |= FIELD_PREP(CMD_SYNC_0_MSISH, UMMU_SH_ISH);
+		cmd[0] |= FIELD_PREP(CMD_SYNC_0_MSIATTR, UMMU_MEMATTR_OIWB);
+		break;
+	case CMD_STALL_RESUME:
+		cmd[0] |= ent->stall_resume.dsec ? CMD_STALL_0_DSEC : 0;
+		cmd[0] |= ent->stall_resume.retry ? CMD_STALL_0_RETRY : 0;
+		cmd[0] |= ent->stall_resume.abort ? CMD_STALL_0_ABORT : 0;
+		cmd[1] |= FIELD_PREP(CMD_STALL_1_TAG,  ent->stall_resume.tag);
+		cmd[2] |= FIELD_PREP(CMD_STALL_2_TECT_TAG,
+				     ent->stall_resume.tect_tag);
+		break;
+	case CMD_STALL_TERM:
+		cmd[2] |= FIELD_PREP(CMD_STALL_2_TECT_TAG, ent->stall_resume.tect_tag);
+		break;
+	case CMD_PREFET_CFG:
+		cmd[0] |= ent->prefet.tkv ? CMD_PREFET_0_TKV : 0;
+		cmd[0] |= FIELD_PREP(CMD_PREFET_0_TID, ent->prefet.tid);
+		cmd[2] |= FIELD_PREP(CMD_PREFET_2_DEID_0, ent->prefet.deid_0);
+		cmd[2] |= FIELD_PREP(CMD_PREFET_2_DEID_1, ent->prefet.deid_1);
+		cmd[3] |= FIELD_PREP(CMD_PREFET_3_DEID_0, ent->prefet.deid_2);
+		cmd[3] |= FIELD_PREP(CMD_PREFET_3_DEID_1, ent->prefet.deid_3);
+		break;
+	case CMD_CFGI_TECT:
+		cmd[0] |= ent->cfgi.leaf ? CMD_CFGI_0_LEAF : 0;
+		cmd[2] |= FIELD_PREP(CMD_CFGI_2_DEID_0, ent->cfgi.deid_0);
+		cmd[2] |= FIELD_PREP(CMD_CFGI_2_DEID_1, ent->cfgi.deid_1);
+		cmd[3] |= FIELD_PREP(CMD_CFGI_3_DEID_0, ent->cfgi.deid_2);
+		cmd[3] |= FIELD_PREP(CMD_CFGI_3_DEID_1, ent->cfgi.deid_3);
+		break;
+	case CMD_CFGI_TECT_RANGE:
+		cmd[0] |= FIELD_PREP(CMD_CFGI_0_RANGE, ent->cfgi.range);
+		cmd[2] |= FIELD_PREP(CMD_CFGI_2_DEID_0, ent->cfgi.deid_0);
+		cmd[2] |= FIELD_PREP(CMD_CFGI_2_DEID_1, ent->cfgi.deid_1);
+		cmd[3] |= FIELD_PREP(CMD_CFGI_3_DEID_0, ent->cfgi.deid_2);
+		cmd[3] |= FIELD_PREP(CMD_CFGI_3_DEID_1, ent->cfgi.deid_3);
+		break;
+	case CMD_CFGI_TCT:
+		cmd[0] |= ent->cfgi.leaf ? CMD_CFGI_0_LEAF : 0;
+		cmd[0] |= FIELD_PREP(CMD_CFGI_0_TID, ent->cfgi.tid);
+		cmd[2] |= FIELD_PREP(CMD_CFGI_2_DEID_0, ent->cfgi.deid_0);
+		cmd[2] |= FIELD_PREP(CMD_CFGI_2_DEID_1, ent->cfgi.deid_1);
+		cmd[3] |= FIELD_PREP(CMD_CFGI_3_DEID_0, ent->cfgi.deid_2);
+		cmd[3] |= FIELD_PREP(CMD_CFGI_3_DEID_1, ent->cfgi.deid_3);
+		break;
+	case CMD_CFGI_TCT_ALL:
+		cmd[0] |= ent->cfgi.leaf ? CMD_CFGI_0_LEAF : 0;
+		cmd[0] |= FIELD_PREP(CMD_CFGI_0_TID, ent->cfgi.tid);
+		cmd[2] |= FIELD_PREP(CMD_CFGI_2_DEID_0, ent->cfgi.deid_0);
+		cmd[2] |= FIELD_PREP(CMD_CFGI_2_DEID_1, ent->cfgi.deid_1);
+		cmd[3] |= FIELD_PREP(CMD_CFGI_3_DEID_0, ent->cfgi.deid_2);
+		cmd[3] |= FIELD_PREP(CMD_CFGI_3_DEID_1, ent->cfgi.deid_3);
+		break;
+	case CMD_PLBI_OS_EID:
+		cmd[2] |= FIELD_PREP(CMD_PLBI_2_TECTE_TAG, ent->plbi.tecte_tag);
+		break;
+	case CMD_PLBI_OS_EIDTID:
+		cmd[0] |= FIELD_PREP(CMD_PLBI_0_TID, ent->plbi.tid);
+		cmd[2] |= FIELD_PREP(CMD_PLBI_2_TECTE_TAG, ent->plbi.tecte_tag);
+		break;
+	case CMD_PLBI_OS_VA:
+		cmd[0] |= FIELD_PREP(CMD_PLBI_0_TID, ent->plbi.tid);
+		cmd[0] |= FIELD_PREP(CMD_PLBI_0_RANGE, ent->plbi.range);
+		cmd[1] |= ent->plbi.addr & CMD_PLBI_1_ADDR_MASK;
+		cmd[2] |= FIELD_PREP(CMD_PLBI_2_TECTE_TAG, ent->plbi.tecte_tag);
+		break;
+	case CMD_TLBI_OS_ALL:
+		cmd[2] |= FIELD_PREP(CMD_TLBI_2_TECTE_TAG, ent->tlbi.tect_tag);
+		break;
+	case CMD_TLBI_OS_TID:
+		cmd[0] |= FIELD_PREP(CMD_TLBI_0_TOKEN_ID, ent->tlbi.tid);
+		cmd[2] |= FIELD_PREP(CMD_TLBI_2_TECTE_TAG, ent->tlbi.tect_tag);
+		break;
+	case CMD_TLBI_OS_VA:
+		cmd[0] |= FIELD_PREP(CMD_TLBI_0_TOKEN_ID, ent->tlbi.tid);
+		fallthrough;
+	case CMD_TLBI_OS_VAA:
+		cmd[2] |= FIELD_PREP(CMD_TLBI_2_TECTE_TAG, ent->tlbi.tect_tag);
+		cmd[0] |= ent->tlbi.leaf ? CMD_TLBI_0_LEAF : 0;
+		cmd[0] |= FIELD_PREP(CMD_TLBI_0_NUM, ent->tlbi.num);
+		cmd[0] |= FIELD_PREP(CMD_TLBI_0_SCALE, ent->tlbi.scale);
+		cmd[0] |= FIELD_PREP(CMD_TLBI_0_TL, ent->tlbi.tl);
+		cmd[1] |= FIELD_PREP(CMD_TLBI_1_GS, ent->tlbi.gs);
+		cmd[1] |= ent->tlbi.addr & CMD_TLBI_1_VA_MASK;
+		break;
+	case CMD_TLBI_HYP_TID:
+		cmd[0] |= FIELD_PREP(CMD_TLBI_0_TOKEN_ID, ent->tlbi.tid);
+		break;
+	case CMD_TLBI_HYP_VA:
+		cmd[0] |= FIELD_PREP(CMD_TLBI_0_TOKEN_ID, ent->tlbi.tid);
+		fallthrough;
+	case CMD_TLBI_HYP_VAA:
+		cmd[0] |= ent->tlbi.leaf ? CMD_TLBI_0_LEAF : 0;
+		cmd[0] |= FIELD_PREP(CMD_TLBI_0_NUM, ent->tlbi.num);
+		cmd[0] |= FIELD_PREP(CMD_TLBI_0_SCALE, ent->tlbi.scale);
+		cmd[0] |= FIELD_PREP(CMD_TLBI_0_TL, ent->tlbi.tl);
+		cmd[1] |= FIELD_PREP(CMD_TLBI_1_GS, ent->tlbi.gs);
+		cmd[1] |= ent->tlbi.addr & CMD_TLBI_1_VA_MASK;
+		break;
+	case CMD_TLBI_S1S2_VMALL:
+		cmd[2] |= FIELD_PREP(CMD_TLBI_2_TECTE_TAG, ent->tlbi.tect_tag);
+		break;
+	case CMD_TLBI_S2_IPA:
+		cmd[2] |= FIELD_PREP(CMD_TLBI_2_TECTE_TAG, ent->tlbi.tect_tag);
+		cmd[0] |= ent->tlbi.leaf ? CMD_TLBI_0_LEAF : 0;
+		cmd[0] |= FIELD_PREP(CMD_TLBI_0_NUM, ent->tlbi.num);
+		cmd[0] |= FIELD_PREP(CMD_TLBI_0_SCALE, ent->tlbi.scale);
+		cmd[0] |= FIELD_PREP(CMD_TLBI_0_TL, ent->tlbi.tl);
+		cmd[1] |= FIELD_PREP(CMD_TLBI_1_GS, ent->tlbi.gs);
+		cmd[1] |= ent->tlbi.addr & CMD_TLBI_1_VA_MASK;
+		break;
+	case CMD_TLBI_HYP_ALL:
+	case CMD_TLBI_NS_OS_ALL:
+		break;
+	case CMD_CREATE_KVTBL:
+		cmd[0] |= ent->create_kvtbl.evt_en ?
+			   CMD_CREATE_KVTBL0_EVT_EN : 0;
+		cmd[0] |= FIELD_PREP(CMD_CREATE_KVTBL0_TAG_MASK,
+				     ent->create_kvtbl.tecte_tag);
+		cmd[0] |= FIELD_PREP(CMD_CREATE_KVTBL0_KV_INDEX_MASK,
+				     ent->create_kvtbl.kv_index);
+		cmd[1] |= ent->create_kvtbl.tect_base_addr &
+			  CMD_CREATE_KVTBL1_ADDR_MASK;
+		cmd[2] |= FIELD_PREP(CMD_CREATE_KVTBL2_EID_LOW,
+				     ent->create_kvtbl.eid_low);
+		cmd[3] |= FIELD_PREP(CMD_CREATE_KVTBL3_EID_HIGH,
+				     ent->create_kvtbl.eid_high);
+		break;
+	case CMD_DELETE_KVTBL:
+		cmd[0] |= ent->delete_kvtbl.evt_en ?
+			   CMD_DELETE_KVTBL0_EVT_EN : 0;
+		cmd[0] |= FIELD_PREP(CMD_DELETE_KVTBL0_TAG_MASK,
+				     ent->delete_kvtbl.tecte_tag);
+		cmd[0] |= FIELD_PREP(CMD_DELETE_KVTBL0_KV_INDEX_MASK,
+				     ent->delete_kvtbl.kv_index);
+		cmd[2] |= FIELD_PREP(CMD_DELETE_KVTBL2_EID_LOW,
+				     ent->delete_kvtbl.eid_low);
+		cmd[3] |= FIELD_PREP(CMD_DELETE_KVTBL3_EID_HIGH,
+				     ent->delete_kvtbl.eid_high);
+		break;
+	case CMD_TLBI_OS_ALL_U:
+		cmd[0] |= FIELD_PREP(CMD_TLBI_0_VMID, ent->tlbi.vmid);
+		break;
+	case CMD_TLBI_OS_ASID_U:
+		cmd[0] |= FIELD_PREP(CMD_TLBI_0_ASID, ent->tlbi.asid);
+		cmd[0] |= FIELD_PREP(CMD_TLBI_0_VMID, ent->tlbi.vmid);
+		break;
+	case CMD_TLBI_OS_VA_U:
+		cmd[0] |= FIELD_PREP(CMD_TLBI_0_ASID, ent->tlbi.asid);
+		fallthrough;
+	case CMD_TLBI_OS_VAA_U:
+		cmd[0] |= FIELD_PREP(CMD_TLBI_0_VMID, ent->tlbi.vmid);
+		cmd[0] |= ent->tlbi.leaf ? CMD_TLBI_0_LEAF : 0;
+		cmd[0] |= FIELD_PREP(CMD_TLBI_0_NUM, ent->tlbi.num);
+		cmd[0] |= FIELD_PREP(CMD_TLBI_0_SCALE, ent->tlbi.scale);
+		cmd[0] |= FIELD_PREP(CMD_TLBI_0_TL, ent->tlbi.tl);
+		cmd[1] |= FIELD_PREP(CMD_TLBI_1_GS, ent->tlbi.gs);
+		cmd[1] |= ent->tlbi.addr & CMD_TLBI_1_VA_MASK;
+		break;
+	case CMD_TLBI_HYP_ASID_U:
+		cmd[0] |= FIELD_PREP(CMD_TLBI_0_ASID, ent->tlbi.asid);
+		break;
+	case CMD_TLBI_HYP_VA_U:
+		cmd[0] |= FIELD_PREP(CMD_TLBI_0_ASID, ent->tlbi.asid);
+		cmd[0] |= ent->tlbi.leaf ? CMD_TLBI_0_LEAF : 0;
+		cmd[0] |= FIELD_PREP(CMD_TLBI_0_NUM, ent->tlbi.num);
+		cmd[0] |= FIELD_PREP(CMD_TLBI_0_SCALE, ent->tlbi.scale);
+		cmd[0] |= FIELD_PREP(CMD_TLBI_0_TL, ent->tlbi.tl);
+		cmd[1] |= FIELD_PREP(CMD_TLBI_1_GS, ent->tlbi.gs);
+		cmd[1] |= ent->tlbi.addr & CMD_TLBI_1_VA_MASK;
+		break;
+	case CMD_TLBI_S1S2_VMALL_U:
+		cmd[0] |= FIELD_PREP(CMD_TLBI_0_VMID, ent->tlbi.vmid);
+		break;
+	case CMD_TLBI_S2_IPA_U:
+		cmd[0] |= FIELD_PREP(CMD_TLBI_0_VMID, ent->tlbi.vmid);
+		cmd[0] |= ent->tlbi.leaf ? CMD_TLBI_0_LEAF : 0;
+		cmd[0] |= FIELD_PREP(CMD_TLBI_0_NUM, ent->tlbi.num);
+		cmd[0] |= FIELD_PREP(CMD_TLBI_0_SCALE, ent->tlbi.scale);
+		cmd[0] |= FIELD_PREP(CMD_TLBI_0_TL, ent->tlbi.tl);
+		cmd[1] |= FIELD_PREP(CMD_TLBI_1_GS, ent->tlbi.gs);
+		cmd[1] |= ent->tlbi.addr & CMD_TLBI_1_IPA_MASK;
+		break;
+	case CMD_NULL_OP:
+		return ummu_mcmdq_build_nop_cmd(cmd, ent);
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * Command queue insertion.
+ * This process became cumbersome as we aimed to achieve scalability
+ * due to the shared queue among all CPUs in the system.
+ * If you desire a combination of size concurrency, dependency order,
+ * and loose atoms, then you will absolutely adore this monstrous solution.
+ * The fundamental concept is to divide the queue into command ranges
+ * owned by each CPU. The owner may not have authored all the commands
+ * themselves but assumes responsibility for advancing the hardware product
+ * pointer under certain circumstances: when it's time. The algorithm can be
+ * summarized as follows:
+ * 1. Allocate space within the queue while also determining if another CPU
+ *     currently owns the head of the queue or if we are its rightful owners.
+ * 2. Write our commands into the allocated slot within the queue.
+ * 3. Mark our slot as valid in ummu_mcmdq.valid_map.
+ * 4. If we are indeed the owner:
+ *    A. Wait for completion by any previous owner.
+ *    B. Declare that there is no current owner for this range,
+ *       indicating our responsibility for publishing it.
+ *    C. Await execution of all orders within our possession.
+ *    D. Advance the hardware product pointer.
+ *    E. Notify subsequent hosts that we have completed our tasks.
+ * 5. If we insert CMD_SYNC (whether or not we are its owner),
+ *     then we must persist with it until completion:
+ *    A. If MSI is available, UMMU can write back to CMD_SYNC and
+ *       clear its first 4 bytes.
+ *    B. Otherwise, rotate and wait until the hardware cons pointer points
+ *       beyond our command.
+ * The devil lies in these intricate details-particularly regarding locking
+ * mechanisms-to ensure complete synchronization and efficient utilization of
+ * space within the queue before deeming it full.
+ */
+static void ummu_mcmdq_build_sync_cmd(u64 *cmd, struct ummu_device *ummu,
+				      struct ummu_queue *q, u32 prod)
+{
+	struct ummu_mcmdq_ent ent = {
+		.opcode = CMD_SYNC,
+	};
+
+	/*
+	 * Beware that Hi16xx adds an extra 32 bits of goodness to its MSI
+	 * payload, so the write will zero the entire command on that platform.
+	 */
+	if (ummu->cap.options & UMMU_OPT_MSIPOLL)
+		ent.sync.msi_addr = q->base_pa + Q_IDX(&q->llq, prod) *
+						ENTRY_DWORDS_TO_SIZE(q->ent_dwords);
+	ent.sync.support_sev = !!(ummu->cap.features & UMMU_FEAT_SEV);
+	(void)ummu_mcmdq_build_cmd(ummu, cmd, &ent);
+}
+
+static void ummu_mcmdq_poll_set_valid_map(struct ummu_mcmdq *mcmdq, u32 sprod, u32 eprod, bool set)
+{
+	u32 swidx, sbidx, ewidx, ebidx;
+	struct ummu_ll_queue llq;
+	unsigned long valid;
+	unsigned long mask;
+	atomic_long_t *ptr;
+	u32 limit;
+
+	llq.prod = sprod;
+	llq.log2size = mcmdq->q.llq.log2size;
+
+	ewidx = BIT_WORD(Q_IDX(&llq, eprod));
+	ebidx = Q_IDX(&llq, eprod) % BITS_PER_LONG;
+
+	while (llq.prod != eprod) {
+		limit = BITS_PER_LONG;
+		swidx = BIT_WORD(Q_IDX(&llq, llq.prod));
+		sbidx = Q_IDX(&llq, llq.prod) % BITS_PER_LONG;
+
+		ptr = &mcmdq->valid_map[swidx];
+
+		if ((swidx == ewidx) && (sbidx < ebidx))
+			limit = ebidx;
+
+		mask = GENMASK(limit - 1, sbidx);
+
+		if (set) {
+			atomic_long_xor(mask, ptr);
+		} else { /* Poll */
+			/*
+			 * The valid bit is equal to the wrap bit.
+			 * This means that a queue initialized to 0 is invalid,
+			 * and after all elements are marked as valid, causing a rollover,
+			 * all elements become invalid again.
+			 */
+
+			valid = (ULONG_MAX + !!Q_WRP(&llq, llq.prod)) & mask;
+			atomic_long_cond_read_relaxed(ptr,
+						      (VAL & mask) == valid);
+		}
+
+		llq.prod = ummu_queue_inc_prod_n(&llq, limit - sbidx);
+	}
+}
+
+/* Mark all entries in the range [sprod, eprod) as valid */
+static void ummu_mcmdq_set_valid_map(struct ummu_mcmdq *mcmdq, u32 sprod,
+				     u32 eprod)
+{
+	ummu_mcmdq_poll_set_valid_map(mcmdq, sprod, eprod, true);
+}
+
+/* Wait for all entries in the range [sprod, eprod) to become valid */
+static void ummu_mcmdq_poll_valid_map(struct ummu_mcmdq *mcmdq, u32 sprod,
+				      u32 eprod)
+{
+	ummu_mcmdq_poll_set_valid_map(mcmdq, sprod, eprod, false);
+}
+
+/*
+ * Wait until the UMMU signals a CMD_SYNC completion MSI.
+ */
+static int ummu_mcmdq_poll_until_msi(struct ummu_device *ummu,
+				     struct ummu_mcmdq *mcmdq,
+				     struct ummu_ll_queue *llq)
+{
+	u32 *cmd = (u32 *)(Q_ENT(&mcmdq->q, llq->prod));
+	struct ummu_queue_poll qp;
+	int ret = 0;
+
+	ummu_queue_poll_init(ummu, &qp);
+
+	/*
+	 * The MSI won't generate an event, since it's being written back
+	 * into the command queue.
+	 */
+	qp.wfe = false;
+	smp_cond_load_relaxed(cmd, !VAL || (ret = ummu_queue_poll(&qp)));
+	llq->cons = ret ? readl(mcmdq->q.cons_reg) : ummu_queue_inc_prod_n(llq, 1);
+
+	return ret;
+}
+
+/*
+ * Wait until the UMMU cons index passes llq->prod.
+ */
+static int ummu_mcmdq_poll_until_consumed(struct ummu_device *ummu,
+					  struct ummu_mcmdq *mcmdq,
+					  struct ummu_ll_queue *llq)
+{
+	struct ummu_queue_poll qp;
+	u32 prod = llq->prod;
+	int ret = 0;
+
+	ummu_queue_poll_init(ummu, &qp);
+	llq->val = READ_ONCE(mcmdq->q.llq.val);
+	do {
+		if (ummu_queue_consumed(llq, prod))
+			break;
+
+		ret = ummu_queue_poll(&qp);
+
+		/*
+		 * This needs to be a readl() so that our subsequent call
+		 * to ummu_mcmdq_shared_tryunlock() can fail accurately.
+		 *
+		 * Specifically, we need to ensure that we observe all
+		 * shared_lock()s by other CMD_SYNCs that share our owner,
+		 * so that a failing call to tryunlock() means that we're
+		 * the last one out and therefore we can safely advance
+		 * mcmdq->q.llq.cons. Roughly speaking:
+		 */
+		llq->cons = readl(mcmdq->q.cons_reg);
+	} while (!ret);
+
+	return ret;
+}
+
+static int ummu_mcmdq_poll_until_sync(struct ummu_device *ummu,
+				      struct ummu_mcmdq *mcmdq,
+				      struct ummu_ll_queue *llq)
+{
+	if (ummu->cap.options & UMMU_OPT_MSIPOLL)
+		return ummu_mcmdq_poll_until_msi(ummu, mcmdq, llq);
+
+	return ummu_mcmdq_poll_until_consumed(ummu, mcmdq, llq);
+}
+
+static void ummu_mcmdq_write_entries(struct ummu_mcmdq *mcmdq, u64 *cmds,
+				     u32 prod, int n)
+{
+	struct ummu_ll_queue llq;
+	u64 *cmd;
+	int i;
+
+	llq.prod = prod;
+	llq.log2size = mcmdq->q.llq.log2size;
+
+	for (i = 0; i < n; ++i) {
+		cmd = &cmds[i * MCMDQ_ENT_DWORDS];
+		prod = ummu_queue_inc_prod_n(&llq, i);
+		ummu_queue_write(Q_ENT(&mcmdq->q, prod), cmd, MCMDQ_ENT_DWORDS);
+	}
+}
+
+static int check_pa_continuity_nop_exec(struct ummu_queue *q, u32 prod)
+{
+	u64 cmd = (u64)le64_to_cpu(Q_ENT(q, prod));
+
+	if (FIELD_GET(CMD_0_OP, cmd) == CMD_NULL_OP &&
+	    FIELD_GET(CMD_NULL_OP_SUB_OP, cmd) ==
+		      SUB_CMD_NULL_CHECK_PA_CONTINUITY) {
+		if (FIELD_GET(SUB_OP_CHECK_PA_CONTI_0_RESULT, cmd))
+			return -ENOSPC;
+		if (FIELD_GET(SUB_OP_CHECK_PA_CONTI_0_FLAG, cmd) == 1 &&
+			FIELD_GET(SUB_OP_CHECK_PA_CONTI_0_ID, cmd) != 0)
+			return -ERANGE;
+	}
+	return 0;
+}
+
+static struct ummu_mcmdq *ummu_device_get_mcmdq(struct ummu_device *ummu,
+						u64 *cmd)
+{
+	return *this_cpu_ptr(ummu->mcmdq);
+}
+
+static int ummu_mcmdq_exclusive_issue_cmdlist(struct ummu_device *ummu,
+					      struct ummu_mcmdq *mcmdq,
+					      u64 *cmds, int n, bool sync)
+{
+	u64 cmd_sync[MCMDQ_ENT_DWORDS], old;
+	struct ummu_ll_queue llq, head;
+	unsigned long flags;
+	u32 prod, sprod;
+	int ret = 0;
+
+	llq.log2size = mcmdq->q.llq.log2size;
+	/* 1. Allocate some space in the queue */
+	local_irq_save(flags);
+	llq.val = READ_ONCE(mcmdq->q.llq.val);
+	do {
+		while (!ummu_queue_has_space(&llq, n + sync)) {
+			local_irq_restore(flags);
+			if (ummu_mcmdq_poll_until_not_full(ummu, mcmdq, &llq)) {
+				dev_err_ratelimited(ummu->dev, "wait MCMDQ not full timeout.\n");
+				return -ETIMEDOUT;
+			}
+			local_irq_save(flags);
+		}
+
+		head.cons = llq.cons;
+		head.prod = ummu_queue_inc_prod_n(&llq, n + (sync ? 1 : 0));
+
+		old = cmpxchg_relaxed(&mcmdq->q.llq.val, llq.val, head.val);
+		if (old == llq.val)
+			break;
+
+		llq.val = old;
+	} while (1);
+	sprod = llq.prod;
+
+	/* 2. Write our commands into the queue */
+	ummu_mcmdq_write_entries(mcmdq, cmds, llq.prod, n);
+	if (sync) {
+		prod = ummu_queue_inc_prod_n(&llq, n);
+		ummu_mcmdq_build_sync_cmd(cmd_sync, ummu, &mcmdq->q, prod);
+		ummu_queue_write(Q_ENT(&mcmdq->q, prod), cmd_sync, MCMDQ_ENT_DWORDS);
+	}
+
+	/* 3. Ensuring commands are visible first */
+	dma_wmb();
+
+	/* 4. Advance the hardware prod pointer */
+	read_lock(&mcmdq->mcmdq_lock);
+	writel_relaxed(head.prod | mcmdq->mcmdq_prod, mcmdq->q.prod_reg);
+	read_unlock(&mcmdq->mcmdq_lock);
+
+	/* 5. If we are inserting a CMD_SYNC, we must wait for it to complete */
+	if (sync) {
+		llq.prod = ummu_queue_inc_prod_n(&llq, n);
+		ret = ummu_mcmdq_poll_until_sync(ummu, mcmdq, &llq);
+		if (ret) {
+			/*
+			 * When sync times out, error handling cannot be performed more
+			 * effectively and CIs need to be maintained. Therefore, continue.
+			 */
+			dev_err_ratelimited(ummu->dev,
+				"CMD_SYNC timeout at 0x%08x [hwprod 0x%08x, hwcons 0x%08x]\n",
+				llq.prod,
+				readl_relaxed(mcmdq->q.prod_reg),
+				readl_relaxed(mcmdq->q.cons_reg));
+		}
+
+		ret = check_pa_continuity_nop_exec(&mcmdq->q, sprod);
+		/*
+		 * Update mcmdq->q.llq.cons, to improve the success rate of
+		 * ummu_queue_has_space() when some new commands are inserted next
+		 * time.
+		 */
+		WRITE_ONCE(mcmdq->q.llq.cons, llq.cons);
+	}
+
+	local_irq_restore(flags);
+	return ret;
+}
+
+/*
+ * The actual insert function provides the following functionality for
+ * sorting guarantees to callers:
+ * - Prioritizing write ordering of data structures in memory
+ * by ensuring a dma_wmb() before publishing any
+ * command to the queue.
+ * - Sorting subsequent writes to memory
+ * (e.g., releasing the IOVA after CMD_SYNC is complete) through a
+ * control dependency when CMD_SYNC is finished.
+ * - Ensuring fully ordered command insertion, where if two CPUs
+ * compete with each other to insert their own command lists, one CPU's
+ * commands will always appear before any commands from another CPU.
+ */
+int ummu_mcmdq_issue_cmdlist(struct ummu_device *ummu, u64 *cmds,
+			     int n, bool sync)
+{
+	struct ummu_mcmdq *mcmdq = ummu_device_get_mcmdq(ummu, cmds);
+	u64 cmd_sync[MCMDQ_ENT_DWORDS];
+	struct ummu_ll_queue llq, head;
+	unsigned long flags;
+	u32 prod, sprod;
+	int ret = 0;
+	bool owner;
+	u64 old;
+
+	if (unlikely(!mcmdq->shared))
+		return ummu_mcmdq_exclusive_issue_cmdlist(ummu, mcmdq, cmds,
+							  n, sync);
+
+	llq.log2size = mcmdq->q.llq.log2size;
+
+	/* 1. Allocate some space in the queue */
+	local_irq_save(flags);
+	llq.val = READ_ONCE(mcmdq->q.llq.val);
+	do {
+		while (!ummu_queue_has_space(&llq, n + sync)) {
+			local_irq_restore(flags);
+			if (ummu_mcmdq_poll_until_not_full(ummu, mcmdq, &llq)) {
+				dev_err_ratelimited(ummu->dev, "wait MCMDQ not full timeout.\n");
+				return -ETIMEDOUT;
+			}
+			local_irq_save(flags);
+		}
+
+		head.cons = llq.cons;
+		head.prod = ummu_queue_inc_prod_n(&llq, n + sync) |
+			    MCMDQ_PROD_OWNED_FLAG;
+
+		old = cmpxchg_relaxed(&mcmdq->q.llq.val, llq.val, head.val);
+		if (old == llq.val)
+			break;
+
+		llq.val = old;
+	} while (1);
+	owner = !(llq.prod & MCMDQ_PROD_OWNED_FLAG);
+	head.prod &= ~MCMDQ_PROD_OWNED_FLAG;
+	llq.prod &= ~MCMDQ_PROD_OWNED_FLAG;
+	sprod = llq.prod;
+	/*
+	 * 2. Write our commands into the queue
+	 * Dependency ordering from the cmpxchg() loop above.
+	 */
+	ummu_mcmdq_write_entries(mcmdq, cmds, llq.prod, n);
+	if (sync) {
+		prod = ummu_queue_inc_prod_n(&llq, n);
+		ummu_mcmdq_build_sync_cmd(cmd_sync, ummu, &mcmdq->q, prod);
+		ummu_queue_write(Q_ENT(&mcmdq->q, prod), cmd_sync, MCMDQ_ENT_DWORDS);
+
+		/*
+		 * In order to determine completion of our CMD_SYNC, we must
+		 * ensure that the queue can't wrap twice without us noticing.
+		 * We achieve that by taking the mcmdq lock as shared before
+		 * marking our slot as valid.
+		 */
+		ummu_mcmdq_shared_lock(mcmdq);
+	}
+
+	/* 3. Mark our slots as valid, ensuring commands are visible first */
+	dma_wmb();
+	ummu_mcmdq_set_valid_map(mcmdq, llq.prod, head.prod);
+
+	/* 4. If we are the owner, take control of the UMMU hardware */
+	if (owner) {
+		/* a. Wait for previous owner to finish */
+		atomic_cond_read_relaxed(&mcmdq->owner_prod, VAL == llq.prod);
+
+		/* b. Stop gathering work by clearing the owned flag */
+		prod = atomic_fetch_andnot_relaxed(MCMDQ_PROD_OWNED_FLAG,
+						   &mcmdq->q.llq.atomic.prod);
+		prod &= ~MCMDQ_PROD_OWNED_FLAG;
+
+		/*
+		 * c. Wait for any gathered work to be written to the queue.
+		 * Note that we read our own entries so that we have the control
+		 * dependency required by (d).
+		 */
+		ummu_mcmdq_poll_valid_map(mcmdq, llq.prod, prod);
+
+		/*
+		 * d. Advance the hardware prod pointer
+		 * Control dependency ordering from the entries becoming valid.
+		 */
+		read_lock(&mcmdq->mcmdq_lock);
+		writel_relaxed(prod | mcmdq->mcmdq_prod, mcmdq->q.prod_reg);
+		read_unlock(&mcmdq->mcmdq_lock);
+
+		/*
+		 * e. Tell the next owner we're done
+		 * Make sure we've updated the hardware first, so that we don't
+		 * race to update prod and potentially move it backwards.
+		 */
+		atomic_set_release(&mcmdq->owner_prod, prod);
+	}
+
+	/* 5. If we are inserting a CMD_SYNC, we must wait for it to complete */
+	if (sync) {
+		llq.prod = ummu_queue_inc_prod_n(&llq, n);
+
+		ret = ummu_mcmdq_poll_until_sync(ummu, mcmdq, &llq);
+		if (ret)
+			dev_err_ratelimited(
+				ummu->dev,
+				"CMD_SYNC timeout at 0x%08x [hwprod 0x%08x, hwcons 0x%08x]\n",
+				llq.prod, readl_relaxed(mcmdq->q.prod_reg),
+				readl_relaxed(mcmdq->q.cons_reg));
+
+		ret = check_pa_continuity_nop_exec(&mcmdq->q, sprod);
+		/*
+		 * Try to unlock the mcmdq lock. This will fail if we're the last
+		 * reader, in which case we can safely update mcmdq->q.llq.cons
+		 */
+		if (!ummu_mcmdq_shared_tryunlock(mcmdq)) {
+			WRITE_ONCE(mcmdq->q.llq.cons, llq.cons);
+			ummu_mcmdq_shared_unlock(mcmdq);
+		}
+	}
+
+	local_irq_restore(flags);
+	return ret;
+}
+
+static int __ummu_mcmdq_issue_cmd(struct ummu_device *ummu,
+				  struct ummu_mcmdq_ent *ent, bool sync)
+{
+	u64 cmd[MCMDQ_ENT_DWORDS];
+
+	if (unlikely(ummu_mcmdq_build_cmd(ummu, cmd, ent))) {
+		dev_warn(ummu->dev, "ignoring unknown MCMDQ opcode = 0x%x\n",
+			 ent->opcode);
+		return -EINVAL;
+	}
+
+	return ummu_mcmdq_issue_cmdlist(ummu, cmd, 1, sync);
+}
+
+int ummu_mcmdq_issue_cmd(struct ummu_device *ummu, struct ummu_mcmdq_ent *ent)
+{
+	return __ummu_mcmdq_issue_cmd(ummu, ent, false);
+}
+
+int ummu_mcmdq_issue_cmd_with_sync(struct ummu_device *ummu,
+				   struct ummu_mcmdq_ent *ent)
+{
+	return __ummu_mcmdq_issue_cmd(ummu, ent, true);
+}
+
+void ummu_mcmdq_batch_add(struct ummu_device *ummu,
+			  struct ummu_mcmdq_batch *cmds,
+			  struct ummu_mcmdq_ent *cmd)
+{
+	int index;
+
+	if (cmds->num == MCMDQ_BATCH_ENTRIES) {
+		(void)ummu_mcmdq_issue_cmdlist(ummu, cmds->cmds, cmds->num, false);
+		cmds->num = 0;
+	}
+
+	index = cmds->num * MCMDQ_ENT_DWORDS;
+	if (unlikely(ummu_mcmdq_build_cmd(ummu, &cmds->cmds[index], cmd))) {
+		dev_warn(ummu->dev, "ignoring unknown MCMDQ opcode = 0x%x\n",
+			 cmd->opcode);
+		return;
+	}
+
+	cmds->num++;
+}
+
+int ummu_mcmdq_batch_submit(struct ummu_device *ummu,
+			    struct ummu_mcmdq_batch *cmds)
+{
+	return ummu_mcmdq_issue_cmdlist(ummu, cmds->cmds, cmds->num, true);
+}
diff --git a/drivers/iommu/hisilicon/queue.h b/drivers/iommu/hisilicon/queue.h
new file mode 100644
index 0000000000000000000000000000000000000000..2f60fd52ce8342d9af5dd24dd24109e303f46268
--- /dev/null
+++ b/drivers/iommu/hisilicon/queue.h
@@ -0,0 +1,324 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Copyright (c) 2025 HiSilicon Technologies Co., Ltd. All rights reserved.
+ * Description: mcmdq/evtq/permq header file
+ */
+
+#ifndef __UMMU_QUEUE_H__
+#define __UMMU_QUEUE_H__
+
+#include "ummu.h"
+
+#define Q_IDX(llq, p) ((p) & ((1 << (llq)->log2size) - 1))
+#define Q_WRP(llq, p) ((p) & (1 << (llq)->log2size))
+#define Q_OVERFLOW_FLAG (1UL << 31)
+#define Q_OVF(p) ((p)&Q_OVERFLOW_FLAG)
+#define Q_ENT(q, p) ((q)->base + Q_IDX(&((q)->llq), p) * (q)->ent_dwords)
+
+/*
+ * Ensure DMA allocations are naturally aligned
+ * Hardware requirements base address by address length align
+ */
+#if IS_ENABLED(CONFIG_CMA_ALIGNMENT)
+#define Q_MAX_SZ_SHIFT (PAGE_SHIFT + CONFIG_CMA_ALIGNMENT)
+#else
+#define Q_MAX_SZ_SHIFT (PAGE_SHIFT + MAX_ORDER)
+#endif
+
+#define Q_BASE_RWA (1ULL << 63)
+#define Q_BASE_ADDR_MASK GENMASK_ULL(51, 5)
+#define Q_BASE_LOG2SIZE GENMASK(4, 0)
+
+/* multiple command queue */
+#define MCMDQ_ENT_SZ_SHIFT 5
+#define MCMDQ_ENT_DWORDS ((1UL << MCMDQ_ENT_SZ_SHIFT) / sizeof(u64))
+#define MCMDQ_ENT_SIZE 16
+#define MCMDQ_MAX_SZ_SHIFT 8
+
+#define UMMU_MCMDQ_OFFSET 0x100
+#define MCMDQ_PROD_OFFSET 0x8
+#define MCMDQ_CONS_OFFSET 0xC
+#define MCMDQ_CONS_ERR (1UL << 23)
+#define MCMDQ_PROD_ERRACK (1UL << 23)
+#define MCMDQ_PROD_EN (1UL << 31)
+#define MCMDQ_EN_RESP (1UL << 31)
+
+#define MCMDQ_CONS_ERR_REASON GENMASK(26, 24)
+#define MCMDQ_CERROR_NONE_IDX 0
+#define MCMDQ_CERROR_ILL_IDX 1
+#define MCMDQ_CERROR_ABT_IDX 2
+
+#define MCMDQ_PROD_OWNED_FLAG Q_OVERFLOW_FLAG
+
+#define MCMDQ_BATCH_ENTRIES 32
+#define CMD_0_OP GENMASK_ULL(7, 0)
+#define CMD_0_SSV (1UL << 11)
+
+#define CMD_SYNC_0_CM GENMASK_ULL(13, 12)
+#define CMD_SYNC_0_CM_NONE 0
+#define CMD_SYNC_0_CM_IRQ 1
+#define CMD_SYNC_0_CM_SEV 2
+#define CMD_SYNC_0_MSISH GENMASK_ULL(15, 14)
+#define CMD_SYNC_0_MSIATTR GENMASK_ULL(19, 16)
+#define CMD_SYNC_0_MSIDATA GENMASK_ULL(63, 32)
+#define CMD_SYNC_1_MSIADDR GENMASK_ULL(51, 2)
+
+#define CMD_STALL_0_DSEC (1UL << 10)
+#define CMD_STALL_0_RETRY (1UL << 12)
+#define CMD_STALL_0_ABORT (1UL << 13)
+#define CMD_STALL_1_TAG GENMASK_ULL(15, 0)
+#define CMD_STALL_2_TECT_TAG GENMASK_ULL(15, 0)
+
+#define CMD_PREFET_0_TKV (1UL << 11)
+#define CMD_PREFET_0_TID GENMASK_ULL(31, 12)
+#define CMD_PREFET_0_SIZE GENMASK_ULL(56, 52)
+#define CMD_PREFET_0_STRIDE GENMASK_ULL(61, 57)
+#define CMD_PREFET_1_ADDR_MASK GENMASK_ULL(63, 12)
+#define CMD_PREFET_2_TECTE_TAG GENMASK_ULL(15, 0)
+#define CMD_PREFET_2_DEID_0 GENMASK_ULL(31, 0)
+#define CMD_PREFET_2_DEID_1 GENMASK_ULL(63, 32)
+#define CMD_PREFET_3_DEID_0 GENMASK_ULL(31, 0)
+#define CMD_PREFET_3_DEID_1 GENMASK_ULL(63, 32)
+
+#define CMD_CFGI_0_LEAF (1UL << 8)
+#define CMD_CFGI_0_TID GENMASK_ULL(31, 12)
+#define CMD_CFGI_0_VMID GENMASK_ULL(47, 32)
+#define CMD_CFGI_0_RANGE GENMASK_ULL(56, 52)
+#define CMD_CFGI_2_TECTE_TAG GENMASK_ULL(15, 0)
+#define CMD_CFGI_2_DEID_0 GENMASK_ULL(31, 0)
+#define CMD_CFGI_2_DEID_1 GENMASK_ULL(63, 32)
+#define CMD_CFGI_3_DEID_0 GENMASK_ULL(31, 0)
+#define CMD_CFGI_3_DEID_1 GENMASK_ULL(63, 32)
+
+#define CMD_TLBI_0_LEAF (1UL << 8)
+#define CMD_TLBI_0_ASID GENMASK_ULL(27, 12)
+#define CMD_TLBI_0_TOKEN_ID GENMASK_ULL(31, 12)
+#define CMD_TLBI_0_VMID GENMASK_ULL(47, 32)
+#define CMD_TLBI_0_NUM GENMASK_ULL(56, 52)
+#define CMD_TLBI_0_SCALE GENMASK_ULL(61, 57)
+#define CMD_TLBI_0_TL GENMASK_ULL(63, 62)
+#define CMD_TLBI_1_GS GENMASK_ULL(1, 0)
+#define CMD_TLBI_1_VA_MASK GENMASK_ULL(63, 12)
+#define CMD_TLBI_1_IPA_MASK GENMASK_ULL(51, 12)
+#define CMD_TLBI_2_TECTE_TAG GENMASK_ULL(15, 0)
+#define CMD_TLBI_RANGE_NUM_MAX 31
+
+#define CMD_PLBI_0_TID GENMASK_ULL(31, 12)
+#define CMD_PLBI_0_RANGE GENMASK_ULL(37, 32)
+#define CMD_PLBI_1_ADDR_MASK GENMASK_ULL(63, 0)
+#define CMD_PLBI_2_TECTE_TAG GENMASK_ULL(15, 0)
+
+#define CMD_CREATE_KVTBL0_EVT_EN BIT(8)
+#define CMD_CREATE_KVTBL0_TAG_MASK GENMASK_ULL(31, 16)
+#define CMD_CREATE_KVTBL0_KV_INDEX_MASK GENMASK_ULL(63, 32)
+#define CMD_CREATE_KVTBL1_ADDR_MASK GENMASK_ULL(51, 6)
+#define CMD_CREATE_KVTBL2_EID_LOW GENMASK_ULL(63, 0)
+#define CMD_CREATE_KVTBL3_EID_HIGH GENMASK_ULL(63, 0)
+
+#define CMD_DELETE_KVTBL0_EVT_EN BIT(8)
+#define CMD_DELETE_KVTBL0_TAG_MASK GENMASK_ULL(31, 16)
+#define CMD_DELETE_KVTBL0_KV_INDEX_MASK GENMASK_ULL(63, 32)
+#define CMD_DELETE_KVTBL2_EID_LOW GENMASK_ULL(63, 0)
+#define CMD_DELETE_KVTBL3_EID_HIGH GENMASK_ULL(63, 0)
+
+#define CMD_NULL_OP_SUB_OP GENMASK(15, 8)
+#define SUB_OP_CHECK_PA_CONTI_0_RESULT GENMASK(19, 16)
+#define SUB_OP_CHECK_PA_CONTI_0_FLAG BIT(20)
+#define SUB_OP_CHECK_PA_CONTI_0_SIZE GENMASK(29, 24)
+#define SUB_OP_CHECK_PA_CONTI_0_ID GENMASK_ULL(40, 32)
+#define SUB_OP_CHECK_PA_CONTI_1_ADDR GENMASK_ULL(63, 12)
+
+/* event queue */
+#define EVTQ_ENT_SZ_SHIFT 6
+#define EVTQ_ENT_DWORDS (1UL << EVTQ_ENT_SZ_SHIFT >> 3)
+#define EVTQ_MAX_SZ_SHIFT (Q_MAX_SZ_SHIFT - EVTQ_ENT_SZ_SHIFT)
+
+#define UMMU_EVTQ_OFFSET 0x1100
+#define UMMU_EVTQ_PROD_OFFSET 0x1108
+#define UMMU_EVTQ_CONS_OFFSET 0x110C
+
+#define EVTQ_ENT0_CODE GENMASK(7, 0)
+#define EVTQ_ENT0_RNW (1U << 11)
+#define EVTQ_ENT0_IND (1U << 12)
+#define EVTQ_ENT0_PNU (1U << 13)
+#define EVTQ_ENT0_CLS GENMASK(15, 14)
+#define EVTQ_ENT0_NSIPA (1U << 16)
+#define EVTQ_ENT0_S2 (1U << 17)
+#define EVTQ_ENT0_STALL (1U << 18)
+#define EVTQ_ENT0_TTRNW (1U << 19)
+#define EVTQ_ENT0_TID GENMASK_ULL(51, 32)
+
+#define EVTQ_ENT1_STAG GENMASK(15, 0)
+#define EVTQ_ENT1_IMPL_DEF GENMASK(31, 16)
+#define EVTQ_ENT1_REASON GENMASK_ULL(63, 32)
+
+#define EVTQ_ENT2_IPA GENMASK_ULL(51, 12)
+#define EVTQ_ENT3_IADDR GENMASK_ULL(63, 0)
+#define EVTQ_ENT4_TECTE_TAG GENMASK(15, 0)
+#define EVTQ_ENT4_EID_LOW GENMASK_ULL(63, 0)
+#define EVTQ_ENT5_EID_HIGH GENMASK_ULL(63, 0)
+#define EVTQ_ENT6_FTADDR GENMASK_ULL(51, 3)
+
+struct ummu_mcmdq_batch {
+	u64 cmds[MCMDQ_BATCH_ENTRIES * MCMDQ_ENT_DWORDS];
+	int num;
+};
+
+struct ummu_mcmdq_ent {
+	/* Common fields */
+	u8 opcode;
+
+	/* Command-specific fields */
+	union {
+#define CMD_SYNC 0x1
+		struct {
+			u64 msi_addr;
+			bool support_sev;
+		} sync;
+
+#define CMD_STALL_RESUME 0x02
+		struct {
+			bool dsec;
+			bool retry;
+			bool abort;
+			u16 tect_tag;
+			u16 tag;
+		} stall_resume;
+
+#define CMD_STALL_TERM 0x03
+		struct {
+			u16 tect_tag;
+		} stall_term;
+
+#define CMD_PREFET_CFG 0x04
+		struct {
+			bool tkv;
+			u32 tid;
+			u32 deid_0;
+			u32 deid_1;
+			u32 deid_2;
+			u32 deid_3;
+		} prefet;
+
+#define CMD_CFGI_TECT 0x08
+#define CMD_CFGI_TECT_RANGE 0x09
+#define CMD_CFGI_TCT 0x0A
+#define CMD_CFGI_TCT_ALL 0x0B
+#define CMD_CFGI_TECTS_PIDM 0x0C
+		struct {
+			bool leaf;
+			u32 tid;
+			u16 vmid;
+			u8 range;
+			u32 deid_0;
+			u32 deid_1;
+			u32 deid_2;
+			u32 deid_3;
+		} cfgi;
+
+#define CMD_PLBI_OS_EID 0x14
+#define CMD_PLBI_OS_EIDTID 0x15
+#define CMD_PLBI_OS_VA 0x16
+		struct {
+			u32 tid;
+			u16 tecte_tag;
+			u8 range;
+			u64 addr;
+		} plbi;
+
+#define CMD_TLBI_OS_ALL 0x10
+#define CMD_TLBI_OS_TID 0x11
+#define CMD_TLBI_OS_VA 0x12
+#define CMD_TLBI_OS_VAA 0x13
+#define CMD_TLBI_HYP_ALL 0x18
+#define CMD_TLBI_HYP_TID 0x19
+#define CMD_TLBI_HYP_VA 0x1A
+#define CMD_TLBI_HYP_VAA 0x1B
+#define CMD_TLBI_S1S2_VMALL 0x28
+#define CMD_TLBI_S2_IPA 0x2A
+#define CMD_TLBI_NS_OS_ALL 0x2C
+#define CMD_TLBI_OS_ALL_U 0x90
+#define CMD_TLBI_OS_ASID_U 0x91
+#define CMD_TLBI_OS_VA_U 0x92
+#define CMD_TLBI_OS_VAA_U 0x93
+#define CMD_TLBI_HYP_ASID_U 0x99
+#define CMD_TLBI_HYP_VA_U 0x9A
+#define CMD_TLBI_S1S2_VMALL_U 0xA8
+#define CMD_TLBI_S2_IPA_U 0xAA
+		struct {
+			bool leaf;
+			u16 asid;
+			u16 vmid;
+			u32 tid;
+			u16 tect_tag;
+			u8 num;
+			u8 scale;
+			u8 tl;
+			u8 gs;
+			u64 addr;
+		} tlbi;
+
+#define CMD_RESUME 0x44
+		struct {
+			u32 deid;
+			u16 stag;
+			u8 resp;
+		} resume;
+
+#define CMD_CREATE_KVTBL 0x60
+		struct {
+			bool evt_en;
+			u16 tecte_tag;
+			u32 kv_index;
+			u64 tect_base_addr;
+			u64 eid_low;
+			u64 eid_high;
+		} create_kvtbl;
+
+#define CMD_DELETE_KVTBL 0x61
+		struct {
+			bool evt_en;
+			u16 tecte_tag;
+			u32 kv_index;
+			u64 eid_low;
+			u64 eid_high;
+		} delete_kvtbl;
+
+#define CMD_NULL_OP 0x62
+		struct {
+			u8 sub_op;
+			union {
+#define SUB_CMD_NULL_CHECK_PA_CONTINUITY 0x1
+				struct {
+					u16 result;
+					u16 flag;
+					u32 size_order;
+					u32 id;
+					u64 addr;
+				} check_pa_conti;
+			};
+		} null_op;
+	};
+};
+
+void ummu_queue_write(__le64 *dst, u64 *src, size_t n_dwords);
+void ummu_queue_read(u64 *dst, __le64 *src, size_t n_dwords);
+int ummu_queue_remove_raw(struct ummu_queue *q, u64 *ent);
+int ummu_queue_sync_prod_in(struct ummu_queue *q);
+bool ummu_queue_empty(struct ummu_ll_queue *q);
+int ummu_write_evtq_regs(struct ummu_device *ummu);
+int ummu_init_queues(struct ummu_device *ummu);
+int ummu_device_mcmdq_init_cfg(struct ummu_device *ummu);
+int ummu_mcmdq_issue_cmd(struct ummu_device *ummu, struct ummu_mcmdq_ent *ent);
+int ummu_mcmdq_build_cmd(struct ummu_device *ummu, u64 *cmd,
+			 struct ummu_mcmdq_ent *ent);
+int ummu_mcmdq_issue_cmdlist(struct ummu_device *ummu, u64 *cmds,
+			     int n, bool sync);
+int ummu_mcmdq_issue_cmd_with_sync(struct ummu_device *ummu,
+				   struct ummu_mcmdq_ent *ent);
+void ummu_mcmdq_batch_add(struct ummu_device *ummu,
+			  struct ummu_mcmdq_batch *cmds,
+			  struct ummu_mcmdq_ent *cmd);
+int ummu_mcmdq_batch_submit(struct ummu_device *ummu,
+			    struct ummu_mcmdq_batch *cmds);
+#endif /* __UMMU_QUEUE_H__ */
diff --git a/drivers/iommu/hisilicon/regs.h b/drivers/iommu/hisilicon/regs.h
new file mode 100644
index 0000000000000000000000000000000000000000..23d3b033c60958605f6330f9566fcf7ca16ae601
--- /dev/null
+++ b/drivers/iommu/hisilicon/regs.h
@@ -0,0 +1,186 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/* Copyright (c) 2025 HiSilicon Technologies Co., Ltd. All rights reserved. */
+
+#ifndef __UMMU_REGS_H__
+#define __UMMU_REGS_H__
+
+#define UMMU_REG_SZ 0x5000
+
+/* MMIO registers */
+#define UMMU_IIDR 0x0
+#define IIDR_PROD_ID GENMASK(19, 8)
+#define IIDR_PROD_VARIANT GENMASK(7, 4)
+#define UMMU_AIDR 0x4
+
+#define UMMU_CAP0 0x10
+#define CAP0_TECT_LVL_BIT (1UL << 19)
+#define CAP0_TECT_MODE_MASK GENMASK(18, 17)
+#define CAP0_TCT_LVL_BIT (1UL << 16)
+#define CAP0_S2_ATTR_TYPE (1UL << 15)
+#define CAP0_ATTR_TYPES_OVR (1UL << 14)
+#define CAP0_ATTR_PERMS_OVR (1UL << 13)
+#define CAP0_TIDSIZE_MASK GENMASK(12, 8)
+#define CAP0_DEIDSIZE_MASK GENMASK(7, 0)
+
+#define UMMU_CAP1 0x14
+#define CAP1_STALL_MAX GENMASK(31, 20)
+#define CAP1_EVENT_GEN (1UL << 19)
+#define CAP1_MCMDQ_SUPPORT (1UL << 18)
+#define CAP1_MCMDQ_LOG2NUM GENMASK(17, 14)
+#define CAP1_MCMDQ_LOG2SIZE GENMASK(13, 10)
+#define MCMDQ_MAX_LOG2SIZE 15
+#define CAP1_EVENTQ_SUPPORT (1UL << 9)
+#define CAP1_EVENTQ_LOG2NUM GENMASK(8, 5)
+#define CAP1_EVENTQ_LOG2SIZE GENMASK(4, 0)
+#define EVTQ_MAX_LOG2SIZE 19
+
+#define UMMU_CAP2 0x18
+#define CAP2_TTF_MASK GENMASK(15, 14)
+#define CAP2_TTF_IAS_40 40
+#define CAP2_TTF_AARCH64 2
+#define CAP2_TTF_AARCH32_64 3
+#define CAP2_TRANS_SMALL_BIT (1UL << 13)
+#define CAP2_S1P_BIT (1UL << 12)
+#define CAP2_S2P_BIT (1UL << 11)
+#define CAP2_VA_EXT_MASK GENMASK(10, 9)
+#define CAP2_VA_EXT_52 1
+#define CAP2_GRAN64K_BIT (1UL << 8)
+#define CAP2_GRAN16K_BIT (1UL << 7)
+#define CAP2_GRAN4K_BIT (1UL << 6)
+#define CAP2_OAS_MASK GENMASK(5, 3)
+#define CAP2_OAS_32_BIT 0
+#define CAP2_OAS_36_BIT 1
+#define CAP2_OAS_40_BIT 2
+#define CAP2_OAS_42_BIT 3
+#define CAP2_OAS_44_BIT 4
+#define CAP2_OAS_48_BIT 5
+#define CAP2_TTF_OAS_32 32
+#define CAP2_TTF_OAS_36 36
+#define CAP2_TTF_OAS_40 40
+#define CAP2_TTF_OAS_42 42
+#define CAP2_TTF_OAS_44 44
+#define CAP2_TTF_OAS_48 48
+#define CAP2_RTLBI_BIT (1UL << 2)
+#define CAP2_BTLBI_BIT (1UL << 1)
+#define CAP2_VMIDTLBI_BIT (1UL << 0)
+
+#define UMMU_CAP3 0x1C
+#define CAP3_SATIMAX_MASK GENMASK(20, 15)
+#define CAP3_TERM_MODEL_BIT (1UL << 14)
+#define CAP3_STALL_MODEL_MASK GENMASK(13, 12)
+#define CAP3_STALL_MODE 0
+#define CAP3_STALL_MODE_FORCE 2
+#define CAP3_MSI_SUPPORT_BIT (1UL << 11)
+#define CAP3_HYP_S1CTX_BIT (1UL << 10)
+#define CAP3_HTTU_MASK GENMASK(9, 8)
+#define CAP3_HTTU_ACCESS_DIRTY 2
+#define CAP3_HTTU_ACCESS 1
+#define CAP3_MTM_BIT (1UL << 7)
+#define CAP3_TTENDIAN_MASK GENMASK(6, 5)
+#define CAP3_TTENDIAN_MIXED 0
+#define CAP3_TTENDIAN_LE 2
+#define CAP3_TTENDIAN_BE 3
+#define CAP3_COHACC_BIT (1UL << 4)
+#define CAP3_BBML_MASK GENMASK(3, 2)
+#define CAP3_BBML0 0
+#define CAP3_BBML1 1
+#define CAP3_BBML2 2
+#define CAP3_S2_EXE_NEVER_CTRL_BIT (1UL << 1)
+#define CAP3_HIER_ARRT_DISABLE_BIT (1UL << 0)
+
+#define UMMU_CAP4 0x20
+#define CAP4_UEQ_SUPPORT (1UL << 24)
+#define CAP4_UEQ_LOG2NUM GENMASK(23, 20)
+#define CAP4_UEQ_LOG2SIZE GENMASK(19, 16)
+#define CAP4_UCPLQ_LOG2SIZE GENMASK(15, 12)
+#define CAP4_UCMDQ_LOG2SIZE GENMASK(11, 8)
+#define CAP4_UCMDQ_CPLQ_LOG2NUM GENMASK(7, 0)
+
+#define UMMU_CAP5 0x24
+#define CAP5_BRDCAST_PLBI_BIT (1UL << 9)
+#define CAP5_RANGE_PLBI_BIT (1UL << 8)
+#define CAP5_TKVALCHK_MOD GENMASK(7, 6)
+#define CAP5_TKVALCHK_BIT (1UL << 5)
+#define CAP5_PT_GRAN4K_BIT (1UL << 4)
+#define CAP5_PT_GRAN2M_BIT (1UL << 3)
+#define CAP5_MAPT_MODE_MASK GENMASK(2, 1)
+#define CAP5_MAPT_SUPPORT (1UL << 0)
+
+#define UMMU_CAP6 0x28
+#define CAP6_MTM_GP_MAX GENMASK(23, 16)
+#define CAP6_MTM_ID_MAX GENMASK(15, 0)
+
+#define UMMU_CR0 0x30
+#define UMMU_CR0ACK 0x34
+#define CR0_MAPT_EN (1UL << 5)
+#define CR0_VMID_WILDCARD_MASK GENMASK(4, 2)
+#define CR0_EVENTQ_EN (1UL << 1)
+#define CR0_UMMU_EN (1UL << 0)
+
+#define UMMU_CR1 0x38
+#define CR1_TECT_MODE_SEL (1UL << 15)
+#define CR1_PRIVATE_TLB (1UL << 14)
+#define CR1_BAD_EID_RECORD (1UL << 13)
+#define CR1_E2H (1UL << 12)
+#define CR1_TABLE_SH GENMASK(11, 10)
+#define CR1_TABLE_OC GENMASK(9, 8)
+#define CR1_TABLE_IC GENMASK(7, 6)
+#define CR1_QUEUE_SH GENMASK(5, 4)
+#define CR1_QUEUE_OC GENMASK(3, 2)
+#define CR1_QUEUE_IC GENMASK(1, 0)
+
+#define UMMU_CR2 0x3C
+#define CR2_PRIVATE_PLB (1UL << 6)
+#define CR2_UE_QUEUE_SH GENMASK(5, 4)
+#define CR2_UE_QUEUE_OC GENMASK(3, 2)
+#define CR2_UE_QUEUE_IC GENMASK(1, 0)
+
+#define UMMU_CR3 0x40
+#define CR3_UPDATE_FLAG (1UL << 31)
+#define CR3_TRANS_MTM_GP GENMASK(23, 16)
+#define CR3_TRANS_MTM_ID GENMASK(15, 0)
+
+#define UMMU_GBPA 0x50
+#define GBPA_UPDATE_BIT (1UL << 31)
+#define GBPA_ABORT_BIT (1UL << 15)
+
+#define UMMU_EVENT_QUE_MSI_ADDR0 0x1110
+#define UMMU_MSI_ADDR1_OFFSET 0x04
+#define UMMU_MSI_ADDR_MASK GENMASK_ULL(51, 2)
+#define UMMU_EVENT_QUE_MSI_DATA 0x1118
+#define UMMU_EVENT_QUE_MSI_ATTR 0x111C
+
+#define UMMU_GLB_IRQ_EN 0x1130
+#define IRQ_CTRL_EVTQ_IRQEN (1UL << 1)
+#define IRQ_CTRL_GERROR_IRQEN (1UL << 0)
+
+#define UMMU_GERROR 0x1134
+#define GERROR_MSI_GERR_ABT_ERR (1UL << 7)
+#define GERROR_MSI_UIEQ_ABT_ERR (1UL << 4)
+#define GERROR_MSI_EVTQ_ABT_ERR (1UL << 3)
+#define GERROR_MSI_MCMDQ_ABT_ERR (1UL << 2)
+#define GERROR_EVTQ_ABT_ERR (1UL << 1)
+#define GERROR_MCMDQ_ERR (1UL << 0)
+#define GERROR_ERR_MASK GENMASK(6, 0)
+#define UMMU_GERRORN 0x1138
+
+#define UMMU_GLB_ERR_INT_MSI_ADDR0 0x1140
+#define UMMU_GLB_ERR_INT_MSI_DATA 0x1148
+#define UMMU_GLB_ERR_INT_MSI_ATTR 0x114C
+
+/* Common memory attribute values */
+#define UMMU_SH_NSH 0
+#define UMMU_CACHE_WB 1
+#define UMMU_SH_OSH 2
+#define UMMU_SH_ISH 3
+#define UMMU_MEMATTR_DEVICE_nGnRE 0x1
+#define UMMU_MEMATTR_OIWB 0xf
+
+#define UMMU_REG_POLL_TIMEOUT_US 5
+#define UMMU_QUE_POLL_TIMEOUT_US 100000
+#define UMMU_POLL_SPIN_COUNT 10
+
+#define PERMQ_RELEASE_TIMEOUT_US 100
+#define UMMU_CONS_POLL_TIMEOUT_US 5
+
+#endif /* __UMMU_REGS_H__ */
diff --git a/drivers/iommu/hisilicon/ummu.h b/drivers/iommu/hisilicon/ummu.h
new file mode 100644
index 0000000000000000000000000000000000000000..7eb8089eeb2db145b047f72878d368678df98574
--- /dev/null
+++ b/drivers/iommu/hisilicon/ummu.h
@@ -0,0 +1,197 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Copyright (c) 2025 HiSilicon Technologies Co., Ltd. All rights reserved.
+ * Description: UMMU Device's implementations
+ */
+
+#ifndef __UMMU_H__
+#define __UMMU_H__
+
+#include <linux/bitfield.h>
+#include <linux/ummu_core.h>
+#include <linux/kernel.h>
+#include <linux/sizes.h>
+#include <linux/init.h>
+
+extern struct platform_driver ummu_driver;
+
+enum ummu_device_msi_index {
+	EVTQ_MSI_INDEX,
+	GERROR_MSI_INDEX,
+	UMMU_MAX_MSIS,
+};
+
+struct ummu_tct_desc {
+	u32	asid;
+};
+
+/* translation stage1 table config */
+struct ummu_s1_cfg {
+	struct ummu_tct_desc tct;
+};
+
+/* translation stage2 table config */
+struct ummu_s2_cfg {
+	u16	vmid;
+};
+
+enum ummu_domain_stage {
+	UMMU_DOMAIN_S1 = 0,
+	UMMU_DOMAIN_S2,
+};
+
+struct ummu_ll_queue {
+	union {
+		u64 val;
+		struct {
+			u32 prod;
+			u32 cons;
+		};
+		struct {
+			atomic_t prod;
+			atomic_t cons;
+		} atomic;
+		u8 __pad[SMP_CACHE_BYTES];
+	} ____cacheline_aligned_in_smp;
+	u32 log2size;
+};
+
+struct ummu_queue {
+	struct ummu_ll_queue llq;
+	__le64 *base;
+	phys_addr_t base_pa;
+	u64 q_base;
+
+	size_t ent_dwords;
+	u32 __iomem *prod_reg;
+	u32 __iomem *cons_reg;
+};
+
+struct ummu_mcmdq {
+	struct ummu_queue q;
+	atomic_long_t *valid_map;
+	atomic_t owner_prod;
+	u32 mcmdq_prod;
+	rwlock_t mcmdq_lock;
+	atomic_t lock;
+	int configured;
+	int shared;
+	void __iomem *base;
+};
+
+struct ummu_evtq {
+	struct ummu_queue q;
+	u32 max_stalls;
+};
+
+struct ummu_capability {
+#define UMMU_FEAT_2_LVL_TECT		BIT(0)
+#define UMMU_FEAT_2_LVL_TCT		BIT(1)
+#define UMMU_FEAT_MCMDQ			BIT(2)
+#define UMMU_FEAT_EVENTQ		BIT(3)
+#define UMMU_FEAT_SEV			BIT(4)
+#define UMMU_FEAT_TRANS_S1		BIT(5)
+#define UMMU_FEAT_TRANS_S2		BIT(6)
+#define UMMU_FEAT_RANGE_INV		BIT(7)
+#define UMMU_FEAT_STALLS		BIT(8)
+#define UMMU_FEAT_STALL_FORCE		BIT(9)
+#define UMMU_FEAT_MSI			BIT(10)
+#define UMMU_FEAT_HYP			BIT(11)
+#define UMMU_FEAT_HA			BIT(12)
+#define UMMU_FEAT_HD			BIT(13)
+#define UMMU_FEAT_MTM			BIT(14)
+#define UMMU_FEAT_TT_LE			BIT(15)
+#define UMMU_FEAT_TT_BE			BIT(16)
+#define UMMU_FEAT_COHERENCY		BIT(17)
+#define UMMU_FEAT_BBML1			BIT(18)
+#define UMMU_FEAT_BBML2			BIT(19)
+#define UMMU_FEAT_VAX			BIT(20)
+#define UMMU_FEAT_BTM			BIT(21)
+#define UMMU_FEAT_SVA			BIT(22)
+#define UMMU_FEAT_E2H			BIT(23)
+#define UMMU_FEAT_MAPT			BIT(24)
+#define UMMU_FEAT_RANGE_PLBI		BIT(25)
+#define UMMU_FEAT_TOKEN_CHK		BIT(26)
+#define UMMU_FEAT_PERMQ			BIT(27)
+#define UMMU_FEAT_NESTING		BIT(28)
+
+	u32 features;
+	u32 deid_bits;
+	u32 tid_bits;
+	u64 pgsize_bitmap;
+	u32 ias;
+	u32 oas;
+	u64 ptsize_bitmap;
+#define UMMU_OPT_MSIPOLL		(1UL << 0)
+#define UMMU_OPT_DOUBLE_PLBI		(1UL << 1)
+	u32 options;
+
+#define UMMU_MAX_ASIDS			(1UL << 16)
+	unsigned int asid_bits;
+#define UMMU_MAX_VMIDS			(1UL << 16)
+	unsigned int vmid_bits;
+
+	bool support_mapt;
+	u32 mcmdq_log2num;
+	u32 mcmdq_log2size;
+	u32 evtq_log2num;
+	u32 evtq_log2size;
+	u32 permq_num;
+	struct {
+		u32 cmdq_num;
+		u32 cplq_num;
+	} permq_ent_num;
+	u32 mtm_gp_max;
+	u32 mtm_id_max;
+	u16 prod_ver;
+};
+
+struct ummu_device {
+	struct device *dev;
+	void __iomem *base;
+
+	struct ummu_capability cap;
+
+	u32 nr_mcmdq;
+	struct ummu_mcmdq *__percpu *mcmdq;
+	struct ummu_evtq evtq;
+
+	struct ummu_core_device core_dev;
+	const struct ummu_device_helper *helper_ops;
+	struct list_head list;
+};
+
+struct ummu_domain_cfgs {
+	enum ummu_domain_stage stage;
+
+	u32 tecte_tag;
+
+	union {
+		struct ummu_s1_cfg	s1_cfg;
+		struct ummu_s2_cfg	s2_cfg;
+	};
+};
+
+struct ummu_domain {
+	struct ummu_base_domain base_domain;
+	struct ummu_domain_cfgs cfgs;
+};
+
+static inline
+struct ummu_device *core_to_ummu_device(struct ummu_core_device *ummu_core_dev)
+{
+	return container_of(ummu_core_dev, struct ummu_device, core_dev);
+}
+
+static inline struct ummu_domain *to_ummu_domain(struct iommu_domain *dom)
+{
+	struct ummu_base_domain *base_dom =
+			container_of(dom, struct ummu_base_domain, domain);
+
+	return container_of(base_dom, struct ummu_domain, base_domain);
+}
+
+int ummu_write_reg_sync(struct ummu_device *ummu, u32 val,
+			u32 reg_off, u32 ack_off);
+
+#endif /* __UMMU_H__ */
diff --git a/drivers/iommu/hisilicon/ummu_main.c b/drivers/iommu/hisilicon/ummu_main.c
new file mode 100644
index 0000000000000000000000000000000000000000..3992e248dc5988d82336a87f420a6758d4e20f83
--- /dev/null
+++ b/drivers/iommu/hisilicon/ummu_main.c
@@ -0,0 +1,670 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (c) 2025 HiSilicon Technologies Co., Ltd. All rights reserved.
+ * Description: UMMU Device's implementations
+ */
+
+#define pr_fmt(fmt) "UMMU: " fmt
+
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/iopoll.h>
+#include <ub/ubfi/ubfi.h>
+
+#include "interrupt.h"
+#include "queue.h"
+#include "regs.h"
+#include "flush.h"
+#include "ummu.h"
+
+#define UMMU_DRV_NAME "ummu"
+
+int ummu_write_reg_sync(struct ummu_device *ummu, u32 val,
+			u32 reg_off, u32 ack_off)
+{
+	u32 reg;
+
+	writel_relaxed(val, ummu->base + reg_off);
+	return readl_relaxed_poll_timeout(ummu->base + ack_off, reg, reg == val,
+					  1, UMMU_REG_POLL_TIMEOUT_US);
+}
+
+static int ummu_update_gbpa(struct ummu_device *ummu, u32 set,
+			    u32 clr)
+{
+	void __iomem *gbpa = ummu->base + UMMU_GBPA;
+	u32 reg;
+	int ret;
+
+	ret = readl_relaxed_poll_timeout(gbpa, reg, !(reg & GBPA_UPDATE_BIT), 1,
+					 UMMU_REG_POLL_TIMEOUT_US);
+	if (ret)
+		return ret;
+
+	reg &= ~clr;
+	reg |= set;
+	writel_relaxed(reg | GBPA_UPDATE_BIT, gbpa);
+	ret = readl_relaxed_poll_timeout(gbpa, reg, !(reg & GBPA_UPDATE_BIT), 1,
+					 UMMU_REG_POLL_TIMEOUT_US);
+	if (ret)
+		dev_err(ummu->dev, "GBPA not responding to update\n");
+	return ret;
+}
+
+static int ummu_ioremap(struct ummu_device *ummu, resource_size_t start,
+			resource_size_t size)
+{
+	struct resource res = DEFINE_RES_MEM(start, size);
+
+	ummu->base = devm_ioremap_resource(ummu->dev, &res);
+	if (IS_ERR(ummu->base))
+		return PTR_ERR(ummu->base);
+
+	return 0;
+}
+
+static int ummu_device_register(struct ummu_device *ummu)
+{
+	int ret;
+
+	ret = iommu_device_sysfs_add(&ummu->core_dev.iommu, ummu->dev, NULL,
+				     "%s", dev_name(ummu->dev));
+	if (ret)
+		dev_err(ummu->dev, "add iommu sysfs failed, ret = %d.\n", ret);
+
+	return ret;
+}
+
+static void ummu_device_unregister(struct ummu_device *ummu)
+{
+	iommu_device_sysfs_remove(&ummu->core_dev.iommu);
+}
+
+static int ummu_init_structures(struct ummu_device *ummu)
+{
+	int ret;
+
+	ret = ummu_init_queues(ummu);
+	if (ret) {
+		dev_err(ummu->dev, "init queues failed\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static void ummu_device_hw_probe_ver(struct ummu_device *ummu)
+{
+	u32 reg = readl_relaxed(ummu->base + UMMU_IIDR);
+
+	ummu->cap.prod_ver = (u16)FIELD_GET(IIDR_PROD_ID, reg);
+	/*
+	 * On the hisi chip with IIDR_PROD_ID set to 0,
+	 * ummu enables special_identify to perform some
+	 * specialized operations.
+	 */
+	if (!ummu->cap.prod_ver) {
+		ummu->cap.options |= UMMU_OPT_DOUBLE_PLBI;
+		ummu->cap.features &= ~UMMU_FEAT_STALLS;
+	}
+}
+
+static void ummu_device_hw_probe_cap0(struct ummu_device *ummu)
+{
+	u32 reg, pasids, ubrt_pasids, cap_pasids;
+
+	reg = readl_relaxed(ummu->base + UMMU_CAP0);
+
+	/* 2-level tect structures */
+	if (reg & CAP0_TECT_LVL_BIT)
+		ummu->cap.features |= UMMU_FEAT_2_LVL_TECT;
+
+	/* 2-level tct structures */
+	if (reg & CAP0_TCT_LVL_BIT)
+		ummu->cap.features |= UMMU_FEAT_2_LVL_TCT;
+
+	/* TID size */
+	ummu->cap.tid_bits = FIELD_GET(CAP0_TIDSIZE_MASK, reg);
+	/* The tid cap should follow the UB protocol */
+	ubrt_pasids = ummu->core_dev.iommu.max_pasids;
+	cap_pasids = 1 << ummu->cap.tid_bits;
+	if (ubrt_pasids > cap_pasids)
+		pr_warn("ubrt max_pasids[%u] beyond capacity.\n", ubrt_pasids);
+	pasids = min(cap_pasids, (1UL << UB_MAX_TID_BITS));
+	ummu->core_dev.iommu.max_pasids = min(ubrt_pasids, pasids);
+	/* TECTE_TAG size */
+	ummu->cap.deid_bits = FIELD_GET(CAP0_DEIDSIZE_MASK, reg);
+}
+
+static void ummu_device_hw_probe_cap1(struct ummu_device *ummu)
+{
+	u32  reg = readl_relaxed(ummu->base + UMMU_CAP1);
+
+	/* Maximum number of outstanding stalls */
+	ummu->evtq.max_stalls = FIELD_GET(CAP1_STALL_MAX, reg);
+
+	/* Support generation of WFE wake-up events to PE */
+	if (reg & CAP1_EVENT_GEN)
+		ummu->cap.features |= UMMU_FEAT_SEV;
+
+	/* MCMDQ's support, numbers and depth */
+	if (reg & CAP1_MCMDQ_SUPPORT) {
+		ummu->cap.features |= UMMU_FEAT_MCMDQ;
+		ummu->cap.mcmdq_log2num = FIELD_GET(CAP1_MCMDQ_LOG2NUM, reg);
+		ummu->cap.mcmdq_log2size = min(FIELD_GET(CAP1_MCMDQ_LOG2SIZE, reg),
+					       MCMDQ_MAX_LOG2SIZE);
+	}
+
+	/* EVENTQ's support, numbers and depth */
+	if (reg & CAP1_EVENTQ_SUPPORT) {
+		ummu->cap.features |= UMMU_FEAT_EVENTQ;
+		ummu->cap.evtq_log2num = FIELD_GET(CAP1_EVENTQ_LOG2NUM, reg);
+		ummu->cap.evtq_log2size = min(FIELD_GET(CAP1_EVENTQ_LOG2SIZE, reg),
+					      EVTQ_MAX_LOG2SIZE);
+
+	}
+}
+
+static int ummu_device_get_ttf(struct ummu_device *ummu, u32 reg)
+{
+	switch (FIELD_GET(CAP2_TTF_MASK, reg)) {
+	case CAP2_TTF_AARCH32_64:
+		ummu->cap.ias = CAP2_TTF_IAS_40;
+		break;
+	case CAP2_TTF_AARCH64:
+		break;
+	default:
+		dev_err(ummu->dev, "page table format not supported!\n");
+		return -ENXIO;
+	}
+	return 0;
+}
+
+static int ummu_device_get_trans_stage(struct ummu_device *ummu, u32 reg)
+{
+	if (!(reg & (CAP2_S1P_BIT | CAP2_S2P_BIT))) {
+		dev_err(ummu->dev, "no translation stage support!\n");
+		return -ENXIO;
+	}
+
+	if (reg & CAP2_S1P_BIT)
+		ummu->cap.features |= UMMU_FEAT_TRANS_S1;
+
+	if (reg & CAP2_S2P_BIT)
+		ummu->cap.features |= UMMU_FEAT_TRANS_S2;
+
+	if ((ummu->cap.features & UMMU_FEAT_TRANS_S1) &&
+	    (ummu->cap.features & UMMU_FEAT_TRANS_S2))
+		ummu->cap.features |= UMMU_FEAT_NESTING;
+
+	return 0;
+}
+
+static void ummu_device_get_pgsize(struct ummu_device *ummu, u32 reg)
+{
+	/* page sizes */
+	if (reg & CAP2_GRAN64K_BIT)
+		ummu->cap.pgsize_bitmap |= SZ_64K | SZ_512M;
+	if (reg & CAP2_GRAN16K_BIT)
+		ummu->cap.pgsize_bitmap |= SZ_16K | SZ_32M;
+	if (reg & CAP2_GRAN4K_BIT)
+		ummu->cap.pgsize_bitmap |= SZ_4K | SZ_2M | SZ_1G;
+}
+
+static void ummu_device_get_oas(struct ummu_device *ummu, u32 reg)
+{
+	/* output address size */
+	switch (FIELD_GET(CAP2_OAS_MASK, reg)) {
+	case CAP2_OAS_32_BIT:
+		ummu->cap.oas = CAP2_TTF_OAS_32;
+		break;
+	case CAP2_OAS_36_BIT:
+		ummu->cap.oas = CAP2_TTF_OAS_36;
+		break;
+	case CAP2_OAS_40_BIT:
+		ummu->cap.oas = CAP2_TTF_OAS_40;
+		break;
+	case CAP2_OAS_42_BIT:
+		ummu->cap.oas = CAP2_TTF_OAS_42;
+		break;
+	case CAP2_OAS_44_BIT:
+		ummu->cap.oas = CAP2_TTF_OAS_44;
+		break;
+	default:
+		dev_warn(ummu->dev,
+			 "unknown output address size. truncating to 48-bit\n");
+		fallthrough;
+	case CAP2_OAS_48_BIT:
+		ummu->cap.oas = CAP2_TTF_OAS_48;
+		break;
+	}
+}
+
+static int ummu_device_hw_probe_cap2(struct ummu_device *ummu)
+{
+	u32 reg = readl_relaxed(ummu->base + UMMU_CAP2);
+	int ret = ummu_device_get_ttf(ummu, reg);
+
+	if (ret)
+		return ret;
+
+	ret = ummu_device_get_trans_stage(ummu, reg);
+	if (ret)
+		return ret;
+
+	/* input address size */
+	if (FIELD_GET(CAP2_VA_EXT_MASK, reg) == CAP2_VA_EXT_52)
+		ummu->cap.features |= UMMU_FEAT_VAX;
+
+	ummu_device_get_pgsize(ummu, reg);
+
+	ummu_device_get_oas(ummu, reg);
+
+	ummu->cap.ias = max(ummu->cap.ias, ummu->cap.oas);
+
+	if (FIELD_GET(CAP2_RTLBI_BIT, reg))
+		ummu->cap.features |= UMMU_FEAT_RANGE_INV;
+
+	if (FIELD_GET(CAP2_BTLBI_BIT, reg))
+		ummu->cap.features |= UMMU_FEAT_BTM;
+	else
+		dev_warn(ummu->dev, "don't support BTM!\n");
+	return 0;
+}
+
+static void ummu_device_get_stall_model(struct ummu_device *ummu, u32 reg)
+{
+	switch (FIELD_GET(CAP3_STALL_MODEL_MASK, reg)) {
+	case CAP3_STALL_MODE_FORCE:
+		ummu->cap.features |= UMMU_FEAT_STALL_FORCE;
+		fallthrough;
+	case CAP3_STALL_MODE:
+		ummu->cap.features |= UMMU_FEAT_STALLS;
+	default:
+		break;
+	}
+}
+
+static void ummu_device_get_httu(struct ummu_device *ummu, u32 reg)
+{
+	switch (FIELD_GET(CAP3_HTTU_MASK, reg)) {
+	case CAP3_HTTU_ACCESS_DIRTY:
+		ummu->cap.features |= UMMU_FEAT_HD;
+		fallthrough;
+	case CAP3_HTTU_ACCESS:
+		ummu->cap.features |= UMMU_FEAT_HA;
+	default:
+		break;
+	}
+}
+
+static int ummu_device_get_ttendian(struct ummu_device *ummu, u32 reg)
+{
+	switch (FIELD_GET(CAP3_TTENDIAN_MASK, reg)) {
+	case CAP3_TTENDIAN_MIXED:
+		ummu->cap.features |= UMMU_FEAT_TT_LE | UMMU_FEAT_TT_BE;
+		break;
+#ifdef __BIG_ENDIAN
+	case CAP3_TTENDIAN_BE:
+		break;
+#else
+	case CAP3_TTENDIAN_LE:
+		break;
+#endif
+	default:
+		dev_err(ummu->dev, "unknown/unsupported TT endianness!\n");
+		return -ENXIO;
+	}
+	return 0;
+}
+
+static void ummu_device_get_bbm_level(struct ummu_device *ummu, u32 reg)
+{
+	switch (FIELD_GET(CAP3_BBML_MASK, reg)) {
+	case CAP3_BBML0:
+		break;
+	case CAP3_BBML1:
+		ummu->cap.features |= UMMU_FEAT_BBML1;
+		break;
+	case CAP3_BBML2:
+		ummu->cap.features |= UMMU_FEAT_BBML2;
+		break;
+	default:
+		dev_warn(ummu->dev, "unknown/unsupported BBM behavior level\n");
+	}
+}
+
+static int ummu_device_hw_probe_cap3(struct ummu_device *ummu)
+{
+	u32 reg = readl_relaxed(ummu->base + UMMU_CAP3);
+	int ret;
+
+	ummu_device_get_stall_model(ummu, reg);
+
+	if (reg & CAP3_MSI_SUPPORT_BIT)
+		ummu->cap.features |= UMMU_FEAT_MSI;
+
+	if (reg & CAP3_HYP_S1CTX_BIT) {
+		ummu->cap.features |= UMMU_FEAT_HYP;
+		if (cpus_have_cap(ARM64_HAS_VIRT_HOST_EXTN)) {
+			ummu->cap.features |= UMMU_FEAT_E2H;
+			pr_debug("support hypervisor and E2H\n");
+		}
+	}
+
+	ummu_device_get_httu(ummu, reg);
+
+	if (reg & CAP3_MTM_BIT)
+		ummu->cap.features |= UMMU_FEAT_MTM;
+
+	ret = ummu_device_get_ttendian(ummu, reg);
+	if (ret)
+		return ret;
+
+	if (reg & CAP3_COHACC_BIT) {
+		ummu->cap.features |= UMMU_FEAT_COHERENCY;
+		if (ummu->cap.features & UMMU_FEAT_MSI)
+			ummu->cap.options |= UMMU_OPT_MSIPOLL;
+	}
+
+	ummu_device_get_bbm_level(ummu, reg);
+
+	return 0;
+}
+
+static int ummu_device_hw_probe_cap4(struct ummu_device *ummu)
+{
+	u32 reg = readl_relaxed(ummu->base + UMMU_CAP4);
+	int hw_permq_ent;
+
+	hw_permq_ent = 1 << FIELD_GET(CAP4_UCMDQ_LOG2SIZE, reg);
+	ummu->cap.permq_ent_num.cmdq_num = hw_permq_ent;
+
+	hw_permq_ent = 1 << FIELD_GET(CAP4_UCPLQ_LOG2SIZE, reg);
+	ummu->cap.permq_ent_num.cplq_num = hw_permq_ent;
+
+	return 0;
+}
+
+static void ummu_device_hw_probe_cap5(struct ummu_device *ummu)
+{
+	u32 reg = readl_relaxed(ummu->base + UMMU_CAP5);
+
+	if (reg & CAP5_RANGE_PLBI_BIT)
+		ummu->cap.features |= UMMU_FEAT_RANGE_PLBI;
+
+	if (reg & CAP5_MAPT_SUPPORT)
+		ummu->cap.support_mapt = true;
+
+	if (reg & CAP5_PT_GRAN4K_BIT)
+		ummu->cap.ptsize_bitmap |= SZ_4K;
+
+	if (reg & CAP5_PT_GRAN2M_BIT)
+		ummu->cap.ptsize_bitmap |= SZ_2M;
+
+	if (reg & CAP5_TKVALCHK_BIT)
+		ummu->cap.features |= UMMU_FEAT_TOKEN_CHK;
+
+	/*
+	 * the ASID and VMID capabilities are determined based on
+	 * the bit widths of the ASID and VMID in the configuration table.
+	 */
+	ummu->cap.asid_bits = ilog2(UMMU_MAX_ASIDS);
+	ummu->cap.vmid_bits = ilog2(UMMU_MAX_VMIDS);
+
+	dev_info(ummu->dev, "ias = %u-bit, oas = %u-bit, features = 0x%08x.\n",
+		 ummu->cap.ias, ummu->cap.oas, ummu->cap.features);
+}
+
+static void ummu_device_hw_probe_cap6(struct ummu_device *ummu)
+{
+	u32 reg;
+
+	if (ummu->cap.features & UMMU_FEAT_MTM) {
+		reg = readl_relaxed(ummu->base + UMMU_CAP6);
+		ummu->cap.mtm_id_max = FIELD_GET(CAP6_MTM_ID_MAX, reg);
+		ummu->cap.mtm_gp_max = FIELD_GET(CAP6_MTM_GP_MAX, reg);
+	}
+	dev_dbg(ummu->dev, "partid_max %u, pmg_max %u.\n", ummu->cap.mtm_id_max,
+		ummu->cap.mtm_gp_max);
+}
+
+static int ummu_device_hw_init(struct ummu_device *ummu)
+{
+	int ret;
+
+	ummu_device_hw_probe_cap0(ummu);
+	ummu_device_hw_probe_cap1(ummu);
+
+	ret = ummu_device_hw_probe_cap2(ummu);
+	if (ret)
+		return ret;
+
+	ret = ummu_device_hw_probe_cap3(ummu);
+	if (ret)
+		return ret;
+
+	ret = ummu_device_hw_probe_cap4(ummu);
+	if (ret)
+		return ret;
+
+	ummu_device_hw_probe_cap5(ummu);
+	ummu_device_hw_probe_cap6(ummu);
+	ummu_device_hw_probe_ver(ummu);
+
+	return 0;
+}
+
+static void ummu_device_sync(struct ummu_device *ummu)
+{
+	u32 reg = readl_relaxed(ummu->base + UMMU_CR0);
+
+	if (reg & CR0_UMMU_EN) {
+		dev_warn(ummu->dev, "ummu currently enabled! Resetting...\n");
+		ummu_update_gbpa(ummu, GBPA_ABORT_BIT, 0);
+	}
+}
+
+static int ummu_device_disable(struct ummu_device *ummu)
+{
+	int ret;
+
+	ret = ummu_write_reg_sync(ummu, 0, UMMU_CR0, UMMU_CR0ACK);
+	if (ret)
+		dev_err(ummu->dev, "disable ummu interface failed, ret = %d.\n", ret);
+
+	return ret;
+}
+
+static int ummu_device_enable(struct ummu_device *ummu)
+{
+	int ret;
+	u32 cr0;
+
+	cr0 = readl_relaxed(ummu->base + UMMU_CR0);
+	cr0 |= CR0_UMMU_EN;
+	ret = ummu_write_reg_sync(ummu, cr0, UMMU_CR0, UMMU_CR0ACK);
+	if (ret)
+		dev_err(ummu->dev, "enable ummu interface failed.\n");
+
+	return ret;
+}
+
+static void ummu_device_set_mem_attr(struct ummu_device *ummu)
+{
+	u32 reg;
+
+	reg = CR1_TECT_MODE_SEL | CR1_E2H |
+	      FIELD_PREP(CR1_TABLE_SH, UMMU_SH_ISH) |
+	      FIELD_PREP(CR1_TABLE_OC, UMMU_CACHE_WB) |
+	      FIELD_PREP(CR1_TABLE_IC, UMMU_CACHE_WB) |
+	      FIELD_PREP(CR1_QUEUE_SH, UMMU_SH_ISH) |
+	      FIELD_PREP(CR1_QUEUE_OC, UMMU_CACHE_WB) |
+	      FIELD_PREP(CR1_QUEUE_IC, UMMU_CACHE_WB);
+
+	writel_relaxed(reg, ummu->base + UMMU_CR1);
+}
+
+static int ummu_device_reset(struct ummu_device *ummu)
+{
+	int ret;
+
+	ummu_device_sync(ummu);
+
+	ret = ummu_device_disable(ummu);
+	if (ret)
+		return ret;
+
+	/* set configuration table and queue memory attributes */
+	ummu_device_set_mem_attr(ummu);
+
+	ret = ummu_device_mcmdq_init_cfg(ummu);
+	if (ret)
+		return ret;
+
+	ret = ummu_write_evtq_regs(ummu);
+	if (ret)
+		return ret;
+
+	ummu_setup_irqs(ummu);
+	ummu_sync_tect_all(ummu);
+	ummu_init_flush_iotlb(ummu);
+
+	return ummu_device_enable(ummu);
+}
+
+static int ummu_device_ubrt_probe(struct ummu_device *ummu)
+{
+	struct fwnode_handle *fwnode = dev_fwnode(ummu->dev);
+	struct ubrt_fwnode *fw;
+	struct ummu_node *node;
+
+	if (!fwnode)
+		return -EINVAL;
+
+	fw = ubrt_fwnode_get(fwnode);
+	if (!fw) {
+		dev_err(ummu->dev, "get ubrt fwnode failed!\n");
+		return -ENXIO;
+	}
+
+	if (fw->type != UBRT_UMMU) {
+		dev_err(ummu->dev, "get invalid ubct type!\n");
+		return -ESPIPE;
+	}
+
+	node = (struct ummu_node *)fw->ubrt_node;
+
+	ummu->core_dev.iommu.min_pasids = node->min_tid;
+	ummu->core_dev.iommu.max_pasids = node->max_tid;
+
+	return 0;
+}
+
+static int ummu_device_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct ummu_device *ummu;
+	struct resource *res;
+	int ret;
+
+	ummu = devm_kzalloc(dev, sizeof(*ummu), GFP_KERNEL);
+	if (!ummu)
+		return -ENOMEM;
+
+	ummu->dev = dev;
+
+	ret = ummu_device_ubrt_probe(ummu);
+	if (ret) {
+		dev_err(dev, "failed to probe ummu_node: %d\n", ret);
+		return ret;
+	}
+
+	/* Base address */
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		dev_err(dev, "IO resource is null\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * Don't map the IMPLEMENTATION DEFINED regions, since they may contain
+	 * the root registers which are reserved by the bios.
+	 */
+	ret = ummu_ioremap(ummu, res->start, UMMU_REG_SZ);
+	if (ret)
+		return ret;
+
+	/* hardware init */
+	ret = ummu_device_hw_init(ummu);
+	if (ret)
+		return ret;
+
+	/* Initialise in-memory data structures */
+	ret = ummu_init_structures(ummu);
+	if (ret)
+		return ret;
+
+	/* record ummu device */
+	platform_set_drvdata(pdev, ummu);
+
+	ret = ummu_device_reset(ummu);
+	if (ret)
+		return ret;
+
+	ret = ummu_device_register(ummu);
+	if (ret)
+		dev_err(dev, "probe ummu device failed, ret = %d.\n", ret);
+
+	return ret;
+}
+
+static int ummu_device_remove(struct platform_device *pdev)
+{
+	struct ummu_device *ummu = platform_get_drvdata(pdev);
+
+	ummu_device_disable(ummu);
+	ummu_device_unregister(ummu);
+
+	dev_dbg(&pdev->dev, "Remove ummu successful!\n");
+	return 0;
+}
+
+static void ummu_device_shutdown(struct platform_device *pdev)
+{
+	struct ummu_device *ummu = platform_get_drvdata(pdev);
+
+	ummu_device_disable(ummu);
+}
+
+static const struct of_device_id hisi_ummu_of_match[] = {
+	{ .compatible = "ub,ummu", },
+	{ }
+};
+MODULE_DEVICE_TABLE(of, hisi_ummu_of_match);
+
+static const struct acpi_device_id hisi_ummu_acpi_match[] = {
+	{ "HISI0551", 0 },
+	{ }
+};
+MODULE_DEVICE_TABLE(acpi, hisi_ummu_acpi_match);
+
+struct platform_driver ummu_driver = {
+	.driver = {
+		.name = UMMU_DRV_NAME,
+		.suppress_bind_attrs = true,
+		.of_match_table = hisi_ummu_of_match,
+		.acpi_match_table = hisi_ummu_acpi_match,
+	},
+	.probe = ummu_device_probe,
+	.remove = ummu_device_remove,
+	.shutdown = ummu_device_shutdown,
+};
+
+module_driver(ummu_driver, platform_driver_register, platform_driver_unregister);
+
+MODULE_IMPORT_NS(UMMU_CORE_DRIVER);
+MODULE_DESCRIPTION("Hisilicon ummu driver");
+MODULE_AUTHOR("HiSilicon Tech. Co., Ltd.");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:" UMMU_DRV_NAME);