diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index a4408004963c44a7c71c6b1be3c7c72733bb2f85..64573ae8f31a25778f6f44efdef7b61126910037 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -53,6 +53,9 @@ #define IDR0_ASID16 (1 << 12) #define IDR0_ATS (1 << 10) #define IDR0_HYP (1 << 9) +#define IDR0_HTTU GENMASK(7, 6) +#define IDR0_HTTU_ACCESS 1 +#define IDR0_HTTU_ACCESS_DIRTY 2 #define IDR0_COHACC (1 << 4) #define IDR0_TTF GENMASK(3, 2) #define IDR0_TTF_AARCH64 2 @@ -70,6 +73,12 @@ #define IDR1_SSIDSIZE GENMASK(10, 6) #define IDR1_SIDSIZE GENMASK(5, 0) +#define ARM_SMMU_IDR3 0xc +#define IDR3_BBML GENMASK(12, 11) +#define IDR3_BBML0 0 +#define IDR3_BBML1 1 +#define IDR3_BBML2 2 + #define ARM_SMMU_IDR5 0x14 #define IDR5_STALL_MAX GENMASK(31, 16) #define IDR5_GRAN64K (1 << 6) @@ -285,6 +294,9 @@ #define CTXDESC_CD_0_TCR_TBI0 (1ULL << 38) #define ARM64_TCR_TBI0 (1ULL << 37) +#define CTXDESC_CD_0_TCR_HA (1UL << 43) +#define CTXDESC_CD_0_TCR_HD (1UL << 42) + #define CTXDESC_CD_0_AA64 (1UL << 41) #define CTXDESC_CD_0_S (1UL << 44) #define CTXDESC_CD_0_R (1UL << 45) @@ -659,6 +671,10 @@ struct arm_smmu_device { #define ARM_SMMU_FEAT_HYP (1 << 12) #define ARM_SMMU_FEAT_STALL_FORCE (1 << 13) #define ARM_SMMU_FEAT_VAX (1 << 14) +#define ARM_SMMU_FEAT_HA (1 << 19) +#define ARM_SMMU_FEAT_HD (1 << 20) +#define ARM_SMMU_FEAT_BBML1 (1 << 21) +#define ARM_SMMU_FEAT_BBML2 (1 << 22) u32 features; #define ARM_SMMU_OPT_SKIP_PREFETCH (1 << 0) @@ -1541,6 +1557,11 @@ static void arm_smmu_write_ctx_desc(struct arm_smmu_device *smmu, CTXDESC_CD_0_AA64 | FIELD_PREP(CTXDESC_CD_0_ASID, cfg->cd.asid) | CTXDESC_CD_0_V; + if (smmu->features & ARM_SMMU_FEAT_HA) + val |= CTXDESC_CD_0_TCR_HA; + if (smmu->features & ARM_SMMU_FEAT_HD) + val |= CTXDESC_CD_0_TCR_HD; + /* STALL_MODEL==0b10 && CD.S==0 is ILLEGAL */ if (smmu->features & ARM_SMMU_FEAT_STALL_FORCE) val |= CTXDESC_CD_0_S; @@ -2328,6 +2349,13 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain) if (smmu_domain->non_strict) pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT; + if (smmu->features & ARM_SMMU_FEAT_HD) + pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_ARM_HD; + + if (smmu->features & ARM_SMMU_FEAT_BBML1) + pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_ARM_BBML1; + else if (smmu->features & ARM_SMMU_FEAT_BBML2) + pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_ARM_BBML2; pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain); if (!pgtbl_ops) @@ -2784,6 +2812,207 @@ static int arm_smmu_domain_set_attr(struct iommu_domain *domain, return ret; } +static int arm_smmu_split_block(struct iommu_domain *domain, + unsigned long iova, size_t size) +{ + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); + struct arm_smmu_device *smmu = smmu_domain->smmu; + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; + size_t handled_size; + + if (!(smmu->features & (ARM_SMMU_FEAT_BBML1 | ARM_SMMU_FEAT_BBML2))) { + dev_err(smmu->dev, "don't support BBML1/2, can't split block\n"); + return -ENODEV; + } + if (!ops || !ops->split_block) { + pr_err("io-pgtable don't realize split block\n"); + return -ENODEV; + } + + handled_size = ops->split_block(ops, iova, size); + if (handled_size != size) { + pr_err("split block failed\n"); + return -EFAULT; + } + + return 0; +} + +static int __arm_smmu_merge_page(struct iommu_domain *domain, + unsigned long iova, phys_addr_t paddr, + size_t size, int prot) +{ + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; + size_t handled_size; + + if (!ops || !ops->merge_page) { + pr_err("io-pgtable don't realize merge page\n"); + return -ENODEV; + } + + while (size) { + size_t pgsize = iommu_pgsize(domain, iova | paddr, size); + + handled_size = ops->merge_page(ops, iova, paddr, pgsize, prot); + if (handled_size != pgsize) { + pr_err("merge page failed\n"); + return -EFAULT; + } + + pr_debug("merge handled: iova 0x%lx pa %pa size 0x%zx\n", + iova, &paddr, pgsize); + + iova += pgsize; + paddr += pgsize; + size -= pgsize; + } + + return 0; +} + +static int arm_smmu_merge_page(struct iommu_domain *domain, unsigned long iova, + size_t size, int prot) +{ + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); + struct arm_smmu_device *smmu = smmu_domain->smmu; + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; + phys_addr_t phys; + dma_addr_t p, i; + size_t cont_size; + int ret = 0; + + if (!(smmu->features & (ARM_SMMU_FEAT_BBML1 | ARM_SMMU_FEAT_BBML2))) { + dev_err(smmu->dev, "don't support BBML1/2, can't merge page\n"); + return -ENODEV; + } + + if (!ops || !ops->iova_to_phys) + return -ENODEV; + + while (size) { + phys = ops->iova_to_phys(ops, iova); + cont_size = PAGE_SIZE; + p = phys + cont_size; + i = iova + cont_size; + + while (cont_size < size && p == ops->iova_to_phys(ops, i)) { + p += PAGE_SIZE; + i += PAGE_SIZE; + cont_size += PAGE_SIZE; + } + + if (cont_size != PAGE_SIZE) { + ret = __arm_smmu_merge_page(domain, iova, phys, + cont_size, prot); + if (ret) + break; + } + + iova += cont_size; + size -= cont_size; + } + + return ret; +} + +static bool arm_smmu_support_dirty_log(struct iommu_domain *domain) +{ + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); + + return !!(smmu_domain->smmu->features & ARM_SMMU_FEAT_HD); +} + +static int arm_smmu_switch_dirty_log(struct iommu_domain *domain, bool enable, + unsigned long iova, size_t size, int prot) +{ + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); + struct arm_smmu_device *smmu = smmu_domain->smmu; + + if (!(smmu->features & ARM_SMMU_FEAT_HD)) + return -ENODEV; + if (smmu_domain->stage != ARM_SMMU_DOMAIN_S1) + return -EINVAL; + + if (enable) { + /* + * For SMMU, the hardware dirty management is always enabled if + * hardware supports HTTU HD. The action to start dirty log is + * spliting block mapping. + * + * We don't return error even if the split operation fail, as we + * can still track dirty at block granule, which is still a much + * better choice compared to full dirty policy. + */ + arm_smmu_split_block(domain, iova, size); + } else { + /* + * For SMMU, the hardware dirty management is always enabled if + * hardware supports HTTU HD. The action to stop dirty log is + * merging page mapping. + * + * We don't return error even if the merge operation fail, as it + * just effects performace of DMA transaction. + */ + arm_smmu_merge_page(domain, iova, size, prot); + } + + return 0; +} + +static int arm_smmu_sync_dirty_log(struct iommu_domain *domain, + unsigned long iova, size_t size, + unsigned long *bitmap, + unsigned long base_iova, + unsigned long bitmap_pgshift) +{ + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; + struct arm_smmu_device *smmu = smmu_domain->smmu; + + if (!(smmu->features & ARM_SMMU_FEAT_HD)) + return -ENODEV; + if (smmu_domain->stage != ARM_SMMU_DOMAIN_S1) + return -EINVAL; + + if (!ops || !ops->sync_dirty_log) { + pr_err("io-pgtable don't realize sync dirty log\n"); + return -ENODEV; + } + + /* + * Flush iotlb to ensure all inflight transactions are completed. + * See doc IHI0070Da 3.13.4 "HTTU behavior summary". + */ + arm_smmu_flush_iotlb_all(domain); + return ops->sync_dirty_log(ops, iova, size, bitmap, base_iova, + bitmap_pgshift); +} + +static int arm_smmu_clear_dirty_log(struct iommu_domain *domain, + unsigned long iova, size_t size, + unsigned long *bitmap, + unsigned long base_iova, + unsigned long bitmap_pgshift) +{ + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; + struct arm_smmu_device *smmu = smmu_domain->smmu; + + if (!(smmu->features & ARM_SMMU_FEAT_HD)) + return -ENODEV; + if (smmu_domain->stage != ARM_SMMU_DOMAIN_S1) + return -EINVAL; + + if (!ops || !ops->clear_dirty_log) { + pr_err("io-pgtable don't realize clear dirty log\n"); + return -ENODEV; + } + + return ops->clear_dirty_log(ops, iova, size, bitmap, base_iova, + bitmap_pgshift); +} + static int arm_smmu_of_xlate(struct device *dev, struct of_phandle_args *args) { return iommu_fwspec_add_ids(dev, args->args, 1); @@ -2885,6 +3114,10 @@ static struct iommu_ops arm_smmu_ops = { .device_group = arm_smmu_device_group, .domain_get_attr = arm_smmu_domain_get_attr, .domain_set_attr = arm_smmu_domain_set_attr, + .support_dirty_log = arm_smmu_support_dirty_log, + .switch_dirty_log = arm_smmu_switch_dirty_log, + .sync_dirty_log = arm_smmu_sync_dirty_log, + .clear_dirty_log = arm_smmu_clear_dirty_log, .of_xlate = arm_smmu_of_xlate, .get_resv_regions = arm_smmu_get_resv_regions, .put_resv_regions = arm_smmu_put_resv_regions, @@ -3514,6 +3747,29 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass) return 0; } +static void arm_smmu_get_httu(struct arm_smmu_device *smmu, u32 reg) +{ + u32 fw_features = smmu->features & + (ARM_SMMU_FEAT_HA | ARM_SMMU_FEAT_HD); + u32 hw_features = 0; + + switch (FIELD_GET(IDR0_HTTU, reg)) { + case IDR0_HTTU_ACCESS_DIRTY: + hw_features |= ARM_SMMU_FEAT_HD; + fallthrough; + case IDR0_HTTU_ACCESS: + hw_features |= ARM_SMMU_FEAT_HA; + } + + if (smmu->dev->of_node) + smmu->features |= hw_features; + else if (hw_features != fw_features) + /* ACPI IORT sets the HTTU bits */ + dev_warn(smmu->dev, + "IDR0.HTTU features(0x%x) overridden by FW configuration (0x%x)\n", + hw_features, fw_features); +} + static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) { u32 reg; @@ -3568,6 +3824,8 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) if (reg & IDR0_HYP) smmu->features |= ARM_SMMU_FEAT_HYP; + arm_smmu_get_httu(smmu, reg); + /* * The coherency feature as set by FW is used in preference to the ID * register, but warn on mismatch. @@ -3649,6 +3907,22 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) if (smmu->sid_bits <= STRTAB_SPLIT) smmu->features &= ~ARM_SMMU_FEAT_2_LVL_STRTAB; + /* IDR3 */ + reg = readl_relaxed(smmu->base + ARM_SMMU_IDR3); + switch (FIELD_GET(IDR3_BBML, reg)) { + case IDR3_BBML0: + break; + case IDR3_BBML1: + smmu->features |= ARM_SMMU_FEAT_BBML1; + break; + case IDR3_BBML2: + smmu->features |= ARM_SMMU_FEAT_BBML2; + break; + default: + dev_err(smmu->dev, "unknown/unsupported BBM behavior level\n"); + return -ENXIO; + } + /* IDR5 */ reg = readl_relaxed(smmu->base + ARM_SMMU_IDR5); @@ -3745,6 +4019,14 @@ static int arm_smmu_device_acpi_probe(struct platform_device *pdev, if (iort_smmu->flags & ACPI_IORT_SMMU_V3_COHACC_OVERRIDE) smmu->features |= ARM_SMMU_FEAT_COHERENCY; + switch (FIELD_GET(ACPI_IORT_SMMU_V3_HTTU_OVERRIDE, iort_smmu->flags)) { + case IDR0_HTTU_ACCESS_DIRTY: + smmu->features |= ARM_SMMU_FEAT_HD; + fallthrough; + case IDR0_HTTU_ACCESS: + smmu->features |= ARM_SMMU_FEAT_HA; + } + return 0; } #else diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 651054aa871034f3973d44584c0bc11c803aa9f7..8e72b6561edd822de569c1b4988fb9217811675a 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -458,7 +458,7 @@ static void __iommu_dma_unmap(struct device *dev, dma_addr_t dma_addr, WARN_ON(unmapped != size); if (!cookie->fq_domain) - iommu_tlb_sync(domain, &iotlb_gather); + iommu_iotlb_sync(domain, &iotlb_gather); iommu_dma_free_iova(cookie, dma_addr, size); } diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 975237ca032670da6f7b0b422f68eecace0f4e60..318049539fc30a59a6fd5525a16989bf375d1d1e 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -78,16 +78,19 @@ #define ARM_LPAE_PTE_NSTABLE (((arm_lpae_iopte)1) << 63) #define ARM_LPAE_PTE_XN (((arm_lpae_iopte)3) << 53) +#define ARM_LPAE_PTE_DBM (((arm_lpae_iopte)1) << 51) #define ARM_LPAE_PTE_AF (((arm_lpae_iopte)1) << 10) #define ARM_LPAE_PTE_SH_NS (((arm_lpae_iopte)0) << 8) #define ARM_LPAE_PTE_SH_OS (((arm_lpae_iopte)2) << 8) #define ARM_LPAE_PTE_SH_IS (((arm_lpae_iopte)3) << 8) #define ARM_LPAE_PTE_NS (((arm_lpae_iopte)1) << 5) #define ARM_LPAE_PTE_VALID (((arm_lpae_iopte)1) << 0) +/* Block descriptor bits */ +#define ARM_LPAE_PTE_NT (((arm_lpae_iopte)1) << 16) #define ARM_LPAE_PTE_ATTR_LO_MASK (((arm_lpae_iopte)0x3ff) << 2) /* Ignore the contiguous bit for block splitting */ -#define ARM_LPAE_PTE_ATTR_HI_MASK (((arm_lpae_iopte)6) << 52) +#define ARM_LPAE_PTE_ATTR_HI_MASK (((arm_lpae_iopte)13) << 51) #define ARM_LPAE_PTE_ATTR_MASK (ARM_LPAE_PTE_ATTR_LO_MASK | \ ARM_LPAE_PTE_ATTR_HI_MASK) /* Software bit for solving coherency races */ @@ -429,6 +432,7 @@ static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova, static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data, int prot) { + struct io_pgtable_cfg *cfg = &data->iop.cfg; arm_lpae_iopte pte; if (data->iop.fmt == ARM_64_LPAE_S1 || @@ -436,6 +440,10 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data, pte = ARM_LPAE_PTE_nG; if (!(prot & IOMMU_WRITE) && (prot & IOMMU_READ)) pte |= ARM_LPAE_PTE_AP_RDONLY; + else if (data->iop.fmt == ARM_64_LPAE_S1 && + cfg->quirks & IO_PGTABLE_QUIRK_ARM_HD) + pte |= ARM_LPAE_PTE_DBM; + if (!(prot & IOMMU_PRIV)) pte |= ARM_LPAE_PTE_AP_UNPRIV; } else { @@ -698,6 +706,373 @@ static phys_addr_t arm_lpae_iova_to_phys(struct io_pgtable_ops *ops, return iopte_to_paddr(pte, data) | iova; } +static size_t __arm_lpae_split_block(struct arm_lpae_io_pgtable *data, + unsigned long iova, size_t size, int lvl, + arm_lpae_iopte *ptep); + +static size_t arm_lpae_do_split_blk(struct arm_lpae_io_pgtable *data, + unsigned long iova, size_t size, + arm_lpae_iopte blk_pte, int lvl, + arm_lpae_iopte *ptep) +{ + struct io_pgtable_cfg *cfg = &data->iop.cfg; + arm_lpae_iopte pte, *tablep; + phys_addr_t blk_paddr; + size_t tablesz = ARM_LPAE_GRANULE(data); + size_t split_sz = ARM_LPAE_BLOCK_SIZE(lvl, data); + int i; + + if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS)) + return 0; + + tablep = __arm_lpae_alloc_pages(tablesz, GFP_ATOMIC, cfg); + if (!tablep) + return 0; + + blk_paddr = iopte_to_paddr(blk_pte, data); + pte = iopte_prot(blk_pte); + for (i = 0; i < tablesz / sizeof(pte); i++, blk_paddr += split_sz) + __arm_lpae_init_pte(data, blk_paddr, pte, lvl, &tablep[i]); + + if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_BBML1) { + /* Race does not exist */ + blk_pte |= ARM_LPAE_PTE_NT; + __arm_lpae_set_pte(ptep, blk_pte, cfg); + io_pgtable_tlb_flush_walk(&data->iop, iova, size, size); + } + /* Race does not exist */ + pte = arm_lpae_install_table(tablep, ptep, blk_pte, data); + + /* Have splited it into page? */ + if (lvl == (ARM_LPAE_MAX_LEVELS - 1)) + return size; + + /* Go back to lvl - 1 */ + ptep -= ARM_LPAE_LVL_IDX(iova, lvl - 1, data); + return __arm_lpae_split_block(data, iova, size, lvl - 1, ptep); +} + +static size_t __arm_lpae_split_block(struct arm_lpae_io_pgtable *data, + unsigned long iova, size_t size, int lvl, + arm_lpae_iopte *ptep) +{ + arm_lpae_iopte pte; + struct io_pgtable *iop = &data->iop; + size_t base, next_size, total_size; + + if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS)) + return 0; + + ptep += ARM_LPAE_LVL_IDX(iova, lvl, data); + pte = READ_ONCE(*ptep); + if (WARN_ON(!pte)) + return 0; + + if (size == ARM_LPAE_BLOCK_SIZE(lvl, data)) { + if (iopte_leaf(pte, lvl, iop->fmt)) { + if (lvl == (ARM_LPAE_MAX_LEVELS - 1) || + (pte & ARM_LPAE_PTE_AP_RDONLY)) + return size; + + /* We find a writable block, split it. */ + return arm_lpae_do_split_blk(data, iova, size, pte, + lvl + 1, ptep); + } else { + /* If it is the last table level, then nothing to do */ + if (lvl == (ARM_LPAE_MAX_LEVELS - 2)) + return size; + + total_size = 0; + next_size = ARM_LPAE_BLOCK_SIZE(lvl + 1, data); + ptep = iopte_deref(pte, data); + for (base = 0; base < size; base += next_size) + total_size += __arm_lpae_split_block(data, + iova + base, next_size, lvl + 1, + ptep); + return total_size; + } + } else if (iopte_leaf(pte, lvl, iop->fmt)) { + WARN(1, "Can't split behind a block.\n"); + return 0; + } + + /* Keep on walkin */ + ptep = iopte_deref(pte, data); + return __arm_lpae_split_block(data, iova, size, lvl + 1, ptep); +} + +static size_t arm_lpae_split_block(struct io_pgtable_ops *ops, + unsigned long iova, size_t size) +{ + struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops); + struct io_pgtable_cfg *cfg = &data->iop.cfg; + arm_lpae_iopte *ptep = data->pgd; + int lvl = ARM_LPAE_START_LVL(data); + + if (WARN_ON(!size || (size & cfg->pgsize_bitmap) != size)) + return 0; + + if (WARN_ON(iova >= (1ULL << data->iop.cfg.ias))) + return 0; + + /* If it is smallest granule, then nothing to do */ + if (size == ARM_LPAE_BLOCK_SIZE(ARM_LPAE_MAX_LEVELS - 1, data)) + return size; + + return __arm_lpae_split_block(data, iova, size, lvl, ptep); +} + +static size_t __arm_lpae_merge_page(struct arm_lpae_io_pgtable *data, + unsigned long iova, phys_addr_t paddr, + size_t size, int lvl, arm_lpae_iopte *ptep, + arm_lpae_iopte prot) +{ + arm_lpae_iopte pte, *tablep; + struct io_pgtable *iop = &data->iop; + struct io_pgtable_cfg *cfg = &data->iop.cfg; + + if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS)) + return 0; + + ptep += ARM_LPAE_LVL_IDX(iova, lvl, data); + pte = READ_ONCE(*ptep); + if (WARN_ON(!pte)) + return 0; + + if (size == ARM_LPAE_BLOCK_SIZE(lvl, data)) { + if (iopte_leaf(pte, lvl, iop->fmt)) + return size; + + /* Race does not exist */ + if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_BBML1) { + prot |= ARM_LPAE_PTE_NT; + __arm_lpae_init_pte(data, paddr, prot, lvl, ptep); + io_pgtable_tlb_flush_walk(iop, iova, size, + ARM_LPAE_GRANULE(data)); + + prot &= ~(ARM_LPAE_PTE_NT); + __arm_lpae_init_pte(data, paddr, prot, lvl, ptep); + } else { + __arm_lpae_init_pte(data, paddr, prot, lvl, ptep); + } + + tablep = iopte_deref(pte, data); + __arm_lpae_free_pgtable(data, lvl + 1, tablep); + return size; + } else if (iopte_leaf(pte, lvl, iop->fmt)) { + /* The size is too small, already merged */ + return size; + } + + /* Keep on walkin */ + ptep = iopte_deref(pte, data); + return __arm_lpae_merge_page(data, iova, paddr, size, lvl + 1, ptep, + prot); +} + +static size_t arm_lpae_merge_page(struct io_pgtable_ops *ops, + unsigned long iova, phys_addr_t paddr, + size_t size, int iommu_prot) +{ + struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops); + struct io_pgtable_cfg *cfg = &data->iop.cfg; + arm_lpae_iopte *ptep = data->pgd; + int lvl = ARM_LPAE_START_LVL(data); + arm_lpae_iopte prot; + + if (WARN_ON(!size || (size & cfg->pgsize_bitmap) != size)) + return 0; + + if (WARN_ON(iova >= (1ULL << data->iop.cfg.ias) || + paddr >= (1ULL << data->iop.cfg.oas))) + return -ERANGE; + + /* If no access, then nothing to do */ + if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE))) + return size; + + /* If it is smallest granule, then nothing to do */ + if (size == ARM_LPAE_BLOCK_SIZE(ARM_LPAE_MAX_LEVELS - 1, data)) + return size; + + prot = arm_lpae_prot_to_pte(data, iommu_prot); + return __arm_lpae_merge_page(data, iova, paddr, size, lvl, ptep, prot); +} + +static int __arm_lpae_sync_dirty_log(struct arm_lpae_io_pgtable *data, + unsigned long iova, size_t size, + int lvl, arm_lpae_iopte *ptep, + unsigned long *bitmap, + unsigned long base_iova, + unsigned long bitmap_pgshift) +{ + arm_lpae_iopte pte; + struct io_pgtable *iop = &data->iop; + size_t base, next_size; + unsigned long offset; + int nbits, ret; + + if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS)) + return -EINVAL; + + ptep += ARM_LPAE_LVL_IDX(iova, lvl, data); + pte = READ_ONCE(*ptep); + if (WARN_ON(!pte)) + return -EINVAL; + + if (size == ARM_LPAE_BLOCK_SIZE(lvl, data)) { + if (iopte_leaf(pte, lvl, iop->fmt)) { + if (pte & ARM_LPAE_PTE_AP_RDONLY) + return 0; + + /* It is writable, set the bitmap */ + nbits = size >> bitmap_pgshift; + offset = (iova - base_iova) >> bitmap_pgshift; + bitmap_set(bitmap, offset, nbits); + return 0; + } + /* Current level is table, traverse next level */ + next_size = ARM_LPAE_BLOCK_SIZE(lvl + 1, data); + ptep = iopte_deref(pte, data); + for (base = 0; base < size; base += next_size) { + ret = __arm_lpae_sync_dirty_log(data, iova + base, + next_size, lvl + 1, ptep, bitmap, + base_iova, bitmap_pgshift); + if (ret) + return ret; + } + return 0; + } else if (iopte_leaf(pte, lvl, iop->fmt)) { + if (pte & ARM_LPAE_PTE_AP_RDONLY) + return 0; + + /* Though the size is too small, also set bitmap */ + nbits = size >> bitmap_pgshift; + offset = (iova - base_iova) >> bitmap_pgshift; + bitmap_set(bitmap, offset, nbits); + return 0; + } + + /* Keep on walkin */ + ptep = iopte_deref(pte, data); + return __arm_lpae_sync_dirty_log(data, iova, size, lvl + 1, ptep, + bitmap, base_iova, bitmap_pgshift); +} + +static int arm_lpae_sync_dirty_log(struct io_pgtable_ops *ops, + unsigned long iova, size_t size, + unsigned long *bitmap, + unsigned long base_iova, + unsigned long bitmap_pgshift) +{ + struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops); + struct io_pgtable_cfg *cfg = &data->iop.cfg; + arm_lpae_iopte *ptep = data->pgd; + int lvl = ARM_LPAE_START_LVL(data); + + if (WARN_ON(!size || (size & cfg->pgsize_bitmap) != size)) + return -EINVAL; + + if (WARN_ON(iova >= (1ULL << data->iop.cfg.ias))) + return -EINVAL; + + if (data->iop.fmt != ARM_64_LPAE_S1 && + data->iop.fmt != ARM_32_LPAE_S1) + return -EINVAL; + + return __arm_lpae_sync_dirty_log(data, iova, size, lvl, ptep, + bitmap, base_iova, bitmap_pgshift); +} + +static int __arm_lpae_clear_dirty_log(struct arm_lpae_io_pgtable *data, + unsigned long iova, size_t size, + int lvl, arm_lpae_iopte *ptep, + unsigned long *bitmap, + unsigned long base_iova, + unsigned long bitmap_pgshift) +{ + arm_lpae_iopte pte; + struct io_pgtable *iop = &data->iop; + unsigned long offset; + size_t base, next_size; + int nbits, ret, i; + + if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS)) + return -EINVAL; + + ptep += ARM_LPAE_LVL_IDX(iova, lvl, data); + pte = READ_ONCE(*ptep); + if (WARN_ON(!pte)) + return -EINVAL; + + if (size == ARM_LPAE_BLOCK_SIZE(lvl, data)) { + if (iopte_leaf(pte, lvl, iop->fmt)) { + if (pte & ARM_LPAE_PTE_AP_RDONLY) + return 0; + + /* Ensure all corresponding bits are set */ + nbits = size >> bitmap_pgshift; + offset = (iova - base_iova) >> bitmap_pgshift; + for (i = offset; i < offset + nbits; i++) { + if (!test_bit(i, bitmap)) + return 0; + } + + /* Race does not exist */ + pte |= ARM_LPAE_PTE_AP_RDONLY; + __arm_lpae_set_pte(ptep, pte, &iop->cfg); + return 0; + } + /* Current level is table, traverse next level */ + next_size = ARM_LPAE_BLOCK_SIZE(lvl + 1, data); + ptep = iopte_deref(pte, data); + for (base = 0; base < size; base += next_size) { + ret = __arm_lpae_clear_dirty_log(data, iova + base, + next_size, lvl + 1, ptep, bitmap, + base_iova, bitmap_pgshift); + if (ret) + return ret; + } + return 0; + } else if (iopte_leaf(pte, lvl, iop->fmt)) { + /* Though the size is too small, it is already clean */ + if (pte & ARM_LPAE_PTE_AP_RDONLY) + return 0; + + return -EINVAL; + } + + /* Keep on walkin */ + ptep = iopte_deref(pte, data); + return __arm_lpae_clear_dirty_log(data, iova, size, lvl + 1, ptep, + bitmap, base_iova, bitmap_pgshift); +} + +static int arm_lpae_clear_dirty_log(struct io_pgtable_ops *ops, + unsigned long iova, size_t size, + unsigned long *bitmap, + unsigned long base_iova, + unsigned long bitmap_pgshift) +{ + struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops); + struct io_pgtable_cfg *cfg = &data->iop.cfg; + arm_lpae_iopte *ptep = data->pgd; + int lvl = ARM_LPAE_START_LVL(data); + + if (WARN_ON(!size || (size & cfg->pgsize_bitmap) != size)) + return -EINVAL; + + if (WARN_ON(iova >= (1ULL << data->iop.cfg.ias))) + return -EINVAL; + + if (data->iop.fmt != ARM_64_LPAE_S1 && + data->iop.fmt != ARM_32_LPAE_S1) + return -EINVAL; + + return __arm_lpae_clear_dirty_log(data, iova, size, lvl, ptep, + bitmap, base_iova, bitmap_pgshift); +} + static void arm_lpae_restrict_pgsizes(struct io_pgtable_cfg *cfg) { unsigned long granule, page_sizes; @@ -781,6 +1156,10 @@ arm_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg) .map = arm_lpae_map, .unmap = arm_lpae_unmap, .iova_to_phys = arm_lpae_iova_to_phys, + .split_block = arm_lpae_split_block, + .merge_page = arm_lpae_merge_page, + .sync_dirty_log = arm_lpae_sync_dirty_log, + .clear_dirty_log = arm_lpae_clear_dirty_log, }; return data; @@ -793,7 +1172,10 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie) struct arm_lpae_io_pgtable *data; if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS | - IO_PGTABLE_QUIRK_NON_STRICT)) + IO_PGTABLE_QUIRK_NON_STRICT | + IO_PGTABLE_QUIRK_ARM_HD | + IO_PGTABLE_QUIRK_ARM_BBML1 | + IO_PGTABLE_QUIRK_ARM_BBML2)) return NULL; data = arm_lpae_alloc_pgtable(cfg); diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 87227b1ee8cb9f93cc7b9aafd6388c953bae7153..3be56006375e2c727cd911386a5ee5007c161f32 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -667,7 +667,7 @@ static int iommu_group_create_direct_mappings(struct iommu_group *group, } - iommu_flush_tlb_all(domain); + iommu_flush_iotlb_all(domain); out: iommu_put_resv_regions(dev, &mappings); @@ -1621,6 +1621,7 @@ static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus, domain->type = type; /* Assume all sizes by default; the driver may override this later */ domain->pgsize_bitmap = bus->iommu_ops->pgsize_bitmap; + mutex_init(&domain->switch_log_lock); return domain; } @@ -1839,8 +1840,8 @@ phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova) } EXPORT_SYMBOL_GPL(iommu_iova_to_phys); -static size_t iommu_pgsize(struct iommu_domain *domain, - unsigned long addr_merge, size_t size) +size_t iommu_pgsize(struct iommu_domain *domain, + unsigned long addr_merge, size_t size) { unsigned int pgsize_idx; size_t pgsize; @@ -1870,6 +1871,7 @@ static size_t iommu_pgsize(struct iommu_domain *domain, return pgsize; } +EXPORT_SYMBOL_GPL(iommu_pgsize); int __iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot, gfp_t gfp) @@ -2008,7 +2010,7 @@ size_t iommu_unmap(struct iommu_domain *domain, iommu_iotlb_gather_init(&iotlb_gather); ret = __iommu_unmap(domain, iova, size, &iotlb_gather); - iommu_tlb_sync(domain, &iotlb_gather); + iommu_iotlb_sync(domain, &iotlb_gather); return ret; } @@ -2200,6 +2202,188 @@ int iommu_domain_set_attr(struct iommu_domain *domain, } EXPORT_SYMBOL_GPL(iommu_domain_set_attr); +bool iommu_support_dirty_log(struct iommu_domain *domain) +{ + const struct iommu_ops *ops = domain->ops; + + return ops->support_dirty_log && ops->support_dirty_log(domain); +} +EXPORT_SYMBOL_GPL(iommu_support_dirty_log); + +int iommu_switch_dirty_log(struct iommu_domain *domain, bool enable, + unsigned long iova, size_t size, int prot) +{ + const struct iommu_ops *ops = domain->ops; + unsigned long orig_iova = iova; + unsigned int min_pagesz; + size_t orig_size = size; + bool flush = false; + int ret = 0; + + if (unlikely(!ops->switch_dirty_log)) + return -ENODEV; + + min_pagesz = 1 << __ffs(domain->pgsize_bitmap); + if (!IS_ALIGNED(iova | size, min_pagesz)) { + pr_err("unaligned: iova 0x%lx size 0x%zx min_pagesz 0x%x\n", + iova, size, min_pagesz); + return -EINVAL; + } + + mutex_lock(&domain->switch_log_lock); + + pr_debug("switch_dirty_log %s for: iova 0x%lx size 0x%zx\n", + enable ? "enable" : "disable", iova, size); + + while (size) { + size_t pgsize = iommu_pgsize(domain, iova, size); + + flush = true; + ret = ops->switch_dirty_log(domain, enable, iova, pgsize, prot); + if (ret) + break; + + pr_debug("switch_dirty_log handled: iova 0x%lx size 0x%zx\n", + iova, pgsize); + + iova += pgsize; + size -= pgsize; + } + + if (flush) + iommu_flush_iotlb_all(domain); + + if (!ret) + trace_switch_dirty_log(orig_iova, orig_size, enable); + + mutex_unlock(&domain->switch_log_lock); + return ret; +} +EXPORT_SYMBOL_GPL(iommu_switch_dirty_log); + +int iommu_sync_dirty_log(struct iommu_domain *domain, unsigned long iova, + size_t size, unsigned long *bitmap, + unsigned long base_iova, unsigned long bitmap_pgshift) +{ + const struct iommu_ops *ops = domain->ops; + unsigned long orig_iova = iova; + unsigned int min_pagesz; + size_t orig_size = size; + int ret = 0; + + if (unlikely(!ops->sync_dirty_log)) + return -ENODEV; + + min_pagesz = 1 << __ffs(domain->pgsize_bitmap); + if (!IS_ALIGNED(iova | size, min_pagesz)) { + pr_err("unaligned: iova 0x%lx size 0x%zx min_pagesz 0x%x\n", + iova, size, min_pagesz); + return -EINVAL; + } + + mutex_lock(&domain->switch_log_lock); + + pr_debug("sync_dirty_log for: iova 0x%lx size 0x%zx\n", iova, size); + + while (size) { + size_t pgsize = iommu_pgsize(domain, iova, size); + + ret = ops->sync_dirty_log(domain, iova, pgsize, + bitmap, base_iova, bitmap_pgshift); + if (ret) + break; + + pr_debug("sync_dirty_log handled: iova 0x%lx size 0x%zx\n", + iova, pgsize); + + iova += pgsize; + size -= pgsize; + } + + if (!ret) + trace_sync_dirty_log(orig_iova, orig_size); + + mutex_unlock(&domain->switch_log_lock); + return ret; +} +EXPORT_SYMBOL_GPL(iommu_sync_dirty_log); + +static int __iommu_clear_dirty_log(struct iommu_domain *domain, + unsigned long iova, size_t size, + unsigned long *bitmap, + unsigned long base_iova, + unsigned long bitmap_pgshift) +{ + const struct iommu_ops *ops = domain->ops; + unsigned long orig_iova = iova; + size_t orig_size = size; + int ret = 0; + + if (unlikely(!ops->clear_dirty_log)) + return -ENODEV; + + pr_debug("clear_dirty_log for: iova 0x%lx size 0x%zx\n", iova, size); + + while (size) { + size_t pgsize = iommu_pgsize(domain, iova, size); + + ret = ops->clear_dirty_log(domain, iova, pgsize, bitmap, + base_iova, bitmap_pgshift); + if (ret) + break; + + pr_debug("clear_dirty_log handled: iova 0x%lx size 0x%zx\n", + iova, pgsize); + + iova += pgsize; + size -= pgsize; + } + + if (!ret) + trace_clear_dirty_log(orig_iova, orig_size); + + return ret; +} + +int iommu_clear_dirty_log(struct iommu_domain *domain, + unsigned long iova, size_t size, + unsigned long *bitmap, unsigned long base_iova, + unsigned long bitmap_pgshift) +{ + unsigned long riova, rsize; + unsigned int min_pagesz, rs, re, start, end; + bool flush = false; + int ret = 0; + + min_pagesz = 1 << __ffs(domain->pgsize_bitmap); + if (!IS_ALIGNED(iova | size, min_pagesz)) { + pr_err("unaligned: iova 0x%lx min_pagesz 0x%x\n", + iova, min_pagesz); + return -EINVAL; + } + + mutex_lock(&domain->switch_log_lock); + + start = (iova - base_iova) >> bitmap_pgshift; + end = start + (size >> bitmap_pgshift); + bitmap_for_each_set_region(bitmap, rs, re, start, end) { + flush = true; + riova = base_iova + ((unsigned long)rs << bitmap_pgshift); + rsize = (unsigned long)(re - rs) << bitmap_pgshift; + ret = __iommu_clear_dirty_log(domain, riova, rsize, bitmap, + base_iova, bitmap_pgshift); + if (ret) + break; + } + + if (flush) + iommu_flush_iotlb_all(domain); + + mutex_unlock(&domain->switch_log_lock); + return ret; +} +EXPORT_SYMBOL_GPL(iommu_clear_dirty_log); + void iommu_get_resv_regions(struct device *dev, struct list_head *list) { const struct iommu_ops *ops = dev->bus->iommu_ops; diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index efd3782ead9743de6fb19bcfd81c094b8a5ca6dd..a899009721f0819808d801694b61f7b21a49e41d 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -85,6 +85,7 @@ struct vfio_group { atomic_t opened; wait_queue_head_t container_q; bool noiommu; + unsigned int dev_counter; struct kvm *kvm; struct blocking_notifier_head notifier; }; @@ -555,6 +556,7 @@ struct vfio_device *vfio_group_create_device(struct vfio_group *group, mutex_lock(&group->device_lock); list_add(&device->group_next, &group->device_list); + group->dev_counter++; mutex_unlock(&group->device_lock); return device; @@ -567,6 +569,7 @@ static void vfio_device_release(struct kref *kref) struct vfio_group *group = device->group; list_del(&device->group_next); + group->dev_counter--; mutex_unlock(&group->device_lock); dev_set_drvdata(device->dev, NULL); @@ -1929,6 +1932,9 @@ int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage, if (!group) return -ENODEV; + if (group->dev_counter > 1) + return -EINVAL; + ret = vfio_group_add_container_user(group); if (ret) goto err_pin_pages; @@ -1936,7 +1942,8 @@ int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage, container = group->container; driver = container->iommu_driver; if (likely(driver && driver->ops->pin_pages)) - ret = driver->ops->pin_pages(container->iommu_data, user_pfn, + ret = driver->ops->pin_pages(container->iommu_data, + group->iommu_group, user_pfn, npage, prot, phys_pfn); else ret = -ENOTTY; diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 6b1e8cba1798408ab02abebd66b66bb9fd9289ab..2a10b837035c37a2a4ccdd4f62c0b47e76126c25 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -69,8 +69,13 @@ struct vfio_iommu { struct rb_root dma_list; struct blocking_notifier_head notifier; unsigned int dma_avail; + uint64_t pgsize_bitmap; + uint64_t num_non_pinned_groups; + uint64_t num_non_hwdbm_domains; bool v2; bool nesting; + bool dirty_page_tracking; + bool dirty_log_get_no_clear; }; struct vfio_domain { @@ -79,6 +84,7 @@ struct vfio_domain { struct list_head group_list; int prot; /* IOMMU_CACHE */ bool fgsp; /* Fine-grained super pages */ + bool iommu_hwdbm; /* Hardware dirty management */ }; struct vfio_dma { @@ -91,12 +97,14 @@ struct vfio_dma { bool lock_cap; /* capable(CAP_IPC_LOCK) */ struct task_struct *task; struct rb_root pfn_list; /* Ex-user pinned pfn list */ + unsigned long *bitmap; }; struct vfio_group { struct iommu_group *iommu_group; struct list_head next; bool mdev_group; /* An mdev group */ + bool pinned_page_dirty_scope; }; struct vfio_iova { @@ -112,7 +120,7 @@ struct vfio_pfn { struct rb_node node; dma_addr_t iova; /* Device address */ unsigned long pfn; /* Host pfn */ - atomic_t ref_count; + unsigned int ref_count; }; struct vfio_regions { @@ -125,8 +133,24 @@ struct vfio_regions { #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \ (!list_empty(&iommu->domain_list)) +#define DIRTY_BITMAP_BYTES(n) (ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE) + +/* + * Input argument of number of bits to bitmap_set() is unsigned integer, which + * further casts to signed integer for unaligned multi-bit operation, + * __bitmap_set(). + * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte, + * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page + * system. + */ +#define DIRTY_BITMAP_PAGES_MAX ((u64)INT_MAX) +#define DIRTY_BITMAP_SIZE_MAX DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX) + static int put_pfn(unsigned long pfn, int prot); +static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu, + struct iommu_group *iommu_group); + /* * This code handles mapping and unmapping of user data buffers * into DMA'ble space using the IOMMU @@ -175,6 +199,93 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old) rb_erase(&old->node, &iommu->dma_list); } + +static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize) +{ + uint64_t npages = dma->size / pgsize; + + if (npages > DIRTY_BITMAP_PAGES_MAX) + return -EINVAL; + + /* + * Allocate extra 64 bits that are used to calculate shift required for + * bitmap_shift_left() to manipulate and club unaligned number of pages + * in adjacent vfio_dma ranges. + */ + dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages) + sizeof(u64), + GFP_KERNEL); + if (!dma->bitmap) + return -ENOMEM; + + return 0; +} + +static void vfio_dma_bitmap_free(struct vfio_dma *dma) +{ + kfree(dma->bitmap); + dma->bitmap = NULL; +} + +static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize) +{ + struct rb_node *p; + unsigned long pgshift = __ffs(pgsize); + + for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) { + struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node); + + bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) >> pgshift, 1); + } +} + +static void vfio_iommu_populate_bitmap_full(struct vfio_iommu *iommu) +{ + struct rb_node *n; + unsigned long pgshift = __ffs(iommu->pgsize_bitmap); + + for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) { + struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node); + + bitmap_set(dma->bitmap, 0, dma->size >> pgshift); + } +} + +static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize) +{ + struct rb_node *n; + + for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) { + struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node); + int ret; + + ret = vfio_dma_bitmap_alloc(dma, pgsize); + if (ret) { + struct rb_node *p; + + for (p = rb_prev(n); p; p = rb_prev(p)) { + struct vfio_dma *dma = rb_entry(n, + struct vfio_dma, node); + + vfio_dma_bitmap_free(dma); + } + return ret; + } + vfio_dma_populate_bitmap(dma, pgsize); + } + return 0; +} + +static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu) +{ + struct rb_node *n; + + for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) { + struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node); + + vfio_dma_bitmap_free(dma); + } +} + /* * Helper Functions for host iova-pfn list */ @@ -233,7 +344,7 @@ static int vfio_add_to_pfn_list(struct vfio_dma *dma, dma_addr_t iova, vpfn->iova = iova; vpfn->pfn = pfn; - atomic_set(&vpfn->ref_count, 1); + vpfn->ref_count = 1; vfio_link_pfn(dma, vpfn); return 0; } @@ -251,7 +362,7 @@ static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova); if (vpfn) - atomic_inc(&vpfn->ref_count); + vpfn->ref_count++; return vpfn; } @@ -259,7 +370,8 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn) { int ret = 0; - if (atomic_dec_and_test(&vpfn->ref_count)) { + vpfn->ref_count--; + if (!vpfn->ref_count) { ret = put_pfn(vpfn->pfn, dma->prot); vfio_remove_from_pfn_list(dma, vpfn); } @@ -580,11 +692,13 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova, } static int vfio_iommu_type1_pin_pages(void *iommu_data, + struct iommu_group *iommu_group, unsigned long *user_pfn, int npage, int prot, unsigned long *phys_pfn) { struct vfio_iommu *iommu = iommu_data; + struct vfio_group *group; int i, j, ret; unsigned long remote_vaddr; struct vfio_dma *dma; @@ -646,9 +760,26 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data, vfio_lock_acct(dma, -1, true); goto pin_unwind; } - } + if (iommu->dirty_page_tracking) { + unsigned long pgshift = __ffs(iommu->pgsize_bitmap); + + /* + * Bitmap populated with the smallest supported page + * size + */ + bitmap_set(dma->bitmap, + (iova - dma->iova) >> pgshift, 1); + } + } ret = i; + + group = vfio_iommu_find_iommu_group(iommu, iommu_group); + if (!group->pinned_page_dirty_scope) { + group->pinned_page_dirty_scope = true; + iommu->num_non_pinned_groups--; + } + goto pin_done; pin_unwind: @@ -707,7 +838,7 @@ static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain, long unlocked = 0; struct vfio_regions *entry, *next; - iommu_tlb_sync(domain->domain, iotlb_gather); + iommu_iotlb_sync(domain->domain, iotlb_gather); list_for_each_entry_safe(entry, next, regions, list) { unlocked += vfio_unpin_pages_remote(dma, @@ -881,19 +1012,19 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma) vfio_unmap_unpin(iommu, dma, true); vfio_unlink_dma(iommu, dma); put_task_struct(dma->task); + vfio_dma_bitmap_free(dma); kfree(dma); iommu->dma_avail++; } -static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu) +static void vfio_update_pgsize_bitmap(struct vfio_iommu *iommu) { struct vfio_domain *domain; - unsigned long bitmap = ULONG_MAX; - mutex_lock(&iommu->lock); + iommu->pgsize_bitmap = ULONG_MAX; + list_for_each_entry(domain, &iommu->domain_list, next) - bitmap &= domain->domain->pgsize_bitmap; - mutex_unlock(&iommu->lock); + iommu->pgsize_bitmap &= domain->domain->pgsize_bitmap; /* * In case the IOMMU supports page sizes smaller than PAGE_SIZE @@ -903,36 +1034,304 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu) * granularity while iommu driver can use the sub-PAGE_SIZE size * to map the buffer. */ - if (bitmap & ~PAGE_MASK) { - bitmap &= PAGE_MASK; - bitmap |= PAGE_SIZE; + if (iommu->pgsize_bitmap & ~PAGE_MASK) { + iommu->pgsize_bitmap &= PAGE_MASK; + iommu->pgsize_bitmap |= PAGE_SIZE; } +} + +static int vfio_iommu_dirty_log_clear(struct vfio_iommu *iommu, + dma_addr_t start_iova, size_t size, + unsigned long *bitmap_buffer, + dma_addr_t base_iova, + unsigned long pgshift) +{ + struct vfio_domain *d; + int ret = 0; - return bitmap; + list_for_each_entry(d, &iommu->domain_list, next) { + ret = iommu_clear_dirty_log(d->domain, start_iova, size, + bitmap_buffer, base_iova, pgshift); + if (ret) { + pr_warn("vfio_iommu dirty log clear failed!\n"); + break; + } + } + + return ret; } -static int vfio_dma_do_unmap(struct vfio_iommu *iommu, - struct vfio_iommu_type1_dma_unmap *unmap) +static int vfio_iommu_dirty_log_sync(struct vfio_iommu *iommu, + struct vfio_dma *dma, + unsigned long pgshift) { - uint64_t mask; - struct vfio_dma *dma, *dma_last = NULL; - size_t unmapped = 0; - int ret = 0, retries = 0; + struct vfio_domain *d; + int ret = 0; + + list_for_each_entry(d, &iommu->domain_list, next) { + ret = iommu_sync_dirty_log(d->domain, dma->iova, dma->size, + dma->bitmap, dma->iova, pgshift); + if (ret) { + pr_warn("vfio_iommu dirty log sync failed!\n"); + break; + } + } + + return ret; +} + +static int vfio_iova_dirty_log_clear(u64 __user *bitmap, + struct vfio_iommu *iommu, + dma_addr_t iova, size_t size, + size_t pgsize) +{ + struct vfio_dma *dma; + struct rb_node *n; + dma_addr_t start_iova, end_iova, riova; + unsigned long pgshift = __ffs(pgsize); + unsigned long bitmap_size; + unsigned long *bitmap_buffer = NULL; + bool clear_valid; + unsigned int rs, re, start, end, dma_offset; + int ret = 0; + + bitmap_size = DIRTY_BITMAP_BYTES(size >> pgshift); + bitmap_buffer = kvmalloc(bitmap_size, GFP_KERNEL); + if (!bitmap_buffer) { + ret = -ENOMEM; + goto out; + } - mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1; + if (copy_from_user(bitmap_buffer, bitmap, bitmap_size)) { + ret = -EFAULT; + goto out; + } - if (unmap->iova & mask) + for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) { + dma = rb_entry(n, struct vfio_dma, node); + if ((dma->iova + dma->size - 1) < iova) + continue; + if (dma->iova > iova + size - 1) + break; + + start_iova = max(iova, dma->iova); + end_iova = min(iova + size, dma->iova + dma->size); + + /* Similar logic as the tail of vfio_iova_dirty_bitmap */ + + clear_valid = false; + start = (start_iova - iova) >> pgshift; + end = (end_iova - iova) >> pgshift; + bitmap_for_each_set_region(bitmap_buffer, rs, re, start, end) { + clear_valid = true; + riova = iova + ((unsigned long)rs << pgshift); + dma_offset = (riova - dma->iova) >> pgshift; + bitmap_clear(dma->bitmap, dma_offset, re - rs); + } + + if (clear_valid) + vfio_dma_populate_bitmap(dma, pgsize); + + if (clear_valid && iommu->num_non_pinned_groups && + dma->iommu_mapped && !iommu->num_non_hwdbm_domains) { + ret = vfio_iommu_dirty_log_clear(iommu, start_iova, + end_iova - start_iova, bitmap_buffer, + iova, pgshift); + if (ret) { + pr_warn("dma dirty log clear failed!\n"); + goto out; + } + } + + } + +out: + kfree(bitmap_buffer); + return ret; +} + +static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu, + struct vfio_dma *dma, dma_addr_t base_iova, + size_t pgsize) +{ + unsigned long pgshift = __ffs(pgsize); + unsigned long nbits = dma->size >> pgshift; + unsigned long bit_offset = (dma->iova - base_iova) >> pgshift; + unsigned long copy_offset = bit_offset / BITS_PER_LONG; + unsigned long shift = bit_offset % BITS_PER_LONG; + unsigned long leftover; + bool iommu_hwdbm_dirty = false; + int ret; + + if (iommu->num_non_pinned_groups && dma->iommu_mapped) { + if (!iommu->num_non_hwdbm_domains) { + /* try to get dirty log from IOMMU */ + iommu_hwdbm_dirty = true; + ret = vfio_iommu_dirty_log_sync(iommu, dma, pgshift); + if (ret) + return ret; + } else { + /* + * mark all pages dirty if any IOMMU capable device is + * not able to report dirty pages and all pages are + * pinned and mapped. + */ + bitmap_set(dma->bitmap, 0, nbits); + } + } + + if (shift) { + bitmap_shift_left(dma->bitmap, dma->bitmap, shift, + nbits + shift); + + if (copy_from_user(&leftover, + (void __user *)(bitmap + copy_offset), + sizeof(leftover))) + return -EFAULT; + + bitmap_or(dma->bitmap, dma->bitmap, &leftover, shift); + } + + if (copy_to_user((void __user *)(bitmap + copy_offset), dma->bitmap, + DIRTY_BITMAP_BYTES(nbits + shift))) + return -EFAULT; + + /* + * Recover the bitmap if it'll be used to clear hardware dirty log, or + * user wants to clear the dirty bitmap manually. + */ + if (shift && (iommu_hwdbm_dirty || iommu->dirty_log_get_no_clear)) + bitmap_shift_right(dma->bitmap, dma->bitmap, shift, + nbits + shift); + + return 0; +} + +static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu, + dma_addr_t iova, size_t size, size_t pgsize) +{ + struct vfio_dma *dma; + struct rb_node *n; + unsigned long pgshift = __ffs(pgsize); + int ret; + + /* + * GET_BITMAP request must fully cover vfio_dma mappings. Multiple + * vfio_dma mappings may be clubbed by specifying large ranges, but + * there must not be any previous mappings bisected by the range. + * An error will be returned if these conditions are not met. + */ + dma = vfio_find_dma(iommu, iova, 1); + if (dma && dma->iova != iova) return -EINVAL; - if (!unmap->size || unmap->size & mask) + + dma = vfio_find_dma(iommu, iova + size - 1, 0); + if (dma && dma->iova + dma->size != iova + size) return -EINVAL; - if (unmap->iova + unmap->size - 1 < unmap->iova || - unmap->size > SIZE_MAX) + + for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) { + struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node); + + if (dma->iova < iova) + continue; + + if (dma->iova > iova + size - 1) + break; + + ret = update_user_bitmap(bitmap, iommu, dma, iova, pgsize); + if (ret) + return ret; + + /* Do not clear dirty automatically when require no clear */ + if (iommu->dirty_log_get_no_clear) + continue; + + /* Clear iommu dirty log to re-enable dirty log tracking */ + if (iommu->num_non_pinned_groups && dma->iommu_mapped && + !iommu->num_non_hwdbm_domains) { + ret = vfio_iommu_dirty_log_clear(iommu, dma->iova, + dma->size, dma->bitmap, dma->iova, + pgshift); + if (ret) + return ret; + } + + /* + * Re-populate bitmap to include all pinned pages which are + * considered as dirty but exclude pages which are unpinned and + * pages which are marked dirty by vfio_dma_rw() + */ + bitmap_clear(dma->bitmap, 0, dma->size >> pgshift); + vfio_dma_populate_bitmap(dma, pgsize); + } + return 0; +} + +static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size) +{ + if (!npages || !bitmap_size || (bitmap_size > DIRTY_BITMAP_SIZE_MAX) || + (bitmap_size < DIRTY_BITMAP_BYTES(npages))) return -EINVAL; - WARN_ON(mask & PAGE_MASK); -again: + return 0; +} + +static void vfio_dma_dirty_log_switch(struct vfio_iommu *iommu, + struct vfio_dma *dma, bool enable) +{ + struct vfio_domain *d; + + if (!dma->iommu_mapped) + return; + + list_for_each_entry(d, &iommu->domain_list, next) { + if (!d->iommu_hwdbm) + continue; + WARN_ON(iommu_switch_dirty_log(d->domain, enable, dma->iova, + dma->size, + IOMMU_CACHE | dma->prot)); + } +} + +static int vfio_dma_do_unmap(struct vfio_iommu *iommu, + struct vfio_iommu_type1_dma_unmap *unmap, + struct vfio_bitmap *bitmap) +{ + struct vfio_dma *dma, *dma_last = NULL; + size_t unmapped = 0, pgsize; + int ret = 0, retries = 0; + unsigned long pgshift; + mutex_lock(&iommu->lock); + pgshift = __ffs(iommu->pgsize_bitmap); + pgsize = (size_t)1 << pgshift; + + if (unmap->iova & (pgsize - 1)) { + ret = -EINVAL; + goto unlock; + } + + if (!unmap->size || unmap->size & (pgsize - 1)) { + ret = -EINVAL; + goto unlock; + } + + if (unmap->iova + unmap->size - 1 < unmap->iova || + unmap->size > SIZE_MAX) { + ret = -EINVAL; + goto unlock; + } + + /* When dirty tracking is enabled, allow only min supported pgsize */ + if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) && + (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) { + ret = -EINVAL; + goto unlock; + } + + WARN_ON((pgsize - 1) & PAGE_MASK); +again: /* * vfio-iommu-type1 (v1) - User mappings were coalesced together to * avoid tracking individual mappings. This means that the granularity @@ -1010,8 +1409,21 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, blocking_notifier_call_chain(&iommu->notifier, VFIO_IOMMU_NOTIFY_DMA_UNMAP, &nb_unmap); + mutex_lock(&iommu->lock); goto again; } + + if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) { + ret = update_user_bitmap(bitmap->data, iommu, dma, + unmap->iova, pgsize); + if (ret) + break; + } + + /* Stop log for removed dma */ + if (iommu->dirty_page_tracking) + vfio_dma_dirty_log_switch(iommu, dma, false); + unmapped += dma->size; vfio_remove_dma(iommu, dma); } @@ -1118,31 +1530,35 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, unsigned long vaddr = map->vaddr; size_t size = map->size; int ret = 0, prot = 0; - uint64_t mask; + size_t pgsize; struct vfio_dma *dma; /* Verify that none of our __u64 fields overflow */ if (map->size != size || map->vaddr != vaddr || map->iova != iova) return -EINVAL; - mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1; - - WARN_ON(mask & PAGE_MASK); - /* READ/WRITE from device perspective */ if (map->flags & VFIO_DMA_MAP_FLAG_WRITE) prot |= IOMMU_WRITE; if (map->flags & VFIO_DMA_MAP_FLAG_READ) prot |= IOMMU_READ; - if (!prot || !size || (size | iova | vaddr) & mask) - return -EINVAL; + mutex_lock(&iommu->lock); - /* Don't allow IOVA or virtual address wrap */ - if (iova + size - 1 < iova || vaddr + size - 1 < vaddr) - return -EINVAL; + pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap); - mutex_lock(&iommu->lock); + WARN_ON((pgsize - 1) & PAGE_MASK); + + if (!prot || !size || (size | iova | vaddr) & (pgsize - 1)) { + ret = -EINVAL; + goto out_unlock; + } + + /* Don't allow IOVA or virtual address wrap */ + if (iova + size - 1 < iova || vaddr + size - 1 < vaddr) { + ret = -EINVAL; + goto out_unlock; + } if (vfio_find_dma(iommu, iova, size)) { ret = -EEXIST; @@ -1210,6 +1626,17 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, else ret = vfio_pin_map_dma(iommu, dma, size); + if (!ret && iommu->dirty_page_tracking) { + ret = vfio_dma_bitmap_alloc(dma, pgsize); + if (ret) { + vfio_remove_dma(iommu, dma); + goto out_unlock; + } + + /* Start dirty log for newly added dma */ + vfio_dma_dirty_log_switch(iommu, dma, true); + } + out_unlock: mutex_unlock(&iommu->lock); return ret; @@ -1409,6 +1836,24 @@ static struct vfio_group *find_iommu_group(struct vfio_domain *domain, return NULL; } +static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu, + struct iommu_group *iommu_group) +{ + struct vfio_domain *domain; + struct vfio_group *group = NULL; + + list_for_each_entry(domain, &iommu->domain_list, next) { + group = find_iommu_group(domain, iommu_group); + if (group) + return group; + } + + if (iommu->external_domain) + group = find_iommu_group(iommu->external_domain, iommu_group); + + return group; +} + static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions, phys_addr_t *base) { @@ -1753,6 +2198,78 @@ static void vfio_iommu_iova_insert_copy(struct vfio_iommu *iommu, list_splice_tail(iova_copy, iova); } + +static void vfio_domain_dirty_log_switch(struct vfio_iommu *iommu, + struct vfio_domain *d, bool enable) +{ + struct rb_node *n; + struct vfio_dma *dma; + + for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) { + dma = rb_entry(n, struct vfio_dma, node); + if (!dma->iommu_mapped) + continue; + WARN_ON(iommu_switch_dirty_log(d->domain, enable, dma->iova, + dma->size, + IOMMU_CACHE | dma->prot)); + } +} + +/* + * Called after a new group is added to the iommu_domain, or an old group is + * removed from the iommu_domain. Update the HWDBM status of vfio_domain and + * vfio_iommu. + */ +static void vfio_iommu_update_hwdbm(struct vfio_iommu *iommu, + struct vfio_domain *domain, + bool attach) +{ + bool old_hwdbm = domain->iommu_hwdbm; + bool new_hwdbm = iommu_support_dirty_log(domain->domain); + bool singular = list_is_singular(&domain->group_list); + bool num_non_hwdbm_zeroed = false; + bool log_enabled, should_enable; + + if ((old_hwdbm || singular) && !new_hwdbm && attach) { + iommu->num_non_hwdbm_domains++; + } else if (!old_hwdbm && (new_hwdbm || singular) && !attach) { + iommu->num_non_hwdbm_domains--; + if (!iommu->num_non_hwdbm_domains) + num_non_hwdbm_zeroed = true; + } + domain->iommu_hwdbm = new_hwdbm; + + if (!iommu->dirty_page_tracking) + return; + + /* + * When switch the dirty policy from full dirty to iommu hwdbm, we must + * populate full dirty now to avoid losing dirty. + */ + if (iommu->num_non_pinned_groups && num_non_hwdbm_zeroed) + vfio_iommu_populate_bitmap_full(iommu); + + /* + * The vfio_domain can switch dirty log tracking dynamically due to + * group attach/detach. The basic idea is to convert current dirty log + * status to desired dirty log status. + * + * If old_hwdbm is true then dirty log has been enabled. One exception + * is that this is the first group attached to a domain. + * + * If new_hwdbm is true then dirty log should be enabled. One exception + * is that this is the last group detached from a domain. + */ + log_enabled = old_hwdbm && !(attach && singular); + should_enable = new_hwdbm && !(!attach && singular); + + /* Switch dirty log tracking when status changed */ + if (should_enable && !log_enabled) + vfio_domain_dirty_log_switch(iommu, domain, true); + else if (!should_enable && log_enabled) + vfio_domain_dirty_log_switch(iommu, domain, false); +} + static int vfio_iommu_type1_attach_group(void *iommu_data, struct iommu_group *iommu_group) { @@ -1809,12 +2326,21 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, if (!iommu->external_domain) { INIT_LIST_HEAD(&domain->group_list); iommu->external_domain = domain; + vfio_update_pgsize_bitmap(iommu); } else { kfree(domain); } list_add(&group->next, &iommu->external_domain->group_list); + /* + * Non-iommu backed group cannot dirty memory directly, + * it can only use interfaces that provide dirty + * tracking. + * The iommu scope can only be promoted with the + * addition of a dirty tracking group. + */ + group->pinned_page_dirty_scope = true; mutex_unlock(&iommu->lock); return 0; @@ -1909,6 +2435,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, vfio_iommu_detach_group(domain, group); if (!vfio_iommu_attach_group(d, group)) { list_add(&group->next, &d->group_list); + vfio_iommu_update_hwdbm(iommu, d, true); iommu_domain_free(domain->domain); kfree(domain); goto done; @@ -1934,9 +2461,18 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, } list_add(&domain->next, &iommu->domain_list); + vfio_update_pgsize_bitmap(iommu); + vfio_iommu_update_hwdbm(iommu, domain, true); done: /* Delete the old one and insert new iova list */ vfio_iommu_iova_insert_copy(iommu, &iova_copy); + + /* + * An iommu backed group can dirty memory directly and therefore + * demotes the iommu scope until it declares itself dirty tracking + * capable via the page pinning interface. + */ + iommu->num_non_pinned_groups++; mutex_unlock(&iommu->lock); vfio_iommu_resv_free(&group_resv_regions); @@ -2072,6 +2608,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data, struct vfio_iommu *iommu = iommu_data; struct vfio_domain *domain; struct vfio_group *group; + bool update_dirty_scope = false; LIST_HEAD(iova_copy); mutex_lock(&iommu->lock); @@ -2079,6 +2616,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data, if (iommu->external_domain) { group = find_iommu_group(iommu->external_domain, iommu_group); if (group) { + update_dirty_scope = !group->pinned_page_dirty_scope; list_del(&group->next); kfree(group); @@ -2108,6 +2646,8 @@ static void vfio_iommu_type1_detach_group(void *iommu_data, continue; vfio_iommu_detach_group(domain, group); + vfio_iommu_update_hwdbm(iommu, domain, false); + update_dirty_scope = !group->pinned_page_dirty_scope; list_del(&group->next); kfree(group); /* @@ -2130,6 +2670,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data, list_del(&domain->next); kfree(domain); vfio_iommu_aper_expand(iommu, &iova_copy); + vfio_update_pgsize_bitmap(iommu); } break; } @@ -2140,6 +2681,19 @@ static void vfio_iommu_type1_detach_group(void *iommu_data, vfio_iommu_iova_free(&iova_copy); detach_group_done: + /* + * Removal of a group without dirty tracking may allow the iommu scope + * to be promoted. + */ + if (update_dirty_scope) { + iommu->num_non_pinned_groups--; + /* + * When switch the dirty policy from full dirty to pinned scope, + * we must populate full dirty now to avoid losing dirty. + */ + if (iommu->dirty_page_tracking && iommu->num_non_hwdbm_domains) + vfio_iommu_populate_bitmap_full(iommu); + } mutex_unlock(&iommu->lock); } @@ -2261,8 +2815,6 @@ static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu, size_t size; int iovas = 0, i = 0, ret; - mutex_lock(&iommu->lock); - list_for_each_entry(iova, &iommu->iova_list, list) iovas++; @@ -2271,17 +2823,14 @@ static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu, * Return 0 as a container with a single mdev device * will have an empty list */ - ret = 0; - goto out_unlock; + return 0; } size = sizeof(*cap_iovas) + (iovas * sizeof(*cap_iovas->iova_ranges)); cap_iovas = kzalloc(size, GFP_KERNEL); - if (!cap_iovas) { - ret = -ENOMEM; - goto out_unlock; - } + if (!cap_iovas) + return -ENOMEM; cap_iovas->nr_iovas = iovas; @@ -2294,8 +2843,6 @@ static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu, ret = vfio_iommu_iova_add_cap(caps, cap_iovas, size); kfree(cap_iovas); -out_unlock: - mutex_unlock(&iommu->lock); return ret; } @@ -2305,7 +2852,6 @@ static int vfio_iommu_dma_avail_build_caps(struct vfio_iommu *iommu, struct vfio_iommu_type1_info_dma_avail cap_dma_avail; int ret; - mutex_lock(&iommu->lock); cap_dma_avail.header.id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL; cap_dma_avail.header.version = 1; @@ -2313,10 +2859,41 @@ static int vfio_iommu_dma_avail_build_caps(struct vfio_iommu *iommu, ret = vfio_info_add_capability(caps, &cap_dma_avail.header, sizeof(cap_dma_avail)); - mutex_unlock(&iommu->lock); return ret; } +static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu, + struct vfio_info_cap *caps) +{ + struct vfio_iommu_type1_info_cap_migration cap_mig = {}; + + cap_mig.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION; + cap_mig.header.version = 1; + + cap_mig.flags = 0; + /* support minimum pgsize */ + cap_mig.pgsize_bitmap = (size_t)1 << __ffs(iommu->pgsize_bitmap); + cap_mig.max_dirty_bitmap_size = DIRTY_BITMAP_SIZE_MAX; + + return vfio_info_add_capability(caps, &cap_mig.header, sizeof(cap_mig)); +} + +static void vfio_iommu_dirty_log_switch(struct vfio_iommu *iommu, bool enable) +{ + struct vfio_domain *d; + + /* + * We enable dirty log tracking for these vfio_domains that support + * HWDBM. Even if all iommu domains don't support HWDBM for now. They + * may support it after detach some groups. + */ + list_for_each_entry(d, &iommu->domain_list, next) { + if (!d->iommu_hwdbm) + continue; + vfio_domain_dirty_log_switch(iommu, d, enable); + } +} + static long vfio_iommu_type1_ioctl(void *iommu_data, unsigned int cmd, unsigned long arg) { @@ -2333,6 +2910,10 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, if (!iommu) return 0; return vfio_domains_have_iommu_cache(iommu); + case VFIO_DIRTY_LOG_MANUAL_CLEAR: + if (!iommu) + return 0; + return 1; default: return 0; } @@ -2358,15 +2939,21 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, info.cap_offset = 0; /* output, no-recopy necessary */ } + mutex_lock(&iommu->lock); info.flags = VFIO_IOMMU_INFO_PGSIZES; - info.iova_pgsizes = vfio_pgsize_bitmap(iommu); + info.iova_pgsizes = iommu->pgsize_bitmap; - ret = vfio_iommu_iova_build_caps(iommu, &caps); + ret = vfio_iommu_migration_build_caps(iommu, &caps); if (!ret) ret = vfio_iommu_dma_avail_build_caps(iommu, &caps); + if (!ret) + ret = vfio_iommu_iova_build_caps(iommu, &caps); + + mutex_unlock(&iommu->lock); + if (ret) return ret; @@ -2409,22 +2996,165 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, } else if (cmd == VFIO_IOMMU_UNMAP_DMA) { struct vfio_iommu_type1_dma_unmap unmap; - long ret; + struct vfio_bitmap bitmap = { 0 }; + int ret; minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); if (copy_from_user(&unmap, (void __user *)arg, minsz)) return -EFAULT; - if (unmap.argsz < minsz || unmap.flags) + if (unmap.argsz < minsz || + unmap.flags & ~VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) return -EINVAL; - ret = vfio_dma_do_unmap(iommu, &unmap); + if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) { + unsigned long pgshift; + + if (unmap.argsz < (minsz + sizeof(bitmap))) + return -EINVAL; + + if (copy_from_user(&bitmap, + (void __user *)(arg + minsz), + sizeof(bitmap))) + return -EFAULT; + + if (!access_ok((void __user *)bitmap.data, bitmap.size)) + return -EINVAL; + + pgshift = __ffs(bitmap.pgsize); + ret = verify_bitmap_size(unmap.size >> pgshift, + bitmap.size); + if (ret) + return ret; + } + + ret = vfio_dma_do_unmap(iommu, &unmap, &bitmap); if (ret) return ret; return copy_to_user((void __user *)arg, &unmap, minsz) ? -EFAULT : 0; + } else if (cmd == VFIO_IOMMU_DIRTY_PAGES) { + struct vfio_iommu_type1_dirty_bitmap dirty; + uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START | + VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP | + VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP | + VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR | + VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP; + int ret = 0; + + if (!iommu->v2) + return -EACCES; + + minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap, + flags); + + if (copy_from_user(&dirty, (void __user *)arg, minsz)) + return -EFAULT; + + if (dirty.argsz < minsz || dirty.flags & ~mask) + return -EINVAL; + + /* only one flag should be set at a time */ + if (__ffs(dirty.flags) != __fls(dirty.flags)) + return -EINVAL; + + if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) { + size_t pgsize; + + mutex_lock(&iommu->lock); + pgsize = 1 << __ffs(iommu->pgsize_bitmap); + if (!iommu->dirty_page_tracking) { + ret = vfio_dma_bitmap_alloc_all(iommu, pgsize); + if (!ret) { + iommu->dirty_page_tracking = true; + vfio_iommu_dirty_log_switch(iommu, + true); + } + } + mutex_unlock(&iommu->lock); + return ret; + } else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) { + mutex_lock(&iommu->lock); + if (iommu->dirty_page_tracking) { + iommu->dirty_page_tracking = false; + vfio_dma_bitmap_free_all(iommu); + vfio_iommu_dirty_log_switch(iommu, false); + } + mutex_unlock(&iommu->lock); + return 0; + } else if (dirty.flags & + (VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP | + VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR | + VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP)) { + struct vfio_iommu_type1_dirty_bitmap_get range; + unsigned long pgshift; + size_t data_size = dirty.argsz - minsz; + size_t iommu_pgsize; + + if (!data_size || data_size < sizeof(range)) + return -EINVAL; + + if (copy_from_user(&range, (void __user *)(arg + minsz), + sizeof(range))) + return -EFAULT; + + if (range.iova + range.size < range.iova) + return -EINVAL; + if (!access_ok((void __user *)range.bitmap.data, + range.bitmap.size)) + return -EINVAL; + + pgshift = __ffs(range.bitmap.pgsize); + ret = verify_bitmap_size(range.size >> pgshift, + range.bitmap.size); + if (ret) + return ret; + + mutex_lock(&iommu->lock); + + iommu_pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap); + + /* allow only smallest supported pgsize */ + if (range.bitmap.pgsize != iommu_pgsize) { + ret = -EINVAL; + goto out_unlock; + } + if (range.iova & (iommu_pgsize - 1)) { + ret = -EINVAL; + goto out_unlock; + } + if (!range.size || range.size & (iommu_pgsize - 1)) { + ret = -EINVAL; + goto out_unlock; + } + + if (!iommu->dirty_page_tracking) { + ret = -EINVAL; + goto out_unlock; + } + + if (dirty.flags & + (VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP | + VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR)) { + iommu->dirty_log_get_no_clear = !!(dirty.flags & + VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR); + + ret = vfio_iova_dirty_bitmap(range.bitmap.data, + iommu, range.iova, range.size, + range.bitmap.pgsize); + } else { + ret = vfio_iova_dirty_log_clear( + range.bitmap.data, iommu, + range.iova, range.size, + range.bitmap.pgsize); + } +out_unlock: + mutex_unlock(&iommu->lock); + + return ret; + } } return -ENOTTY; diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index adbc8cbbdb423334b72b476286ab07bcded02ac9..a3f6a3356fa6554b3872ece421ebbaa17ad04b9a 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -51,6 +51,12 @@ * bitmap_clear(dst, pos, nbits) Clear specified bit area * bitmap_find_next_zero_area(buf, len, pos, n, mask) Find bit free area * bitmap_find_next_zero_area_off(buf, len, pos, n, mask) as above + * bitmap_next_clear_region(map, &start, &end, nbits) Find next clear region + * bitmap_next_set_region(map, &start, &end, nbits) Find next set region + * bitmap_for_each_clear_region(map, rs, re, start, end) + * Iterate over all clear regions + * bitmap_for_each_set_region(map, rs, re, start, end) + * Iterate over all set regions * bitmap_shift_right(dst, src, n, nbits) *dst = *src >> n * bitmap_shift_left(dst, src, n, nbits) *dst = *src << n * bitmap_remap(dst, src, old, new, nbits) *dst = map(old, new)(src) @@ -451,6 +457,41 @@ static inline int bitmap_parse(const char *buf, unsigned int buflen, return __bitmap_parse(buf, buflen, 0, maskp, nmaskbits); } +static inline void bitmap_next_clear_region(unsigned long *bitmap, + unsigned int *rs, unsigned int *re, + unsigned int end) +{ + *rs = find_next_zero_bit(bitmap, end, *rs); + *re = find_next_bit(bitmap, end, *rs + 1); +} + +static inline void bitmap_next_set_region(unsigned long *bitmap, + unsigned int *rs, unsigned int *re, + unsigned int end) +{ + *rs = find_next_bit(bitmap, end, *rs); + *re = find_next_zero_bit(bitmap, end, *rs + 1); +} + +/* + * Bitmap region iterators. Iterates over the bitmap between [@start, @end). + * @rs and @re should be integer variables and will be set to start and end + * index of the current clear or set region. + */ +#define bitmap_for_each_clear_region(bitmap, rs, re, start, end) \ + for ((rs) = (start), \ + bitmap_next_clear_region((bitmap), &(rs), &(re), (end)); \ + (rs) < (re); \ + (rs) = (re) + 1, \ + bitmap_next_clear_region((bitmap), &(rs), &(re), (end))) + +#define bitmap_for_each_set_region(bitmap, rs, re, start, end) \ + for ((rs) = (start), \ + bitmap_next_set_region((bitmap), &(rs), &(re), (end)); \ + (rs) < (re); \ + (rs) = (re) + 1, \ + bitmap_next_set_region((bitmap), &(rs), &(re), (end))) + /** * BITMAP_FROM_U64() - Represent u64 value in the format suitable for bitmap. * @n: u64 value diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index ec7a13405f10bbece1606f51f7f6b27df1dcfe48..e97f082cd14730413648f1032b3796715ff2ec84 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -31,7 +31,7 @@ enum io_pgtable_fmt { * single page. IOMMUs that cannot batch TLB invalidation * operations efficiently will typically issue them here, but * others may decide to update the iommu_iotlb_gather structure - * and defer the invalidation until iommu_tlb_sync() instead. + * and defer the invalidation until iommu_iotlb_sync() instead. * * Note that these can all be called in atomic context and must therefore * not block. @@ -83,12 +83,23 @@ struct io_pgtable_cfg { * IO_PGTABLE_QUIRK_NON_STRICT: Skip issuing synchronous leaf TLBIs * on unmap, for DMA domains using the flush queue mechanism for * delayed invalidation. + * + * io_pgtable_quirk_arm_hd: support hardware management of dirty status. + * + * io_pgtable_quirk_arm_bbml1: arm smmu supports bbm level 1 behavior + * when changing block size. + * + * io_pgtable_quirk_arm_bbml2: arm smmu supports bbm level 2 behavior + * when changing block size. */ #define IO_PGTABLE_QUIRK_ARM_NS BIT(0) #define IO_PGTABLE_QUIRK_NO_PERMS BIT(1) #define IO_PGTABLE_QUIRK_TLBI_ON_MAP BIT(2) #define IO_PGTABLE_QUIRK_ARM_MTK_EXT BIT(3) #define IO_PGTABLE_QUIRK_NON_STRICT BIT(4) + #define IO_PGTABLE_QUIRK_ARM_HD BIT(6) + #define IO_PGTABLE_QUIRK_ARM_BBML1 BIT(7) + #define IO_PGTABLE_QUIRK_ARM_BBML2 BIT(8) unsigned long quirks; unsigned long pgsize_bitmap; unsigned int ias; @@ -141,6 +152,18 @@ struct io_pgtable_ops { size_t size, struct iommu_iotlb_gather *gather); phys_addr_t (*iova_to_phys)(struct io_pgtable_ops *ops, unsigned long iova); + size_t (*split_block)(struct io_pgtable_ops *ops, unsigned long iova, + size_t size); + size_t (*merge_page)(struct io_pgtable_ops *ops, unsigned long iova, + phys_addr_t phys, size_t size, int prot); + int (*sync_dirty_log)(struct io_pgtable_ops *ops, + unsigned long iova, size_t size, + unsigned long *bitmap, unsigned long base_iova, + unsigned long bitmap_pgshift); + int (*clear_dirty_log)(struct io_pgtable_ops *ops, + unsigned long iova, size_t size, + unsigned long *bitmap, unsigned long base_iova, + unsigned long bitmap_pgshift); }; /** diff --git a/include/linux/iommu.h b/include/linux/iommu.h index b2c148032001e85a963e6c44b2be8ee8d52d519b..4593a193236839e12489e1a373ec6c0659c11923 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -94,6 +94,7 @@ struct iommu_domain { void *handler_token; struct iommu_domain_geometry geometry; void *iova_cookie; + struct mutex switch_log_lock; }; enum iommu_cap { @@ -227,6 +228,10 @@ struct iommu_iotlb_gather { * @device_group: find iommu group for a particular device * @domain_get_attr: Query domain attributes * @domain_set_attr: Change domain attributes + * @support_dirty_log: Check whether domain supports dirty log tracking + * @switch_dirty_log: Perform actions to start|stop dirty log tracking + * @sync_dirty_log: Sync dirty log from IOMMU into a dirty bitmap + * @clear_dirty_log: Clear dirty log of IOMMU by a mask bitmap * @get_resv_regions: Request list of reserved regions for a device * @put_resv_regions: Free list of reserved regions for a device * @apply_resv_region: Temporary helper call-back for iova reserved ranges @@ -272,6 +277,22 @@ struct iommu_ops { int (*domain_set_attr)(struct iommu_domain *domain, enum iommu_attr attr, void *data); + /* + * Track dirty log. Note: Don't concurrently call these interfaces with + * other ops that access underlying page table. + */ + bool (*support_dirty_log)(struct iommu_domain *domain); + int (*switch_dirty_log)(struct iommu_domain *domain, bool enable, + unsigned long iova, size_t size, int prot); + int (*sync_dirty_log)(struct iommu_domain *domain, + unsigned long iova, size_t size, + unsigned long *bitmap, unsigned long base_iova, + unsigned long bitmap_pgshift); + int (*clear_dirty_log)(struct iommu_domain *domain, + unsigned long iova, size_t size, + unsigned long *bitmap, unsigned long base_iova, + unsigned long bitmap_pgshift); + /* Request/Free a list of reserved regions for a device */ void (*get_resv_regions)(struct device *dev, struct list_head *list); void (*put_resv_regions)(struct device *dev, struct list_head *list); @@ -426,6 +447,8 @@ extern void iommu_detach_device(struct iommu_domain *domain, struct device *dev); extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev); extern struct iommu_domain *iommu_get_dma_domain(struct device *dev); +extern size_t iommu_pgsize(struct iommu_domain *domain, + unsigned long addr_merge, size_t size); extern int iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot); extern int iommu_map_atomic(struct iommu_domain *domain, unsigned long iova, @@ -498,6 +521,17 @@ extern int iommu_domain_get_attr(struct iommu_domain *domain, enum iommu_attr, void *data); extern int iommu_domain_set_attr(struct iommu_domain *domain, enum iommu_attr, void *data); +extern bool iommu_support_dirty_log(struct iommu_domain *domain); +extern int iommu_switch_dirty_log(struct iommu_domain *domain, bool enable, + unsigned long iova, size_t size, int prot); +extern int iommu_sync_dirty_log(struct iommu_domain *domain, unsigned long iova, + size_t size, unsigned long *bitmap, + unsigned long base_iova, + unsigned long bitmap_pgshift); +extern int iommu_clear_dirty_log(struct iommu_domain *domain, + unsigned long iova, size_t dma_size, + unsigned long *bitmap, unsigned long base_iova, + unsigned long bitmap_pgshift); /* Window handling function prototypes */ extern int iommu_domain_window_enable(struct iommu_domain *domain, u32 wnd_nr, @@ -508,13 +542,13 @@ extern void iommu_domain_window_disable(struct iommu_domain *domain, u32 wnd_nr) extern int report_iommu_fault(struct iommu_domain *domain, struct device *dev, unsigned long iova, int flags); -static inline void iommu_flush_tlb_all(struct iommu_domain *domain) +static inline void iommu_flush_iotlb_all(struct iommu_domain *domain) { if (domain->ops->flush_iotlb_all) domain->ops->flush_iotlb_all(domain); } -static inline void iommu_tlb_sync(struct iommu_domain *domain, +static inline void iommu_iotlb_sync(struct iommu_domain *domain, struct iommu_iotlb_gather *iotlb_gather) { if (domain->ops->iotlb_sync) @@ -537,7 +571,7 @@ static inline void iommu_iotlb_gather_add_page(struct iommu_domain *domain, if (gather->pgsize != size || end < gather->start || start > gather->end) { if (gather->pgsize) - iommu_tlb_sync(domain, gather); + iommu_iotlb_sync(domain, gather); gather->pgsize = size; } @@ -708,11 +742,11 @@ static inline size_t iommu_map_sg_atomic(struct iommu_domain *domain, return 0; } -static inline void iommu_flush_tlb_all(struct iommu_domain *domain) +static inline void iommu_flush_iotlb_all(struct iommu_domain *domain) { } -static inline void iommu_tlb_sync(struct iommu_domain *domain, +static inline void iommu_iotlb_sync(struct iommu_domain *domain, struct iommu_iotlb_gather *iotlb_gather) { } @@ -891,7 +925,37 @@ static inline int iommu_domain_set_attr(struct iommu_domain *domain, return -EINVAL; } -static inline int iommu_device_register(struct iommu_device *iommu) +static inline bool iommu_support_dirty_log(struct iommu_domain *domain) +{ + return false; +} + +static inline int iommu_switch_dirty_log(struct iommu_domain *domain, + bool enable, unsigned long iova, + size_t size, int prot) +{ + return -EINVAL; +} + +static inline int iommu_sync_dirty_log(struct iommu_domain *domain, + unsigned long iova, size_t size, + unsigned long *bitmap, + unsigned long base_iova, + unsigned long pgshift) +{ + return -EINVAL; +} + +static inline int iommu_clear_dirty_log(struct iommu_domain *domain, + unsigned long iova, size_t size, + unsigned long *bitmap, + unsigned long base_iova, + unsigned long pgshift) +{ + return -EINVAL; +} + +static inline int iommu_device_register(struct iommu_device *iommu) { return -ENODEV; } diff --git a/include/linux/vfio.h b/include/linux/vfio.h index e42a711a2800c0f6cf02535468ba52d2c8f2d74a..da29802d62767f4f46c5b67ef27124f837486c08 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -72,7 +72,9 @@ struct vfio_iommu_driver_ops { struct iommu_group *group); void (*detach_group)(void *iommu_data, struct iommu_group *group); - int (*pin_pages)(void *iommu_data, unsigned long *user_pfn, + int (*pin_pages)(void *iommu_data, + struct iommu_group *group, + unsigned long *user_pfn, int npage, int prot, unsigned long *phys_pfn); int (*unpin_pages)(void *iommu_data, diff --git a/include/trace/events/iommu.h b/include/trace/events/iommu.h index 72b4582322ff501d75a580b36ce3e2528ad06f32..6436d693d3573dc65b6077aad436502c41770250 100644 --- a/include/trace/events/iommu.h +++ b/include/trace/events/iommu.h @@ -129,6 +129,69 @@ TRACE_EVENT(unmap, ) ); +TRACE_EVENT(switch_dirty_log, + + TP_PROTO(unsigned long iova, size_t size, bool enable), + + TP_ARGS(iova, size, enable), + + TP_STRUCT__entry( + __field(u64, iova) + __field(size_t, size) + __field(bool, enable) + ), + + TP_fast_assign( + __entry->iova = iova; + __entry->size = size; + __entry->enable = enable; + ), + + TP_printk("IOMMU: iova=0x%016llx size=%zu enable=%u", + __entry->iova, __entry->size, __entry->enable + ) +); + +TRACE_EVENT(sync_dirty_log, + + TP_PROTO(unsigned long iova, size_t size), + + TP_ARGS(iova, size), + + TP_STRUCT__entry( + __field(u64, iova) + __field(size_t, size) + ), + + TP_fast_assign( + __entry->iova = iova; + __entry->size = size; + ), + + TP_printk("IOMMU: iova=0x%016llx size=%zu", __entry->iova, + __entry->size) +); + +TRACE_EVENT(clear_dirty_log, + + TP_PROTO(unsigned long iova, size_t size), + + TP_ARGS(iova, size), + + TP_STRUCT__entry( + __field(u64, iova) + __field(size_t, size) + ), + + TP_fast_assign( + __entry->iova = iova; + __entry->size = size; + ), + + TP_printk("IOMMU: iova=0x%016llx size=%zu", __entry->iova, + __entry->size) +); + DECLARE_EVENT_CLASS(iommu_error, TP_PROTO(struct device *dev, unsigned long iova, int flags), diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index cabc93118f9c8faadb96c31caf292c8a57af7df3..28cf3f0274fe32b7b4782f570e5ebd601f0dd38e 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -46,6 +46,16 @@ */ #define VFIO_NOIOMMU_IOMMU 8 +/* + * The vfio_iommu driver may support user clears dirty log manually, which means + * dirty log can be requested to not cleared automatically after dirty log is + * copied to userspace, it's user's duty to clear dirty log. + * + * Note: please refer to VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR and + * VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP. + */ +#define VFIO_DIRTY_LOG_MANUAL_CLEAR 11 + /* * The IOCTL interface is designed for extensibility by embedding the * structure length (argsz) and flags into structures passed between @@ -305,6 +315,7 @@ struct vfio_region_info_cap_type { #define VFIO_REGION_TYPE_PCI_VENDOR_MASK (0xffff) #define VFIO_REGION_TYPE_GFX (1) #define VFIO_REGION_TYPE_CCW (2) +#define VFIO_REGION_TYPE_MIGRATION (3) /* sub-types for VFIO_REGION_TYPE_PCI_* */ @@ -379,6 +390,233 @@ struct vfio_region_gfx_edid { /* sub-types for VFIO_REGION_TYPE_CCW */ #define VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD (1) +/* sub-types for VFIO_REGION_TYPE_MIGRATION */ +#define VFIO_REGION_SUBTYPE_MIGRATION (1) + +/* + * The structure vfio_device_migration_info is placed at the 0th offset of + * the VFIO_REGION_SUBTYPE_MIGRATION region to get and set VFIO device related + * migration information. Field accesses from this structure are only supported + * at their native width and alignment. Otherwise, the result is undefined and + * vendor drivers should return an error. + * + * device_state: (read/write) + * - The user application writes to this field to inform the vendor driver + * about the device state to be transitioned to. + * - The vendor driver should take the necessary actions to change the + * device state. After successful transition to a given state, the + * vendor driver should return success on write(device_state, state) + * system call. If the device state transition fails, the vendor driver + * should return an appropriate -errno for the fault condition. + * - On the user application side, if the device state transition fails, + * that is, if write(device_state, state) returns an error, read + * device_state again to determine the current state of the device from + * the vendor driver. + * - The vendor driver should return previous state of the device unless + * the vendor driver has encountered an internal error, in which case + * the vendor driver may report the device_state VFIO_DEVICE_STATE_ERROR. + * - The user application must use the device reset ioctl to recover the + * device from VFIO_DEVICE_STATE_ERROR state. If the device is + * indicated to be in a valid device state by reading device_state, the + * user application may attempt to transition the device to any valid + * state reachable from the current state or terminate itself. + * + * device_state consists of 3 bits: + * - If bit 0 is set, it indicates the _RUNNING state. If bit 0 is clear, + * it indicates the _STOP state. When the device state is changed to + * _STOP, driver should stop the device before write() returns. + * - If bit 1 is set, it indicates the _SAVING state, which means that the + * driver should start gathering device state information that will be + * provided to the VFIO user application to save the device's state. + * - If bit 2 is set, it indicates the _RESUMING state, which means that + * the driver should prepare to resume the device. Data provided through + * the migration region should be used to resume the device. + * Bits 3 - 31 are reserved for future use. To preserve them, the user + * application should perform a read-modify-write operation on this + * field when modifying the specified bits. + * + * +------- _RESUMING + * |+------ _SAVING + * ||+----- _RUNNING + * ||| + * 000b => Device Stopped, not saving or resuming + * 001b => Device running, which is the default state + * 010b => Stop the device & save the device state, stop-and-copy state + * 011b => Device running and save the device state, pre-copy state + * 100b => Device stopped and the device state is resuming + * 101b => Invalid state + * 110b => Error state + * 111b => Invalid state + * + * State transitions: + * + * _RESUMING _RUNNING Pre-copy Stop-and-copy _STOP + * (100b) (001b) (011b) (010b) (000b) + * 0. Running or default state + * | + * + * 1. Normal Shutdown (optional) + * |------------------------------------->| + * + * 2. Save the state or suspend + * |------------------------->|---------->| + * + * 3. Save the state during live migration + * |----------->|------------>|---------->| + * + * 4. Resuming + * |<---------| + * + * 5. Resumed + * |--------->| + * + * 0. Default state of VFIO device is _RUNNNG when the user application starts. + * 1. During normal shutdown of the user application, the user application may + * optionally change the VFIO device state from _RUNNING to _STOP. This + * transition is optional. The vendor driver must support this transition but + * must not require it. + * 2. When the user application saves state or suspends the application, the + * device state transitions from _RUNNING to stop-and-copy and then to _STOP. + * On state transition from _RUNNING to stop-and-copy, driver must stop the + * device, save the device state and send it to the application through the + * migration region. The sequence to be followed for such transition is given + * below. + * 3. In live migration of user application, the state transitions from _RUNNING + * to pre-copy, to stop-and-copy, and to _STOP. + * On state transition from _RUNNING to pre-copy, the driver should start + * gathering the device state while the application is still running and send + * the device state data to application through the migration region. + * On state transition from pre-copy to stop-and-copy, the driver must stop + * the device, save the device state and send it to the user application + * through the migration region. + * Vendor drivers must support the pre-copy state even for implementations + * where no data is provided to the user before the stop-and-copy state. The + * user must not be required to consume all migration data before the device + * transitions to a new state, including the stop-and-copy state. + * The sequence to be followed for above two transitions is given below. + * 4. To start the resuming phase, the device state should be transitioned from + * the _RUNNING to the _RESUMING state. + * In the _RESUMING state, the driver should use the device state data + * received through the migration region to resume the device. + * 5. After providing saved device data to the driver, the application should + * change the state from _RESUMING to _RUNNING. + * + * reserved: + * Reads on this field return zero and writes are ignored. + * + * pending_bytes: (read only) + * The number of pending bytes still to be migrated from the vendor driver. + * + * data_offset: (read only) + * The user application should read data_offset field from the migration + * region. The user application should read the device data from this + * offset within the migration region during the _SAVING state or write + * the device data during the _RESUMING state. See below for details of + * sequence to be followed. + * + * data_size: (read/write) + * The user application should read data_size to get the size in bytes of + * the data copied in the migration region during the _SAVING state and + * write the size in bytes of the data copied in the migration region + * during the _RESUMING state. + * + * The format of the migration region is as follows: + * ------------------------------------------------------------------ + * |vfio_device_migration_info| data section | + * | | /////////////////////////////// | + * ------------------------------------------------------------------ + * ^ ^ + * offset 0-trapped part data_offset + * + * The structure vfio_device_migration_info is always followed by the data + * section in the region, so data_offset will always be nonzero. The offset + * from where the data is copied is decided by the kernel driver. The data + * section can be trapped, mmapped, or partitioned, depending on how the kernel + * driver defines the data section. The data section partition can be defined + * as mapped by the sparse mmap capability. If mmapped, data_offset must be + * page aligned, whereas initial section which contains the + * vfio_device_migration_info structure, might not end at the offset, which is + * page aligned. The user is not required to access through mmap regardless + * of the capabilities of the region mmap. + * The vendor driver should determine whether and how to partition the data + * section. The vendor driver should return data_offset accordingly. + * + * The sequence to be followed while in pre-copy state and stop-and-copy state + * is as follows: + * a. Read pending_bytes, indicating the start of a new iteration to get device + * data. Repeated read on pending_bytes at this stage should have no side + * effects. + * If pending_bytes == 0, the user application should not iterate to get data + * for that device. + * If pending_bytes > 0, perform the following steps. + * b. Read data_offset, indicating that the vendor driver should make data + * available through the data section. The vendor driver should return this + * read operation only after data is available from (region + data_offset) + * to (region + data_offset + data_size). + * c. Read data_size, which is the amount of data in bytes available through + * the migration region. + * Read on data_offset and data_size should return the offset and size of + * the current buffer if the user application reads data_offset and + * data_size more than once here. + * d. Read data_size bytes of data from (region + data_offset) from the + * migration region. + * e. Process the data. + * f. Read pending_bytes, which indicates that the data from the previous + * iteration has been read. If pending_bytes > 0, go to step b. + * + * The user application can transition from the _SAVING|_RUNNING + * (pre-copy state) to the _SAVING (stop-and-copy) state regardless of the + * number of pending bytes. The user application should iterate in _SAVING + * (stop-and-copy) until pending_bytes is 0. + * + * The sequence to be followed while _RESUMING device state is as follows: + * While data for this device is available, repeat the following steps: + * a. Read data_offset from where the user application should write data. + * b. Write migration data starting at the migration region + data_offset for + * the length determined by data_size from the migration source. + * c. Write data_size, which indicates to the vendor driver that data is + * written in the migration region. Vendor driver must return this write + * operations on consuming data. Vendor driver should apply the + * user-provided migration region data to the device resume state. + * + * If an error occurs during the above sequences, the vendor driver can return + * an error code for next read() or write() operation, which will terminate the + * loop. The user application should then take the next necessary action, for + * example, failing migration or terminating the user application. + * + * For the user application, data is opaque. The user application should write + * data in the same order as the data is received and the data should be of + * same transaction size at the source. + */ + +struct vfio_device_migration_info { + __u32 device_state; /* VFIO device state */ +#define VFIO_DEVICE_STATE_STOP (0) +#define VFIO_DEVICE_STATE_RUNNING (1 << 0) +#define VFIO_DEVICE_STATE_SAVING (1 << 1) +#define VFIO_DEVICE_STATE_RESUMING (1 << 2) +#define VFIO_DEVICE_STATE_MASK (VFIO_DEVICE_STATE_RUNNING | \ + VFIO_DEVICE_STATE_SAVING | \ + VFIO_DEVICE_STATE_RESUMING) + +#define VFIO_DEVICE_STATE_VALID(state) \ + (state & VFIO_DEVICE_STATE_RESUMING ? \ + (state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_RESUMING : 1) + +#define VFIO_DEVICE_STATE_IS_ERROR(state) \ + ((state & VFIO_DEVICE_STATE_MASK) == (VFIO_DEVICE_STATE_SAVING | \ + VFIO_DEVICE_STATE_RESUMING)) + +#define VFIO_DEVICE_STATE_SET_ERROR(state) \ + ((state & ~VFIO_DEVICE_STATE_MASK) | VFIO_DEVICE_SATE_SAVING | \ + VFIO_DEVICE_STATE_RESUMING) + + __u32 reserved; + __u64 pending_bytes; + __u64 data_offset; + __u64 data_size; +}; + /* * The MSIX mappable capability informs that MSIX data of a BAR can be mmapped * which allows direct access to non-MSIX registers which happened to be within @@ -763,6 +1001,29 @@ struct vfio_iommu_type1_info_dma_avail { __u32 avail; }; +/* + * The migration capability allows to report supported features for migration. + * + * The structures below define version 1 of this capability. + * + * The existence of this capability indicates that IOMMU kernel driver supports + * dirty page logging. + * + * pgsize_bitmap: Kernel driver returns bitmap of supported page sizes for dirty + * page logging. + * max_dirty_bitmap_size: Kernel driver returns maximum supported dirty bitmap + * size in bytes that can be used by user applications when getting the dirty + * bitmap. + */ +#define VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION 2 + +struct vfio_iommu_type1_info_cap_migration { + struct vfio_info_cap_header header; + __u32 flags; + __u64 pgsize_bitmap; + __u64 max_dirty_bitmap_size; /* in bytes */ +}; + #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) /** @@ -783,6 +1044,12 @@ struct vfio_iommu_type1_dma_map { #define VFIO_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13) +struct vfio_bitmap { + __u64 pgsize; /* page size for bitmap in bytes */ + __u64 size; /* in bytes */ + __u64 __user *data; /* one bit per page */ +}; + /** * VFIO_IOMMU_UNMAP_DMA - _IOWR(VFIO_TYPE, VFIO_BASE + 14, * struct vfio_dma_unmap) @@ -792,12 +1059,23 @@ struct vfio_iommu_type1_dma_map { * field. No guarantee is made to the user that arbitrary unmaps of iova * or size different from those used in the original mapping call will * succeed. + * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP should be set to get the dirty bitmap + * before unmapping IO virtual addresses. When this flag is set, the user must + * provide a struct vfio_bitmap in data[]. User must provide zero-allocated + * memory via vfio_bitmap.data and its size in the vfio_bitmap.size field. + * A bit in the bitmap represents one page, of user provided page size in + * vfio_bitmap.pgsize field, consecutively starting from iova offset. Bit set + * indicates that the page at that offset from iova is dirty. A Bitmap of the + * pages in the range of unmapped size is returned in the user-provided + * vfio_bitmap.data. */ struct vfio_iommu_type1_dma_unmap { __u32 argsz; __u32 flags; +#define VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP (1 << 0) __u64 iova; /* IO virtual address */ __u64 size; /* Size of mapping (bytes) */ + __u8 data[]; }; #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14) @@ -809,6 +1087,81 @@ struct vfio_iommu_type1_dma_unmap { #define VFIO_IOMMU_ENABLE _IO(VFIO_TYPE, VFIO_BASE + 15) #define VFIO_IOMMU_DISABLE _IO(VFIO_TYPE, VFIO_BASE + 16) +/** + * VFIO_IOMMU_DIRTY_PAGES - _IOWR(VFIO_TYPE, VFIO_BASE + 17, + * struct vfio_iommu_type1_dirty_bitmap) + * IOCTL is used for dirty pages logging. + * Caller should set flag depending on which operation to perform, details as + * below: + * + * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_START flag set, instructs + * the IOMMU driver to log pages that are dirtied or potentially dirtied by + * the device; designed to be used when a migration is in progress. Dirty pages + * are logged until logging is disabled by user application by calling the IOCTL + * with VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP flag. + * + * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP flag set, instructs + * the IOMMU driver to stop logging dirtied pages. + * + * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP flag set + * returns the dirty pages bitmap for IOMMU container for a given IOVA range. + * The user must specify the IOVA range and the pgsize through the structure + * vfio_iommu_type1_dirty_bitmap_get in the data[] portion. This interface + * supports getting a bitmap of the smallest supported pgsize only and can be + * modified in future to get a bitmap of any specified supported pgsize. The + * user must provide a zeroed memory area for the bitmap memory and specify its + * size in bitmap.size. One bit is used to represent one page consecutively + * starting from iova offset. The user should provide page size in bitmap.pgsize + * field. A bit set in the bitmap indicates that the page at that offset from + * iova is dirty. The caller must set argsz to a value including the size of + * structure vfio_iommu_type1_dirty_bitmap_get, but excluding the size of the + * actual bitmap. If dirty pages logging is not enabled, an error will be + * returned. + * + * The VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR flag is almost same as + * VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP, except that it requires underlying + * dirty bitmap is not cleared automatically. The user can clear it manually by + * calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP flag set. + * + * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP flag set, + * instructs the IOMMU driver to clear the dirty status of pages in a bitmap + * for IOMMU container for a given IOVA range. The user must specify the IOVA + * range, the bitmap and the pgsize through the structure + * vfio_iommu_type1_dirty_bitmap_get in the data[] portion. This interface + * supports clearing a bitmap of the smallest supported pgsize only and can be + * modified in future to clear a bitmap of any specified supported pgsize. The + * user must provide a memory area for the bitmap memory and specify its size + * in bitmap.size. One bit is used to represent one page consecutively starting + * from iova offset. The user should provide page size in bitmap.pgsize field. + * A bit set in the bitmap indicates that the page at that offset from iova is + * cleared the dirty status, and dirty tracking is re-enabled for that page. The + * caller must set argsz to a value including the size of structure + * vfio_iommu_dirty_bitmap_get, but excluing the size of the actual bitmap. If + * dirty pages logging is not enabled, an error will be returned. Note: user + * should clear dirty log before handle corresponding dirty pages. + * + * Only one of the flags _START, _STOP, _GET, _GET_NOCLEAR_, and _CLEAR may be + * specified at a time. + */ +struct vfio_iommu_type1_dirty_bitmap { + __u32 argsz; + __u32 flags; +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_START (1 << 0) +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP (1 << 1) +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP (1 << 2) +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR (1 << 3) +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP (1 << 4) + __u8 data[]; +}; + +struct vfio_iommu_type1_dirty_bitmap_get { + __u64 iova; /* IO virtual address */ + __u64 size; /* Size of iova range */ + struct vfio_bitmap bitmap; +}; + +#define VFIO_IOMMU_DIRTY_PAGES _IO(VFIO_TYPE, VFIO_BASE + 17) + /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */ /* diff --git a/kernel/dma/phytium/pswiotlb-iommu.c b/kernel/dma/phytium/pswiotlb-iommu.c index ca729cea9826a81ca9423aba8afd4332563f4095..1173d5c068f1c9a77015003bcfe95a1ab1f86088 100644 --- a/kernel/dma/phytium/pswiotlb-iommu.c +++ b/kernel/dma/phytium/pswiotlb-iommu.c @@ -78,8 +78,6 @@ struct iommu_dma_cookie { * The following functions are ported from * ./drivers/iommu/dma-iommu.c * ./drivers/iommu/iommu.c - * static size_t iommu_pgsize(struct iommu_domain *domain, unsigned long iova, - * phys_addr_t paddr, size_t size, size_t *count); * static int __iommu_map(struct iommu_domain *domain, unsigned long iova, * phys_addr_t paddr, size_t size, int prot, gfp_t gfp); * static int dma_info_to_prot(enum dma_data_direction dir, bool coherent, @@ -96,40 +94,6 @@ struct iommu_dma_cookie { * int nents, dma_addr_t dma_addr); * static void __invalidate_sg(struct scatterlist *sg, int nents); */ -static size_t iommu_pgsize(struct iommu_domain *domain, - unsigned long addr_merge, size_t size) -{ - unsigned int pgsize_idx; - size_t pgsize; - - /* Max page size that still fits into 'size' */ - pgsize_idx = __fls(size); - - /* need to consider alignment requirements ? */ - if (likely(addr_merge)) { - /* Max page size allowed by address */ - unsigned int align_pgsize_idx = __ffs(addr_merge); - - pgsize_idx = min(pgsize_idx, align_pgsize_idx); - } - - /* build a mask of acceptable page sizes */ - pgsize = (1UL << (pgsize_idx + 1)) - 1; - - /* throw away page sizes not supported by the hardware */ - pgsize &= domain->pgsize_bitmap; - - /* make sure we're still sane */ - BUG_ON(!pgsize); - - /* pick the biggest page */ - pgsize_idx = __fls(pgsize); - pgsize = 1UL << pgsize_idx; - - return pgsize; -} - - static int __iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot, gfp_t gfp) { @@ -367,7 +331,7 @@ static void __iommu_dma_unmap(struct device *dev, dma_addr_t dma_addr, WARN_ON(unmapped != size); if (!cookie->fq_domain) - iommu_tlb_sync(domain, &iotlb_gather); + iommu_iotlb_sync(domain, &iotlb_gather); iommu_dma_free_iova(cookie, dma_addr, size); } diff --git a/mm/percpu.c b/mm/percpu.c index 806bc16f88eb82529615d529d3e8bfe657790972..63a3f6f0a42d712a704eb9d698e01c0803c98d40 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -270,33 +270,6 @@ static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, pcpu_unit_page_offset(cpu, page_idx); } -static void pcpu_next_unpop(unsigned long *bitmap, int *rs, int *re, int end) -{ - *rs = find_next_zero_bit(bitmap, end, *rs); - *re = find_next_bit(bitmap, end, *rs + 1); -} - -static void pcpu_next_pop(unsigned long *bitmap, int *rs, int *re, int end) -{ - *rs = find_next_bit(bitmap, end, *rs); - *re = find_next_zero_bit(bitmap, end, *rs + 1); -} - -/* - * Bitmap region iterators. Iterates over the bitmap between - * [@start, @end) in @chunk. @rs and @re should be integer variables - * and will be set to start and end index of the current free region. - */ -#define pcpu_for_each_unpop_region(bitmap, rs, re, start, end) \ - for ((rs) = (start), pcpu_next_unpop((bitmap), &(rs), &(re), (end)); \ - (rs) < (re); \ - (rs) = (re) + 1, pcpu_next_unpop((bitmap), &(rs), &(re), (end))) - -#define pcpu_for_each_pop_region(bitmap, rs, re, start, end) \ - for ((rs) = (start), pcpu_next_pop((bitmap), &(rs), &(re), (end)); \ - (rs) < (re); \ - (rs) = (re) + 1, pcpu_next_pop((bitmap), &(rs), &(re), (end))) - /* * The following are helper functions to help access bitmaps and convert * between bitmap offsets to address offsets. @@ -732,9 +705,8 @@ static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan) } bits = 0; - pcpu_for_each_md_free_region(chunk, bit_off, bits) { + pcpu_for_each_md_free_region(chunk, bit_off, bits) pcpu_block_update(chunk_md, bit_off, bit_off + bits); - } } /** @@ -749,7 +721,7 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index) { struct pcpu_block_md *block = chunk->md_blocks + index; unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index); - int rs, re, start; /* region start, region end */ + unsigned int rs, re, start; /* region start, region end */ /* promote scan_hint to contig_hint */ if (block->scan_hint) { @@ -765,10 +737,9 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index) block->right_free = 0; /* iterate over free areas and update the contig hints */ - pcpu_for_each_unpop_region(alloc_map, rs, re, start, - PCPU_BITMAP_BLOCK_BITS) { + bitmap_for_each_clear_region(alloc_map, rs, re, start, + PCPU_BITMAP_BLOCK_BITS) pcpu_block_update(block, rs, re); - } } /** @@ -1041,13 +1012,13 @@ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off, static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits, int *next_off) { - int page_start, page_end, rs, re; + unsigned int page_start, page_end, rs, re; page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE); page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE); rs = page_start; - pcpu_next_unpop(chunk->populated, &rs, &re, page_end); + bitmap_next_clear_region(chunk->populated, &rs, &re, page_end); if (rs >= page_end) return true; @@ -1702,13 +1673,13 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, /* populate if not all pages are already there */ if (!is_atomic) { - int page_start, page_end, rs, re; + unsigned int page_start, page_end, rs, re; page_start = PFN_DOWN(off); page_end = PFN_UP(off + size); - pcpu_for_each_unpop_region(chunk->populated, rs, re, - page_start, page_end) { + bitmap_for_each_clear_region(chunk->populated, rs, re, + page_start, page_end) { WARN_ON(chunk->immutable); ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp); @@ -1858,10 +1829,10 @@ static void pcpu_balance_workfn(struct work_struct *work) spin_unlock_irq(&pcpu_lock); list_for_each_entry_safe(chunk, next, &to_free, list) { - int rs, re; + unsigned int rs, re; - pcpu_for_each_pop_region(chunk->populated, rs, re, 0, - chunk->nr_pages) { + bitmap_for_each_set_region(chunk->populated, rs, re, 0, + chunk->nr_pages) { pcpu_depopulate_chunk(chunk, rs, re); spin_lock_irq(&pcpu_lock); pcpu_chunk_depopulated(chunk, rs, re); @@ -1893,7 +1864,7 @@ static void pcpu_balance_workfn(struct work_struct *work) } for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) { - int nr_unpop = 0, rs, re; + unsigned int nr_unpop = 0, rs, re; if (!nr_to_pop) break; @@ -1910,9 +1881,9 @@ static void pcpu_balance_workfn(struct work_struct *work) continue; /* @chunk can't go away while pcpu_alloc_mutex is held */ - pcpu_for_each_unpop_region(chunk->populated, rs, re, 0, - chunk->nr_pages) { - int nr = min(re - rs, nr_to_pop); + bitmap_for_each_clear_region(chunk->populated, rs, re, 0, + chunk->nr_pages) { + int nr = min_t(int, re - rs, nr_to_pop); ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp); if (!ret) {