From 5a3b9528d70d74e3c261d664ef57c1e1f5bcea63 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Thu, 1 Dec 2022 12:01:24 +0800 Subject: [PATCH 1/6] iommu/vt-d: Add a fix for devices need extra dtlb flush commit e65a6897be5e4939d477c4969a05e12d90b08409 upstream. QAT devices on Intel Sapphire Rapids and Emerald Rapids have a defect in address translation service (ATS). These devices may inadvertently issue ATS invalidation completion before posted writes initiated with translated address that utilized translations matching the invalidation address range, violating the invalidation completion ordering. This patch adds an extra device TLB invalidation for the affected devices, it is needed to ensure no more posted writes with translated address following the invalidation completion. Therefore, the ordering is preserved and data-corruption is prevented. Device TLBs are invalidated under the following six conditions: 1. Device driver does DMA API unmap IOVA 2. Device driver unbind a PASID from a process, sva_unbind_device() 3. PASID is torn down, after PASID cache is flushed. e.g. process exit_mmap() due to crash 4. Under SVA usage, called by mmu_notifier.invalidate_range() where VM has to free pages that were unmapped 5. userspace driver unmaps a DMA buffer 6. Cache invalidation in vSVA usage (upcoming) For #1 and #2, device drivers are responsible for stopping DMA traffic before unmap/unbind. For #3, iommu driver gets mmu_notifier to invalidate TLB the same way as normal user unmap which will do an extra invalidation. The dTLB invalidation after PASID cache flush does not need an extra invalidation. Therefore, we only need to deal with #4 and #5 in this patch. #1 is also covered by this patch due to common code path with #5. Intel-SIG: commit e65a6897be5e iommu/vt-d: Add a fix for devices need extra dtlb flush Backport for SPR/EMR vt-d critical fix. Tested-by: Yuzhang Luo Reviewed-by: Ashok Raj Reviewed-by: Kevin Tian Signed-off-by: Jacob Pan Link: https://lore.kernel.org/r/20221130062449.1360063-1-jacob.jun.pan@linux.intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel (cherry picked from commit e65a6897be5e4939d477c4969a05e12d90b08409) [ Ethan Zhao: amend commit log ] Signed-off-by: Ethan Zhao --- drivers/iommu/intel/iommu.c | 68 ++++++++++++++++++++++++++++++++++++- drivers/iommu/intel/svm.c | 5 ++- include/linux/intel-iommu.h | 4 +++ 3 files changed, 75 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 5d06cc2b740c..707d8656b5fe 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1686,6 +1686,24 @@ static void domain_update_iotlb(struct dmar_domain *domain) domain->has_iotlb_device = has_iotlb_device; } +/* + * The extra devTLB flush quirk impacts those QAT devices with PCI device + * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device() + * check because it applies only to the built-in QAT devices and it doesn't + * grant additional privileges. + */ +#define BUGGY_QAT_DEVID_MASK 0x494c +static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev) +{ + if (pdev->vendor != PCI_VENDOR_ID_INTEL) + return false; + + if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK) + return false; + + return true; +} + static void iommu_enable_dev_iotlb(struct device_domain_info *info) { struct pci_dev *pdev; @@ -1773,6 +1791,7 @@ static void __iommu_flush_dev_iotlb(struct device_domain_info *info, qdep = info->ats_qdep; qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, qdep, addr, mask); + quirk_extra_dev_tlb_flush(info, addr, mask, PASID_RID2PASID, qdep); } static void iommu_flush_dev_iotlb(struct dmar_domain *domain, @@ -2806,8 +2825,10 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, } if (info->ats_supported && ecap_prs(iommu->ecap) && - pci_pri_supported(pdev)) + pci_pri_supported(pdev)) { info->pri_supported = 1; + info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev); + } } } @@ -6773,3 +6794,48 @@ static void __init check_tylersburg_isoch(void) pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n", vtisochctrl); } + +/* + * Here we deal with a device TLB defect where device may inadvertently issue ATS + * invalidation completion before posted writes initiated with translated address + * that utilized translations matching the invalidation address range, violating + * the invalidation completion ordering. + * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is + * vulnerable to this defect. In other words, any dTLB invalidation initiated not + * under the control of the trusted/privileged host device driver must use this + * quirk. + * Device TLBs are invalidated under the following six conditions: + * 1. Device driver does DMA API unmap IOVA + * 2. Device driver unbind a PASID from a process, sva_unbind_device() + * 3. PASID is torn down, after PASID cache is flushed. e.g. process + * exit_mmap() due to crash + * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where + * VM has to free pages that were unmapped + * 5. Userspace driver unmaps a DMA buffer + * 6. Cache invalidation in vSVA usage (upcoming) + * + * For #1 and #2, device drivers are responsible for stopping DMA traffic + * before unmap/unbind. For #3, iommu driver gets mmu_notifier to + * invalidate TLB the same way as normal user unmap which will use this quirk. + * The dTLB invalidation after PASID cache flush does not need this quirk. + * + * As a reminder, #6 will *NEED* this quirk as we enable nested translation. + */ +void quirk_extra_dev_tlb_flush(struct device_domain_info *info, + unsigned long address, unsigned long mask, + u32 pasid, u16 qdep) +{ + u16 sid; + + if (likely(!info->dtlb_extra_inval)) + return; + + sid = PCI_DEVID(info->bus, info->devfn); + if (pasid == PASID_RID2PASID) { + qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, + qdep, address, mask); + } else { + qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid, + pasid, qdep, address, mask); + } +} diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index a932e0a8cc0f..fb8eb4ae7673 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -105,10 +105,13 @@ static void __flush_svm_range_dev(struct intel_svm *svm, return; qi_flush_piotlb(sdev->iommu, sdev->did, svm->pasid, address, pages, ih); - if (info->ats_enabled) + if (info->ats_enabled) { qi_flush_dev_iotlb_pasid(sdev->iommu, sdev->sid, info->pfsid, svm->pasid, sdev->qdep, address, order_base_2(pages)); + quirk_extra_dev_tlb_flush(info, address, order_base_2(pages), + svm->pasid, sdev->qdep); + } } static inline bool intel_svm_capable(struct intel_iommu *iommu) diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 8af9abb73b6c..3320b18b6364 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -652,6 +652,7 @@ struct device_domain_info { u8 pri_enabled:1; u8 ats_supported:1; u8 ats_enabled:1; + u8 dtlb_extra_inval:1; /* Quirk for devices need extra flush */ u8 auxd_enabled:1; /* Multiple domains per device */ u8 ats_qdep; struct device *dev; /* it's NULL for PCIe-to-PCI bridge */ @@ -747,6 +748,9 @@ void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid, void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, u64 granu, u32 pasid); +void quirk_extra_dev_tlb_flush(struct device_domain_info *info, + unsigned long address, unsigned long mask, + u32 pasid, u16 qdep); int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc, unsigned int count, unsigned long options); /* -- Gitee From a0ce466756a75422dd5a33f9781b3a35dacca066 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Mon, 5 Dec 2022 21:14:12 +0800 Subject: [PATCH 2/6] iommu/vt-d: Fix buggy QAT device mask commit 81c95fbaebfa5990c3c786c8c3e87426a33106fe upstream. Impacted QAT device IDs that need extra dtlb flush quirk is ranging from 0x4940 to 0x4943. After bitwise AND device ID with 0xfffc the result should be 0x4940 instead of 0x494c to identify these devices. Intel-SIG: commit 81c95fbaebfa iommu/vt-d: Fix buggy QAT device mask Backport for SPR/EMR vt-d critical fix. Fixes: e65a6897be5e ("iommu/vt-d: Add a fix for devices need extra dtlb flush") Reported-by: Raghunathan Srinivasan Signed-off-by: Ashok Raj Signed-off-by: Jacob Pan Link: https://lore.kernel.org/r/20221203005610.2927487-1-jacob.jun.pan@linux.intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel (cherry picked from commit 81c95fbaebfa5990c3c786c8c3e87426a33106fe) [ Ethan Zhao: amend commit log ] Signed-off-by: Ethan Zhao --- drivers/iommu/intel/iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 707d8656b5fe..77c56f6bce8d 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1692,7 +1692,7 @@ static void domain_update_iotlb(struct dmar_domain *domain) * check because it applies only to the built-in QAT devices and it doesn't * grant additional privileges. */ -#define BUGGY_QAT_DEVID_MASK 0x494c +#define BUGGY_QAT_DEVID_MASK 0x4940 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev) { if (pdev->vendor != PCI_VENDOR_ID_INTEL) -- Gitee From d2d18e56a0ee912efd8b1edf4bc32cbea442c0c5 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 23 Aug 2022 14:15:54 +0800 Subject: [PATCH 3/6] iommu/vt-d: Fix kdump kernels boot failure with scalable mode commit 0c5f6c0d8201a809a6585b07b6263e9db2c874a3 upstream. The translation table copying code for kdump kernels is currently based on the extended root/context entry formats of ECS mode defined in older VT-d v2.5, and doesn't handle the scalable mode formats. This causes the kexec capture kernel boot failure with DMAR faults if the IOMMU was enabled in scalable mode by the previous kernel. The ECS mode has already been deprecated by the VT-d spec since v3.0 and Intel IOMMU driver doesn't support this mode as there's no real hardware implementation. Hence this converts ECS checking in copying table code into scalable mode. The existing copying code consumes a bit in the context entry as a mark of copied entry. It needs to work for the old format as well as for the extended context entries. As it's hard to find such a common bit for both legacy and scalable mode context entries. This replaces it with a per- IOMMU bitmap. Intel-SIG: commit 0c5f6c0d8201 iommu/vt-d: Fix kdump kernels boot failure with scalable mode Backport for SPR/EMR vt-d critical fix. Fixes: 7373a8cc38197 ("iommu/vt-d: Setup context and enable RID2PASID support") Cc: stable@vger.kernel.org Reported-by: Jerry Snitselaar Tested-by: Wen Jin Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20220817011035.3250131-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel (cherry picked from commit 0c5f6c0d8201a809a6585b07b6263e9db2c874a3) [ Ethan Zhao: amend commit log ] Signed-off-by: Ethan Zhao --- drivers/iommu/intel/iommu.c | 99 ++++++++++++++++--------------------- include/linux/intel-iommu.h | 9 ++-- 2 files changed, 49 insertions(+), 59 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 77c56f6bce8d..3f2c44e5b46b 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -230,38 +230,6 @@ static phys_addr_t root_entry_uctp(struct root_entry *re) return re->hi & VTD_PAGE_MASK; } -static inline void context_clear_pasid_enable(struct context_entry *context) -{ - context->lo &= ~(1ULL << 11); -} - -static inline bool context_pasid_enabled(struct context_entry *context) -{ - return !!(context->lo & (1ULL << 11)); -} - -static inline void context_set_copied(struct context_entry *context) -{ - context->hi |= (1ull << 3); -} - -static inline bool context_copied(struct context_entry *context) -{ - return !!(context->hi & (1ULL << 3)); -} - -static inline bool __context_present(struct context_entry *context) -{ - return (context->lo & 1); -} - -bool context_present(struct context_entry *context) -{ - return context_pasid_enabled(context) ? - __context_present(context) : - __context_present(context) && !context_copied(context); -} - static inline void context_set_present(struct context_entry *context) { context->lo |= 1; @@ -309,6 +277,26 @@ static inline void context_clear_entry(struct context_entry *context) context->hi = 0; } +static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) +{ + if (!iommu->copied_tables) + return false; + + return test_bit(((long)bus << 8) | devfn, iommu->copied_tables); +} + +static inline void +set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) +{ + set_bit(((long)bus << 8) | devfn, iommu->copied_tables); +} + +static inline void +clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) +{ + clear_bit(((long)bus << 8) | devfn, iommu->copied_tables); +} + /* * This domain is a statically identity mapping domain. * 1. This domain creats a static 1:1 mapping to all usable memory. @@ -831,6 +819,13 @@ struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, struct context_entry *context; u64 *entry; + /* + * Except that the caller requested to allocate a new entry, + * returning a copied context entry makes no sense. + */ + if (!alloc && context_copied(iommu, bus, devfn)) + return NULL; + entry = &root->lo; if (sm_supported(iommu)) { if (devfn >= 0x80) { @@ -2061,6 +2056,10 @@ static void free_dmar_iommu(struct intel_iommu *iommu) } g_iommus[iommu->seq_id] = NULL; + if (iommu->copied_tables) { + bitmap_free(iommu->copied_tables); + iommu->copied_tables = NULL; + } /* free context mapping */ free_context_table(iommu); @@ -2280,7 +2279,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain, goto out_unlock; ret = 0; - if (context_present(context)) + if (context_present(context) && !context_copied(iommu, bus, devfn)) goto out_unlock; /* @@ -2292,7 +2291,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain, * in-flight DMA will exist, and we don't need to worry anymore * hereafter. */ - if (context_copied(context)) { + if (context_copied(iommu, bus, devfn)) { u16 did_old = context_domain_id(context); if (did_old < cap_ndoms(iommu->cap)) { @@ -2303,6 +2302,8 @@ static int domain_context_mapping_one(struct dmar_domain *domain, iommu->flush.flush_iotlb(iommu, did_old, 0, 0, DMA_TLB_DSI_FLUSH); } + + clear_context_copied(iommu, bus, devfn); } context_clear_entry(context); @@ -3212,32 +3213,14 @@ static int copy_context_table(struct intel_iommu *iommu, /* Now copy the context entry */ memcpy(&ce, old_ce + idx, sizeof(ce)); - if (!__context_present(&ce)) + if (!context_present(&ce)) continue; did = context_domain_id(&ce); if (did >= 0 && did < cap_ndoms(iommu->cap)) set_bit(did, iommu->domain_ids); - /* - * We need a marker for copied context entries. This - * marker needs to work for the old format as well as - * for extended context entries. - * - * Bit 67 of the context entry is used. In the old - * format this bit is available to software, in the - * extended format it is the PGE bit, but PGE is ignored - * by HW if PASIDs are disabled (and thus still - * available). - * - * So disable PASIDs first and then mark the entry - * copied. This means that we don't copy PASID - * translations from the old kernel, but this is fine as - * faults there are not fatal. - */ - context_clear_pasid_enable(&ce); - context_set_copied(&ce); - + set_context_copied(iommu, bus, devfn); new_ce[idx] = ce; } @@ -3264,8 +3247,8 @@ static int copy_translation_tables(struct intel_iommu *iommu) bool new_ext, ext; rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); - ext = !!(rtaddr_reg & DMA_RTADDR_RTT); - new_ext = !!ecap_ecs(iommu->ecap); + ext = !!(rtaddr_reg & DMA_RTADDR_SMT); + new_ext = !!sm_supported(iommu); /* * The RTT bit can only be changed when translation is disabled, @@ -3276,6 +3259,10 @@ static int copy_translation_tables(struct intel_iommu *iommu) if (new_ext != ext) return -EINVAL; + iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL); + if (!iommu->copied_tables) + return -ENOMEM; + old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; if (!old_rt_phys) return -EINVAL; diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 3320b18b6364..f1bcae53d2d9 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -197,7 +197,6 @@ #define ecap_dis(e) (((e) >> 27) & 0x1) #define ecap_nest(e) (((e) >> 26) & 0x1) #define ecap_mts(e) (((e) >> 25) & 0x1) -#define ecap_ecs(e) (((e) >> 24) & 0x1) #define ecap_iotlb_offset(e) ((((e) >> 8) & 0x3ff) * 16) #define ecap_max_iotlb_offset(e) (ecap_iotlb_offset(e) + 16) #define ecap_coherent(e) ((e) & 0x1) @@ -265,7 +264,6 @@ #define DMA_GSTS_CFIS (((u32)1) << 23) /* DMA_RTADDR_REG */ -#define DMA_RTADDR_RTT (((u64)1) << 11) #define DMA_RTADDR_SMT (((u64)1) << 10) /* CCMD_REG */ @@ -593,6 +591,7 @@ struct intel_iommu { #ifdef CONFIG_INTEL_IOMMU unsigned long *domain_ids; /* bitmap of domains */ + unsigned long *copied_tables; /* bitmap of copied tables */ struct dmar_domain ***domains; /* ptr to domains */ spinlock_t lock; /* protect context, domain ids */ struct root_entry *root_entry; /* virtual address */ @@ -724,6 +723,11 @@ static inline int nr_pte_to_next_page(struct dma_pte *pte) (struct dma_pte *)ALIGN((unsigned long)pte, VTD_PAGE_SIZE) - pte; } +static inline bool context_present(struct context_entry *context) +{ + return (context->lo & 1); +} + extern struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev); extern int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu); @@ -830,7 +834,6 @@ static inline void intel_iommu_debugfs_init(void) {} #endif /* CONFIG_INTEL_IOMMU_DEBUGFS */ extern const struct attribute_group *intel_iommu_groups[]; -bool context_present(struct context_entry *context); struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, u8 devfn, int alloc); -- Gitee From 487bfec4e174ba462412bd1043ea5c2a2b5c80e1 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Thu, 16 Feb 2023 21:08:14 +0800 Subject: [PATCH 4/6] iommu/vt-d: Avoid superfluous IOTLB tracking in lazy mode commit 16a75bbe480c3598b3af57a2504ea89b1e32c3ac upstream. Intel IOMMU driver implements IOTLB flush queue with domain selective or PASID selective invalidations. In this case there's no need to track IOVA page range and sync IOTLBs, which may cause significant performance hit. This patch adds a check to avoid IOVA gather page and IOTLB sync for the lazy path. The performance difference on Sapphire Rapids 100Gb NIC is improved by the following (as measured by iperf send): w/o this fix~48 Gbits/s. with this fix ~54 Gbits/s Intel-SIG: commit 16a75bbe480c iommu/vt-d: Avoid superfluous IOTLB tracking in lazy mode Backport for SPR/EMR vt-d critical fix. Cc: Fixes: 2a2b8eaa5b25 ("iommu: Handle freelists when using deferred flushing in iommu drivers") Reviewed-by: Robin Murphy Reviewed-by: Kevin Tian Tested-by: Sanjay Kumar Signed-off-by: Sanjay Kumar Signed-off-by: Jacob Pan Link: https://lore.kernel.org/r/20230209175330.1783556-1-jacob.jun.pan@linux.intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel (cherry picked from commit 16a75bbe480c3598b3af57a2504ea89b1e32c3ac) [ Ethan Zhao: amend commit log ] Signed-off-by: Ethan Zhao --- drivers/iommu/intel/iommu.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 3f2c44e5b46b..c165866e8741 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -5385,7 +5385,12 @@ static size_t intel_iommu_unmap(struct iommu_domain *domain, if (dmar_domain->max_addr == iova + size) dmar_domain->max_addr = iova; - iommu_iotlb_gather_add_page(domain, gather, iova, size); + /* + * We do not use page-selective IOTLB invalidation in flush queue, + * so there is no need to track page and sync iotlb. + */ + if (!(gather && gather->queued)) + iommu_iotlb_gather_add_page(domain, gather, iova, size); return size; } -- Gitee From f1cebd957dabfd64704148b05c930b0909aa8e80 Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Tue, 7 Dec 2021 17:50:10 -0800 Subject: [PATCH 5/6] PCI: Add #defines for accessing PCIe DVSEC fields commit 80b3485f7d7bdb2468f2a9d6a346a1132d248309 upstream. Add #defines for accessing Vendor ID, Revision, Length, and ID offsets in the Designated Vendor Specific Extended Capability (DVSEC). Defined in PCIe r5.0, sec 7.9.6. Intel-SIG: commit 80b3485f7d7b PCI: Add #defines for accessing PCIe DVSEC fields Backport for SPR/EMR support. Reviewed-by: Rafael J. Wysocki Acked-by: Bjorn Helgaas Signed-off-by: David E. Box Link: https://lore.kernel.org/r/20211208015015.891275-2-david.e.box@linux.intel.com Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 80b3485f7d7bdb2468f2a9d6a346a1132d248309) [ Ethan Zhao: amend commit log ] Signed-off-by: Ethan Zhao --- include/uapi/linux/pci_regs.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 725e671fe7f2..7f7243f1396e 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -1067,7 +1067,11 @@ /* Designated Vendor-Specific (DVSEC, PCI_EXT_CAP_ID_DVSEC) */ #define PCI_DVSEC_HEADER1 0x4 /* Designated Vendor-Specific Header1 */ +#define PCI_DVSEC_HEADER1_VID(x) ((x) & 0xffff) +#define PCI_DVSEC_HEADER1_REV(x) (((x) >> 16) & 0xf) +#define PCI_DVSEC_HEADER1_LEN(x) (((x) >> 20) & 0xfff) #define PCI_DVSEC_HEADER2 0x8 /* Designated Vendor-Specific Header2 */ +#define PCI_DVSEC_HEADER2_ID(x) ((x) & 0xffff) /* Data Link Feature */ #define PCI_DLF_CAP 0x04 /* Capabilities Register */ -- Gitee From 33b7f4d0cd382ad99f9481b026ccb1a079e9a290 Mon Sep 17 00:00:00 2001 From: "K V P, Satyanarayana" Date: Fri, 17 Mar 2023 08:22:22 +0000 Subject: [PATCH 6/6] vfio/pci: Add DVSEC PCI Extended Config Capability to user visible list. commit 6467d0740a2b2a1fc18b5d9dbc86a9705dbc2cf9 upstream. The Designated Vendor-Specific Extended Capability (DVSEC Capability) is an optional Extended Capability that is permitted to be implemented by any PCI Express Function. This allows PCI Express component vendors to use the Extended Capability mechanism to expose vendor-specific registers that can be present in components by a variety of vendors. A DVSEC Capability structure can tell vendor-specific software which features a particular component supports. An example usage of DVSEC is Intel Platform Monitoring Technology (PMT) for enumerating and accessing hardware monitoring capabilities on a device. PMT encompasses three device monitoring features, Telemetry (device metrics), Watcher (sampling/tracing), and Crashlog. The DVSEC is used to discover these features and provide a BAR offset to their registers with the Intel vendor code. The current VFIO driver does not pass DVSEC capabilities to Virtual Machine (VM) which makes PMT not to work inside the virtual machine. This series adds DVSEC capability to user visible list to allow its use with VFIO. VFIO supports passing of Vendor Specific Extended Capability (VSEC) and raw write access to device. DVSEC also passed to VM in the same way as of VSEC. Intel-SIG: commit 6467d0740a2b vfio/pci: Add DVSEC PCI Extended Config Capability to user visible list Backport for SPR/EMR support. Signed-off-by: K V P Satyanarayana Link: https://lore.kernel.org/r/20230317082222.3355912-1-satyanarayana.k.v.p@intel.com Signed-off-by: Alex Williamson (cherry picked from commit 6467d0740a2b2a1fc18b5d9dbc86a9705dbc2cf9) [ Ethan Zhao: amend commit log ] Signed-off-by: Ethan Zhao --- drivers/vfio/pci/vfio_pci_config.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 6751606c0de6..91a2a640d48a 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -96,6 +96,7 @@ static const u16 pci_ext_cap_length[PCI_EXT_CAP_ID_MAX + 1] = { [PCI_EXT_CAP_ID_SECPCI] = 0, /* not yet */ [PCI_EXT_CAP_ID_PMUX] = 0, /* not yet */ [PCI_EXT_CAP_ID_PASID] = PCI_EXT_CAP_PASID_SIZEOF, + [PCI_EXT_CAP_ID_DVSEC] = 0xFF, }; /* @@ -1065,6 +1066,7 @@ int __init vfio_pci_init_perm_bits(void) ret |= init_pci_ext_cap_err_perm(&ecap_perms[PCI_EXT_CAP_ID_ERR]); ret |= init_pci_ext_cap_pwr_perm(&ecap_perms[PCI_EXT_CAP_ID_PWR]); ecap_perms[PCI_EXT_CAP_ID_VNDR].writefn = vfio_raw_config_write; + ecap_perms[PCI_EXT_CAP_ID_DVSEC].writefn = vfio_raw_config_write; if (ret) vfio_pci_uninit_perm_bits(); @@ -1404,6 +1406,11 @@ static int vfio_ext_cap_len(struct vfio_pci_core_device *vdev, u16 ecap, u16 epo return PCI_TPH_BASE_SIZEOF + (sts * 2) + 2; } return PCI_TPH_BASE_SIZEOF; + case PCI_EXT_CAP_ID_DVSEC: + ret = pci_read_config_dword(pdev, epos + PCI_DVSEC_HEADER1, &dword); + if (ret) + return pcibios_err_to_errno(ret); + return PCI_DVSEC_HEADER1_LEN(dword); default: pci_warn(pdev, "%s: unknown length for PCI ecap %#x@%#x\n", __func__, ecap, epos); -- Gitee