From e3de20aac07d94e68465b80fe1c4c90a1dc087f0 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Sun, 27 Dec 2020 12:11:16 +0200
Subject: [PATCH 01/63] intel_idle: add SnowRidge C-state table

mainline inclusion
from mainline-v5.11-rc2
commit 9cf93f056f783f986c19f40d5304d1bcffa0fc0d
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY
CVE: NA

Intel-SIG: commit 9cf93f056f78 intel_idle: add SnowRidge C-state table.
Backport intel_idle driver

--------------------------------

Add C-state table for the SnowRidge SoC which is found on Intel Jacobsville
platforms.

The following has been changed.

 1. C1E latency changed from 10us to 15us. It was measured using the
    open source "wult" tool (the "nic" method, 15us is the 99.99th
    percentile).

 2. C1E power break even changed from 20us to 25us, which may result
    in less C1E residency in some workloads.

 3. C6 latency changed from 50us to 130us. Measured the same way as C1E.

The C6 C-state is supported only by some SnowRidge revisions, so add a C-state
table commentary about this.

On SnowRidge, C6 support is enumerated via the usual mechanism: "mwait" leaf of
the "cpuid" instruction. The 'intel_idle' driver does check this leaf, so even
though C6 is present in the table, the driver will only use it if the CPU does
support it.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/idle/intel_idle.c | 41 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index d79335506ecd..28f93b9aa51b 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -963,6 +963,39 @@ static struct cpuidle_state dnv_cstates[] __initdata = {
 		.enter = NULL }
 };
 
+/*
+ * Note, depending on HW and FW revision, SnowRidge SoC may or may not support
+ * C6, and this is indicated in the CPUID mwait leaf.
+ */
+static struct cpuidle_state snr_cstates[] __initdata = {
+	{
+		.name = "C1",
+		.desc = "MWAIT 0x00",
+		.flags = MWAIT2flg(0x00),
+		.exit_latency = 2,
+		.target_residency = 2,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.name = "C1E",
+		.desc = "MWAIT 0x01",
+		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
+		.exit_latency = 15,
+		.target_residency = 25,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.name = "C6",
+		.desc = "MWAIT 0x20",
+		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
+		.exit_latency = 130,
+		.target_residency = 500,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.enter = NULL }
+};
+
 static const struct idle_cpu idle_cpu_nehalem __initconst = {
 	.state_table = nehalem_cstates,
 	.auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE,
@@ -1084,6 +1117,12 @@ static const struct idle_cpu idle_cpu_dnv __initconst = {
 	.use_acpi = true,
 };
 
+static const struct idle_cpu idle_cpu_snr __initconst = {
+	.state_table = snr_cstates,
+	.disable_promotion_to_c1e = true,
+	.use_acpi = true,
+};
+
 static const struct x86_cpu_id intel_idle_ids[] __initconst = {
 	X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EP,		&idle_cpu_nhx),
 	X86_MATCH_INTEL_FAM6_MODEL(NEHALEM,		&idle_cpu_nehalem),
@@ -1122,7 +1161,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT,	&idle_cpu_bxt),
 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS,	&idle_cpu_bxt),
 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D,	&idle_cpu_dnv),
-	X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D,	&idle_cpu_dnv),
+	X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D,	&idle_cpu_snr),
 	{}
 };
 
-- 
Gitee


From 0f8062fbabc16c3a98c001e0f29dd971f559c1b8 Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Fri, 15 Jan 2021 15:56:46 -0800
Subject: [PATCH 02/63] intel_idle: remove definition of DEBUG

mainline inclusion
from mainline-v5.12-rc2
commit 651bc5816c39e57833fea4478c8ecfb72ad47e44
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY
CVE: NA

Intel-SIG: commit 651bc5816c39 intel_idle: remove definition
of DEBUG.
Backport intel_idle driver

--------------------------------

Defining DEBUG should only be done in development.
So remove DEBUG.

Signed-off-by: Tom Rix <trix@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/idle/intel_idle.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 28f93b9aa51b..3273360f30f7 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -37,7 +37,7 @@
  */
 
 /* un-comment DEBUG to enable pr_debug() statements */
-#define DEBUG
+/* #define DEBUG */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
-- 
Gitee


From cf27c6dd77b5b2b94d077ed874c313ea42b5019f Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Mon, 8 Mar 2021 16:31:46 +0200
Subject: [PATCH 03/63] intel_idle: update ICX C6 data

mainline inclusion
from mainline-v5.13-rc1
commit d484b8bfc6fa71a088e4ac85d9ce11aa0385867e
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY
CVE: NA

Intel-SIG: commit d484b8bfc6fa intel_idle: update ICX C6 data.
Backport intel_idle driver

--------------------------------

Change IceLake Xeon C6 latency from 128 us to 170 us. The latency
was measured with the "wult" tool and corresponds to the 99.99th
percentile when measuring with the "nic" method. Note, the 128 us
figure correspond to the median latency, but in intel_idle we use
the "worst case" latency figure instead.

C6 target residency was increased from 384 us to 600 us, which may
result in less C6 residency in some workloads. This value was tested
and compared to values 384, and 1000. Value 600 is a reasonable
tradeoff between power and performance.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Acked-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/idle/intel_idle.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 3273360f30f7..6cac0b748efa 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -744,8 +744,8 @@ static struct cpuidle_state icx_cstates[] __initdata = {
 		.name = "C6",
 		.desc = "MWAIT 0x20",
 		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
-		.exit_latency = 128,
-		.target_residency = 384,
+		.exit_latency = 170,
+		.target_residency = 600,
 		.enter = &intel_idle,
 		.enter_s2idle = intel_idle_s2idle, },
 	{
-- 
Gitee


From 6dd726125f90f12f331ca13c9a7ebd6ec00cd27f Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Wed, 7 Apr 2021 09:10:28 +0300
Subject: [PATCH 04/63] intel_idle: add Iclelake-D support

mainline inclusion
from mainline-v5.13-rc1
commit 22141d5f411895bb1b0df2a6b05f702e11e63918
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY
CVE: NA

Intel-SIG: commit 22141d5f4118 intel_idle: add Iclelake-D support.
Backport intel_idle driver

--------------------------------

This patch adds Icelake Xeon D support to the intel_idle driver.

Since Icelake D and Icelake SP C-state characteristics the same,
we use Icelake SP C-states table for Icelake D as well.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Acked-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/idle/intel_idle.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 6cac0b748efa..ec1b9d306ba6 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1156,6 +1156,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
 	X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE,		&idle_cpu_skl),
 	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X,		&idle_cpu_skx),
 	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X,		&idle_cpu_icx),
+	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D,		&idle_cpu_icx),
 	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL,	&idle_cpu_knl),
 	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM,	&idle_cpu_knl),
 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT,	&idle_cpu_bxt),
-- 
Gitee


From 3a5ec17e30b976073001ae9b76968db5ea8e3a51 Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Fri, 28 May 2021 11:20:54 +0800
Subject: [PATCH 05/63] intel_idle: Adjust the SKX C6 parameters if PC6 is
 disabled

mainline inclusion
from mainline-v5.14-rc1
commit 64233338499126c5c31e07165735ab5441c7e45a
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY
CVE: NA

Intel-SIG: commit 642333384991 intel_idle: Adjust the SKX C6 parameters
if PC6 is disabled.
Backport intel_idle driver

--------------------------------

Because cpuidle assumes worst-case C-state parameters, PC6 parameters
are used for describing C6, which is worst-case for requesting CC6.
When PC6 is enabled, this is appropriate. But if PC6 is disabled
in the BIOS, the exit latency and target residency should be adjusted
accordingly.

Exit latency:
Previously the C6 exit latency was measured as the PC6 exit latency.
With PC6 disabled, the C6 exit latency should be the one of CC6.

Target residency:
With PC6 disabled, the idle duration within [CC6, PC6) would make the
idle governor choose C1E over C6. This would cause low energy-efficiency.
We should lower the bar to request C6 when PC6 is disabled.

To fill this gap, check if PC6 is disabled in the BIOS in the
MSR_PKG_CST_CONFIG_CONTROL(0xe2) register. If so, use the CC6 exit latency
for C6 and set target_residency to 3 times of the new exit latency. [This
is consistent with how intel_idle driver uses _CST to calculate the
target_residency.] As a result, the OS would be more likely to choose C6
over C1E when PC6 is disabled, which is reasonable, because if C6 is
enabled, it implies that the user cares about energy, so choosing C6 more
frequently makes sense.

The new CC6 exit latency of 92us was measured with wult[1] on SKX via NIC
wakeup as the 99.99th percentile. Also CLX and CPX both have the same CPU
model number as SkX, but their CC6 exit latencies are similar to the SKX
one, 96us and 89us respectively, so reuse the SKX value for them.

There is a concern that it might be better to use a more generic approach
instead of optimizing every platform. However, if the required code
complexity and different PC6 bit interpretation on different platforms
are taken into account, tuning the code per platform seems to be an
acceptable tradeoff.

Link: https://intel.github.io/wult/ # [1]
Suggested-by: Len Brown <len.brown@intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Reviewed-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
[ rjw: Subject and changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/idle/intel_idle.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index ec1b9d306ba6..e6c543b5ee1d 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1484,6 +1484,36 @@ static void __init sklh_idle_state_table_update(void)
 	skl_cstates[6].flags |= CPUIDLE_FLAG_UNUSABLE;	/* C9-SKL */
 }
 
+/**
+ * skx_idle_state_table_update - Adjust the Sky Lake/Cascade Lake
+ * idle states table.
+ */
+static void __init skx_idle_state_table_update(void)
+{
+	unsigned long long msr;
+
+	rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr);
+
+	/*
+	 * 000b: C0/C1 (no package C-state support)
+	 * 001b: C2
+	 * 010b: C6 (non-retention)
+	 * 011b: C6 (retention)
+	 * 111b: No Package C state limits.
+	 */
+	if ((msr & 0x7) < 2) {
+		/*
+		 * Uses the CC6 + PC0 latency and 3 times of
+		 * latency for target_residency if the PC6
+		 * is disabled in BIOS. This is consistent
+		 * with how intel_idle driver uses _CST
+		 * to set the target_residency.
+		 */
+		skx_cstates[2].exit_latency = 92;
+		skx_cstates[2].target_residency = 276;
+	}
+}
+
 static bool __init intel_idle_verify_cstate(unsigned int mwait_hint)
 {
 	unsigned int mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint) + 1;
@@ -1515,6 +1545,9 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
 	case INTEL_FAM6_SKYLAKE:
 		sklh_idle_state_table_update();
 		break;
+	case INTEL_FAM6_SKYLAKE_X:
+		skx_idle_state_table_update();
+		break;
 	}
 
 	for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) {
-- 
Gitee


From 839940d9189b2a54cd761b666f2c0a9fbde2c3a5 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Fri, 17 Sep 2021 10:20:22 +0300
Subject: [PATCH 06/63] intel_idle: enable interrupts before C1 on Xeons

mainline inclusion
from mainline-v5.16-rc1
commit c227233ad64c77e57db738ab0e46439db71822a3
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY
CVE: NA

Intel-SIG: commit c227233ad64c intel_idle: enable interrupts before C1 on Xeons.
Backport intel_idle driver

--------------------------------

Enable local interrupts before requesting C1 on the last two generations
of Intel Xeon platforms: Sky Lake, Cascade Lake, Cooper Lake, Ice Lake.
This decreases average C1 interrupt latency by about 5-10%, as measured
with the 'wult' tool.

The '->enter()' function of the driver enters C-states with local
interrupts disabled by executing the 'monitor' and 'mwait' pair of
instructions. If an interrupt happens, the CPU exits the C-state and
continues executing instructions after 'mwait'. It does not jump to
the interrupt handler, because local interrupts are disabled. The
cpuidle subsystem enables interrupts a bit later, after doing some
housekeeping.

With this patch, we enable local interrupts before requesting C1. In
this case, if the CPU wakes up because of an interrupt, it will jump
to the interrupt handler right away. The cpuidle housekeeping will be
done after the pending interrupt(s) are handled.

Enabling interrupts before entering a C-state has measurable impact
for faster C-states, like C1. Deeper, but slower C-states like C6 do
not really benefit from this sort of change, because their latency is
a lot higher comparing to the delay added by cpuidle housekeeping.

This change was also tested with cyclictest and dbench. In case of Ice
Lake, the average cyclictest latency decreased by 5.1%, and the average
'dbench' throughput increased by about 0.8%. Both tests were run for 4
hours with only C1 enabled (all other idle states, including 'POLL',
were disabled). CPU frequency was pinned to HFM, and uncore frequency
was pinned to the maximum value. The other platforms had similar
single-digit percentage improvements.

It is worth noting that this patch affects 'cpuidle' statistics a tiny
bit.  Before this patch, C1 residency did not include the interrupt
handling time, but with this patch, it will include it. This is similar
to what happens in case of the 'POLL' state, which also runs with
interrupts enabled.

Suggested-by: Len Brown <len.brown@intel.com>
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/idle/intel_idle.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index e6c543b5ee1d..0b66e25c0e2d 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -88,6 +88,12 @@ static struct cpuidle_state *cpuidle_state_table __initdata;
 
 static unsigned int mwait_substates __initdata;
 
+/*
+ * Enable interrupts before entering the C-state. On some platforms and for
+ * some C-states, this may measurably decrease interrupt latency.
+ */
+#define CPUIDLE_FLAG_IRQ_ENABLE		BIT(14)
+
 /*
  * Enable this state by default even if the ACPI _CST does not list it.
  */
@@ -127,6 +133,9 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev,
 	unsigned long eax = flg2MWAIT(state->flags);
 	unsigned long ecx = 1; /* break on interrupt flag */
 
+	if (state->flags & CPUIDLE_FLAG_IRQ_ENABLE)
+		local_irq_enable();
+
 	mwait_idle_with_hints(eax, ecx);
 
 	return index;
@@ -698,7 +707,7 @@ static struct cpuidle_state skx_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
-		.flags = MWAIT2flg(0x00),
+		.flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_IRQ_ENABLE,
 		.exit_latency = 2,
 		.target_residency = 2,
 		.enter = &intel_idle,
@@ -727,7 +736,7 @@ static struct cpuidle_state icx_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
-		.flags = MWAIT2flg(0x00),
+		.flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_IRQ_ENABLE,
 		.exit_latency = 1,
 		.target_residency = 1,
 		.enter = &intel_idle,
-- 
Gitee


From 23be0951971714eda30d4a62f53e72e70fef6bed Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Wed, 2 Mar 2022 10:15:58 +0200
Subject: [PATCH 07/63] intel_idle: add SPR support

mainline inclusion
from mainline-v5.18-rc1
commit 9edf3c0ffef0ec1bed8300315852b5c6a0997130
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY
CVE: NA

Intel-SIG: commit 9edf3c0ffef0 intel_idle: add SPR support.
Backport intel_idle driver

--------------------------------

Add Sapphire Rapids Xeon support.

Up until very recently, the C1 and C1E C-states were independent, but this
has changed in some new chips, including Sapphire Rapids Xeon (SPR). In these
chips the C1 and C1E states cannot be enabled at the same time. The "C1E
promotion" bit in 'MSR_IA32_POWER_CTL' also has its semantics changed a bit.

Here are the C1, C1E, and "C1E promotion" bit rules on Xeons before SPR.

1. If C1E promotion bit is disabled.
   a. C1  requests end up with C1  C-state.
   b. C1E requests end up with C1E C-state.
2. If C1E promotion bit is enabled.
   a. C1  requests end up with C1E C-state.
   b. C1E requests end up with C1E C-state.

Here are the C1, C1E, and "C1E promotion" bit rules on Sapphire Rapids Xeon.
1. If C1E promotion bit is disabled.
   a. C1  requests end up with C1 C-state.
   b. C1E requests end up with C1 C-state.
2. If C1E promotion bit is enabled.
   a. C1  requests end up with C1E C-state.
   b. C1E requests end up with C1E C-state.

Before SPR Xeon, the 'intel_idle' driver was disabling C1E promotion and was
exposing C1 and C1E as independent C-states. But on SPR, C1 and C1E cannot be
enabled at the same time.

This patch adds both C1 and C1E states. However, C1E is marked as with the
"CPUIDLE_FLAG_UNUSABLE" flag, which means that in won't be registered by
default. The C1E promotion bit will be cleared, which means that by default
only C1 and C6 will be registered on SPR.

The next patch will add an option for enabling C1E and disabling C1 on SPR.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/idle/intel_idle.c | 47 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 0b66e25c0e2d..1c7c25909e54 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -761,6 +761,46 @@ static struct cpuidle_state icx_cstates[] __initdata = {
 		.enter = NULL }
 };
 
+/*
+ * On Sapphire Rapids Xeon C1 has to be disabled if C1E is enabled, and vice
+ * versa. On SPR C1E is enabled only if "C1E promotion" bit is set in
+ * MSR_IA32_POWER_CTL. But in this case there effectively no C1, because C1
+ * requests are promoted to C1E. If the "C1E promotion" bit is cleared, then
+ * both C1 and C1E requests end up with C1, so there is effectively no C1E.
+ *
+ * By default we enable C1 and disable C1E by marking it with
+ * 'CPUIDLE_FLAG_UNUSABLE'.
+ */
+static struct cpuidle_state spr_cstates[] __initdata = {
+	{
+		.name = "C1",
+		.desc = "MWAIT 0x00",
+		.flags = MWAIT2flg(0x00),
+		.exit_latency = 1,
+		.target_residency = 1,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.name = "C1E",
+		.desc = "MWAIT 0x01",
+		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE | \
+					   CPUIDLE_FLAG_UNUSABLE,
+		.exit_latency = 2,
+		.target_residency = 4,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.name = "C6",
+		.desc = "MWAIT 0x20",
+		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
+		.exit_latency = 290,
+		.target_residency = 800,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.enter = NULL }
+};
+
 static struct cpuidle_state atom_cstates[] __initdata = {
 	{
 		.name = "C1E",
@@ -1104,6 +1144,12 @@ static const struct idle_cpu idle_cpu_icx __initconst = {
 	.use_acpi = true,
 };
 
+static const struct idle_cpu idle_cpu_spr __initconst = {
+	.state_table = spr_cstates,
+	.disable_promotion_to_c1e = true,
+	.use_acpi = true,
+};
+
 static const struct idle_cpu idle_cpu_avn __initconst = {
 	.state_table = avn_cstates,
 	.disable_promotion_to_c1e = true,
@@ -1166,6 +1212,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
 	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X,		&idle_cpu_skx),
 	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X,		&idle_cpu_icx),
 	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D,		&idle_cpu_icx),
+	X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X,	&idle_cpu_spr),
 	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL,	&idle_cpu_knl),
 	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM,	&idle_cpu_knl),
 	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT,	&idle_cpu_bxt),
-- 
Gitee


From 8319c2b0f1f31915a17acd5b678bd1d782c9c5e7 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Wed, 2 Mar 2022 10:15:59 +0200
Subject: [PATCH 08/63] intel_idle: add 'preferred_cstates' module argument

mainline inclusion
from mainline-v5.18-rc1
commit da0e58c038e60e7e65d30813ebdfe91687aa8a24
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY
CVE: NA

Intel-SIG: commit da0e58c038e6 intel_idle: add 'preferred_cstates' module argument.
Backport intel_idle driver

--------------------------------

On Sapphire Rapids Xeon (SPR) the C1 and C1E states are basically mutually
exclusive - only one of them can be enabled. By default, 'intel_idle' driver
enables C1 and disables C1E. However, some users prefer to use C1E instead of
C1, because it saves more energy.

This patch adds a new module parameter ('preferred_cstates') for enabling C1E
and disabling C1. Here is the idea behind it.

1. This option has effect only for "mutually exclusive" C-states like C1 and
   C1E on SPR.
2. It does not have any effect on independent C-states, which do not require
   other C-states to be disabled (most states on most platforms as of today).
3. For mutually exclusive C-states, the 'intel_idle' driver always has a
   reasonable default, such as enabling C1 on SPR by default. On other
   platforms, the default may be different.
4. Users can override the default using the 'preferred_cstates' parameter.
5. The parameter accepts the preferred C-states bit-mask, similarly to the
   existing 'states_off' parameter.
6. This parameter is not limited to C1/C1E, and leaves room for supporting
   other mutually exclusive C-states, if they come in the future.

Today 'intel_idle' can only be compiled-in, which means that on SPR, in order
to disable C1 and enable C1E, users should boot with the following kernel
argument: intel_idle.preferred_cstates=4

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/idle/intel_idle.c | 46 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 1c7c25909e54..b2688c326522 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -64,6 +64,7 @@ static struct cpuidle_driver intel_idle_driver = {
 /* intel_idle.max_cstate=0 disables driver */
 static int max_cstate = CPUIDLE_STATE_MAX - 1;
 static unsigned int disabled_states_mask;
+static unsigned int preferred_states_mask;
 
 static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
 
@@ -1400,6 +1401,8 @@ static inline void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv) { }
 static inline bool intel_idle_off_by_default(u32 mwait_hint) { return false; }
 #endif /* !CONFIG_ACPI_PROCESSOR_CSTATE */
 
+static void c1e_promotion_enable(void);
+
 /**
  * ivt_idle_state_table_update - Tune the idle states table for Ivy Town.
  *
@@ -1570,6 +1573,26 @@ static void __init skx_idle_state_table_update(void)
 	}
 }
 
+/**
+ * spr_idle_state_table_update - Adjust Sapphire Rapids idle states table.
+ */
+static void __init spr_idle_state_table_update(void)
+{
+	/* Check if user prefers C1E over C1. */
+	if (preferred_states_mask & BIT(2)) {
+		if (preferred_states_mask & BIT(1))
+			/* Both can't be enabled, stick to the defaults. */
+			return;
+
+		spr_cstates[0].flags |= CPUIDLE_FLAG_UNUSABLE;
+		spr_cstates[1].flags &= ~CPUIDLE_FLAG_UNUSABLE;
+
+		/* Enable C1E using the "C1E promotion" bit. */
+		c1e_promotion_enable();
+		disable_promotion_to_c1e = false;
+	}
+}
+
 static bool __init intel_idle_verify_cstate(unsigned int mwait_hint)
 {
 	unsigned int mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint) + 1;
@@ -1604,6 +1627,9 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
 	case INTEL_FAM6_SKYLAKE_X:
 		skx_idle_state_table_update();
 		break;
+	case INTEL_FAM6_SAPPHIRERAPIDS_X:
+		spr_idle_state_table_update();
+		break;
 	}
 
 	for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) {
@@ -1676,6 +1702,15 @@ static void auto_demotion_disable(void)
 	wrmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_bits);
 }
 
+static void c1e_promotion_enable(void)
+{
+	unsigned long long msr_bits;
+
+	rdmsrl(MSR_IA32_POWER_CTL, msr_bits);
+	msr_bits |= 0x2;
+	wrmsrl(MSR_IA32_POWER_CTL, msr_bits);
+}
+
 static void c1e_promotion_disable(void)
 {
 	unsigned long long msr_bits;
@@ -1845,3 +1880,14 @@ module_param(max_cstate, int, 0444);
  */
 module_param_named(states_off, disabled_states_mask, uint, 0444);
 MODULE_PARM_DESC(states_off, "Mask of disabled idle states");
+/*
+ * Some platforms come with mutually exclusive C-states, so that if one is
+ * enabled, the other C-states must not be used. Example: C1 and C1E on
+ * Sapphire Rapids platform. This parameter allows for selecting the
+ * preferred C-states among the groups of mutually exclusive C-states - the
+ * selected C-states will be registered, the other C-states from the mutually
+ * exclusive group won't be registered. If the platform has no mutually
+ * exclusive C-states, this parameter has no effect.
+ */
+module_param_named(preferred_cstates, preferred_states_mask, uint, 0444);
+MODULE_PARM_DESC(preferred_cstates, "Mask of preferred idle states");
-- 
Gitee


From dc85adcb572c8efdd2b85ae2ade6346dcadef52a Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Wed, 2 Mar 2022 10:16:00 +0200
Subject: [PATCH 09/63] intel_idle: add core C6 optimization for SPR

mainline inclusion
from mainline-v5.18-rc1
commit 3a9cf77b60dc9839b6674943bb7c9dcd524b6294
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY
CVE: NA

Intel-SIG: commit 3a9cf77b60dc intel_idle: add core C6 optimization for SPR.
Backport intel_idle driver

--------------------------------

Add a Sapphire Rapids Xeon C6 optimization, similar to what we have for Sky Lake
Xeon: if package C6 is disabled, adjust C6 exit latency and target residency to
match core C6 values, instead of using the default package C6 values.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/idle/intel_idle.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index b2688c326522..e385ddf15b32 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1578,6 +1578,8 @@ static void __init skx_idle_state_table_update(void)
  */
 static void __init spr_idle_state_table_update(void)
 {
+	unsigned long long msr;
+
 	/* Check if user prefers C1E over C1. */
 	if (preferred_states_mask & BIT(2)) {
 		if (preferred_states_mask & BIT(1))
@@ -1591,6 +1593,19 @@ static void __init spr_idle_state_table_update(void)
 		c1e_promotion_enable();
 		disable_promotion_to_c1e = false;
 	}
+
+	/*
+	 * By default, the C6 state assumes the worst-case scenario of package
+	 * C6. However, if PC6 is disabled, we update the numbers to match
+	 * core C6.
+	 */
+	rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr);
+
+	/* Limit value 2 and above allow for PC6. */
+	if ((msr & 0x7) < 2) {
+		spr_cstates[2].exit_latency = 190;
+		spr_cstates[2].target_residency = 600;
+	}
 }
 
 static bool __init intel_idle_verify_cstate(unsigned int mwait_hint)
-- 
Gitee


From 54ba645bed8c8aa9722fc9c03a35c51789eed326 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 15 Mar 2022 20:35:47 +0100
Subject: [PATCH 10/63] cpuidle: intel_idle: Update intel_idle() kerneldoc
 comment

mainline inclusion
from mainline-v5.18-rc1
commit a335b1e6bb29300d3bc6749763a4298627e594ba
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY
CVE: NA

Intel-SIG: commit a335b1e6bb29 cpuidle: intel_idle: Update intel_idle()
kerneldoc comment.
Backport intel_idle driver

--------------------------------

Commit bf9282dc26e7 ("cpuidle: Make CPUIDLE_FLAG_TLB_FLUSHED generic")
moved the leave_mm() call away from intel_idle(), but it didn't update
its kerneldoc comment accordingly, so do that now.

Fixes: bf9282dc26e7 ("cpuidle: Make CPUIDLE_FLAG_TLB_FLUSHED generic")

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/idle/intel_idle.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index e385ddf15b32..4ba4ab974dbe 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -122,9 +122,6 @@ static unsigned int mwait_substates __initdata;
  * If the local APIC timer is not known to be reliable in the target idle state,
  * enable one-shot tick broadcasting for the target CPU before executing MWAIT.
  *
- * Optionally call leave_mm() for the target CPU upfront to avoid wakeups due to
- * flushing user TLBs.
- *
  * Must be called under local_irq_disable().
  */
 static __cpuidle int intel_idle(struct cpuidle_device *dev,
-- 
Gitee


From 5c3af46f79a48b2eb7e5faff1f8dcdd2a2e44a78 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 15 Mar 2022 20:36:42 +0100
Subject: [PATCH 11/63] cpuidle: intel_idle: Drop redundant backslash at line
 end

mainline inclusion
from mainline-v5.18-rc1
commit 03eb65224e5711e7a2f34b500d44866b322a249a
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY
CVE: NA

Intel-SIG: commit 03eb65224e57 cpuidle: intel_idle: Drop redundant
backslash at line end.
Backport intel_idle driver

--------------------------------

Drop a redundant backslash character at the end of a line in the
spr_cstates[] definition.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/idle/intel_idle.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 4ba4ab974dbe..b7640cfe0020 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -781,7 +781,7 @@ static struct cpuidle_state spr_cstates[] __initdata = {
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
-		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE | \
+		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE |
 					   CPUIDLE_FLAG_UNUSABLE,
 		.exit_latency = 2,
 		.target_residency = 4,
-- 
Gitee


From eed9fe8489aafbc28e77b4fd36b095801ed17c76 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Wed, 27 Apr 2022 09:08:52 +0300
Subject: [PATCH 12/63] intel_idle: Fix the 'preferred_cstates' module
 parameter

mainline inclusion
from mainline-v5.18-rc5
commit 39c184a6a9a7a99950b321d55fe713175cf1d404
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY
CVE: NA

Intel-SIG: commit 39c184a6a9a7 intel_idle: Fix the 'preferred_cstates' module
parameter.
Backport for intel_idle driver

--------------------------------

Problem description.

When user boots kernel up with the 'intel_idle.preferred_cstates=4' option,
we enable C1E and disable C1 states on Sapphire Rapids Xeon (SPR). In order
for C1E to work on SPR, we have to enable the C1E promotion bit on all
CPUs.  However, we enable it only on one CPU.

Fix description.

The 'intel_idle' driver already has the infrastructure for disabling C1E
promotion on every CPU. This patch uses the same infrastructure for
enabling C1E promotion on every CPU. It changes the boolean
'disable_promotion_to_c1e' variable to a tri-state 'c1e_promotion'
variable.

Tested on a 2-socket SPR system. I verified the following combinations:

 * C1E promotion enabled and disabled in BIOS.
 * Booted with and without the 'intel_idle.preferred_cstates=4' kernel
   argument.

In all 4 cases C1E promotion was correctly set on all CPUs.

Also tested on an old Broadwell system, just to make sure it does not cause
a regression. C1E promotion was correctly disabled on that system, both C1
and C1E were exposed (as expected).

Fixes: da0e58c038e6 ("intel_idle: add 'preferred_cstates' module argument")
Reported-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
[ rjw: Minor changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/idle/intel_idle.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index b7640cfe0020..cf5ed4c1d02c 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -69,7 +69,12 @@ static unsigned int preferred_states_mask;
 static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
 
 static unsigned long auto_demotion_disable_flags;
-static bool disable_promotion_to_c1e;
+
+static enum {
+	C1E_PROMOTION_PRESERVE,
+	C1E_PROMOTION_ENABLE,
+	C1E_PROMOTION_DISABLE
+} c1e_promotion = C1E_PROMOTION_PRESERVE;
 
 struct idle_cpu {
 	struct cpuidle_state *state_table;
@@ -1398,8 +1403,6 @@ static inline void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv) { }
 static inline bool intel_idle_off_by_default(u32 mwait_hint) { return false; }
 #endif /* !CONFIG_ACPI_PROCESSOR_CSTATE */
 
-static void c1e_promotion_enable(void);
-
 /**
  * ivt_idle_state_table_update - Tune the idle states table for Ivy Town.
  *
@@ -1587,8 +1590,7 @@ static void __init spr_idle_state_table_update(void)
 		spr_cstates[1].flags &= ~CPUIDLE_FLAG_UNUSABLE;
 
 		/* Enable C1E using the "C1E promotion" bit. */
-		c1e_promotion_enable();
-		disable_promotion_to_c1e = false;
+		c1e_promotion = C1E_PROMOTION_ENABLE;
 	}
 
 	/*
@@ -1754,7 +1756,9 @@ static int intel_idle_cpu_init(unsigned int cpu)
 	if (auto_demotion_disable_flags)
 		auto_demotion_disable();
 
-	if (disable_promotion_to_c1e)
+	if (c1e_promotion == C1E_PROMOTION_ENABLE)
+		c1e_promotion_enable();
+	else if (c1e_promotion == C1E_PROMOTION_DISABLE)
 		c1e_promotion_disable();
 
 	return 0;
@@ -1833,7 +1837,8 @@ static int __init intel_idle_init(void)
 	if (icpu) {
 		cpuidle_state_table = icpu->state_table;
 		auto_demotion_disable_flags = icpu->auto_demotion_disable_flags;
-		disable_promotion_to_c1e = icpu->disable_promotion_to_c1e;
+		if (icpu->disable_promotion_to_c1e)
+			c1e_promotion = C1E_PROMOTION_DISABLE;
 		if (icpu->use_acpi || force_use_acpi)
 			intel_idle_acpi_cst_extract();
 	} else if (!intel_idle_acpi_cst_extract()) {
-- 
Gitee


From 2b0581cc4a97a912816e3d52990ee43f263716eb Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Wed, 27 Apr 2022 09:08:53 +0300
Subject: [PATCH 13/63] intel_idle: Fix SPR C6 optimization

mainline inclusion
from mainline-v5.18-rc5
commit 7eac3bd38d18cd3317756649921b8264ddfee692
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5BECY
CVE: NA

Intel-SIG: commit 7eac3bd38d18 intel_idle: Fix SPR C6 optimization.
Backport for intel_idle driver

--------------------------------

The Sapphire Rapids (SPR) C6 optimization was added to the end of the
'spr_idle_state_table_update()' function. However, the function has a
'return' which may happen before the optimization has a chance to run.
And this may prevent the optimization from happening.

This is an unlikely scenario, but possible if user boots with, say,
the 'intel_idle.preferred_cstates=6' kernel boot option.

This patch fixes the issue by eliminating the problematic 'return'
statement.

Fixes: 3a9cf77b60dc ("intel_idle: add core C6 optimization for SPR")
Suggested-by: Jan Beulich <jbeulich@suse.com>
Reported-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
[ rjw: Minor changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/idle/intel_idle.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index cf5ed4c1d02c..47551ab73ca8 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1581,11 +1581,9 @@ static void __init spr_idle_state_table_update(void)
 	unsigned long long msr;
 
 	/* Check if user prefers C1E over C1. */
-	if (preferred_states_mask & BIT(2)) {
-		if (preferred_states_mask & BIT(1))
-			/* Both can't be enabled, stick to the defaults. */
-			return;
-
+	if ((preferred_states_mask & BIT(2)) &&
+	    !(preferred_states_mask & BIT(1))) {
+		/* Disable C1 and enable C1E. */
 		spr_cstates[0].flags |= CPUIDLE_FLAG_UNUSABLE;
 		spr_cstates[1].flags &= ~CPUIDLE_FLAG_UNUSABLE;
 
-- 
Gitee


From 879d4bbc4747e1f79dae81dd031562482e4f3d1c Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Wed, 28 Oct 2020 18:44:45 -0700
Subject: [PATCH 14/63] PCI: Add defines for Designated Vendor-Specific
 Extended Capability

mainline inclusion
from mainline-v5.11-rc1
commit 1dc2da5cd51f648de6d1df87e2bc6ea13f72f19c
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9
CVE: NA

Intel-SIG: commit 1dc2da5cd51f PCI: Add defines for Designated Vendor-Specific
Extended Capability.
Backport for intel PMT (Platform Monitoring Technology) support
--------------------------------

Add PCIe Designated Vendor-Specific Extended Capability (DVSEC) and defines
for the header offsets. Defined in PCIe r5.0, sec 7.9.6.

Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 include/uapi/linux/pci_regs.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 7e0d526dd96f..9e257fe9efa5 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -729,6 +729,7 @@
 #define PCI_EXT_CAP_ID_DPC	0x1D	/* Downstream Port Containment */
 #define PCI_EXT_CAP_ID_L1SS	0x1E	/* L1 PM Substates */
 #define PCI_EXT_CAP_ID_PTM	0x1F	/* Precision Time Measurement */
+#define PCI_EXT_CAP_ID_DVSEC	0x23	/* Designated Vendor-Specific */
 #define PCI_EXT_CAP_ID_DLF	0x25	/* Data Link Feature */
 #define PCI_EXT_CAP_ID_PL_16GT	0x26	/* Physical Layer 16.0 GT/s */
 #define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_PL_16GT
@@ -1079,6 +1080,10 @@
 #define  PCI_L1SS_CTL1_LTR_L12_TH_SCALE	0xe0000000  /* LTR_L1.2_THRESHOLD_Scale */
 #define PCI_L1SS_CTL2		0x0c	/* Control 2 Register */
 
+/* Designated Vendor-Specific (DVSEC, PCI_EXT_CAP_ID_DVSEC) */
+#define PCI_DVSEC_HEADER1		0x4 /* Designated Vendor-Specific Header1 */
+#define PCI_DVSEC_HEADER2		0x8 /* Designated Vendor-Specific Header2 */
+
 /* Data Link Feature */
 #define PCI_DLF_CAP		0x04	/* Capabilities Register */
 #define  PCI_DLF_EXCHANGE_ENABLE	0x80000000  /* Data Link Feature Exchange Enable */
-- 
Gitee


From ddce0a7181d1b6da69ec1d0452598c82d8a4c183 Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Wed, 28 Oct 2020 18:55:33 -0700
Subject: [PATCH 15/63] mfd: Intel Platform Monitoring Technology support

mainline inclusion
from mainline-v5.11-rc1
commit 4f8217d5b0ca8ace78a27dc371b87697eedc421d
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9
CVE: NA

Intel-SIG: commit 4f8217d5b0ca mfd: Intel Platform Monitoring Technology
support.
Backport for intel PMT (Platform Monitoring Technology) support

--------------------------------

Intel Platform Monitoring Technology (PMT) is an architecture for
enumerating and accessing hardware monitoring facilities. PMT supports
multiple types of monitoring capabilities. This driver creates platform
devices for each type so that they may be managed by capability specific
drivers (to be introduced). Capabilities are discovered using PCIe DVSEC
ids. Support is included for the 3 current capability types, Telemetry,
Watcher, and Crashlog. The features are available on new Intel platforms
starting from Tiger Lake for which support is added. This patch adds
support for Tiger Lake (TGL), Alder Lake (ADL), and Out-of-Band Management
Services Module (OOBMSM).

Also add a quirk mechanism for several early hardware differences and bugs.
For Tiger Lake and Alder Lake, do not support Watcher and Crashlog
capabilities since they will not be compatible with future product. Also,
fix use a quirk to fix the discovery table offset.

Co-developed-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 MAINTAINERS             |   5 +
 drivers/mfd/Kconfig     |  10 ++
 drivers/mfd/Makefile    |   1 +
 drivers/mfd/intel_pmt.c | 223 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 239 insertions(+)
 create mode 100644 drivers/mfd/intel_pmt.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 23a23bd94c00..cdffd5192e09 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9038,6 +9038,11 @@ F:	drivers/mfd/intel_soc_pmic*
 F:	include/linux/mfd/intel_msic.h
 F:	include/linux/mfd/intel_soc_pmic*
 
+INTEL PMT DRIVER
+M:	"David E. Box" <david.e.box@linux.intel.com>
+S:	Maintained
+F:	drivers/mfd/intel_pmt.c
+
 INTEL PRO/WIRELESS 2100, 2200BG, 2915ABG NETWORK CONNECTION SUPPORT
 M:	Stanislav Yakovlev <stas.yakovlev@gmail.com>
 L:	linux-wireless@vger.kernel.org
diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 15680c3c9279..3f9f84f9f288 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -699,6 +699,16 @@ config MFD_INTEL_PMC_BXT
 	  Register and P-unit access. In addition this creates devices
 	  for iTCO watchdog and telemetry that are part of the PMC.
 
+config MFD_INTEL_PMT
+	tristate "Intel Platform Monitoring Technology (PMT) support"
+	depends on PCI
+	select MFD_CORE
+	help
+	  The Intel Platform Monitoring Technology (PMT) is an interface that
+	  provides access to hardware monitor registers. This driver supports
+	  Telemetry, Watcher, and Crashlog PMT capabilities/devices for
+	  platforms starting from Tiger Lake.
+
 config MFD_IPAQ_MICRO
 	bool "Atmel Micro ASIC (iPAQ h3100/h3600/h3700) Support"
 	depends on SA1100_H3100 || SA1100_H3600
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index fb1df45a301e..ce8f1c0583d5 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -216,6 +216,7 @@ obj-$(CONFIG_MFD_INTEL_LPSS_PCI)	+= intel-lpss-pci.o
 obj-$(CONFIG_MFD_INTEL_LPSS_ACPI)	+= intel-lpss-acpi.o
 obj-$(CONFIG_MFD_INTEL_MSIC)	+= intel_msic.o
 obj-$(CONFIG_MFD_INTEL_PMC_BXT)	+= intel_pmc_bxt.o
+obj-$(CONFIG_MFD_INTEL_PMT)	+= intel_pmt.o
 obj-$(CONFIG_MFD_PALMAS)	+= palmas.o
 obj-$(CONFIG_MFD_VIPERBOARD)    += viperboard.o
 obj-$(CONFIG_MFD_RC5T583)	+= rc5t583.o rc5t583-irq.o
diff --git a/drivers/mfd/intel_pmt.c b/drivers/mfd/intel_pmt.c
new file mode 100644
index 000000000000..744b230cdcca
--- /dev/null
+++ b/drivers/mfd/intel_pmt.c
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Platform Monitoring Technology PMT driver
+ *
+ * Copyright (c) 2020, Intel Corporation.
+ * All Rights Reserved.
+ *
+ * Author: David E. Box <david.e.box@linux.intel.com>
+ */
+
+#include <linux/bits.h>
+#include <linux/kernel.h>
+#include <linux/mfd/core.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+#include <linux/pm.h>
+#include <linux/pm_runtime.h>
+#include <linux/types.h>
+
+/* Intel DVSEC capability vendor space offsets */
+#define INTEL_DVSEC_ENTRIES		0xA
+#define INTEL_DVSEC_SIZE		0xB
+#define INTEL_DVSEC_TABLE		0xC
+#define INTEL_DVSEC_TABLE_BAR(x)	((x) & GENMASK(2, 0))
+#define INTEL_DVSEC_TABLE_OFFSET(x)	((x) & GENMASK(31, 3))
+#define INTEL_DVSEC_ENTRY_SIZE		4
+
+/* PMT capabilities */
+#define DVSEC_INTEL_ID_TELEMETRY	2
+#define DVSEC_INTEL_ID_WATCHER		3
+#define DVSEC_INTEL_ID_CRASHLOG		4
+
+struct intel_dvsec_header {
+	u16	length;
+	u16	id;
+	u8	num_entries;
+	u8	entry_size;
+	u8	tbir;
+	u32	offset;
+};
+
+enum pmt_quirks {
+	/* Watcher capability not supported */
+	PMT_QUIRK_NO_WATCHER	= BIT(0),
+
+	/* Crashlog capability not supported */
+	PMT_QUIRK_NO_CRASHLOG	= BIT(1),
+
+	/* Use shift instead of mask to read discovery table offset */
+	PMT_QUIRK_TABLE_SHIFT	= BIT(2),
+};
+
+struct pmt_platform_info {
+	unsigned long quirks;
+};
+
+static const struct pmt_platform_info tgl_info = {
+	.quirks = PMT_QUIRK_NO_WATCHER | PMT_QUIRK_NO_CRASHLOG |
+		  PMT_QUIRK_TABLE_SHIFT,
+};
+
+static int pmt_add_dev(struct pci_dev *pdev, struct intel_dvsec_header *header,
+		       unsigned long quirks)
+{
+	struct device *dev = &pdev->dev;
+	struct resource *res, *tmp;
+	struct mfd_cell *cell;
+	const char *name;
+	int count = header->num_entries;
+	int size = header->entry_size;
+	int id = header->id;
+	int i;
+
+	switch (id) {
+	case DVSEC_INTEL_ID_TELEMETRY:
+		name = "pmt_telemetry";
+		break;
+	case DVSEC_INTEL_ID_WATCHER:
+		if (quirks & PMT_QUIRK_NO_WATCHER) {
+			dev_info(dev, "Watcher not supported\n");
+			return 0;
+		}
+		name = "pmt_watcher";
+		break;
+	case DVSEC_INTEL_ID_CRASHLOG:
+		if (quirks & PMT_QUIRK_NO_CRASHLOG) {
+			dev_info(dev, "Crashlog not supported\n");
+			return 0;
+		}
+		name = "pmt_crashlog";
+		break;
+	default:
+		dev_err(dev, "Unrecognized PMT capability: %d\n", id);
+		return -EINVAL;
+	}
+
+	if (!header->num_entries || !header->entry_size) {
+		dev_err(dev, "Invalid count or size for %s header\n", name);
+		return -EINVAL;
+	}
+
+	cell = devm_kzalloc(dev, sizeof(*cell), GFP_KERNEL);
+	if (!cell)
+		return -ENOMEM;
+
+	res = devm_kcalloc(dev, count, sizeof(*res), GFP_KERNEL);
+	if (!res)
+		return -ENOMEM;
+
+	if (quirks & PMT_QUIRK_TABLE_SHIFT)
+		header->offset >>= 3;
+
+	/*
+	 * The PMT DVSEC contains the starting offset and count for a block of
+	 * discovery tables, each providing access to monitoring facilities for
+	 * a section of the device. Create a resource list of these tables to
+	 * provide to the driver.
+	 */
+	for (i = 0, tmp = res; i < count; i++, tmp++) {
+		tmp->start = pdev->resource[header->tbir].start +
+			     header->offset + i * (size << 2);
+		tmp->end = tmp->start + (size << 2) - 1;
+		tmp->flags = IORESOURCE_MEM;
+	}
+
+	cell->resources = res;
+	cell->num_resources = count;
+	cell->name = name;
+
+	return devm_mfd_add_devices(dev, PLATFORM_DEVID_AUTO, cell, 1, NULL, 0,
+				    NULL);
+}
+
+static int pmt_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct pmt_platform_info *info;
+	unsigned long quirks = 0;
+	bool found_devices = false;
+	int ret, pos = 0;
+
+	ret = pcim_enable_device(pdev);
+	if (ret)
+		return ret;
+
+	info = (struct pmt_platform_info *)id->driver_data;
+
+	if (info)
+		quirks = info->quirks;
+
+	do {
+		struct intel_dvsec_header header;
+		u32 table;
+		u16 vid;
+
+		pos = pci_find_next_ext_capability(pdev, pos, PCI_EXT_CAP_ID_DVSEC);
+		if (!pos)
+			break;
+
+		pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER1, &vid);
+		if (vid != PCI_VENDOR_ID_INTEL)
+			continue;
+
+		pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER2,
+				     &header.id);
+		pci_read_config_byte(pdev, pos + INTEL_DVSEC_ENTRIES,
+				     &header.num_entries);
+		pci_read_config_byte(pdev, pos + INTEL_DVSEC_SIZE,
+				     &header.entry_size);
+		pci_read_config_dword(pdev, pos + INTEL_DVSEC_TABLE,
+				      &table);
+
+		header.tbir = INTEL_DVSEC_TABLE_BAR(table);
+		header.offset = INTEL_DVSEC_TABLE_OFFSET(table);
+
+		ret = pmt_add_dev(pdev, &header, quirks);
+		if (ret) {
+			dev_warn(&pdev->dev,
+				 "Failed to add device for DVSEC id %d\n",
+				 header.id);
+			continue;
+		}
+
+		found_devices = true;
+	} while (true);
+
+	if (!found_devices)
+		return -ENODEV;
+
+	pm_runtime_put(&pdev->dev);
+	pm_runtime_allow(&pdev->dev);
+
+	return 0;
+}
+
+static void pmt_pci_remove(struct pci_dev *pdev)
+{
+	pm_runtime_forbid(&pdev->dev);
+	pm_runtime_get_sync(&pdev->dev);
+}
+
+#define PCI_DEVICE_ID_INTEL_PMT_ADL	0x467d
+#define PCI_DEVICE_ID_INTEL_PMT_OOBMSM	0x09a7
+#define PCI_DEVICE_ID_INTEL_PMT_TGL	0x9a0d
+static const struct pci_device_id pmt_pci_ids[] = {
+	{ PCI_DEVICE_DATA(INTEL, PMT_ADL, &tgl_info) },
+	{ PCI_DEVICE_DATA(INTEL, PMT_OOBMSM, NULL) },
+	{ PCI_DEVICE_DATA(INTEL, PMT_TGL, &tgl_info) },
+	{ }
+};
+MODULE_DEVICE_TABLE(pci, pmt_pci_ids);
+
+static struct pci_driver pmt_pci_driver = {
+	.name = "intel-pmt",
+	.id_table = pmt_pci_ids,
+	.probe = pmt_pci_probe,
+	.remove = pmt_pci_remove,
+};
+module_pci_driver(pmt_pci_driver);
+
+MODULE_AUTHOR("David E. Box <david.e.box@linux.intel.com>");
+MODULE_DESCRIPTION("Intel Platform Monitoring Technology PMT driver");
+MODULE_LICENSE("GPL v2");
-- 
Gitee


From 81b9ddf8fa3e6b6b208a2385ce48863122d2eacc Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Date: Wed, 28 Oct 2020 18:55:34 -0700
Subject: [PATCH 16/63] platform/x86: Intel PMT class driver

mainline inclusion
from mainline-v5.11-rc1
commit e2729113ce66d8d21f729b41bc3ed3feaf1acf69
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9
CVE: NA

Intel-SIG: commit e2729113ce66 platform/x86: Intel PMT class driver.
Backport for intel PMT (Platform Monitoring Technology) support

--------------------------------

Intel Platform Monitoring Technology is meant to provide a common way to
access telemetry and system metrics.

Register mappings are not provided by the driver. Instead, a GUID is read
from a header for each endpoint. The GUID identifies the device and is to
be used with an XML, provided by the vendor, to discover the available set
of metrics and their register mapping.  This allows firmware updates to
modify the register space without needing to update the driver every time
with new mappings. Firmware writes a new GUID in this case to specify the
new mapping.  Software tools with access to the associated XML file can
then interpret the changes.

The module manages access to all Intel PMT endpoints on a system,
independent of the device exporting them. It creates an intel_pmt class
to manage the devices. For each telemetry endpoint, sysfs files provide
GUID and size information as well as a pointer to the parent device the
telemetry came from. Software may discover the association between
endpoints and devices by iterating through the list in sysfs, or by looking
for the existence of the class folder under the device of interest.  A
binary sysfs attribute of the same name allows software to then read or map
the telemetry space for direct access.

Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 .../ABI/testing/sysfs-class-intel_pmt         |  54 ++++
 MAINTAINERS                                   |   1 +
 drivers/platform/x86/Kconfig                  |  12 +
 drivers/platform/x86/Makefile                 |   1 +
 drivers/platform/x86/intel_pmt_class.c        | 297 ++++++++++++++++++
 drivers/platform/x86/intel_pmt_class.h        |  52 +++
 6 files changed, 417 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-class-intel_pmt
 create mode 100644 drivers/platform/x86/intel_pmt_class.c
 create mode 100644 drivers/platform/x86/intel_pmt_class.h

diff --git a/Documentation/ABI/testing/sysfs-class-intel_pmt b/Documentation/ABI/testing/sysfs-class-intel_pmt
new file mode 100644
index 000000000000..926b5cf95fd1
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-intel_pmt
@@ -0,0 +1,54 @@
+What:		/sys/class/intel_pmt/
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	David Box <david.e.box@linux.intel.com>
+Description:
+		The intel_pmt/ class directory contains information for
+		devices that expose hardware telemetry using Intel Platform
+		Monitoring Technology (PMT)
+
+What:		/sys/class/intel_pmt/telem<x>
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	David Box <david.e.box@linux.intel.com>
+Description:
+		The telem<x> directory contains files describing an instance of
+		a PMT telemetry device that exposes hardware telemetry. Each
+		telem<x> directory has an associated telem file. This file
+		may be opened and mapped or read to access the telemetry space
+		of the device. The register layout of the telemetry space is
+		determined from an XML file that matches the PCI device id and
+		GUID for the device.
+
+What:		/sys/class/intel_pmt/telem<x>/telem
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	David Box <david.e.box@linux.intel.com>
+Description:
+		(RO) The telemetry data for this telemetry device. This file
+		may be mapped or read to obtain the data.
+
+What:		/sys/class/intel_pmt/telem<x>/guid
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	David Box <david.e.box@linux.intel.com>
+Description:
+		(RO) The GUID for this telemetry device. The GUID identifies
+		the version of the XML file for the parent device that is to
+		be used to get the register layout.
+
+What:		/sys/class/intel_pmt/telem<x>/size
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	David Box <david.e.box@linux.intel.com>
+Description:
+		(RO) The size of telemetry region in bytes that corresponds to
+		the mapping size for the telem file.
+
+What:		/sys/class/intel_pmt/telem<x>/offset
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	David Box <david.e.box@linux.intel.com>
+Description:
+		(RO) The offset of telemetry region in bytes that corresponds to
+		the mapping for the telem file.
diff --git a/MAINTAINERS b/MAINTAINERS
index cdffd5192e09..466b1c599848 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9042,6 +9042,7 @@ INTEL PMT DRIVER
 M:	"David E. Box" <david.e.box@linux.intel.com>
 S:	Maintained
 F:	drivers/mfd/intel_pmt.c
+F:	drivers/platform/x86/intel_pmt_*
 
 INTEL PRO/WIRELESS 2100, 2200BG, 2915ABG NETWORK CONNECTION SUPPORT
 M:	Stanislav Yakovlev <stas.yakovlev@gmail.com>
diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index a1858689d6e1..9118f76ed3e9 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -1362,6 +1362,18 @@ config INTEL_PMC_CORE
 		- LTR Ignore
 		- MPHY/PLL gating status (Sunrisepoint PCH only)
 
+config INTEL_PMT_CLASS
+	tristate "Intel Platform Monitoring Technology (PMT) Class driver"
+	help
+	  The Intel Platform Monitoring Technology (PMT) class driver provides
+	  the basic sysfs interface and file hierarchy uses by PMT devices.
+
+	  For more information, see:
+	  <file:Documentation/ABI/testing/sysfs-class-intel_pmt>
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called intel_pmt_class.
+
 config INTEL_PUNIT_IPC
 	tristate "Intel P-Unit IPC Driver"
 	help
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index 5f823f7eff45..f4b1f87f2401 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -140,6 +140,7 @@ obj-$(CONFIG_INTEL_MFLD_THERMAL)	+= intel_mid_thermal.o
 obj-$(CONFIG_INTEL_MID_POWER_BUTTON)	+= intel_mid_powerbtn.o
 obj-$(CONFIG_INTEL_MRFLD_PWRBTN)	+= intel_mrfld_pwrbtn.o
 obj-$(CONFIG_INTEL_PMC_CORE)		+= intel_pmc_core.o intel_pmc_core_pltdrv.o
+obj-$(CONFIG_INTEL_PMT_CLASS)		+= intel_pmt_class.o
 obj-$(CONFIG_INTEL_PUNIT_IPC)		+= intel_punit_ipc.o
 obj-$(CONFIG_INTEL_SCU_IPC)		+= intel_scu_ipc.o
 obj-$(CONFIG_INTEL_SCU_PCI)		+= intel_scu_pcidrv.o
diff --git a/drivers/platform/x86/intel_pmt_class.c b/drivers/platform/x86/intel_pmt_class.c
new file mode 100644
index 000000000000..aa88dc23bbde
--- /dev/null
+++ b/drivers/platform/x86/intel_pmt_class.c
@@ -0,0 +1,297 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Platform Monitory Technology Telemetry driver
+ *
+ * Copyright (c) 2020, Intel Corporation.
+ * All Rights Reserved.
+ *
+ * Author: "Alexander Duyck" <alexander.h.duyck@linux.intel.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/pci.h>
+
+#include "intel_pmt_class.h"
+
+#define PMT_XA_START		0
+#define PMT_XA_MAX		INT_MAX
+#define PMT_XA_LIMIT		XA_LIMIT(PMT_XA_START, PMT_XA_MAX)
+
+/*
+ * sysfs
+ */
+static ssize_t
+intel_pmt_read(struct file *filp, struct kobject *kobj,
+	       struct bin_attribute *attr, char *buf, loff_t off,
+	       size_t count)
+{
+	struct intel_pmt_entry *entry = container_of(attr,
+						     struct intel_pmt_entry,
+						     pmt_bin_attr);
+
+	if (off < 0)
+		return -EINVAL;
+
+	if (off >= entry->size)
+		return 0;
+
+	if (count > entry->size - off)
+		count = entry->size - off;
+
+	memcpy_fromio(buf, entry->base + off, count);
+
+	return count;
+}
+
+static int
+intel_pmt_mmap(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *attr, struct vm_area_struct *vma)
+{
+	struct intel_pmt_entry *entry = container_of(attr,
+						     struct intel_pmt_entry,
+						     pmt_bin_attr);
+	unsigned long vsize = vma->vm_end - vma->vm_start;
+	struct device *dev = kobj_to_dev(kobj);
+	unsigned long phys = entry->base_addr;
+	unsigned long pfn = PFN_DOWN(phys);
+	unsigned long psize;
+
+	if (vma->vm_flags & (VM_WRITE | VM_MAYWRITE))
+		return -EROFS;
+
+	psize = (PFN_UP(entry->base_addr + entry->size) - pfn) * PAGE_SIZE;
+	if (vsize > psize) {
+		dev_err(dev, "Requested mmap size is too large\n");
+		return -EINVAL;
+	}
+
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	if (io_remap_pfn_range(vma, vma->vm_start, pfn,
+		vsize, vma->vm_page_prot))
+		return -EAGAIN;
+
+	return 0;
+}
+
+static ssize_t
+guid_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct intel_pmt_entry *entry = dev_get_drvdata(dev);
+
+	return sprintf(buf, "0x%x\n", entry->guid);
+}
+static DEVICE_ATTR_RO(guid);
+
+static ssize_t size_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct intel_pmt_entry *entry = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%zu\n", entry->size);
+}
+static DEVICE_ATTR_RO(size);
+
+static ssize_t
+offset_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct intel_pmt_entry *entry = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%lu\n", offset_in_page(entry->base_addr));
+}
+static DEVICE_ATTR_RO(offset);
+
+static struct attribute *intel_pmt_attrs[] = {
+	&dev_attr_guid.attr,
+	&dev_attr_size.attr,
+	&dev_attr_offset.attr,
+	NULL
+};
+ATTRIBUTE_GROUPS(intel_pmt);
+
+static struct class intel_pmt_class = {
+	.name = "intel_pmt",
+	.owner = THIS_MODULE,
+	.dev_groups = intel_pmt_groups,
+};
+
+static int intel_pmt_populate_entry(struct intel_pmt_entry *entry,
+				    struct intel_pmt_header *header,
+				    struct device *dev,
+				    struct resource *disc_res)
+{
+	struct pci_dev *pci_dev = to_pci_dev(dev->parent);
+	u8 bir;
+
+	/*
+	 * The base offset should always be 8 byte aligned.
+	 *
+	 * For non-local access types the lower 3 bits of base offset
+	 * contains the index of the base address register where the
+	 * telemetry can be found.
+	 */
+	bir = GET_BIR(header->base_offset);
+
+	/* Local access and BARID only for now */
+	switch (header->access_type) {
+	case ACCESS_LOCAL:
+		if (bir) {
+			dev_err(dev,
+				"Unsupported BAR index %d for access type %d\n",
+				bir, header->access_type);
+			return -EINVAL;
+		}
+		/*
+		 * For access_type LOCAL, the base address is as follows:
+		 * base address = end of discovery region + base offset
+		 */
+		entry->base_addr = disc_res->end + 1 + header->base_offset;
+		break;
+	case ACCESS_BARID:
+		/*
+		 * If another BAR was specified then the base offset
+		 * represents the offset within that BAR. SO retrieve the
+		 * address from the parent PCI device and add offset.
+		 */
+		entry->base_addr = pci_resource_start(pci_dev, bir) +
+				   GET_ADDRESS(header->base_offset);
+		break;
+	default:
+		dev_err(dev, "Unsupported access type %d\n",
+			header->access_type);
+		return -EINVAL;
+	}
+
+	entry->guid = header->guid;
+	entry->size = header->size;
+
+	return 0;
+}
+
+static int intel_pmt_dev_register(struct intel_pmt_entry *entry,
+				  struct intel_pmt_namespace *ns,
+				  struct device *parent)
+{
+	struct resource res;
+	struct device *dev;
+	int ret;
+
+	ret = xa_alloc(ns->xa, &entry->devid, entry, PMT_XA_LIMIT, GFP_KERNEL);
+	if (ret)
+		return ret;
+
+	dev = device_create(&intel_pmt_class, parent, MKDEV(0, 0), entry,
+			    "%s%d", ns->name, entry->devid);
+
+	if (IS_ERR(dev)) {
+		dev_err(parent, "Could not create %s%d device node\n",
+			ns->name, entry->devid);
+		ret = PTR_ERR(dev);
+		goto fail_dev_create;
+	}
+
+	entry->kobj = &dev->kobj;
+
+	if (ns->attr_grp) {
+		ret = sysfs_create_group(entry->kobj, ns->attr_grp);
+		if (ret)
+			goto fail_sysfs;
+	}
+
+	/* if size is 0 assume no data buffer, so no file needed */
+	if (!entry->size)
+		return 0;
+
+	res.start = entry->base_addr;
+	res.end = res.start + entry->size - 1;
+	res.flags = IORESOURCE_MEM;
+
+	entry->base = devm_ioremap_resource(dev, &res);
+	if (IS_ERR(entry->base)) {
+		ret = PTR_ERR(entry->base);
+		goto fail_ioremap;
+	}
+
+	sysfs_bin_attr_init(&entry->pmt_bin_attr);
+	entry->pmt_bin_attr.attr.name = ns->name;
+	entry->pmt_bin_attr.attr.mode = 0440;
+	entry->pmt_bin_attr.mmap = intel_pmt_mmap;
+	entry->pmt_bin_attr.read = intel_pmt_read;
+	entry->pmt_bin_attr.size = entry->size;
+
+	ret = sysfs_create_bin_file(&dev->kobj, &entry->pmt_bin_attr);
+	if (!ret)
+		return 0;
+
+fail_ioremap:
+	sysfs_remove_group(entry->kobj, ns->attr_grp);
+fail_sysfs:
+	device_unregister(dev);
+fail_dev_create:
+	xa_erase(ns->xa, entry->devid);
+
+	return ret;
+}
+
+int intel_pmt_dev_create(struct intel_pmt_entry *entry,
+			 struct intel_pmt_namespace *ns,
+			 struct platform_device *pdev, int idx)
+{
+	struct intel_pmt_header header;
+	struct resource	*disc_res;
+	int ret = -ENODEV;
+
+	disc_res = platform_get_resource(pdev, IORESOURCE_MEM, idx);
+	if (!disc_res)
+		return ret;
+
+	entry->disc_table = devm_platform_ioremap_resource(pdev, idx);
+	if (IS_ERR(entry->disc_table))
+		return PTR_ERR(entry->disc_table);
+
+	ret = ns->pmt_header_decode(entry, &header, &pdev->dev);
+	if (ret)
+		return ret;
+
+	ret = intel_pmt_populate_entry(entry, &header, &pdev->dev, disc_res);
+	if (ret)
+		return ret;
+
+	return intel_pmt_dev_register(entry, ns, &pdev->dev);
+
+}
+EXPORT_SYMBOL_GPL(intel_pmt_dev_create);
+
+void intel_pmt_dev_destroy(struct intel_pmt_entry *entry,
+			   struct intel_pmt_namespace *ns)
+{
+	struct device *dev = kobj_to_dev(entry->kobj);
+
+	if (entry->size)
+		sysfs_remove_bin_file(entry->kobj, &entry->pmt_bin_attr);
+
+	if (ns->attr_grp)
+		sysfs_remove_group(entry->kobj, ns->attr_grp);
+
+	device_unregister(dev);
+	xa_erase(ns->xa, entry->devid);
+}
+EXPORT_SYMBOL_GPL(intel_pmt_dev_destroy);
+
+static int __init pmt_class_init(void)
+{
+	return class_register(&intel_pmt_class);
+}
+
+static void __exit pmt_class_exit(void)
+{
+	class_unregister(&intel_pmt_class);
+}
+
+module_init(pmt_class_init);
+module_exit(pmt_class_exit);
+
+MODULE_AUTHOR("Alexander Duyck <alexander.h.duyck@linux.intel.com>");
+MODULE_DESCRIPTION("Intel PMT Class driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/platform/x86/intel_pmt_class.h b/drivers/platform/x86/intel_pmt_class.h
new file mode 100644
index 000000000000..de8f8139ba31
--- /dev/null
+++ b/drivers/platform/x86/intel_pmt_class.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _INTEL_PMT_CLASS_H
+#define _INTEL_PMT_CLASS_H
+
+#include <linux/platform_device.h>
+#include <linux/xarray.h>
+#include <linux/types.h>
+#include <linux/bits.h>
+#include <linux/err.h>
+#include <linux/io.h>
+
+/* PMT access types */
+#define ACCESS_BARID		2
+#define ACCESS_LOCAL		3
+
+/* PMT discovery base address/offset register layout */
+#define GET_BIR(v)		((v) & GENMASK(2, 0))
+#define GET_ADDRESS(v)		((v) & GENMASK(31, 3))
+
+struct intel_pmt_entry {
+	struct bin_attribute	pmt_bin_attr;
+	struct kobject		*kobj;
+	void __iomem		*disc_table;
+	void __iomem		*base;
+	unsigned long		base_addr;
+	size_t			size;
+	u32			guid;
+	int			devid;
+};
+
+struct intel_pmt_header {
+	u32	base_offset;
+	u32	size;
+	u32	guid;
+	u8	access_type;
+};
+
+struct intel_pmt_namespace {
+	const char *name;
+	struct xarray *xa;
+	const struct attribute_group *attr_grp;
+	int (*pmt_header_decode)(struct intel_pmt_entry *entry,
+				 struct intel_pmt_header *header,
+				 struct device *dev);
+};
+
+int intel_pmt_dev_create(struct intel_pmt_entry *entry,
+			 struct intel_pmt_namespace *ns,
+			 struct platform_device *pdev, int idx);
+void intel_pmt_dev_destroy(struct intel_pmt_entry *entry,
+			   struct intel_pmt_namespace *ns);
+#endif
-- 
Gitee


From 2b9ca8a6fdbdefeff6d7832f666c32ba6006ac4f Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Date: Wed, 28 Oct 2020 18:55:35 -0700
Subject: [PATCH 17/63] platform/x86: Intel PMT Telemetry capability driver

mainline inclusion
from mainline-v5.11-rc1
commit 68fe8e6e2c4b04e2733d77834f55a4a0e172b770
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9
CVE: NA

Intel-SIG: commit 68fe8e6e2c4b platform/x86: Intel PMT Telemetry capability
driver.
Backport for intel PMT (Platform Monitoring Technology) support

--------------------------------

PMT Telemetry is a capability of the Intel Platform Monitoring Technology.
The Telemetry capability provides access to device telemetry metrics that
provide hardware performance data to users from read-only register spaces.

With this driver present the intel_pmt directory can be populated with
telem<x> devices. These devices will contain the standard intel_pmt sysfs
data and a "telem" binary sysfs attribute which can be used to access the
telemetry data.

Also create a PCI device id list for early telemetry hardware that require
workarounds for known issues.

Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Co-developed-by: David E. Box <david.e.box@linux.intel.com>
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/platform/x86/Kconfig               |  11 ++
 drivers/platform/x86/Makefile              |   1 +
 drivers/platform/x86/intel_pmt_telemetry.c | 160 +++++++++++++++++++++
 3 files changed, 172 insertions(+)
 create mode 100644 drivers/platform/x86/intel_pmt_telemetry.c

diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 9118f76ed3e9..ac93c807eb9a 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -1374,6 +1374,17 @@ config INTEL_PMT_CLASS
 	  To compile this driver as a module, choose M here: the module
 	  will be called intel_pmt_class.
 
+config INTEL_PMT_TELEMETRY
+	tristate "Intel Platform Monitoring Technology (PMT) Telemetry driver"
+	select INTEL_PMT_CLASS
+	help
+	  The Intel Platform Monitory Technology (PMT) Telemetry driver provides
+	  access to hardware telemetry metrics on devices that support the
+	  feature.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called intel_pmt_telemetry.
+
 config INTEL_PUNIT_IPC
 	tristate "Intel P-Unit IPC Driver"
 	help
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index f4b1f87f2401..6a7b61f59ea8 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -141,6 +141,7 @@ obj-$(CONFIG_INTEL_MID_POWER_BUTTON)	+= intel_mid_powerbtn.o
 obj-$(CONFIG_INTEL_MRFLD_PWRBTN)	+= intel_mrfld_pwrbtn.o
 obj-$(CONFIG_INTEL_PMC_CORE)		+= intel_pmc_core.o intel_pmc_core_pltdrv.o
 obj-$(CONFIG_INTEL_PMT_CLASS)		+= intel_pmt_class.o
+obj-$(CONFIG_INTEL_PMT_TELEMETRY)	+= intel_pmt_telemetry.o
 obj-$(CONFIG_INTEL_PUNIT_IPC)		+= intel_punit_ipc.o
 obj-$(CONFIG_INTEL_SCU_IPC)		+= intel_scu_ipc.o
 obj-$(CONFIG_INTEL_SCU_PCI)		+= intel_scu_pcidrv.o
diff --git a/drivers/platform/x86/intel_pmt_telemetry.c b/drivers/platform/x86/intel_pmt_telemetry.c
new file mode 100644
index 000000000000..f8a87614efa4
--- /dev/null
+++ b/drivers/platform/x86/intel_pmt_telemetry.c
@@ -0,0 +1,160 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Platform Monitory Technology Telemetry driver
+ *
+ * Copyright (c) 2020, Intel Corporation.
+ * All Rights Reserved.
+ *
+ * Author: "David E. Box" <david.e.box@linux.intel.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/overflow.h>
+
+#include "intel_pmt_class.h"
+
+#define TELEM_DEV_NAME		"pmt_telemetry"
+
+#define TELEM_SIZE_OFFSET	0x0
+#define TELEM_GUID_OFFSET	0x4
+#define TELEM_BASE_OFFSET	0x8
+#define TELEM_ACCESS(v)		((v) & GENMASK(3, 0))
+/* size is in bytes */
+#define TELEM_SIZE(v)		(((v) & GENMASK(27, 12)) >> 10)
+
+/* Used by client hardware to identify a fixed telemetry entry*/
+#define TELEM_CLIENT_FIXED_BLOCK_GUID	0x10000000
+
+struct pmt_telem_priv {
+	int				num_entries;
+	struct intel_pmt_entry		entry[];
+};
+
+/*
+ * Early implementations of PMT on client platforms have some
+ * differences from the server platforms (which use the Out Of Band
+ * Management Services Module OOBMSM). This list tracks those
+ * platforms as needed to handle those differences. Newer client
+ * platforms are expected to be fully compatible with server.
+ */
+static const struct pci_device_id pmt_telem_early_client_pci_ids[] = {
+	{ PCI_VDEVICE(INTEL, 0x9a0d) }, /* TGL */
+	{ PCI_VDEVICE(INTEL, 0x467d) }, /* ADL */
+	{ }
+};
+
+static bool intel_pmt_is_early_client_hw(struct device *dev)
+{
+	struct pci_dev *parent = to_pci_dev(dev->parent);
+
+	return !!pci_match_id(pmt_telem_early_client_pci_ids, parent);
+}
+
+static bool pmt_telem_region_overlaps(struct intel_pmt_entry *entry,
+				      struct device *dev)
+{
+	u32 guid = readl(entry->disc_table + TELEM_GUID_OFFSET);
+
+	if (guid != TELEM_CLIENT_FIXED_BLOCK_GUID)
+		return false;
+
+	return intel_pmt_is_early_client_hw(dev);
+}
+
+static int pmt_telem_header_decode(struct intel_pmt_entry *entry,
+				   struct intel_pmt_header *header,
+				   struct device *dev)
+{
+	void __iomem *disc_table = entry->disc_table;
+
+	if (pmt_telem_region_overlaps(entry, dev))
+		return 1;
+
+	header->access_type = TELEM_ACCESS(readl(disc_table));
+	header->guid = readl(disc_table + TELEM_GUID_OFFSET);
+	header->base_offset = readl(disc_table + TELEM_BASE_OFFSET);
+
+	/* Size is measured in DWORDS, but accessor returns bytes */
+	header->size = TELEM_SIZE(readl(disc_table));
+
+	return 0;
+}
+
+static DEFINE_XARRAY_ALLOC(telem_array);
+static struct intel_pmt_namespace pmt_telem_ns = {
+	.name = "telem",
+	.xa = &telem_array,
+	.pmt_header_decode = pmt_telem_header_decode,
+};
+
+static int pmt_telem_remove(struct platform_device *pdev)
+{
+	struct pmt_telem_priv *priv = platform_get_drvdata(pdev);
+	int i;
+
+	for (i = 0; i < priv->num_entries; i++)
+		intel_pmt_dev_destroy(&priv->entry[i], &pmt_telem_ns);
+
+	return 0;
+}
+
+static int pmt_telem_probe(struct platform_device *pdev)
+{
+	struct pmt_telem_priv *priv;
+	size_t size;
+	int i, ret;
+
+	size = struct_size(priv, entry, pdev->num_resources);
+	priv = devm_kzalloc(&pdev->dev, size, GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, priv);
+
+	for (i = 0; i < pdev->num_resources; i++) {
+		struct intel_pmt_entry *entry = &priv->entry[i];
+
+		ret = intel_pmt_dev_create(entry, &pmt_telem_ns, pdev, i);
+		if (ret < 0)
+			goto abort_probe;
+		if (ret)
+			continue;
+
+		priv->num_entries++;
+	}
+
+	return 0;
+abort_probe:
+	pmt_telem_remove(pdev);
+	return ret;
+}
+
+static struct platform_driver pmt_telem_driver = {
+	.driver = {
+		.name   = TELEM_DEV_NAME,
+	},
+	.remove = pmt_telem_remove,
+	.probe  = pmt_telem_probe,
+};
+
+static int __init pmt_telem_init(void)
+{
+	return platform_driver_register(&pmt_telem_driver);
+}
+module_init(pmt_telem_init);
+
+static void __exit pmt_telem_exit(void)
+{
+	platform_driver_unregister(&pmt_telem_driver);
+	xa_destroy(&telem_array);
+}
+module_exit(pmt_telem_exit);
+
+MODULE_AUTHOR("David E. Box <david.e.box@linux.intel.com>");
+MODULE_DESCRIPTION("Intel PMT Telemetry driver");
+MODULE_ALIAS("platform:" TELEM_DEV_NAME);
+MODULE_LICENSE("GPL v2");
-- 
Gitee


From c25e00107196a59b8710e29ae8510f50b28ed489 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Date: Wed, 28 Oct 2020 18:55:36 -0700
Subject: [PATCH 18/63] platform/x86: Intel PMT Crashlog capability driver

mainline inclusion
from mainline-v5.11-rc1
commit 5ef9998c96b0c99c49c202054586967e609286d2
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9
CVE: NA

Intel-SIG: commit 5ef9998c96b0 platform/x86: Intel PMT Crashlog
capability driver.
Backport for intel PMT (Platform Monitoring Technology) support

--------------------------------

Add support for the Intel Platform Monitoring Technology crashlog
interface. This interface provides a few sysfs values to allow for
controlling the crashlog telemetry interface as well as a character
driver to allow for mapping the crashlog memory region so that it can be
accessed after a crashlog has been recorded.

This driver is meant to only support the server version of the crashlog
which is identified as crash_type 1 with a version of zero. Currently no
other types are supported.

Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 .../ABI/testing/sysfs-class-intel_pmt         |  65 ++++
 drivers/platform/x86/Kconfig                  |  11 +
 drivers/platform/x86/Makefile                 |   1 +
 drivers/platform/x86/intel_pmt_crashlog.c     | 328 ++++++++++++++++++
 4 files changed, 405 insertions(+)
 create mode 100644 drivers/platform/x86/intel_pmt_crashlog.c

diff --git a/Documentation/ABI/testing/sysfs-class-intel_pmt b/Documentation/ABI/testing/sysfs-class-intel_pmt
index 926b5cf95fd1..ed4c886a21b1 100644
--- a/Documentation/ABI/testing/sysfs-class-intel_pmt
+++ b/Documentation/ABI/testing/sysfs-class-intel_pmt
@@ -52,3 +52,68 @@ Contact:	David Box <david.e.box@linux.intel.com>
 Description:
 		(RO) The offset of telemetry region in bytes that corresponds to
 		the mapping for the telem file.
+
+What:		/sys/class/intel_pmt/crashlog<x>
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	Alexander Duyck <alexander.h.duyck@linux.intel.com>
+Description:
+		The crashlog<x> directory contains files for configuring an
+		instance of a PMT crashlog device that can perform crash data
+		recording. Each crashlog<x> device has an associated crashlog
+		file. This file can be opened and mapped or read to access the
+		resulting crashlog buffer. The register layout for the buffer
+		can be determined from an XML file of specified GUID for the
+		parent device.
+
+What:		/sys/class/intel_pmt/crashlog<x>/crashlog
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	David Box <david.e.box@linux.intel.com>
+Description:
+		(RO) The crashlog buffer for this crashlog device. This file
+		may be mapped or read to obtain the data.
+
+What:		/sys/class/intel_pmt/crashlog<x>/guid
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	Alexander Duyck <alexander.h.duyck@linux.intel.com>
+Description:
+		(RO) The GUID for this crashlog device. The GUID identifies the
+		version of the XML file for the parent device that should be
+		used to determine the register layout.
+
+What:		/sys/class/intel_pmt/crashlog<x>/size
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	Alexander Duyck <alexander.h.duyck@linux.intel.com>
+Description:
+		(RO) The length of the result buffer in bytes that corresponds
+		to the size for the crashlog buffer.
+
+What:		/sys/class/intel_pmt/crashlog<x>/offset
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	Alexander Duyck <alexander.h.duyck@linux.intel.com>
+Description:
+		(RO) The offset of the buffer in bytes that corresponds
+		to the mapping for the crashlog device.
+
+What:		/sys/class/intel_pmt/crashlog<x>/enable
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	Alexander Duyck <alexander.h.duyck@linux.intel.com>
+Description:
+		(RW) Boolean value controlling if the crashlog functionality
+		is enabled for the crashlog device.
+
+What:		/sys/class/intel_pmt/crashlog<x>/trigger
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	Alexander Duyck <alexander.h.duyck@linux.intel.com>
+Description:
+		(RW) Boolean value controlling the triggering of the crashlog
+		device node. When read it provides data on if the crashlog has
+		been triggered. When written to it can be used to either clear
+		the current trigger by writing false, or to trigger a new
+		event if the trigger is not currently set.
diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index ac93c807eb9a..da36a682d0cf 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -1385,6 +1385,17 @@ config INTEL_PMT_TELEMETRY
 	  To compile this driver as a module, choose M here: the module
 	  will be called intel_pmt_telemetry.
 
+config INTEL_PMT_CRASHLOG
+	tristate "Intel Platform Monitoring Technology (PMT) Crashlog driver"
+	select INTEL_PMT_CLASS
+	help
+	  The Intel Platform Monitoring Technology (PMT) crashlog driver provides
+	  access to hardware crashlog capabilities on devices that support the
+	  feature.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called intel_pmt_crashlog.
+
 config INTEL_PUNIT_IPC
 	tristate "Intel P-Unit IPC Driver"
 	help
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index 6a7b61f59ea8..ca82c1344977 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -142,6 +142,7 @@ obj-$(CONFIG_INTEL_MRFLD_PWRBTN)	+= intel_mrfld_pwrbtn.o
 obj-$(CONFIG_INTEL_PMC_CORE)		+= intel_pmc_core.o intel_pmc_core_pltdrv.o
 obj-$(CONFIG_INTEL_PMT_CLASS)		+= intel_pmt_class.o
 obj-$(CONFIG_INTEL_PMT_TELEMETRY)	+= intel_pmt_telemetry.o
+obj-$(CONFIG_INTEL_PMT_CRASHLOG)	+= intel_pmt_crashlog.o
 obj-$(CONFIG_INTEL_PUNIT_IPC)		+= intel_punit_ipc.o
 obj-$(CONFIG_INTEL_SCU_IPC)		+= intel_scu_ipc.o
 obj-$(CONFIG_INTEL_SCU_PCI)		+= intel_scu_pcidrv.o
diff --git a/drivers/platform/x86/intel_pmt_crashlog.c b/drivers/platform/x86/intel_pmt_crashlog.c
new file mode 100644
index 000000000000..97dd749c8290
--- /dev/null
+++ b/drivers/platform/x86/intel_pmt_crashlog.c
@@ -0,0 +1,328 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Platform Monitoring Technology Crashlog driver
+ *
+ * Copyright (c) 2020, Intel Corporation.
+ * All Rights Reserved.
+ *
+ * Author: "Alexander Duyck" <alexander.h.duyck@linux.intel.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/overflow.h>
+
+#include "intel_pmt_class.h"
+
+#define DRV_NAME		"pmt_crashlog"
+
+/* Crashlog discovery header types */
+#define CRASH_TYPE_OOBMSM	1
+
+/* Control Flags */
+#define CRASHLOG_FLAG_DISABLE		BIT(27)
+
+/*
+ * Bits 28 and 29 control the state of bit 31.
+ *
+ * Bit 28 will clear bit 31, if set, allowing a new crashlog to be captured.
+ * Bit 29 will immediately trigger a crashlog to be generated, setting bit 31.
+ * Bit 30 is read-only and reserved as 0.
+ * Bit 31 is the read-only status with a 1 indicating log is complete.
+ */
+#define CRASHLOG_FLAG_TRIGGER_CLEAR	BIT(28)
+#define CRASHLOG_FLAG_TRIGGER_EXECUTE	BIT(29)
+#define CRASHLOG_FLAG_TRIGGER_COMPLETE	BIT(31)
+#define CRASHLOG_FLAG_TRIGGER_MASK	GENMASK(31, 28)
+
+/* Crashlog Discovery Header */
+#define CONTROL_OFFSET		0x0
+#define GUID_OFFSET		0x4
+#define BASE_OFFSET		0x8
+#define SIZE_OFFSET		0xC
+#define GET_ACCESS(v)		((v) & GENMASK(3, 0))
+#define GET_TYPE(v)		(((v) & GENMASK(7, 4)) >> 4)
+#define GET_VERSION(v)		(((v) & GENMASK(19, 16)) >> 16)
+/* size is in bytes */
+#define GET_SIZE(v)		((v) * sizeof(u32))
+
+struct crashlog_entry {
+	/* entry must be first member of struct */
+	struct intel_pmt_entry		entry;
+	struct mutex			control_mutex;
+};
+
+struct pmt_crashlog_priv {
+	int			num_entries;
+	struct crashlog_entry	entry[];
+};
+
+/*
+ * I/O
+ */
+static bool pmt_crashlog_complete(struct intel_pmt_entry *entry)
+{
+	u32 control = readl(entry->disc_table + CONTROL_OFFSET);
+
+	/* return current value of the crashlog complete flag */
+	return !!(control & CRASHLOG_FLAG_TRIGGER_COMPLETE);
+}
+
+static bool pmt_crashlog_disabled(struct intel_pmt_entry *entry)
+{
+	u32 control = readl(entry->disc_table + CONTROL_OFFSET);
+
+	/* return current value of the crashlog disabled flag */
+	return !!(control & CRASHLOG_FLAG_DISABLE);
+}
+
+static bool pmt_crashlog_supported(struct intel_pmt_entry *entry)
+{
+	u32 discovery_header = readl(entry->disc_table + CONTROL_OFFSET);
+	u32 crash_type, version;
+
+	crash_type = GET_TYPE(discovery_header);
+	version = GET_VERSION(discovery_header);
+
+	/*
+	 * Currently we only recognize OOBMSM version 0 devices.
+	 * We can ignore all other crashlog devices in the system.
+	 */
+	return crash_type == CRASH_TYPE_OOBMSM && version == 0;
+}
+
+static void pmt_crashlog_set_disable(struct intel_pmt_entry *entry,
+				     bool disable)
+{
+	u32 control = readl(entry->disc_table + CONTROL_OFFSET);
+
+	/* clear trigger bits so we are only modifying disable flag */
+	control &= ~CRASHLOG_FLAG_TRIGGER_MASK;
+
+	if (disable)
+		control |= CRASHLOG_FLAG_DISABLE;
+	else
+		control &= ~CRASHLOG_FLAG_DISABLE;
+
+	writel(control, entry->disc_table + CONTROL_OFFSET);
+}
+
+static void pmt_crashlog_set_clear(struct intel_pmt_entry *entry)
+{
+	u32 control = readl(entry->disc_table + CONTROL_OFFSET);
+
+	control &= ~CRASHLOG_FLAG_TRIGGER_MASK;
+	control |= CRASHLOG_FLAG_TRIGGER_CLEAR;
+
+	writel(control, entry->disc_table + CONTROL_OFFSET);
+}
+
+static void pmt_crashlog_set_execute(struct intel_pmt_entry *entry)
+{
+	u32 control = readl(entry->disc_table + CONTROL_OFFSET);
+
+	control &= ~CRASHLOG_FLAG_TRIGGER_MASK;
+	control |= CRASHLOG_FLAG_TRIGGER_EXECUTE;
+
+	writel(control, entry->disc_table + CONTROL_OFFSET);
+}
+
+/*
+ * sysfs
+ */
+static ssize_t
+enable_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct intel_pmt_entry *entry = dev_get_drvdata(dev);
+	int enabled = !pmt_crashlog_disabled(entry);
+
+	return sprintf(buf, "%d\n", enabled);
+}
+
+static ssize_t
+enable_store(struct device *dev, struct device_attribute *attr,
+	    const char *buf, size_t count)
+{
+	struct crashlog_entry *entry;
+	bool enabled;
+	int result;
+
+	entry = dev_get_drvdata(dev);
+
+	result = kstrtobool(buf, &enabled);
+	if (result)
+		return result;
+
+	mutex_lock(&entry->control_mutex);
+	pmt_crashlog_set_disable(&entry->entry, !enabled);
+	mutex_unlock(&entry->control_mutex);
+
+	return count;
+}
+static DEVICE_ATTR_RW(enable);
+
+static ssize_t
+trigger_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct intel_pmt_entry *entry;
+	int trigger;
+
+	entry = dev_get_drvdata(dev);
+	trigger = pmt_crashlog_complete(entry);
+
+	return sprintf(buf, "%d\n", trigger);
+}
+
+static ssize_t
+trigger_store(struct device *dev, struct device_attribute *attr,
+	    const char *buf, size_t count)
+{
+	struct crashlog_entry *entry;
+	bool trigger;
+	int result;
+
+	entry = dev_get_drvdata(dev);
+
+	result = kstrtobool(buf, &trigger);
+	if (result)
+		return result;
+
+	mutex_lock(&entry->control_mutex);
+
+	if (!trigger) {
+		pmt_crashlog_set_clear(&entry->entry);
+	} else if (pmt_crashlog_complete(&entry->entry)) {
+		/* we cannot trigger a new crash if one is still pending */
+		result = -EEXIST;
+		goto err;
+	} else if (pmt_crashlog_disabled(&entry->entry)) {
+		/* if device is currently disabled, return busy */
+		result = -EBUSY;
+		goto err;
+	} else {
+		pmt_crashlog_set_execute(&entry->entry);
+	}
+
+	result = count;
+err:
+	mutex_unlock(&entry->control_mutex);
+	return result;
+}
+static DEVICE_ATTR_RW(trigger);
+
+static struct attribute *pmt_crashlog_attrs[] = {
+	&dev_attr_enable.attr,
+	&dev_attr_trigger.attr,
+	NULL
+};
+
+static struct attribute_group pmt_crashlog_group = {
+	.attrs	= pmt_crashlog_attrs,
+};
+
+static int pmt_crashlog_header_decode(struct intel_pmt_entry *entry,
+				      struct intel_pmt_header *header,
+				      struct device *dev)
+{
+	void __iomem *disc_table = entry->disc_table;
+	struct crashlog_entry *crashlog;
+
+	if (!pmt_crashlog_supported(entry))
+		return 1;
+
+	/* initialize control mutex */
+	crashlog = container_of(entry, struct crashlog_entry, entry);
+	mutex_init(&crashlog->control_mutex);
+
+	header->access_type = GET_ACCESS(readl(disc_table));
+	header->guid = readl(disc_table + GUID_OFFSET);
+	header->base_offset = readl(disc_table + BASE_OFFSET);
+
+	/* Size is measured in DWORDS, but accessor returns bytes */
+	header->size = GET_SIZE(readl(disc_table + SIZE_OFFSET));
+
+	return 0;
+}
+
+static DEFINE_XARRAY_ALLOC(crashlog_array);
+static struct intel_pmt_namespace pmt_crashlog_ns = {
+	.name = "crashlog",
+	.xa = &crashlog_array,
+	.attr_grp = &pmt_crashlog_group,
+	.pmt_header_decode = pmt_crashlog_header_decode,
+};
+
+/*
+ * initialization
+ */
+static int pmt_crashlog_remove(struct platform_device *pdev)
+{
+	struct pmt_crashlog_priv *priv = platform_get_drvdata(pdev);
+	int i;
+
+	for (i = 0; i < priv->num_entries; i++)
+		intel_pmt_dev_destroy(&priv->entry[i].entry, &pmt_crashlog_ns);
+
+	return 0;
+}
+
+static int pmt_crashlog_probe(struct platform_device *pdev)
+{
+	struct pmt_crashlog_priv *priv;
+	size_t size;
+	int i, ret;
+
+	size = struct_size(priv, entry, pdev->num_resources);
+	priv = devm_kzalloc(&pdev->dev, size, GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, priv);
+
+	for (i = 0; i < pdev->num_resources; i++) {
+		struct intel_pmt_entry *entry = &priv->entry[i].entry;
+
+		ret = intel_pmt_dev_create(entry, &pmt_crashlog_ns, pdev, i);
+		if (ret < 0)
+			goto abort_probe;
+		if (ret)
+			continue;
+
+		priv->num_entries++;
+	}
+
+	return 0;
+abort_probe:
+	pmt_crashlog_remove(pdev);
+	return ret;
+}
+
+static struct platform_driver pmt_crashlog_driver = {
+	.driver = {
+		.name   = DRV_NAME,
+	},
+	.remove = pmt_crashlog_remove,
+	.probe  = pmt_crashlog_probe,
+};
+
+static int __init pmt_crashlog_init(void)
+{
+	return platform_driver_register(&pmt_crashlog_driver);
+}
+
+static void __exit pmt_crashlog_exit(void)
+{
+	platform_driver_unregister(&pmt_crashlog_driver);
+	xa_destroy(&crashlog_array);
+}
+
+module_init(pmt_crashlog_init);
+module_exit(pmt_crashlog_exit);
+
+MODULE_AUTHOR("Alexander Duyck <alexander.h.duyck@linux.intel.com>");
+MODULE_DESCRIPTION("Intel PMT Crashlog driver");
+MODULE_ALIAS("platform:" DRV_NAME);
+MODULE_LICENSE("GPL v2");
-- 
Gitee


From c10e13c43a8b060acb6c82c7fb18c778ccaf81ff Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Tue, 17 Nov 2020 10:22:51 +0300
Subject: [PATCH 19/63] platform/x86: pmt: Fix a potential Oops on error in
 probe

mainline inclusion
from mainline-v5.11-rc1
commit d3d73d25e0d9bc43fd2a6f4b4e58ff182e55b217
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9
CVE: NA

Intel-SIG: commit d3d73d25e0d9 platform/x86: pmt: Fix a potential Oops
on error in probe.
Backport for intel PMT (Platform Monitoring Technology) support

--------------------------------

The "ns->attr_grp" pointer can be NULL so this error handling code needs
to check for that to avoid an Oops.

Fixes: e2729113ce66 ("platform/x86: Intel PMT class driver")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: David E. Box <david.e.box@linux.intel.com>
Link: https://lore.kernel.org/r/20201117072251.GC1111239@mwanda
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/platform/x86/intel_pmt_class.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/platform/x86/intel_pmt_class.c b/drivers/platform/x86/intel_pmt_class.c
index aa88dc23bbde..c8939fba4509 100644
--- a/drivers/platform/x86/intel_pmt_class.c
+++ b/drivers/platform/x86/intel_pmt_class.c
@@ -225,7 +225,8 @@ static int intel_pmt_dev_register(struct intel_pmt_entry *entry,
 		return 0;
 
 fail_ioremap:
-	sysfs_remove_group(entry->kobj, ns->attr_grp);
+	if (ns->attr_grp)
+		sysfs_remove_group(entry->kobj, ns->attr_grp);
 fail_sysfs:
 	device_unregister(dev);
 fail_dev_create:
-- 
Gitee


From aa1980cb3878e62c6d8f17516a1e06361c086e52 Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Tue, 26 Jan 2021 12:55:06 -0800
Subject: [PATCH 20/63] platform/x86: intel_pmt: Make INTEL_PMT_CLASS
 non-user-selectable

mainline inclusion
from mainline-v5.12-rc2
commit 35d8a973fe4d38afee944db636c3d2b1df3741a7
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9
CVE: NA

Intel-SIG: commit 35d8a973fe4d platform/x86: intel_pmt: Make
INTEL_PMT_CLASS non-user-selectable.
Backport for intel PMT (Platform Monitoring Technology) support

--------------------------------

Fix error in Kconfig that exposed INTEL_PMT_CLASS as a user selectable
option. It is already selected by INTEL_PMT_TELEMETRY and
INTEL_PMT_CRASHLOG which are user selectable.

Fixes: e2729113ce66 ("platform/x86: Intel PMT class driver")
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Link: https://lore.kernel.org/r/20210126205508.30907-1-david.e.box@linux.intel.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/platform/x86/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index da36a682d0cf..15c8dd312af5 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -1363,7 +1363,7 @@ config INTEL_PMC_CORE
 		- MPHY/PLL gating status (Sunrisepoint PCH only)
 
 config INTEL_PMT_CLASS
-	tristate "Intel Platform Monitoring Technology (PMT) Class driver"
+	tristate
 	help
 	  The Intel Platform Monitoring Technology (PMT) class driver provides
 	  the basic sysfs interface and file hierarchy uses by PMT devices.
-- 
Gitee


From 0c7b979ae17ac1369e71045eabf158480ff1e862 Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Tue, 26 Jan 2021 12:55:07 -0800
Subject: [PATCH 21/63] platform/x86: intel_pmt_telemetry: Add dependency on
 MFD_INTEL_PMT

mainline inclusion
from mainline-v5.12-rc2
commit f3f6da5014dea3cc005b36948abe3664b5d1f7d3
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9
CVE: NA

Intel-SIG: commit f3f6da5014de platform/x86: intel_pmt_telemetry: Add
dependency on MFD_INTEL_PMT.
Backport for intel PMT (Platform Monitoring Technology) support

--------------------------------

All devices that expose Intel Platform Monitoring Technology (PMT)
telemetry are currently owned by the intel_pmt MFD driver. Therefore make
the telemetry driver depend on the MFD driver for build.

Fixes: 68fe8e6e2c4b ("platform/x86: Intel PMT Telemetry capability driver")
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Link: https://lore.kernel.org/r/20210126205508.30907-2-david.e.box@linux.intel.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/platform/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 15c8dd312af5..06484c5c5b8c 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -1376,6 +1376,7 @@ config INTEL_PMT_CLASS
 
 config INTEL_PMT_TELEMETRY
 	tristate "Intel Platform Monitoring Technology (PMT) Telemetry driver"
+	depends on MFD_INTEL_PMT
 	select INTEL_PMT_CLASS
 	help
 	  The Intel Platform Monitory Technology (PMT) Telemetry driver provides
-- 
Gitee


From 22c0f0c9e9bf18f63f8aed22f7e2defd077753df Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Tue, 26 Jan 2021 12:55:08 -0800
Subject: [PATCH 22/63] platform/x86: intel_pmt_crashlog: Add dependency on
 MFD_INTEL_PMT

mainline inclusion
from mainline-v5.12-rc2
commit fdd3feb37e36bec2ad75d76f8ac4d0273c5c0a91
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9
CVE: NA

Intel-SIG: commit fdd3feb37e36 platform/x86: intel_pmt_crashlog: Add
dependency on MFD_INTEL_PMT.
Backport for intel PMT (Platform Monitoring Technology) support

--------------------------------

All devices that expose Intel Platform Monitoring Technology (PMT)
crashlog are currently owned by the intel_pmt MFD driver. Therefore make
the crashlog driver depend on the MFD driver for build.

Fixes: 5ef9998c96b0 ("platform/x86: Intel PMT Crashlog capability driver")
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Link: https://lore.kernel.org/r/20210126205508.30907-3-david.e.box@linux.intel.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/platform/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 06484c5c5b8c..a24783aa52ea 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -1388,6 +1388,7 @@ config INTEL_PMT_TELEMETRY
 
 config INTEL_PMT_CRASHLOG
 	tristate "Intel Platform Monitoring Technology (PMT) Crashlog driver"
+	depends on MFD_INTEL_PMT
 	select INTEL_PMT_CLASS
 	help
 	  The Intel Platform Monitoring Technology (PMT) crashlog driver provides
-- 
Gitee


From 9c2552c05b7f89f76c32ce573a33576e430f72f5 Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Wed, 24 Feb 2021 12:10:04 -0800
Subject: [PATCH 23/63] mfd: intel_pmt: Fix nuisance messages and handling of
 disabled capabilities

mainline inclusion
from mainline-v5.13-rc1
commit a1a5c1c3df282dc122508a17500317266ef19e46
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9
CVE: NA

Intel-SIG: commit a1a5c1c3df28 mfd: intel_pmt: Fix nuisance messages and
handling of disabled capabilities.
Backport for intel PMT (Platform Monitoring Technology) support

--------------------------------

Some products will be available that have PMT capabilities that are not
supported. Remove the warnings in this instance to avoid nuisance messages
and confusion.

Also return an error code for capabilities that are disabled by quirk to
prevent them from keeping the driver loaded if only disabled capabilities
are found.

Fixes: 4f8217d5b0ca ("mfd: Intel Platform Monitoring Technology support")
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/mfd/intel_pmt.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/drivers/mfd/intel_pmt.c b/drivers/mfd/intel_pmt.c
index 744b230cdcca..65da2b17a204 100644
--- a/drivers/mfd/intel_pmt.c
+++ b/drivers/mfd/intel_pmt.c
@@ -79,19 +79,18 @@ static int pmt_add_dev(struct pci_dev *pdev, struct intel_dvsec_header *header,
 	case DVSEC_INTEL_ID_WATCHER:
 		if (quirks & PMT_QUIRK_NO_WATCHER) {
 			dev_info(dev, "Watcher not supported\n");
-			return 0;
+			return -EINVAL;
 		}
 		name = "pmt_watcher";
 		break;
 	case DVSEC_INTEL_ID_CRASHLOG:
 		if (quirks & PMT_QUIRK_NO_CRASHLOG) {
 			dev_info(dev, "Crashlog not supported\n");
-			return 0;
+			return -EINVAL;
 		}
 		name = "pmt_crashlog";
 		break;
 	default:
-		dev_err(dev, "Unrecognized PMT capability: %d\n", id);
 		return -EINVAL;
 	}
 
@@ -174,12 +173,8 @@ static int pmt_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		header.offset = INTEL_DVSEC_TABLE_OFFSET(table);
 
 		ret = pmt_add_dev(pdev, &header, quirks);
-		if (ret) {
-			dev_warn(&pdev->dev,
-				 "Failed to add device for DVSEC id %d\n",
-				 header.id);
+		if (ret)
 			continue;
-		}
 
 		found_devices = true;
 	} while (true);
-- 
Gitee


From 1a6161db47ce8d98291caf649c095fe75352e2f5 Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Wed, 24 Feb 2021 12:10:05 -0800
Subject: [PATCH 24/63] mfd: intel_pmt: Add support for DG1

mainline inclusion
from mainline-v5.13-rc1
commit aa47ad3f853ae72c32b7e46dfc8bc2c8dc2dbad7
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9
CVE: NA

Intel-SIG: commit aa47ad3f853a mfd: intel_pmt: Add support for DG1.
Backport for intel PMT (Platform Monitoring Technology) support

--------------------------------

Adds PMT Telemetry aggregator support for the DG1 graphics PCIe card. The
device does not have the DVSEC region in its PCI config space so hard
code the discovery table data in the driver. Also requires a fix for DG1
in the Telemetry driver for how the ACCESS_TYPE field is used.

Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/mfd/intel_pmt.c                    | 101 +++++++++++++++------
 drivers/platform/x86/intel_pmt_class.c     |  46 ++++++++++
 drivers/platform/x86/intel_pmt_class.h     |   1 +
 drivers/platform/x86/intel_pmt_telemetry.c |  20 ----
 4 files changed, 119 insertions(+), 49 deletions(-)

diff --git a/drivers/mfd/intel_pmt.c b/drivers/mfd/intel_pmt.c
index 65da2b17a204..dd7eb614c28e 100644
--- a/drivers/mfd/intel_pmt.c
+++ b/drivers/mfd/intel_pmt.c
@@ -49,10 +49,14 @@ enum pmt_quirks {
 
 	/* Use shift instead of mask to read discovery table offset */
 	PMT_QUIRK_TABLE_SHIFT	= BIT(2),
+
+	/* DVSEC not present (provided in driver data) */
+	PMT_QUIRK_NO_DVSEC	= BIT(3),
 };
 
 struct pmt_platform_info {
 	unsigned long quirks;
+	struct intel_dvsec_header **capabilities;
 };
 
 static const struct pmt_platform_info tgl_info = {
@@ -60,6 +64,26 @@ static const struct pmt_platform_info tgl_info = {
 		  PMT_QUIRK_TABLE_SHIFT,
 };
 
+/* DG1 Platform with DVSEC quirk*/
+static struct intel_dvsec_header dg1_telemetry = {
+	.length = 0x10,
+	.id = 2,
+	.num_entries = 1,
+	.entry_size = 3,
+	.tbir = 0,
+	.offset = 0x466000,
+};
+
+static struct intel_dvsec_header *dg1_capabilities[] = {
+	&dg1_telemetry,
+	NULL
+};
+
+static const struct pmt_platform_info dg1_info = {
+	.quirks = PMT_QUIRK_NO_DVSEC,
+	.capabilities = dg1_capabilities,
+};
+
 static int pmt_add_dev(struct pci_dev *pdev, struct intel_dvsec_header *header,
 		       unsigned long quirks)
 {
@@ -147,37 +171,54 @@ static int pmt_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (info)
 		quirks = info->quirks;
 
-	do {
-		struct intel_dvsec_header header;
-		u32 table;
-		u16 vid;
+	if (info && (info->quirks & PMT_QUIRK_NO_DVSEC)) {
+		struct intel_dvsec_header **header;
 
-		pos = pci_find_next_ext_capability(pdev, pos, PCI_EXT_CAP_ID_DVSEC);
-		if (!pos)
-			break;
+		header = info->capabilities;
+		while (*header) {
+			ret = pmt_add_dev(pdev, *header, quirks);
+			if (ret)
+				dev_warn(&pdev->dev,
+					 "Failed to add device for DVSEC id %d\n",
+					 (*header)->id);
+			else
+				found_devices = true;
 
-		pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER1, &vid);
-		if (vid != PCI_VENDOR_ID_INTEL)
-			continue;
-
-		pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER2,
-				     &header.id);
-		pci_read_config_byte(pdev, pos + INTEL_DVSEC_ENTRIES,
-				     &header.num_entries);
-		pci_read_config_byte(pdev, pos + INTEL_DVSEC_SIZE,
-				     &header.entry_size);
-		pci_read_config_dword(pdev, pos + INTEL_DVSEC_TABLE,
-				      &table);
-
-		header.tbir = INTEL_DVSEC_TABLE_BAR(table);
-		header.offset = INTEL_DVSEC_TABLE_OFFSET(table);
-
-		ret = pmt_add_dev(pdev, &header, quirks);
-		if (ret)
-			continue;
-
-		found_devices = true;
-	} while (true);
+			++header;
+		}
+	} else {
+		do {
+			struct intel_dvsec_header header;
+			u32 table;
+			u16 vid;
+
+			pos = pci_find_next_ext_capability(pdev, pos, PCI_EXT_CAP_ID_DVSEC);
+			if (!pos)
+				break;
+
+			pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER1, &vid);
+			if (vid != PCI_VENDOR_ID_INTEL)
+				continue;
+
+			pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER2,
+					     &header.id);
+			pci_read_config_byte(pdev, pos + INTEL_DVSEC_ENTRIES,
+					     &header.num_entries);
+			pci_read_config_byte(pdev, pos + INTEL_DVSEC_SIZE,
+					     &header.entry_size);
+			pci_read_config_dword(pdev, pos + INTEL_DVSEC_TABLE,
+					      &table);
+
+			header.tbir = INTEL_DVSEC_TABLE_BAR(table);
+			header.offset = INTEL_DVSEC_TABLE_OFFSET(table);
+
+			ret = pmt_add_dev(pdev, &header, quirks);
+			if (ret)
+				continue;
+
+			found_devices = true;
+		} while (true);
+	}
 
 	if (!found_devices)
 		return -ENODEV;
@@ -195,10 +236,12 @@ static void pmt_pci_remove(struct pci_dev *pdev)
 }
 
 #define PCI_DEVICE_ID_INTEL_PMT_ADL	0x467d
+#define PCI_DEVICE_ID_INTEL_PMT_DG1	0x490e
 #define PCI_DEVICE_ID_INTEL_PMT_OOBMSM	0x09a7
 #define PCI_DEVICE_ID_INTEL_PMT_TGL	0x9a0d
 static const struct pci_device_id pmt_pci_ids[] = {
 	{ PCI_DEVICE_DATA(INTEL, PMT_ADL, &tgl_info) },
+	{ PCI_DEVICE_DATA(INTEL, PMT_DG1, &dg1_info) },
 	{ PCI_DEVICE_DATA(INTEL, PMT_OOBMSM, NULL) },
 	{ PCI_DEVICE_DATA(INTEL, PMT_TGL, &tgl_info) },
 	{ }
diff --git a/drivers/platform/x86/intel_pmt_class.c b/drivers/platform/x86/intel_pmt_class.c
index c8939fba4509..228e21f1ce5c 100644
--- a/drivers/platform/x86/intel_pmt_class.c
+++ b/drivers/platform/x86/intel_pmt_class.c
@@ -19,6 +19,28 @@
 #define PMT_XA_MAX		INT_MAX
 #define PMT_XA_LIMIT		XA_LIMIT(PMT_XA_START, PMT_XA_MAX)
 
+/*
+ * Early implementations of PMT on client platforms have some
+ * differences from the server platforms (which use the Out Of Band
+ * Management Services Module OOBMSM). This list tracks those
+ * platforms as needed to handle those differences. Newer client
+ * platforms are expected to be fully compatible with server.
+ */
+static const struct pci_device_id pmt_telem_early_client_pci_ids[] = {
+	{ PCI_VDEVICE(INTEL, 0x467d) }, /* ADL */
+	{ PCI_VDEVICE(INTEL, 0x490e) }, /* DG1 */
+	{ PCI_VDEVICE(INTEL, 0x9a0d) }, /* TGL */
+	{ }
+};
+
+bool intel_pmt_is_early_client_hw(struct device *dev)
+{
+	struct pci_dev *parent = to_pci_dev(dev->parent);
+
+	return !!pci_match_id(pmt_telem_early_client_pci_ids, parent);
+}
+EXPORT_SYMBOL_GPL(intel_pmt_is_early_client_hw);
+
 /*
  * sysfs
  */
@@ -147,6 +169,30 @@ static int intel_pmt_populate_entry(struct intel_pmt_entry *entry,
 		 * base address = end of discovery region + base offset
 		 */
 		entry->base_addr = disc_res->end + 1 + header->base_offset;
+
+		/*
+		 * Some hardware use a different calculation for the base address
+		 * when access_type == ACCESS_LOCAL. On the these systems
+		 * ACCCESS_LOCAL refers to an address in the same BAR as the
+		 * header but at a fixed offset. But as the header address was
+		 * supplied to the driver, we don't know which BAR it was in.
+		 * So search for the bar whose range includes the header address.
+		 */
+		if (intel_pmt_is_early_client_hw(dev)) {
+			int i;
+
+			entry->base_addr = 0;
+			for (i = 0; i < 6; i++)
+				if (disc_res->start >= pci_resource_start(pci_dev, i) &&
+				   (disc_res->start <= pci_resource_end(pci_dev, i))) {
+					entry->base_addr = pci_resource_start(pci_dev, i) +
+							   header->base_offset;
+					break;
+				}
+			if (!entry->base_addr)
+				return -EINVAL;
+		}
+
 		break;
 	case ACCESS_BARID:
 		/*
diff --git a/drivers/platform/x86/intel_pmt_class.h b/drivers/platform/x86/intel_pmt_class.h
index de8f8139ba31..1337019c2873 100644
--- a/drivers/platform/x86/intel_pmt_class.h
+++ b/drivers/platform/x86/intel_pmt_class.h
@@ -44,6 +44,7 @@ struct intel_pmt_namespace {
 				 struct device *dev);
 };
 
+bool intel_pmt_is_early_client_hw(struct device *dev);
 int intel_pmt_dev_create(struct intel_pmt_entry *entry,
 			 struct intel_pmt_namespace *ns,
 			 struct platform_device *pdev, int idx);
diff --git a/drivers/platform/x86/intel_pmt_telemetry.c b/drivers/platform/x86/intel_pmt_telemetry.c
index f8a87614efa4..9b95ef050457 100644
--- a/drivers/platform/x86/intel_pmt_telemetry.c
+++ b/drivers/platform/x86/intel_pmt_telemetry.c
@@ -34,26 +34,6 @@ struct pmt_telem_priv {
 	struct intel_pmt_entry		entry[];
 };
 
-/*
- * Early implementations of PMT on client platforms have some
- * differences from the server platforms (which use the Out Of Band
- * Management Services Module OOBMSM). This list tracks those
- * platforms as needed to handle those differences. Newer client
- * platforms are expected to be fully compatible with server.
- */
-static const struct pci_device_id pmt_telem_early_client_pci_ids[] = {
-	{ PCI_VDEVICE(INTEL, 0x9a0d) }, /* TGL */
-	{ PCI_VDEVICE(INTEL, 0x467d) }, /* ADL */
-	{ }
-};
-
-static bool intel_pmt_is_early_client_hw(struct device *dev)
-{
-	struct pci_dev *parent = to_pci_dev(dev->parent);
-
-	return !!pci_match_id(pmt_telem_early_client_pci_ids, parent);
-}
-
 static bool pmt_telem_region_overlaps(struct intel_pmt_entry *entry,
 				      struct device *dev)
 {
-- 
Gitee


From 5ac8a629650c114f25da7d9c41926640dfed6f53 Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Tue, 16 Mar 2021 19:44:54 -0700
Subject: [PATCH 25/63] platform/x86: intel_pmt_class: Initial resource to 0

mainline inclusion
from mainline-v5.13-rc1
commit 501bb68a66cfc0bc2a2458483400cb49daca974f
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9
CVE: NA

Intel-SIG: commit 501bb68a66cf platform/x86: intel_pmt_class: Initial resource
to 0.
Backport for intel PMT (Platform Monitoring Technology) support

--------------------------------

Initialize the struct resource in intel_pmt_dev_register to zero to avoid a
fault should the char *name field be non-zero.

Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Link: https://lore.kernel.org/r/20210317024455.3071477-1-david.e.box@linux.intel.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/platform/x86/intel_pmt_class.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/platform/x86/intel_pmt_class.c b/drivers/platform/x86/intel_pmt_class.c
index 228e21f1ce5c..c86ff15b1ed5 100644
--- a/drivers/platform/x86/intel_pmt_class.c
+++ b/drivers/platform/x86/intel_pmt_class.c
@@ -219,7 +219,7 @@ static int intel_pmt_dev_register(struct intel_pmt_entry *entry,
 				  struct intel_pmt_namespace *ns,
 				  struct device *parent)
 {
-	struct resource res;
+	struct resource res = {0};
 	struct device *dev;
 	int ret;
 
-- 
Gitee


From 8ab1b0c075ce22bcb036812c4ac4c898fdd15b43 Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Tue, 16 Mar 2021 19:44:55 -0700
Subject: [PATCH 26/63] platform/x86: intel_pmt_crashlog: Fix incorrect macros

mainline inclusion
from mainline-v5.12-rc5
commit 10c931cdfe64ebc38a15a485dd794915044f2111
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9
CVE: NA

Intel-SIG: commit 10c931cdfe64 platform/x86: intel_pmt_crashlog: Fix
incorrect macros.
Backport for intel PMT (Platform Monitoring Technology) support

--------------------------------

Fixes off-by-one bugs in the macro assignments for the crashlog control
bits. Was initially tested on emulation but bug revealed after testing on
silicon.

Fixes: 5ef9998c96b0 ("platform/x86: Intel PMT Crashlog capability driver")
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Link: https://lore.kernel.org/r/20210317024455.3071477-2-david.e.box@linux.intel.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/platform/x86/intel_pmt_crashlog.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/platform/x86/intel_pmt_crashlog.c b/drivers/platform/x86/intel_pmt_crashlog.c
index 97dd749c8290..92d315a16cfd 100644
--- a/drivers/platform/x86/intel_pmt_crashlog.c
+++ b/drivers/platform/x86/intel_pmt_crashlog.c
@@ -23,18 +23,17 @@
 #define CRASH_TYPE_OOBMSM	1
 
 /* Control Flags */
-#define CRASHLOG_FLAG_DISABLE		BIT(27)
+#define CRASHLOG_FLAG_DISABLE		BIT(28)
 
 /*
- * Bits 28 and 29 control the state of bit 31.
+ * Bits 29 and 30 control the state of bit 31.
  *
- * Bit 28 will clear bit 31, if set, allowing a new crashlog to be captured.
- * Bit 29 will immediately trigger a crashlog to be generated, setting bit 31.
- * Bit 30 is read-only and reserved as 0.
+ * Bit 29 will clear bit 31, if set, allowing a new crashlog to be captured.
+ * Bit 30 will immediately trigger a crashlog to be generated, setting bit 31.
  * Bit 31 is the read-only status with a 1 indicating log is complete.
  */
-#define CRASHLOG_FLAG_TRIGGER_CLEAR	BIT(28)
-#define CRASHLOG_FLAG_TRIGGER_EXECUTE	BIT(29)
+#define CRASHLOG_FLAG_TRIGGER_CLEAR	BIT(29)
+#define CRASHLOG_FLAG_TRIGGER_EXECUTE	BIT(30)
 #define CRASHLOG_FLAG_TRIGGER_COMPLETE	BIT(31)
 #define CRASHLOG_FLAG_TRIGGER_MASK	GENMASK(31, 28)
 
-- 
Gitee


From 5b15a15d7477110bc7cb53bfc039f87e6770634e Mon Sep 17 00:00:00 2001
From: Rikard Falkeborn <rikard.falkeborn@gmail.com>
Date: Sat, 5 Jun 2021 22:38:05 +0200
Subject: [PATCH 27/63] platform/x86: intel_pmt_crashlog: Constify static
 attribute_group struct

mainline inclusion
from mainline-v5.14-rc1
commit d24023e375704860c6c8b91c3af3034669aa1bc5
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9
CVE: NA

Intel-SIG: commit d24023e37570 platform/x86: intel_pmt_crashlog: Constify
static attribute_group struct.
Backport for intel PMT (Platform Monitoring Technology) support

--------------------------------

The only use of pmt_crashlog_group is to assign its address to the
attr_grp field in the intel_pmt_namespace struct, which is a pointer to
const attribute_group. Make it const to allow the compiler to put it in
read-only memory.

Signed-off-by: Rikard Falkeborn <rikard.falkeborn@gmail.com>
Link: https://lore.kernel.org/r/20210605203807.60547-3-rikard.falkeborn@gmail.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/platform/x86/intel_pmt_crashlog.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/platform/x86/intel_pmt_crashlog.c b/drivers/platform/x86/intel_pmt_crashlog.c
index 92d315a16cfd..56963ceb6345 100644
--- a/drivers/platform/x86/intel_pmt_crashlog.c
+++ b/drivers/platform/x86/intel_pmt_crashlog.c
@@ -218,7 +218,7 @@ static struct attribute *pmt_crashlog_attrs[] = {
 	NULL
 };
 
-static struct attribute_group pmt_crashlog_group = {
+static const struct attribute_group pmt_crashlog_group = {
 	.attrs	= pmt_crashlog_attrs,
 };
 
-- 
Gitee


From a4a2d906de5f6d830ee00690c119f1c99c9d91ec Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Tue, 17 Aug 2021 15:40:17 -0700
Subject: [PATCH 28/63] platform/x86: intel_pmt_telemetry: Ignore zero sized
 entries

mainline inclusion
from mainline-v5.15-rc1
commit ef195e8a7f43924b9979b2cd81ac7fa54f24bb3c
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596K9
CVE: NA

Intel-SIG: commit ef195e8a7f43 platform/x86: intel_pmt_telemetry: Ignore zero
sized entries.
Backport for intel PMT (Platform Monitoring Technology) support

--------------------------------

Some devices may expose non-functioning entries that are reserved for
future use. These entries have zero size. Ignore them during probe.

Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Link: https://lore.kernel.org/r/20210817224018.1013192-5-david.e.box@linux.intel.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: yingbao jia <yingbao.jia@intel.com>
Signed-off-by: Jun Tian <jun.j.tian@intel.com>
---
 drivers/platform/x86/intel_pmt_telemetry.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/platform/x86/intel_pmt_telemetry.c b/drivers/platform/x86/intel_pmt_telemetry.c
index 9b95ef050457..9f845e70a1f8 100644
--- a/drivers/platform/x86/intel_pmt_telemetry.c
+++ b/drivers/platform/x86/intel_pmt_telemetry.c
@@ -61,6 +61,14 @@ static int pmt_telem_header_decode(struct intel_pmt_entry *entry,
 	/* Size is measured in DWORDS, but accessor returns bytes */
 	header->size = TELEM_SIZE(readl(disc_table));
 
+	/*
+	 * Some devices may expose non-functioning entries that are
+	 * reserved for future use. They have zero size. Do not fail
+	 * probe for these. Just ignore them.
+	 */
+	if (header->size == 0)
+		return 1;
+
 	return 0;
 }
 
-- 
Gitee


From 435e8cd7b681337e3f8e28b60f59e2213bdb676a Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 13 Nov 2020 00:01:17 +0200
Subject: [PATCH 29/63] x86/mm: Signal SIGSEGV with PF_SGX

mainline inclusion
from mainline-5.11
commit 74faeee06db81a06add0def6a394210c8fef0ab7
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 74faeee06db8 x86/mm: Signal SIGSEGV with PF_SGX.
Backport for SGX Foundations support

--------------------------------

The x86 architecture has a set of page fault error codes.  These indicate
things like whether the fault occurred from a write, or whether it
originated in userspace.

The SGX hardware architecture has its own per-page memory management
metadata (EPCM) [*] and hardware which is separate from the normal x86 MMU.
The architecture has a new page fault error code: PF_SGX.  This new error
code bit is set whenever a page fault occurs as the result of the SGX MMU.

These faults occur for a variety of reasons.  For instance, an access
attempt to enclave memory from outside the enclave causes a PF_SGX fault.
PF_SGX would also be set for permission conflicts, such as if a write to an
enclave page occurs and the page is marked read-write in the x86 page
tables but is read-only in the EPCM.

These faults do not always indicate errors, though.  SGX pages are
encrypted with a key that is destroyed at hardware reset, including
suspend. Throwing a SIGSEGV allows user space software to react and recover
when these events occur.

Include PF_SGX in the PF error codes list and throw SIGSEGV when it is
encountered.

[*] Intel SDM: 36.5.1 Enclave Page Cache Map (EPCM)

 [ bp: Add bit 15 to the comment above enum x86_pf_error_code too. ]

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Jethro Beekman <jethro@fortanix.com>
Link: https://lkml.kernel.org/r/20201112220135.165028-7-jarkko@kernel.org
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/include/asm/trap_pf.h |  2 ++
 arch/x86/mm/fault.c            | 12 ++++++++++++
 2 files changed, 14 insertions(+)

diff --git a/arch/x86/include/asm/trap_pf.h b/arch/x86/include/asm/trap_pf.h
index 305bc1214aef..10b1de500ab1 100644
--- a/arch/x86/include/asm/trap_pf.h
+++ b/arch/x86/include/asm/trap_pf.h
@@ -11,6 +11,7 @@
  *   bit 3 ==				1: use of reserved bit detected
  *   bit 4 ==				1: fault was an instruction fetch
  *   bit 5 ==				1: protection keys block access
+ *   bit 15 ==				1: SGX MMU page-fault
  */
 enum x86_pf_error_code {
 	X86_PF_PROT	=		1 << 0,
@@ -19,6 +20,7 @@ enum x86_pf_error_code {
 	X86_PF_RSVD	=		1 << 3,
 	X86_PF_INSTR	=		1 << 4,
 	X86_PF_PK	=		1 << 5,
+	X86_PF_SGX	=		1 << 15,
 };
 
 #endif /* _ASM_X86_TRAP_PF_H */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 34112c63b347..058ff3f6944c 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1123,6 +1123,18 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
 	if (error_code & X86_PF_PK)
 		return 1;
 
+	/*
+	 * SGX hardware blocked the access.  This usually happens
+	 * when the enclave memory contents have been destroyed, like
+	 * after a suspend/resume cycle. In any case, the kernel can't
+	 * fix the cause of the fault.  Handle the fault as an access
+	 * error even in cases where no actual access violation
+	 * occurred.  This allows userspace to rebuild the enclave in
+	 * response to the signal.
+	 */
+	if (unlikely(error_code & X86_PF_SGX))
+		return 1;
+
 	/*
 	 * Make sure to check the VMA so that we do not perform
 	 * faults just to hit a X86_PF_PK as soon as we fill in a
-- 
Gitee


From 541221e59205f548173bde5786c9bbb01c1675f8 Mon Sep 17 00:00:00 2001
From: Kai Huang <kai.huang@intel.com>
Date: Fri, 19 Mar 2021 20:22:17 +1300
Subject: [PATCH 30/63] x86/cpufeatures: Make SGX_LC feature bit depend on SGX
 bit

mainline inclusion
from mainline-5.13
commit e9a15a40e857fc6ccfbb05fec7b184e9003057df
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit e9a15a40e857 x86/cpufeatures: Make SGX_LC feature bit
depend on SGX bit.
Backport for SGX virtualization support

--------------------------------

Move SGX_LC feature bit to CPUID dependency table to make clearing all
SGX feature bits easier. Also remove clear_sgx_caps() since it is just
a wrapper of setup_clear_cpu_cap(X86_FEATURE_SGX) now.

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Sean Christopherson <seanjc@google.com>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Link: https://lkml.kernel.org/r/5d4220fd0a39f52af024d3fa166231c1d498dd10.1616136308.git.kai.huang@intel.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kernel/cpu/cpuid-deps.c |  1 +
 arch/x86/kernel/cpu/feat_ctl.c   | 12 ++----
 find_commits_in_repo.sh          | 73 ++++++++++++++++++++++++++++++++
 3 files changed, 77 insertions(+), 9 deletions(-)
 create mode 100755 find_commits_in_repo.sh

diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index d502241995a3..b34632ea6d70 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -71,6 +71,7 @@ static const struct cpuid_dep cpuid_deps[] = {
 	{ X86_FEATURE_AVX512_BF16,		X86_FEATURE_AVX512VL  },
 	{ X86_FEATURE_ENQCMD,			X86_FEATURE_XSAVES    },
 	{ X86_FEATURE_PER_THREAD_MBA,		X86_FEATURE_MBA       },
+	{ X86_FEATURE_SGX_LC,			X86_FEATURE_SGX	      },
 	{}
 };
 
diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c
index 3b1b01f2b248..27533a6e04fa 100644
--- a/arch/x86/kernel/cpu/feat_ctl.c
+++ b/arch/x86/kernel/cpu/feat_ctl.c
@@ -93,15 +93,9 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c)
 }
 #endif /* CONFIG_X86_VMX_FEATURE_NAMES */
 
-static void clear_sgx_caps(void)
-{
-	setup_clear_cpu_cap(X86_FEATURE_SGX);
-	setup_clear_cpu_cap(X86_FEATURE_SGX_LC);
-}
-
 static int __init nosgx(char *str)
 {
-	clear_sgx_caps();
+	setup_clear_cpu_cap(X86_FEATURE_SGX);
 
 	return 0;
 }
@@ -116,7 +110,7 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
 
 	if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) {
 		clear_cpu_cap(c, X86_FEATURE_VMX);
-		clear_sgx_caps();
+		clear_cpu_cap(c, X86_FEATURE_SGX);
 		return;
 	}
 
@@ -177,6 +171,6 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
 	    !(msr & FEAT_CTL_SGX_LC_ENABLED) || !enable_sgx) {
 		if (enable_sgx)
 			pr_err_once("SGX disabled by BIOS\n");
-		clear_sgx_caps();
+		clear_cpu_cap(c, X86_FEATURE_SGX);
 	}
 }
diff --git a/find_commits_in_repo.sh b/find_commits_in_repo.sh
new file mode 100755
index 000000000000..6527d10ae53e
--- /dev/null
+++ b/find_commits_in_repo.sh
@@ -0,0 +1,73 @@
+#!/usr/bin/env bash
+#set -x
+
+PATCH_DIR=/local/work/SGX/SGX-host-virt-Backport-UniCloud-20220317
+PATCH_DIR_MISS="${PATCH_DIR}.miss"
+REPO_DIR="$(pwd)"
+
+function copy_missing_patches()
+{
+	local missing_commits=$1
+
+	rm -rf ${PATCH_DIR_MISS}
+	mkdir -p ${PATCH_DIR_MISS}
+
+	OLDIFS=$IFS
+	IFS=$'\n'
+	for i in $(echo -e "$missing_commits"); do
+		local p=$(echo $i | cut -d':' -f1)
+		cp ${PATCH_DIR}/${p} ${PATCH_DIR_MISS}
+	done
+	IFS=$OLDIFS
+}
+
+function main()
+{
+	local line=""
+	local missing_commits=""
+	local commit_list="$(grep -o "^commit .\+ upstream" ${PATCH_DIR}/*)"
+	local total_commit_nr=0
+	local find_commit_nr=0
+
+	cd $REPO_DIR
+	echo "Find commits:"
+	echo "----------------------------------------+------------+--------------------"
+	echo "Old Commit                              | New Commit | Subject"
+	echo "----------------------------------------|------------|--------------------"
+	OLDIFS=$IFS
+	IFS=$'\n'
+	for i in $commit_list; do
+		local c=$(echo $i | cut -f2 -d" ")
+		(( ++total_commit_nr ))
+		if git merge-base --is-ancestor $c HEAD 2>/dev/null ; then
+			# Current branch contains the commit directly
+			line="$(git log $c -1 --oneline)"
+		else
+			# The commit might have been backported to current branch
+			line="$(git log --grep=$c --oneline)"
+		fi
+		if [ -n "$line" ]; then
+			(( ++find_commit_nr ))
+			# Replace the first space to "|"
+			line=${line/ /|}
+			echo "$c|$line"
+		else
+			missing_commits="${missing_commits}\n$(basename ${i})"
+		fi
+	done
+	IFS=$OLDIFS
+
+	local missing_commit_nr=$(( $total_commit_nr - $find_commit_nr ))
+	echo -e "\nMissing commits (${missing_commit_nr}/${total_commit_nr}):"
+	echo -n "-------------------------------"
+	echo -e "$missing_commits"
+
+	read -p "Do you want to copy the missing patches to [${PATCH_DIR_MISS}]? [y/N]" answer
+	if [ X"$answer" == X"y" ]; then
+		copy_missing_patches "$missing_commits"
+	else
+		echo "Do nothing"
+	fi
+}
+
+main "$*"
-- 
Gitee


From 89bd6e9476ea349064f84d9afff06464708bf96d Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 19 Mar 2021 20:22:18 +1300
Subject: [PATCH 31/63] x86/cpufeatures: Add SGX1 and SGX2 sub-features

mainline inclusion
from mainline-5.13
commit b8921dccf3b25798409d35155b5d127085de72c2
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit b8921dccf3b2 x86/cpufeatures: Add SGX1 and SGX2 sub-features.
Backport for SGX virtualization support

--------------------------------

Add SGX1 and SGX2 feature flags, via CPUID.0x12.0x0.EAX, as scattered
features, since adding a new leaf for only two bits would be wasteful.
As part of virtualizing SGX, KVM will expose the SGX CPUID leafs to its
guest, and to do so correctly needs to query hardware and kernel support
for SGX1 and SGX2.

Suppress both SGX1 and SGX2 from /proc/cpuinfo. SGX1 basically means
SGX, and for SGX2 there is no concrete use case of using it in
/proc/cpuinfo.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Link: https://lkml.kernel.org/r/d787827dbfca6b3210ac3e432e3ac1202727e786.1616136308.git.kai.huang@intel.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/include/asm/cpufeatures.h | 2 ++
 arch/x86/kernel/cpu/cpuid-deps.c   | 2 ++
 arch/x86/kernel/cpu/scattered.c    | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 1e9b13636f17..5fd7c396e5eb 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -291,6 +291,8 @@
 #define X86_FEATURE_FENCE_SWAPGS_KERNEL	(11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */
 #define X86_FEATURE_SPLIT_LOCK_DETECT	(11*32+ 6) /* #AC for split lock */
 #define X86_FEATURE_PER_THREAD_MBA	(11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */
+#define X86_FEATURE_SGX1		(11*32+ 8) /* "" Basic SGX */
+#define X86_FEATURE_SGX2		(11*32+ 9) /* "" SGX Enclave Dynamic Memory Management (EDMM) */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
 #define X86_FEATURE_AVX512_BF16		(12*32+ 5) /* AVX512 BFLOAT16 instructions */
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index b34632ea6d70..28c75066815c 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -72,6 +72,8 @@ static const struct cpuid_dep cpuid_deps[] = {
 	{ X86_FEATURE_ENQCMD,			X86_FEATURE_XSAVES    },
 	{ X86_FEATURE_PER_THREAD_MBA,		X86_FEATURE_MBA       },
 	{ X86_FEATURE_SGX_LC,			X86_FEATURE_SGX	      },
+	{ X86_FEATURE_SGX1,			X86_FEATURE_SGX       },
+	{ X86_FEATURE_SGX2,			X86_FEATURE_SGX1      },
 	{}
 };
 
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 866c9a9bcdee..839b54a08e09 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -36,6 +36,8 @@ static const struct cpuid_bit cpuid_bits[] = {
 	{ X86_FEATURE_CDP_L2,		CPUID_ECX,  2, 0x00000010, 2 },
 	{ X86_FEATURE_MBA,		CPUID_EBX,  3, 0x00000010, 0 },
 	{ X86_FEATURE_PER_THREAD_MBA,	CPUID_ECX,  0, 0x00000010, 3 },
+	{ X86_FEATURE_SGX1,		CPUID_EAX,  0, 0x00000012, 0 },
+	{ X86_FEATURE_SGX2,		CPUID_EAX,  1, 0x00000012, 0 },
 	{ X86_FEATURE_HW_PSTATE,	CPUID_EDX,  7, 0x80000007, 0 },
 	{ X86_FEATURE_CPB,		CPUID_EDX,  9, 0x80000007, 0 },
 	{ X86_FEATURE_PROC_FEEDBACK,    CPUID_EDX, 11, 0x80000007, 0 },
-- 
Gitee


From 58fdbfbabc3d4f8da0b64f9648b6415aafc46d27 Mon Sep 17 00:00:00 2001
From: Kai Huang <kai.huang@intel.com>
Date: Thu, 25 Mar 2021 22:30:57 +1300
Subject: [PATCH 32/63] x86/sgx: Wipe out EREMOVE from sgx_free_epc_page()

mainline inclusion
from mainline-5.13
commit b0c7459be0670fabe080e30906ba9fe62df5e02c
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit b0c7459be067 x86/sgx: Wipe out EREMOVE from
sgx_free_epc_page().
Backport for SGX virtualization support

--------------------------------

EREMOVE takes a page and removes any association between that page and
an enclave. It must be run on a page before it can be added into another
enclave. Currently, EREMOVE is run as part of pages being freed into the
SGX page allocator. It is not expected to fail, as it would indicate a
use-after-free of EPC pages. Rather than add the page back to the pool
of available EPC pages, the kernel intentionally leaks the page to avoid
additional errors in the future.

However, KVM does not track how guest pages are used, which means that
SGX virtualization use of EREMOVE might fail. Specifically, it is
legitimate that EREMOVE returns SGX_CHILD_PRESENT for EPC assigned to
KVM guest, because KVM/kernel doesn't track SECS pages.

To allow SGX/KVM to introduce a more permissive EREMOVE helper and
to let the SGX virtualization code use the allocator directly, break
out the EREMOVE call from the SGX page allocator. Rename the original
sgx_free_epc_page() to sgx_encl_free_epc_page(), indicating that
it is used to free an EPC page assigned to a host enclave. Replace
sgx_free_epc_page() with sgx_encl_free_epc_page() in all call sites so
there's no functional change.

At the same time, improve the error message when EREMOVE fails, and
add documentation to explain to the user what that failure means and
to suggest to the user what to do when this bug happens in the case it
happens.

 [ bp: Massage commit message, fix typos and sanitize text, simplify. ]

Signed-off-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Link: https://lkml.kernel.org/r/20210325093057.122834-1-kai.huang@intel.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 Documentation/x86/sgx.rst       | 25 +++++++++++++++++++++++++
 arch/x86/kernel/cpu/sgx/encl.c  | 31 ++++++++++++++++++++++++++-----
 arch/x86/kernel/cpu/sgx/encl.h  |  1 +
 arch/x86/kernel/cpu/sgx/ioctl.c |  6 +++---
 arch/x86/kernel/cpu/sgx/main.c  | 14 +++++---------
 arch/x86/kernel/cpu/sgx/sgx.h   |  4 ++++
 6 files changed, 64 insertions(+), 17 deletions(-)

diff --git a/Documentation/x86/sgx.rst b/Documentation/x86/sgx.rst
index eaee1368b4fd..f90076e67cde 100644
--- a/Documentation/x86/sgx.rst
+++ b/Documentation/x86/sgx.rst
@@ -209,3 +209,28 @@ An application may be loaded into a container enclave which is specially
 configured with a library OS and run-time which permits the application to run.
 The enclave run-time and library OS work together to execute the application
 when a thread enters the enclave.
+
+Impact of Potential Kernel SGX Bugs
+===================================
+
+EPC leaks
+---------
+
+When EPC page leaks happen, a WARNING like this is shown in dmesg:
+
+"EREMOVE returned ... and an EPC page was leaked.  SGX may become unusable..."
+
+This is effectively a kernel use-after-free of an EPC page, and due
+to the way SGX works, the bug is detected at freeing. Rather than
+adding the page back to the pool of available EPC pages, the kernel
+intentionally leaks the page to avoid additional errors in the future.
+
+When this happens, the kernel will likely soon leak more EPC pages, and
+SGX will likely become unusable because the memory available to SGX is
+limited. However, while this may be fatal to SGX, the rest of the kernel
+is unlikely to be impacted and should continue to work.
+
+As a result, when this happpens, user should stop running any new
+SGX workloads, (or just any new workloads), and migrate all valuable
+workloads. Although a machine reboot can recover all EPC memory, the bug
+should be reported to Linux developers.
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 97fb7efce224..3be203297988 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -78,7 +78,7 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page,
 
 	ret = __sgx_encl_eldu(encl_page, epc_page, secs_page);
 	if (ret) {
-		sgx_free_epc_page(epc_page);
+		sgx_encl_free_epc_page(epc_page);
 		return ERR_PTR(ret);
 	}
 
@@ -404,7 +404,7 @@ void sgx_encl_release(struct kref *ref)
 			if (sgx_unmark_page_reclaimable(entry->epc_page))
 				continue;
 
-			sgx_free_epc_page(entry->epc_page);
+			sgx_encl_free_epc_page(entry->epc_page);
 			encl->secs_child_cnt--;
 			entry->epc_page = NULL;
 		}
@@ -415,7 +415,7 @@ void sgx_encl_release(struct kref *ref)
 	xa_destroy(&encl->page_array);
 
 	if (!encl->secs_child_cnt && encl->secs.epc_page) {
-		sgx_free_epc_page(encl->secs.epc_page);
+		sgx_encl_free_epc_page(encl->secs.epc_page);
 		encl->secs.epc_page = NULL;
 	}
 
@@ -423,7 +423,7 @@ void sgx_encl_release(struct kref *ref)
 		va_page = list_first_entry(&encl->va_pages, struct sgx_va_page,
 					   list);
 		list_del(&va_page->list);
-		sgx_free_epc_page(va_page->epc_page);
+		sgx_encl_free_epc_page(va_page->epc_page);
 		kfree(va_page);
 	}
 
@@ -686,7 +686,7 @@ struct sgx_epc_page *sgx_alloc_va_page(void)
 	ret = __epa(sgx_get_epc_virt_addr(epc_page));
 	if (ret) {
 		WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret);
-		sgx_free_epc_page(epc_page);
+		sgx_encl_free_epc_page(epc_page);
 		return ERR_PTR(-EFAULT);
 	}
 
@@ -735,3 +735,24 @@ bool sgx_va_page_full(struct sgx_va_page *va_page)
 
 	return slot == SGX_VA_SLOT_COUNT;
 }
+
+/**
+ * sgx_encl_free_epc_page - free an EPC page assigned to an enclave
+ * @page:	EPC page to be freed
+ *
+ * Free an EPC page assigned to an enclave. It does EREMOVE for the page, and
+ * only upon success, it puts the page back to free page list.  Otherwise, it
+ * gives a WARNING to indicate page is leaked.
+ */
+void sgx_encl_free_epc_page(struct sgx_epc_page *page)
+{
+	int ret;
+
+	WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED);
+
+	ret = __eremove(sgx_get_epc_virt_addr(page));
+	if (WARN_ONCE(ret, EREMOVE_ERROR_MESSAGE, ret, ret))
+		return;
+
+	sgx_free_epc_page(page);
+}
diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h
index d8d30ccbef4c..6e74f85b6264 100644
--- a/arch/x86/kernel/cpu/sgx/encl.h
+++ b/arch/x86/kernel/cpu/sgx/encl.h
@@ -115,5 +115,6 @@ struct sgx_epc_page *sgx_alloc_va_page(void);
 unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page);
 void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset);
 bool sgx_va_page_full(struct sgx_va_page *va_page);
+void sgx_encl_free_epc_page(struct sgx_epc_page *page);
 
 #endif /* _X86_ENCL_H */
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
index 2e10367ea66c..354e309fcdb7 100644
--- a/arch/x86/kernel/cpu/sgx/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -47,7 +47,7 @@ static void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page)
 	encl->page_cnt--;
 
 	if (va_page) {
-		sgx_free_epc_page(va_page->epc_page);
+		sgx_encl_free_epc_page(va_page->epc_page);
 		list_del(&va_page->list);
 		kfree(va_page);
 	}
@@ -117,7 +117,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs)
 	return 0;
 
 err_out:
-	sgx_free_epc_page(encl->secs.epc_page);
+	sgx_encl_free_epc_page(encl->secs.epc_page);
 	encl->secs.epc_page = NULL;
 
 err_out_backing:
@@ -365,7 +365,7 @@ static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src,
 	mmap_read_unlock(current->mm);
 
 err_out_free:
-	sgx_free_epc_page(epc_page);
+	sgx_encl_free_epc_page(epc_page);
 	kfree(encl_page);
 
 	return ret;
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 13a7599ce7d4..b227629b1e9c 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -294,7 +294,7 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
 
 		sgx_encl_ewb(encl->secs.epc_page, &secs_backing);
 
-		sgx_free_epc_page(encl->secs.epc_page);
+		sgx_encl_free_epc_page(encl->secs.epc_page);
 		encl->secs.epc_page = NULL;
 
 		sgx_encl_put_backing(&secs_backing, true);
@@ -609,19 +609,15 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim)
  * sgx_free_epc_page() - Free an EPC page
  * @page:	an EPC page
  *
- * Call EREMOVE for an EPC page and insert it back to the list of free pages.
+ * Put the EPC page back to the list of free pages. It's the caller's
+ * responsibility to make sure that the page is in uninitialized state. In other
+ * words, do EREMOVE, EWB or whatever operation is necessary before calling
+ * this function.
  */
 void sgx_free_epc_page(struct sgx_epc_page *page)
 {
 	struct sgx_epc_section *section = &sgx_epc_sections[page->section];
 	struct sgx_numa_node *node = section->node;
-	int ret;
-
-	WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED);
-
-	ret = __eremove(sgx_get_epc_virt_addr(page));
-	if (WARN_ONCE(ret, "EREMOVE returned %d (0x%x)", ret, ret))
-		return;
 
 	spin_lock(&node->lock);
 
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
index 2a2b5c857451..65b2e88dbedd 100644
--- a/arch/x86/kernel/cpu/sgx/sgx.h
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -13,6 +13,10 @@
 #undef pr_fmt
 #define pr_fmt(fmt) "sgx: " fmt
 
+#define EREMOVE_ERROR_MESSAGE \
+	"EREMOVE returned %d (0x%x) and an EPC page was leaked. SGX may become unusable. " \
+	"Refer to Documentation/x86/sgx.rst for more information."
+
 #define SGX_MAX_EPC_SECTIONS		8
 #define SGX_EEXTEND_BLOCK_SIZE		256
 #define SGX_NR_TO_SCAN			16
-- 
Gitee


From bbcf51d333e8b315559910d0418b2d533d4d777f Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 19 Mar 2021 20:22:20 +1300
Subject: [PATCH 33/63] x86/sgx: Add SGX_CHILD_PRESENT hardware error code

mainline inclusion
from mainline-5.13
commit 231d3dbdda192e3b3c7b79f4c3b0616f6c7f31b7
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 231d3dbdda19 x86/sgx: Add SGX_CHILD_PRESENT hardware
error code.
Backport for SGX virtualization support

--------------------------------

SGX driver can accurately track how enclave pages are used.  This
enables SECS to be specifically targeted and EREMOVE'd only after all
child pages have been EREMOVE'd.  This ensures that SGX driver will
never encounter SGX_CHILD_PRESENT in normal operation.

Virtual EPC is different.  The host does not track how EPC pages are
used by the guest, so it cannot guarantee EREMOVE success.  It might,
for instance, encounter a SECS with a non-zero child count.

Add a definition of SGX_CHILD_PRESENT.  It will be used exclusively by
the SGX virtualization driver to handle recoverable EREMOVE errors when
saniziting EPC pages after they are freed.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Link: https://lkml.kernel.org/r/050b198e882afde7e6eba8e6a0d4da39161dbb5a.1616136308.git.kai.huang@intel.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/include/asm/sgx.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h
index d96e5451d28a..14bb5f7e221c 100644
--- a/arch/x86/include/asm/sgx.h
+++ b/arch/x86/include/asm/sgx.h
@@ -31,12 +31,14 @@
  * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV
  * %SGX_NOT_TRACKED:		Previous ETRACK's shootdown sequence has not
  *				been completed yet.
+ * %SGX_CHILD_PRESENT		SECS has child pages present in the EPC.
  * %SGX_INVALID_EINITTOKEN:	EINITTOKEN is invalid and enclave signer's
  *				public key does not match IA32_SGXLEPUBKEYHASH.
  * %SGX_UNMASKED_EVENT:		An unmasked event, e.g. INTR, was received
  */
 enum sgx_return_code {
 	SGX_NOT_TRACKED			= 11,
+	SGX_CHILD_PRESENT		= 13,
 	SGX_INVALID_EINITTOKEN		= 16,
 	SGX_UNMASKED_EVENT		= 128,
 };
-- 
Gitee


From 24a91cd29c0d70c5b7a643f6e62bb3017d42b6be Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 19 Mar 2021 20:22:21 +1300
Subject: [PATCH 34/63] x86/sgx: Introduce virtual EPC for use by KVM guests

mainline inclusion
from mainline-5.13
commit 540745ddbc70eabdc7dbd3fcc00fe4fb17cd59ba
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 540745ddbc70 x86/sgx: Introduce virtual EPC for use by
KVM guests.
Backport for SGX virtualization support

--------------------------------

Add a misc device /dev/sgx_vepc to allow userspace to allocate "raw"
Enclave Page Cache (EPC) without an associated enclave. The intended
and only known use case for raw EPC allocation is to expose EPC to a
KVM guest, hence the 'vepc' moniker, virt.{c,h} files and X86_SGX_KVM
Kconfig.

The SGX driver uses the misc device /dev/sgx_enclave to support
userspace in creating an enclave. Each file descriptor returned from
opening /dev/sgx_enclave represents an enclave. Unlike the SGX driver,
KVM doesn't control how the guest uses the EPC, therefore EPC allocated
to a KVM guest is not associated with an enclave, and /dev/sgx_enclave
is not suitable for allocating EPC for a KVM guest.

Having separate device nodes for the SGX driver and KVM virtual EPC also
allows separate permission control for running host SGX enclaves and KVM
SGX guests.

To use /dev/sgx_vepc to allocate a virtual EPC instance with particular
size, the hypervisor opens /dev/sgx_vepc, and uses mmap() with the
intended size to get an address range of virtual EPC. Then it may use
the address range to create one KVM memory slot as virtual EPC for
a guest.

Implement the "raw" EPC allocation in the x86 core-SGX subsystem via
/dev/sgx_vepc rather than in KVM. Doing so has two major advantages:

  - Does not require changes to KVM's uAPI, e.g. EPC gets handled as
    just another memory backend for guests.

  - EPC management is wholly contained in the SGX subsystem, e.g. SGX
    does not have to export any symbols, changes to reclaim flows don't
    need to be routed through KVM, SGX's dirty laundry doesn't have to
    get aired out for the world to see, and so on and so forth.

The virtual EPC pages allocated to guests are currently not reclaimable.
Reclaiming an EPC page used by enclave requires a special reclaim
mechanism separate from normal page reclaim, and that mechanism is not
supported for virutal EPC pages. Due to the complications of handling
reclaim conflicts between guest and host, reclaiming virtual EPC pages
is significantly more complex than basic support for SGX virtualization.

 [ bp:
   - Massage commit message and comments
   - use cpu_feature_enabled()
   - vertically align struct members init
   - massage Virtual EPC clarification text
   - move Kconfig prompt to Virtualization ]

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Co-developed-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Link: https://lkml.kernel.org/r/0c38ced8c8e5a69872db4d6a1c0dabd01e07cad7.1616136308.git.kai.huang@intel.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 Documentation/x86/sgx.rst        |  16 ++
 arch/x86/kernel/cpu/sgx/Makefile |   1 +
 arch/x86/kernel/cpu/sgx/sgx.h    |   9 ++
 arch/x86/kernel/cpu/sgx/virt.c   | 259 +++++++++++++++++++++++++++++++
 arch/x86/kvm/Kconfig             |  12 ++
 5 files changed, 297 insertions(+)
 create mode 100644 arch/x86/kernel/cpu/sgx/virt.c

diff --git a/Documentation/x86/sgx.rst b/Documentation/x86/sgx.rst
index f90076e67cde..dd0ac96ff9ef 100644
--- a/Documentation/x86/sgx.rst
+++ b/Documentation/x86/sgx.rst
@@ -234,3 +234,19 @@ As a result, when this happpens, user should stop running any new
 SGX workloads, (or just any new workloads), and migrate all valuable
 workloads. Although a machine reboot can recover all EPC memory, the bug
 should be reported to Linux developers.
+
+
+Virtual EPC
+===========
+
+The implementation has also a virtual EPC driver to support SGX enclaves
+in guests. Unlike the SGX driver, an EPC page allocated by the virtual
+EPC driver doesn't have a specific enclave associated with it. This is
+because KVM doesn't track how a guest uses EPC pages.
+
+As a result, the SGX core page reclaimer doesn't support reclaiming EPC
+pages allocated to KVM guests through the virtual EPC driver. If the
+user wants to deploy SGX applications both on the host and in guests
+on the same machine, the user should reserve enough EPC (by taking out
+total virtual EPC size of all SGX VMs from the physical EPC size) for
+host SGX applications so they can run with acceptable performance.
diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile
index 91d3dc784a29..9c1656779b2a 100644
--- a/arch/x86/kernel/cpu/sgx/Makefile
+++ b/arch/x86/kernel/cpu/sgx/Makefile
@@ -3,3 +3,4 @@ obj-y += \
 	encl.o \
 	ioctl.o \
 	main.o
+obj-$(CONFIG_X86_SGX_KVM)	+= virt.o
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
index 65b2e88dbedd..e4cbc71bf136 100644
--- a/arch/x86/kernel/cpu/sgx/sgx.h
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -84,4 +84,13 @@ void sgx_mark_page_reclaimable(struct sgx_epc_page *page);
 int sgx_unmark_page_reclaimable(struct sgx_epc_page *page);
 struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim);
 
+#ifdef CONFIG_X86_SGX_KVM
+int __init sgx_vepc_init(void);
+#else
+static inline int __init sgx_vepc_init(void)
+{
+	return -ENODEV;
+}
+#endif
+
 #endif /* _X86_SGX_H */
diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
new file mode 100644
index 000000000000..259cc46ad78c
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -0,0 +1,259 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Device driver to expose SGX enclave memory to KVM guests.
+ *
+ * Copyright(c) 2021 Intel Corporation.
+ */
+
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/slab.h>
+#include <linux/xarray.h>
+#include <asm/sgx.h>
+#include <uapi/asm/sgx.h>
+
+#include "encls.h"
+#include "sgx.h"
+
+struct sgx_vepc {
+	struct xarray page_array;
+	struct mutex lock;
+};
+
+/*
+ * Temporary SECS pages that cannot be EREMOVE'd due to having child in other
+ * virtual EPC instances, and the lock to protect it.
+ */
+static struct mutex zombie_secs_pages_lock;
+static struct list_head zombie_secs_pages;
+
+static int __sgx_vepc_fault(struct sgx_vepc *vepc,
+			    struct vm_area_struct *vma, unsigned long addr)
+{
+	struct sgx_epc_page *epc_page;
+	unsigned long index, pfn;
+	int ret;
+
+	WARN_ON(!mutex_is_locked(&vepc->lock));
+
+	/* Calculate index of EPC page in virtual EPC's page_array */
+	index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start);
+
+	epc_page = xa_load(&vepc->page_array, index);
+	if (epc_page)
+		return 0;
+
+	epc_page = sgx_alloc_epc_page(vepc, false);
+	if (IS_ERR(epc_page))
+		return PTR_ERR(epc_page);
+
+	ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL));
+	if (ret)
+		goto err_free;
+
+	pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page));
+
+	ret = vmf_insert_pfn(vma, addr, pfn);
+	if (ret != VM_FAULT_NOPAGE) {
+		ret = -EFAULT;
+		goto err_delete;
+	}
+
+	return 0;
+
+err_delete:
+	xa_erase(&vepc->page_array, index);
+err_free:
+	sgx_free_epc_page(epc_page);
+	return ret;
+}
+
+static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	struct sgx_vepc *vepc = vma->vm_private_data;
+	int ret;
+
+	mutex_lock(&vepc->lock);
+	ret = __sgx_vepc_fault(vepc, vma, vmf->address);
+	mutex_unlock(&vepc->lock);
+
+	if (!ret)
+		return VM_FAULT_NOPAGE;
+
+	if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) {
+		mmap_read_unlock(vma->vm_mm);
+		return VM_FAULT_RETRY;
+	}
+
+	return VM_FAULT_SIGBUS;
+}
+
+const struct vm_operations_struct sgx_vepc_vm_ops = {
+	.fault = sgx_vepc_fault,
+};
+
+static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct sgx_vepc *vepc = file->private_data;
+
+	if (!(vma->vm_flags & VM_SHARED))
+		return -EINVAL;
+
+	vma->vm_ops = &sgx_vepc_vm_ops;
+	/* Don't copy VMA in fork() */
+	vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY;
+	vma->vm_private_data = vepc;
+
+	return 0;
+}
+
+static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
+{
+	int ret;
+
+	/*
+	 * Take a previously guest-owned EPC page and return it to the
+	 * general EPC page pool.
+	 *
+	 * Guests can not be trusted to have left this page in a good
+	 * state, so run EREMOVE on the page unconditionally.  In the
+	 * case that a guest properly EREMOVE'd this page, a superfluous
+	 * EREMOVE is harmless.
+	 */
+	ret = __eremove(sgx_get_epc_virt_addr(epc_page));
+	if (ret) {
+		/*
+		 * Only SGX_CHILD_PRESENT is expected, which is because of
+		 * EREMOVE'ing an SECS still with child, in which case it can
+		 * be handled by EREMOVE'ing the SECS again after all pages in
+		 * virtual EPC have been EREMOVE'd. See comments in below in
+		 * sgx_vepc_release().
+		 *
+		 * The user of virtual EPC (KVM) needs to guarantee there's no
+		 * logical processor is still running in the enclave in guest,
+		 * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be
+		 * handled here.
+		 */
+		WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE,
+			  ret, ret);
+		return ret;
+	}
+
+	sgx_free_epc_page(epc_page);
+
+	return 0;
+}
+
+static int sgx_vepc_release(struct inode *inode, struct file *file)
+{
+	struct sgx_vepc *vepc = file->private_data;
+	struct sgx_epc_page *epc_page, *tmp, *entry;
+	unsigned long index;
+
+	LIST_HEAD(secs_pages);
+
+	xa_for_each(&vepc->page_array, index, entry) {
+		/*
+		 * Remove all normal, child pages.  sgx_vepc_free_page()
+		 * will fail if EREMOVE fails, but this is OK and expected on
+		 * SECS pages.  Those can only be EREMOVE'd *after* all their
+		 * child pages. Retries below will clean them up.
+		 */
+		if (sgx_vepc_free_page(entry))
+			continue;
+
+		xa_erase(&vepc->page_array, index);
+	}
+
+	/*
+	 * Retry EREMOVE'ing pages.  This will clean up any SECS pages that
+	 * only had children in this 'epc' area.
+	 */
+	xa_for_each(&vepc->page_array, index, entry) {
+		epc_page = entry;
+		/*
+		 * An EREMOVE failure here means that the SECS page still
+		 * has children.  But, since all children in this 'sgx_vepc'
+		 * have been removed, the SECS page must have a child on
+		 * another instance.
+		 */
+		if (sgx_vepc_free_page(epc_page))
+			list_add_tail(&epc_page->list, &secs_pages);
+
+		xa_erase(&vepc->page_array, index);
+	}
+
+	/*
+	 * SECS pages are "pinned" by child pages, and "unpinned" once all
+	 * children have been EREMOVE'd.  A child page in this instance
+	 * may have pinned an SECS page encountered in an earlier release(),
+	 * creating a zombie.  Since some children were EREMOVE'd above,
+	 * try to EREMOVE all zombies in the hopes that one was unpinned.
+	 */
+	mutex_lock(&zombie_secs_pages_lock);
+	list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) {
+		/*
+		 * Speculatively remove the page from the list of zombies,
+		 * if the page is successfully EREMOVE'd it will be added to
+		 * the list of free pages.  If EREMOVE fails, throw the page
+		 * on the local list, which will be spliced on at the end.
+		 */
+		list_del(&epc_page->list);
+
+		if (sgx_vepc_free_page(epc_page))
+			list_add_tail(&epc_page->list, &secs_pages);
+	}
+
+	if (!list_empty(&secs_pages))
+		list_splice_tail(&secs_pages, &zombie_secs_pages);
+	mutex_unlock(&zombie_secs_pages_lock);
+
+	kfree(vepc);
+
+	return 0;
+}
+
+static int sgx_vepc_open(struct inode *inode, struct file *file)
+{
+	struct sgx_vepc *vepc;
+
+	vepc = kzalloc(sizeof(struct sgx_vepc), GFP_KERNEL);
+	if (!vepc)
+		return -ENOMEM;
+	mutex_init(&vepc->lock);
+	xa_init(&vepc->page_array);
+
+	file->private_data = vepc;
+
+	return 0;
+}
+
+static const struct file_operations sgx_vepc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= sgx_vepc_open,
+	.release	= sgx_vepc_release,
+	.mmap		= sgx_vepc_mmap,
+};
+
+static struct miscdevice sgx_vepc_dev = {
+	.minor		= MISC_DYNAMIC_MINOR,
+	.name		= "sgx_vepc",
+	.nodename	= "sgx_vepc",
+	.fops		= &sgx_vepc_fops,
+};
+
+int __init sgx_vepc_init(void)
+{
+	/* SGX virtualization requires KVM to work */
+	if (!cpu_feature_enabled(X86_FEATURE_VMX))
+		return -ENODEV;
+
+	INIT_LIST_HEAD(&zombie_secs_pages);
+	mutex_init(&zombie_secs_pages_lock);
+
+	return misc_register(&sgx_vepc_dev);
+}
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index f92dfd8ef10d..71fb38d83e4a 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -84,6 +84,18 @@ config KVM_INTEL
 	  To compile this as a module, choose M here: the module
 	  will be called kvm-intel.
 
+config X86_SGX_KVM
+	bool "Software Guard eXtensions (SGX) Virtualization"
+	depends on X86_SGX && KVM_INTEL
+	help
+
+	  Enables KVM guests to create SGX enclaves.
+
+	  This includes support to expose "raw" unreclaimable enclave memory to
+	  guests via a device node, e.g. /dev/sgx_vepc.
+
+	  If unsure, say N.
+
 config KVM_AMD
 	tristate "KVM for AMD processors support"
 	depends on KVM
-- 
Gitee


From f0834b90c77a634811113b811dc2ae8e5648d321 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 19 Mar 2021 20:22:58 +1300
Subject: [PATCH 35/63] x86/cpu/intel: Allow SGX virtualization without Launch
 Control support

mainline inclusion
from mainline-5.13
commit 332bfc7becf479de8a55864cc5ed0024baea28aa
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 332bfc7becf4 x86/cpu/intel: Allow SGX virtualization
without Launch Control support.
Backport for SGX virtualization support

--------------------------------

The kernel will currently disable all SGX support if the hardware does
not support launch control.  Make it more permissive to allow SGX
virtualization on systems without Launch Control support.  This will
allow KVM to expose SGX to guests that have less-strict requirements on
the availability of flexible launch control.

Improve error message to distinguish between three cases.  There are two
cases where SGX support is completely disabled:
1) SGX has been disabled completely by the BIOS
2) SGX LC is locked by the BIOS.  Bare-metal support is disabled because
   of LC unavailability.  SGX virtualization is unavailable (because of
   Kconfig).
One where it is partially available:
3) SGX LC is locked by the BIOS.  Bare-metal support is disabled because
   of LC unavailability.  SGX virtualization is supported.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Co-developed-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Link: https://lkml.kernel.org/r/b3329777076509b3b601550da288c8f3c406a865.1616136308.git.kai.huang@intel.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kernel/cpu/feat_ctl.c | 59 +++++++++++++++++++++++++---------
 1 file changed, 44 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c
index 27533a6e04fa..da696eb4821a 100644
--- a/arch/x86/kernel/cpu/feat_ctl.c
+++ b/arch/x86/kernel/cpu/feat_ctl.c
@@ -104,8 +104,9 @@ early_param("nosgx", nosgx);
 
 void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
 {
+	bool enable_sgx_kvm = false, enable_sgx_driver = false;
 	bool tboot = tboot_enabled();
-	bool enable_sgx;
+	bool enable_vmx;
 	u64 msr;
 
 	if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) {
@@ -114,13 +115,19 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
 		return;
 	}
 
-	/*
-	 * Enable SGX if and only if the kernel supports SGX and Launch Control
-	 * is supported, i.e. disable SGX if the LE hash MSRs can't be written.
-	 */
-	enable_sgx = cpu_has(c, X86_FEATURE_SGX) &&
-		     cpu_has(c, X86_FEATURE_SGX_LC) &&
-		     IS_ENABLED(CONFIG_X86_SGX);
+	enable_vmx = cpu_has(c, X86_FEATURE_VMX) &&
+		     IS_ENABLED(CONFIG_KVM_INTEL);
+
+	if (cpu_has(c, X86_FEATURE_SGX) && IS_ENABLED(CONFIG_X86_SGX)) {
+		/*
+		 * Separate out SGX driver enabling from KVM.  This allows KVM
+		 * guests to use SGX even if the kernel SGX driver refuses to
+		 * use it.  This happens if flexible Launch Control is not
+		 * available.
+		 */
+		enable_sgx_driver = cpu_has(c, X86_FEATURE_SGX_LC);
+		enable_sgx_kvm = enable_vmx && IS_ENABLED(CONFIG_X86_SGX_KVM);
+	}
 
 	if (msr & FEAT_CTL_LOCKED)
 		goto update_caps;
@@ -136,15 +143,18 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
 	 * i.e. KVM is enabled, to avoid unnecessarily adding an attack vector
 	 * for the kernel, e.g. using VMX to hide malicious code.
 	 */
-	if (cpu_has(c, X86_FEATURE_VMX) && IS_ENABLED(CONFIG_KVM_INTEL)) {
+	if (enable_vmx) {
 		msr |= FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
 
 		if (tboot)
 			msr |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX;
 	}
 
-	if (enable_sgx)
-		msr |= FEAT_CTL_SGX_ENABLED | FEAT_CTL_SGX_LC_ENABLED;
+	if (enable_sgx_kvm || enable_sgx_driver) {
+		msr |= FEAT_CTL_SGX_ENABLED;
+		if (enable_sgx_driver)
+			msr |= FEAT_CTL_SGX_LC_ENABLED;
+	}
 
 	wrmsrl(MSR_IA32_FEAT_CTL, msr);
 
@@ -167,10 +177,29 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
 	}
 
 update_sgx:
-	if (!(msr & FEAT_CTL_SGX_ENABLED) ||
-	    !(msr & FEAT_CTL_SGX_LC_ENABLED) || !enable_sgx) {
-		if (enable_sgx)
-			pr_err_once("SGX disabled by BIOS\n");
+	if (!(msr & FEAT_CTL_SGX_ENABLED)) {
+		if (enable_sgx_kvm || enable_sgx_driver)
+			pr_err_once("SGX disabled by BIOS.\n");
 		clear_cpu_cap(c, X86_FEATURE_SGX);
+		return;
+	}
+
+	/*
+	 * VMX feature bit may be cleared due to being disabled in BIOS,
+	 * in which case SGX virtualization cannot be supported either.
+	 */
+	if (!cpu_has(c, X86_FEATURE_VMX) && enable_sgx_kvm) {
+		pr_err_once("SGX virtualization disabled due to lack of VMX.\n");
+		enable_sgx_kvm = 0;
+	}
+
+	if (!(msr & FEAT_CTL_SGX_LC_ENABLED) && enable_sgx_driver) {
+		if (!enable_sgx_kvm) {
+			pr_err_once("SGX Launch Control is locked. Disable SGX.\n");
+			clear_cpu_cap(c, X86_FEATURE_SGX);
+		} else {
+			pr_err_once("SGX Launch Control is locked. Support SGX virtualization only.\n");
+			clear_cpu_cap(c, X86_FEATURE_SGX_LC);
+		}
 	}
 }
-- 
Gitee


From d6b7fb6968a186e8707952c1ed616fdf92075028 Mon Sep 17 00:00:00 2001
From: Kai Huang <kai.huang@intel.com>
Date: Fri, 19 Mar 2021 20:23:02 +1300
Subject: [PATCH 36/63] x86/sgx: Initialize virtual EPC driver even when SGX
 driver is disabled

mainline inclusion
from mainline-5.13
commit faa7d3e6f3b983a28bf0f88f82dcb1c162e61105
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit faa7d3e6f3b9 x86/sgx: Initialize virtual EPC driver
even when SGX driver is disabled.
Backport for SGX virtualization support

--------------------------------

Modify sgx_init() to always try to initialize the virtual EPC driver,
even if the SGX driver is disabled.  The SGX driver might be disabled
if SGX Launch Control is in locked mode, or not supported in the
hardware at all.  This allows (non-Linux) guests that support non-LC
configurations to use SGX.

 [ bp: De-silli-fy the test. ]

Signed-off-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Sean Christopherson <seanjc@google.com>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Link: https://lkml.kernel.org/r/d35d17a02bbf8feef83a536cec8b43746d4ea557.1616136308.git.kai.huang@intel.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kernel/cpu/sgx/main.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index b227629b1e9c..1c8a228b0104 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -743,8 +743,17 @@ static int __init sgx_init(void)
 		goto err_page_cache;
 	}
 
+	/*
+	 * Always try to initialize the native *and* KVM drivers.
+	 * The KVM driver is less picky than the native one and
+	 * can function if the native one is not supported on the
+	 * current system or fails to initialize.
+	 *
+	 * Error out only if both fail to initialize.
+	 */
 	ret = sgx_drv_init();
-	if (ret)
+
+	if (sgx_vepc_init() && ret)
 		goto err_kthread;
 
 	return 0;
-- 
Gitee


From e7dbc12c12e53c214d94983cab88f97e022209d2 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 19 Mar 2021 20:23:04 +1300
Subject: [PATCH 37/63] x86/sgx: Move ENCLS leaf definitions to sgx.h

mainline inclusion
from mainline-5.13
commit 9c55c78a73ce6e62a1d46ba6e4f242c23c29b812
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 9c55c78a73ce x86/sgx: Move ENCLS leaf definitions to sgx.h.
Backport for SGX virtualization support

--------------------------------

Move the ENCLS leaf definitions to sgx.h so that they can be used by
KVM.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Link: https://lkml.kernel.org/r/2e6cd7c5c1ced620cfcd292c3c6c382827fde6b2.1616136308.git.kai.huang@intel.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/include/asm/sgx.h      | 15 +++++++++++++++
 arch/x86/kernel/cpu/sgx/encls.h | 15 ---------------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h
index 14bb5f7e221c..34f44238d1d1 100644
--- a/arch/x86/include/asm/sgx.h
+++ b/arch/x86/include/asm/sgx.h
@@ -27,6 +27,21 @@
 /* The bitmask for the EPC section type. */
 #define SGX_CPUID_EPC_MASK	GENMASK(3, 0)
 
+enum sgx_encls_function {
+	ECREATE	= 0x00,
+	EADD	= 0x01,
+	EINIT	= 0x02,
+	EREMOVE	= 0x03,
+	EDGBRD	= 0x04,
+	EDGBWR	= 0x05,
+	EEXTEND	= 0x06,
+	ELDU	= 0x08,
+	EBLOCK	= 0x09,
+	EPA	= 0x0A,
+	EWB	= 0x0B,
+	ETRACK	= 0x0C,
+};
+
 /**
  * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV
  * %SGX_NOT_TRACKED:		Previous ETRACK's shootdown sequence has not
diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h
index 443188fe7e70..be5c49689980 100644
--- a/arch/x86/kernel/cpu/sgx/encls.h
+++ b/arch/x86/kernel/cpu/sgx/encls.h
@@ -11,21 +11,6 @@
 #include <asm/traps.h>
 #include "sgx.h"
 
-enum sgx_encls_function {
-	ECREATE	= 0x00,
-	EADD	= 0x01,
-	EINIT	= 0x02,
-	EREMOVE	= 0x03,
-	EDGBRD	= 0x04,
-	EDGBWR	= 0x05,
-	EEXTEND	= 0x06,
-	ELDU	= 0x08,
-	EBLOCK	= 0x09,
-	EPA	= 0x0A,
-	EWB	= 0x0B,
-	ETRACK	= 0x0C,
-};
-
 /**
  * ENCLS_FAULT_FLAG - flag signifying an ENCLS return code is a trapnr
  *
-- 
Gitee


From 13ce182d16ef2aca3cfc9db6b266a4246661e75e Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 19 Mar 2021 20:23:05 +1300
Subject: [PATCH 38/63] x86/sgx: Add SGX2 ENCLS leaf definitions (EAUG, EMODPR
 and EMODT)

mainline inclusion
from mainline-5.13
commit 32ddda8e445df3de477db14d386fb3518042224a
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 32ddda8e445d x86/sgx: Add SGX2 ENCLS leaf definitions
(EAUG, EMODPR and EMODT).
Backport for SGX virtualization support

--------------------------------

Define the ENCLS leafs that are available with SGX2, also referred to as
Enclave Dynamic Memory Management (EDMM).  The leafs will be used by KVM
to conditionally expose SGX2 capabilities to guests.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Link: https://lkml.kernel.org/r/5f0970c251ebcc6d5add132f0d750cc753b7060f.1616136308.git.kai.huang@intel.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/include/asm/sgx.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h
index 34f44238d1d1..3b025afec0a7 100644
--- a/arch/x86/include/asm/sgx.h
+++ b/arch/x86/include/asm/sgx.h
@@ -40,6 +40,9 @@ enum sgx_encls_function {
 	EPA	= 0x0A,
 	EWB	= 0x0B,
 	ETRACK	= 0x0C,
+	EAUG	= 0x0D,
+	EMODPR	= 0x0E,
+	EMODT	= 0x0F,
 };
 
 /**
-- 
Gitee


From 923146306d5774f1e792c10ddbc666331f952f94 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 19 Mar 2021 20:23:06 +1300
Subject: [PATCH 39/63] x86/sgx: Add encls_faulted() helper

mainline inclusion
from mainline-5.13
commit a67136b458e5e63822b19c35794451122fe2bf3e
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit a67136b458e5 x86/sgx: Add encls_faulted() helper.
Backport for SGX virtualization support

--------------------------------

Add a helper to extract the fault indicator from an encoded ENCLS return
value.  SGX virtualization will also need to detect ENCLS faults.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Link: https://lkml.kernel.org/r/c1f955898110de2f669da536fc6cf62e003dff88.1616136308.git.kai.huang@intel.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kernel/cpu/sgx/encls.h | 15 ++++++++++++++-
 arch/x86/kernel/cpu/sgx/ioctl.c |  2 +-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h
index be5c49689980..9b204843b78d 100644
--- a/arch/x86/kernel/cpu/sgx/encls.h
+++ b/arch/x86/kernel/cpu/sgx/encls.h
@@ -40,6 +40,19 @@
 	} while (0);							  \
 }
 
+/*
+ * encls_faulted() - Check if an ENCLS leaf faulted given an error code
+ * @ret:	the return value of an ENCLS leaf function call
+ *
+ * Return:
+ * - true:	ENCLS leaf faulted.
+ * - false:	Otherwise.
+ */
+static inline bool encls_faulted(int ret)
+{
+	return ret & ENCLS_FAULT_FLAG;
+}
+
 /**
  * encls_failed() - Check if an ENCLS function failed
  * @ret:	the return value of an ENCLS function call
@@ -50,7 +63,7 @@
  */
 static inline bool encls_failed(int ret)
 {
-	if (ret & ENCLS_FAULT_FLAG)
+	if (encls_faulted(ret))
 		return ENCLS_TRAPNR(ret) != X86_TRAP_PF;
 
 	return !!ret;
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
index 354e309fcdb7..11e3f9635c24 100644
--- a/arch/x86/kernel/cpu/sgx/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -568,7 +568,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
 		}
 	}
 
-	if (ret & ENCLS_FAULT_FLAG) {
+	if (encls_faulted(ret)) {
 		if (encls_failed(ret))
 			ENCLS_WARN(ret, "EINIT");
 
-- 
Gitee


From ea60a916d1bed960489396ac811e248eb139c1f2 Mon Sep 17 00:00:00 2001
From: Kai Huang <kai.huang@intel.com>
Date: Fri, 19 Mar 2021 20:23:07 +1300
Subject: [PATCH 40/63] x86/sgx: Add helper to update SGX_LEPUBKEYHASHn MSRs

mainline inclusion
from mainline-5.13
commit 73916b6a0c714258f9c2619408a66c6696a761a7
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 73916b6a0c71 x86/sgx: Add helper to update
SGX_LEPUBKEYHASHn MSRs.
Backport for SGX virtualization support

--------------------------------

Add a helper to update SGX_LEPUBKEYHASHn MSRs.  SGX virtualization also
needs to update those MSRs based on guest's "virtual" SGX_LEPUBKEYHASHn
before EINIT from guest.

Signed-off-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Link: https://lkml.kernel.org/r/dfb7cd39d4dd62ea27703b64afdd8bccb579f623.1616136308.git.kai.huang@intel.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kernel/cpu/sgx/ioctl.c |  5 ++---
 arch/x86/kernel/cpu/sgx/main.c  | 16 ++++++++++++++++
 arch/x86/kernel/cpu/sgx/sgx.h   |  2 ++
 3 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
index 11e3f9635c24..7be9c064a640 100644
--- a/arch/x86/kernel/cpu/sgx/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -495,7 +495,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
 			 void *token)
 {
 	u64 mrsigner[4];
-	int i, j, k;
+	int i, j;
 	void *addr;
 	int ret;
 
@@ -544,8 +544,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
 
 			preempt_disable();
 
-			for (k = 0; k < 4; k++)
-				wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + k, mrsigner[k]);
+			sgx_update_lepubkeyhash(mrsigner);
 
 			ret = __einit(sigstruct, token, addr);
 
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 1c8a228b0104..227f1e2ad9cf 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -727,6 +727,22 @@ static bool __init sgx_page_cache_init(void)
 	return true;
 }
 
+/*
+ * Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller.
+ * Bare-metal driver requires to update them to hash of enclave's signer
+ * before EINIT. KVM needs to update them to guest's virtual MSR values
+ * before doing EINIT from guest.
+ */
+void sgx_update_lepubkeyhash(u64 *lepubkeyhash)
+{
+	int i;
+
+	WARN_ON_ONCE(preemptible());
+
+	for (i = 0; i < 4; i++)
+		wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]);
+}
+
 static int __init sgx_init(void)
 {
 	int ret;
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
index e4cbc71bf136..4628acec0009 100644
--- a/arch/x86/kernel/cpu/sgx/sgx.h
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -93,4 +93,6 @@ static inline int __init sgx_vepc_init(void)
 }
 #endif
 
+void sgx_update_lepubkeyhash(u64 *lepubkeyhash);
+
 #endif /* _X86_SGX_H */
-- 
Gitee


From 1a3a6d3b4cf97568849b9fc71028f948f4fe5c9d Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 19 Mar 2021 20:23:08 +1300
Subject: [PATCH 41/63] x86/sgx: Add helpers to expose ECREATE and EINIT to KVM

mainline inclusion
from mainline-5.13
commit d155030b1e7c0e448aab22a803f7a71ea2e117d7
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit d155030b1e7c x86/sgx: Add helpers to expose ECREATE and
EINIT to KVM.
Backport for SGX virtualization support

--------------------------------

The host kernel must intercept ECREATE to impose policies on guests, and
intercept EINIT to be able to write guest's virtual SGX_LEPUBKEYHASH MSR
values to hardware before running guest's EINIT so it can run correctly
according to hardware behavior.

Provide wrappers around __ecreate() and __einit() to hide the ugliness
of overloading the ENCLS return value to encode multiple error formats
in a single int.  KVM will trap-and-execute ECREATE and EINIT as part
of SGX virtualization, and reflect ENCLS execution result to guest by
setting up guest's GPRs, or on an exception, injecting the correct fault
based on return value of __ecreate() and __einit().

Use host userspace addresses (provided by KVM based on guest physical
address of ENCLS parameters) to execute ENCLS/EINIT when possible.
Accesses to both EPC and memory originating from ENCLS are subject to
segmentation and paging mechanisms.  It's also possible to generate
kernel mappings for ENCLS parameters by resolving PFN but using
__uaccess_xx() is simpler.

 [ bp: Return early if the __user memory accesses fail, use
   cpu_feature_enabled(). ]

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Link: https://lkml.kernel.org/r/20e09daf559aa5e9e680a0b4b5fba940f1bad86e.1616136308.git.kai.huang@intel.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/include/asm/sgx.h     |   7 ++
 arch/x86/kernel/cpu/sgx/virt.c | 117 +++++++++++++++++++++++++++++++++
 2 files changed, 124 insertions(+)

diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h
index 3b025afec0a7..954042e04102 100644
--- a/arch/x86/include/asm/sgx.h
+++ b/arch/x86/include/asm/sgx.h
@@ -365,4 +365,11 @@ struct sgx_sigstruct {
  * comment!
  */
 
+#ifdef CONFIG_X86_SGX_KVM
+int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
+		     int *trapnr);
+int sgx_virt_einit(void __user *sigstruct, void __user *token,
+		   void __user *secs, u64 *lepubkeyhash, int *trapnr);
+#endif
+
 #endif /* _ASM_X86_SGX_H */
diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
index 259cc46ad78c..7d221eac716a 100644
--- a/arch/x86/kernel/cpu/sgx/virt.c
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -257,3 +257,120 @@ int __init sgx_vepc_init(void)
 
 	return misc_register(&sgx_vepc_dev);
 }
+
+/**
+ * sgx_virt_ecreate() - Run ECREATE on behalf of guest
+ * @pageinfo:	Pointer to PAGEINFO structure
+ * @secs:	Userspace pointer to SECS page
+ * @trapnr:	trap number injected to guest in case of ECREATE error
+ *
+ * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose
+ * of enforcing policies of guest's enclaves, and return the trap number
+ * which should be injected to guest in case of any ECREATE error.
+ *
+ * Return:
+ * -  0:	ECREATE was successful.
+ * - <0:	on error.
+ */
+int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
+		     int *trapnr)
+{
+	int ret;
+
+	/*
+	 * @secs is an untrusted, userspace-provided address.  It comes from
+	 * KVM and is assumed to be a valid pointer which points somewhere in
+	 * userspace.  This can fault and call SGX or other fault handlers when
+	 * userspace mapping @secs doesn't exist.
+	 *
+	 * Add a WARN() to make sure @secs is already valid userspace pointer
+	 * from caller (KVM), who should already have handled invalid pointer
+	 * case (for instance, made by malicious guest).  All other checks,
+	 * such as alignment of @secs, are deferred to ENCLS itself.
+	 */
+	if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE)))
+		return -EINVAL;
+
+	__uaccess_begin();
+	ret = __ecreate(pageinfo, (void *)secs);
+	__uaccess_end();
+
+	if (encls_faulted(ret)) {
+		*trapnr = ENCLS_TRAPNR(ret);
+		return -EFAULT;
+	}
+
+	/* ECREATE doesn't return an error code, it faults or succeeds. */
+	WARN_ON_ONCE(ret);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sgx_virt_ecreate);
+
+static int __sgx_virt_einit(void __user *sigstruct, void __user *token,
+			    void __user *secs)
+{
+	int ret;
+
+	/*
+	 * Make sure all userspace pointers from caller (KVM) are valid.
+	 * All other checks deferred to ENCLS itself.  Also see comment
+	 * for @secs in sgx_virt_ecreate().
+	 */
+#define SGX_EINITTOKEN_SIZE	304
+	if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) ||
+			 !access_ok(token, SGX_EINITTOKEN_SIZE) ||
+			 !access_ok(secs, PAGE_SIZE)))
+		return -EINVAL;
+
+	__uaccess_begin();
+	ret = __einit((void *)sigstruct, (void *)token, (void *)secs);
+	__uaccess_end();
+
+	return ret;
+}
+
+/**
+ * sgx_virt_einit() - Run EINIT on behalf of guest
+ * @sigstruct:		Userspace pointer to SIGSTRUCT structure
+ * @token:		Userspace pointer to EINITTOKEN structure
+ * @secs:		Userspace pointer to SECS page
+ * @lepubkeyhash:	Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values
+ * @trapnr:		trap number injected to guest in case of EINIT error
+ *
+ * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available
+ * in host, SGX driver may rewrite the hardware values at wish, therefore KVM
+ * needs to update hardware values to guest's virtual MSR values in order to
+ * ensure EINIT is executed with expected hardware values.
+ *
+ * Return:
+ * -  0:	EINIT was successful.
+ * - <0:	on error.
+ */
+int sgx_virt_einit(void __user *sigstruct, void __user *token,
+		   void __user *secs, u64 *lepubkeyhash, int *trapnr)
+{
+	int ret;
+
+	if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) {
+		ret = __sgx_virt_einit(sigstruct, token, secs);
+	} else {
+		preempt_disable();
+
+		sgx_update_lepubkeyhash(lepubkeyhash);
+
+		ret = __sgx_virt_einit(sigstruct, token, secs);
+		preempt_enable();
+	}
+
+	/* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */
+	if (ret == -EINVAL)
+		return ret;
+
+	if (encls_faulted(ret)) {
+		*trapnr = ENCLS_TRAPNR(ret);
+		return -EFAULT;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(sgx_virt_einit);
-- 
Gitee


From 3cf59f5eaa117197f420426530a3f088c184f7a3 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 19 Mar 2021 20:23:09 +1300
Subject: [PATCH 42/63] x86/sgx: Move provisioning device creation out of SGX
 driver

mainline inclusion
from mainline-5.13
commit b3754e5d3da320af2bebb7a690002685c7f5c15c
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit b3754e5d3da3 x86/sgx: Move provisioning device creation
out of SGX driver.
Backport for SGX virtualization support

--------------------------------

And extract sgx_set_attribute() out of sgx_ioc_enclave_provision() and
export it as symbol for KVM to use.

The provisioning key is sensitive. The SGX driver only allows to create
an enclave which can access the provisioning key when the enclave
creator has permission to open /dev/sgx_provision. It should apply to
a VM as well, as the provisioning key is platform-specific, thus an
unrestricted VM can also potentially compromise the provisioning key.

Move the provisioning device creation out of sgx_drv_init() to
sgx_init() as a preparation for adding SGX virtualization support,
so that even if the SGX driver is not enabled due to flexible launch
control not being available, SGX virtualization can still be enabled,
and use it to restrict a VM's capability of being able to access the
provisioning key.

 [ bp: Massage commit message. ]

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Link: https://lkml.kernel.org/r/0f4d044d621561f26d5f4ef73e8dc6cd18cc7e79.1616136308.git.kai.huang@intel.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/include/asm/sgx.h       |  3 ++
 arch/x86/kernel/cpu/sgx/driver.c | 17 ----------
 arch/x86/kernel/cpu/sgx/ioctl.c  | 16 ++-------
 arch/x86/kernel/cpu/sgx/main.c   | 57 +++++++++++++++++++++++++++++++-
 4 files changed, 61 insertions(+), 32 deletions(-)

diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h
index 954042e04102..a16e2c9154a3 100644
--- a/arch/x86/include/asm/sgx.h
+++ b/arch/x86/include/asm/sgx.h
@@ -372,4 +372,7 @@ int sgx_virt_einit(void __user *sigstruct, void __user *token,
 		   void __user *secs, u64 *lepubkeyhash, int *trapnr);
 #endif
 
+int sgx_set_attribute(unsigned long *allowed_attributes,
+		      unsigned int attribute_fd);
+
 #endif /* _ASM_X86_SGX_H */
diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
index 8ce6d8371cfb..aa9b8b868867 100644
--- a/arch/x86/kernel/cpu/sgx/driver.c
+++ b/arch/x86/kernel/cpu/sgx/driver.c
@@ -136,10 +136,6 @@ static const struct file_operations sgx_encl_fops = {
 	.get_unmapped_area	= sgx_get_unmapped_area,
 };
 
-const struct file_operations sgx_provision_fops = {
-	.owner			= THIS_MODULE,
-};
-
 static struct miscdevice sgx_dev_enclave = {
 	.minor = MISC_DYNAMIC_MINOR,
 	.name = "sgx_enclave",
@@ -147,13 +143,6 @@ static struct miscdevice sgx_dev_enclave = {
 	.fops = &sgx_encl_fops,
 };
 
-static struct miscdevice sgx_dev_provision = {
-	.minor = MISC_DYNAMIC_MINOR,
-	.name = "sgx_provision",
-	.nodename = "sgx_provision",
-	.fops = &sgx_provision_fops,
-};
-
 int __init sgx_drv_init(void)
 {
 	unsigned int eax, ebx, ecx, edx;
@@ -187,11 +176,5 @@ int __init sgx_drv_init(void)
 	if (ret)
 		return ret;
 
-	ret = misc_register(&sgx_dev_provision);
-	if (ret) {
-		misc_deregister(&sgx_dev_enclave);
-		return ret;
-	}
-
 	return 0;
 }
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
index 7be9c064a640..83df20e3e633 100644
--- a/arch/x86/kernel/cpu/sgx/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -2,6 +2,7 @@
 /*  Copyright(c) 2016-20 Intel Corporation. */
 
 #include <asm/mman.h>
+#include <asm/sgx.h>
 #include <linux/mman.h>
 #include <linux/delay.h>
 #include <linux/file.h>
@@ -666,24 +667,11 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg)
 static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg)
 {
 	struct sgx_enclave_provision params;
-	struct file *file;
 
 	if (copy_from_user(&params, arg, sizeof(params)))
 		return -EFAULT;
 
-	file = fget(params.fd);
-	if (!file)
-		return -EINVAL;
-
-	if (file->f_op != &sgx_provision_fops) {
-		fput(file);
-		return -EINVAL;
-	}
-
-	encl->attributes_mask |= SGX_ATTR_PROVISIONKEY;
-
-	fput(file);
-	return 0;
+	return sgx_set_attribute(&encl->attributes_mask, params.fd);
 }
 
 long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 227f1e2ad9cf..92cb11dffd4c 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -1,14 +1,17 @@
 // SPDX-License-Identifier: GPL-2.0
 /*  Copyright(c) 2016-20 Intel Corporation. */
 
+#include <linux/file.h>
 #include <linux/freezer.h>
 #include <linux/highmem.h>
 #include <linux/kthread.h>
+#include <linux/miscdevice.h>
 #include <linux/pagemap.h>
 #include <linux/ratelimit.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/signal.h>
 #include <linux/slab.h>
+#include <asm/sgx.h>
 #include "driver.h"
 #include "encl.h"
 #include "encls.h"
@@ -743,6 +746,51 @@ void sgx_update_lepubkeyhash(u64 *lepubkeyhash)
 		wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]);
 }
 
+const struct file_operations sgx_provision_fops = {
+	.owner			= THIS_MODULE,
+};
+
+static struct miscdevice sgx_dev_provision = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "sgx_provision",
+	.nodename = "sgx_provision",
+	.fops = &sgx_provision_fops,
+};
+
+/**
+ * sgx_set_attribute() - Update allowed attributes given file descriptor
+ * @allowed_attributes:		Pointer to allowed enclave attributes
+ * @attribute_fd:		File descriptor for specific attribute
+ *
+ * Append enclave attribute indicated by file descriptor to allowed
+ * attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by
+ * /dev/sgx_provision is supported.
+ *
+ * Return:
+ * -0:		SGX_ATTR_PROVISIONKEY is appended to allowed_attributes
+ * -EINVAL:	Invalid, or not supported file descriptor
+ */
+int sgx_set_attribute(unsigned long *allowed_attributes,
+		      unsigned int attribute_fd)
+{
+	struct file *file;
+
+	file = fget(attribute_fd);
+	if (!file)
+		return -EINVAL;
+
+	if (file->f_op != &sgx_provision_fops) {
+		fput(file);
+		return -EINVAL;
+	}
+
+	*allowed_attributes |= SGX_ATTR_PROVISIONKEY;
+
+	fput(file);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sgx_set_attribute);
+
 static int __init sgx_init(void)
 {
 	int ret;
@@ -759,6 +807,10 @@ static int __init sgx_init(void)
 		goto err_page_cache;
 	}
 
+	ret = misc_register(&sgx_dev_provision);
+	if (ret)
+		goto err_kthread;
+
 	/*
 	 * Always try to initialize the native *and* KVM drivers.
 	 * The KVM driver is less picky than the native one and
@@ -770,10 +822,13 @@ static int __init sgx_init(void)
 	ret = sgx_drv_init();
 
 	if (sgx_vepc_init() && ret)
-		goto err_kthread;
+		goto err_provision;
 
 	return 0;
 
+err_provision:
+	misc_deregister(&sgx_dev_provision);
+
 err_kthread:
 	kthread_stop(ksgxd_tsk);
 
-- 
Gitee


From f13c9f87a42d5f862ee31d1f071c304a7c15a9ac Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 12 Apr 2021 16:21:33 +1200
Subject: [PATCH 43/63] KVM: x86: Export kvm_mmu_gva_to_gpa_{read,write}() for
 SGX (VMX)

mainline inclusion
from mainline-5.13
commit 54f958cdaa8c43c0e9b9ef850ae613a6e1bda44e
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 54f958cdaa8c KVM: x86: Export
kvm_mmu_gva_to_gpa_{read,write}() for SGX (VMX).
Backport for SGX virtualization support

--------------------------------

Export the gva_to_gpa() helpers for use by SGX virtualization when
executing ENCLS[ECREATE] and ENCLS[EINIT] on behalf of the guest.
To execute ECREATE and EINIT, KVM must obtain the GPA of the target
Secure Enclave Control Structure (SECS) in order to get its
corresponding HVA.

Because the SECS must reside in the Enclave Page Cache (EPC), copying
the SECS's data to a host-controlled buffer via existing exported
helpers is not a viable option as the EPC is not readable or writable
by the kernel.

SGX virtualization will also use gva_to_gpa() to obtain HVAs for
non-EPC pages in order to pass user pointers directly to ECREATE and
EINIT, which avoids having to copy pages worth of data into the kernel.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Message-Id: <02f37708321bcdfaa2f9d41c8478affa6e84b04d.1618196135.git.kai.huang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kvm/x86.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7da272aea5c4..3fb8b3c46298 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6002,6 +6002,7 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
 	u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
 	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read);
 
  gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
 				struct x86_exception *exception)
@@ -6018,6 +6019,7 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
 	access |= PFERR_WRITE_MASK;
 	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write);
 
 /* uses this to access any guest's mapped memory without checking CPL */
 gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
-- 
Gitee


From 8310e35147e1fc1b64d9804821c8bf48137e46fd Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 12 Apr 2021 16:21:34 +1200
Subject: [PATCH 44/63] KVM: x86: Define new #PF SGX error code bit

mainline inclusion
from mainline-5.13
commit 00e7646c3563d2f1a46a8fa1824c32373d77a8be
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 00e7646c3563 KVM: x86: Define new #PF SGX error code bit.
Backport for SGX virtualization support

--------------------------------

Page faults that are signaled by the SGX Enclave Page Cache Map (EPCM),
as opposed to the traditional IA32/EPT page tables, set an SGX bit in
the error code to indicate that the #PF was induced by SGX.  KVM will
need to emulate this behavior as part of its trap-and-execute scheme for
virtualizing SGX Launch Control, e.g. to inject SGX-induced #PFs if
EINIT faults in the host, and to support live migration.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Message-Id: <e170c5175cb9f35f53218a7512c9e3db972b97a2.1618196135.git.kai.huang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/include/asm/kvm_host.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4ab8f866e39d..b7a2d2293732 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -216,6 +216,7 @@ enum x86_intercept_stage;
 #define PFERR_RSVD_BIT 3
 #define PFERR_FETCH_BIT 4
 #define PFERR_PK_BIT 5
+#define PFERR_SGX_BIT 15
 #define PFERR_GUEST_FINAL_BIT 32
 #define PFERR_GUEST_PAGE_BIT 33
 
@@ -225,6 +226,7 @@ enum x86_intercept_stage;
 #define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
 #define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
 #define PFERR_PK_MASK (1U << PFERR_PK_BIT)
+#define PFERR_SGX_MASK (1U << PFERR_SGX_BIT)
 #define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT)
 #define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT)
 
-- 
Gitee


From be94d5dbaff7c29389bf893fbcc9219bb224541a Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Mon, 12 Apr 2021 16:21:35 +1200
Subject: [PATCH 45/63] KVM: x86: Add support for reverse CPUID lookup of
 scattered features

mainline inclusion
from mainline-5.13
commit 4e66c0cb79b732b01b82e094b21b8e22a20dff83
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 4e66c0cb79b7 KVM: x86: Add support for reverse CPUID
lookup of scattered features.
Backport for SGX virtualization support

--------------------------------

Introduce a scheme that allows KVM's CPUID magic to support features
that are scattered in the kernel's feature words.  To advertise and/or
query guest support for CPUID-based features, KVM requires the bit
number of an X86_FEATURE_* to match the bit number in its associated
CPUID entry.  For scattered features, this does not hold true.

Add a framework to allow defining KVM-only words, stored in
kvm_cpu_caps after the shared kernel caps, that can be used to gather
the scattered feature bits by translating X86_FEATURE_* flags into their
KVM-defined feature.

Note, because reverse_cpuid_check() effectively forces kvm_cpu_caps
lookups to be resolved at compile time, there is no runtime cost for
translating from kernel-defined to kvm-defined features.

More details here:  https://lkml.kernel.org/r/X/jxCOLG+HUO4QlZ@google.com

Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Message-Id: <16cad8d00475f67867fb36701fc7fb7c1ec86ce1.1618196135.git.kai.huang@intel.com>
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kvm/cpuid.c | 32 +++++++++++++++++++++++++++-----
 arch/x86/kvm/cpuid.h | 39 ++++++++++++++++++++++++++++++++++-----
 2 files changed, 61 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 41b0dc37720e..c5fa5c62e391 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -28,7 +28,7 @@
  * Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be
  * aligned to sizeof(unsigned long) because it's not accessed via bitops.
  */
-u32 kvm_cpu_caps[NCAPINTS] __read_mostly;
+u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
 EXPORT_SYMBOL_GPL(kvm_cpu_caps);
 
 static u32 xstate_required_size(u64 xstate_bv, bool compacted)
@@ -53,6 +53,7 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted)
 }
 
 #define F feature_bit
+#define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0)
 
 static inline struct kvm_cpuid_entry2 *cpuid_entry2_find(
 	struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index)
@@ -330,13 +331,13 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 	return r;
 }
 
-static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
+/* Mask kvm_cpu_caps for @leaf with the raw CPUID capabilities of this CPU. */
+static __always_inline void __kvm_cpu_cap_mask(enum cpuid_leafs leaf)
 {
 	const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32);
 	struct kvm_cpuid_entry2 entry;
 
 	reverse_cpuid_check(leaf);
-	kvm_cpu_caps[leaf] &= mask;
 
 	cpuid_count(cpuid.function, cpuid.index,
 		    &entry.eax, &entry.ebx, &entry.ecx, &entry.edx);
@@ -344,6 +345,26 @@ static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
 	kvm_cpu_caps[leaf] &= *__cpuid_entry_get_reg(&entry, cpuid.reg);
 }
 
+static __always_inline void kvm_cpu_cap_init_scattered(enum cpuid_leafs leaf, u32 mask)
+{
+	/* Use kvm_cpu_cap_mask for non-scattered leafs. */
+	BUILD_BUG_ON(leaf < NCAPINTS);
+
+	kvm_cpu_caps[leaf] = mask;
+
+	__kvm_cpu_cap_mask(leaf);
+}
+
+static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
+{
+	/* Use kvm_cpu_cap_init_scattered for scattered leafs. */
+	BUILD_BUG_ON(leaf >= NCAPINTS);
+
+	kvm_cpu_caps[leaf] &= mask;
+
+	__kvm_cpu_cap_mask(leaf);
+}
+
 void kvm_set_cpu_caps(void)
 {
 	unsigned int f_nx = is_efer_nx() ? F(NX) : 0;
@@ -354,12 +375,13 @@ void kvm_set_cpu_caps(void)
 	unsigned int f_gbpages = 0;
 	unsigned int f_lm = 0;
 #endif
+	memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps));
 
-	BUILD_BUG_ON(sizeof(kvm_cpu_caps) >
+	BUILD_BUG_ON(sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)) >
 		     sizeof(boot_cpu_data.x86_capability));
 
 	memcpy(&kvm_cpu_caps, &boot_cpu_data.x86_capability,
-	       sizeof(kvm_cpu_caps));
+	       sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)));
 
 	kvm_cpu_cap_mask(CPUID_1_ECX,
 		/*
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index dc921d76e42e..2041e2f07347 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -7,7 +7,20 @@
 #include <asm/processor.h>
 #include <uapi/asm/kvm_para.h>
 
-extern u32 kvm_cpu_caps[NCAPINTS] __read_mostly;
+/*
+ * Hardware-defined CPUID leafs that are scattered in the kernel, but need to
+ * be directly used by KVM.  Note, these word values conflict with the kernel's
+ * "bug" caps, but KVM doesn't use those.
+ */
+enum kvm_only_cpuid_leafs {
+	NR_KVM_CPU_CAPS = NCAPINTS,
+
+	NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
+};
+
+#define X86_KVM_FEATURE(w, f)		((w)*32 + (f))
+
+extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
 void kvm_set_cpu_caps(void);
 
 void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu);
@@ -83,6 +96,20 @@ static __always_inline void reverse_cpuid_check(unsigned int x86_leaf)
 	BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0);
 }
 
+/*
+ * Translate feature bits that are scattered in the kernel's cpufeatures word
+ * into KVM feature words that align with hardware's definitions.
+ */
+static __always_inline u32 __feature_translate(int x86_feature)
+{
+	return x86_feature;
+}
+
+static __always_inline u32 __feature_leaf(int x86_feature)
+{
+	return __feature_translate(x86_feature) / 32;
+}
+
 /*
  * Retrieve the bit mask from an X86_FEATURE_* definition.  Features contain
  * the hardware defined bit number (stored in bits 4:0) and a software defined
@@ -91,6 +118,8 @@ static __always_inline void reverse_cpuid_check(unsigned int x86_leaf)
  */
 static __always_inline u32 __feature_bit(int x86_feature)
 {
+	x86_feature = __feature_translate(x86_feature);
+
 	reverse_cpuid_check(x86_feature / 32);
 	return 1 << (x86_feature & 31);
 }
@@ -99,7 +128,7 @@ static __always_inline u32 __feature_bit(int x86_feature)
 
 static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned int x86_feature)
 {
-	unsigned int x86_leaf = x86_feature / 32;
+	unsigned int x86_leaf = __feature_leaf(x86_feature);
 
 	reverse_cpuid_check(x86_leaf);
 	return reverse_cpuid[x86_leaf];
@@ -291,7 +320,7 @@ static inline bool cpuid_fault_enabled(struct kvm_vcpu *vcpu)
 
 static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature)
 {
-	unsigned int x86_leaf = x86_feature / 32;
+	unsigned int x86_leaf = __feature_leaf(x86_feature);
 
 	reverse_cpuid_check(x86_leaf);
 	kvm_cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature);
@@ -299,7 +328,7 @@ static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature)
 
 static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature)
 {
-	unsigned int x86_leaf = x86_feature / 32;
+	unsigned int x86_leaf = __feature_leaf(x86_feature);
 
 	reverse_cpuid_check(x86_leaf);
 	kvm_cpu_caps[x86_leaf] |= __feature_bit(x86_feature);
@@ -307,7 +336,7 @@ static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature)
 
 static __always_inline u32 kvm_cpu_cap_get(unsigned int x86_feature)
 {
-	unsigned int x86_leaf = x86_feature / 32;
+	unsigned int x86_leaf = __feature_leaf(x86_feature);
 
 	reverse_cpuid_check(x86_leaf);
 	return kvm_cpu_caps[x86_leaf] & __feature_bit(x86_feature);
-- 
Gitee


From 1a1e6a1f68ce652c7c1470c2c8503c77e200aff6 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Mon, 12 Apr 2021 16:21:36 +1200
Subject: [PATCH 46/63] KVM: x86: Add reverse-CPUID lookup support for
 scattered SGX features

mainline inclusion
from mainline-5.13
commit 01de8682b32d3ed4f0136f7379e1e3ae2e563308
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 01de8682b32d KVM: x86: Add reverse-CPUID lookup support
for scattered SGX features.
Backport for SGX virtualization support

--------------------------------

Define a new KVM-only feature word for advertising and querying SGX
sub-features in CPUID.0x12.0x0.EAX.  Because SGX1 and SGX2 are scattered
in the kernel's feature word, they need to be translated so that the
bit numbers match those of hardware.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Message-Id: <e797c533f4c71ae89265bbb15a02aef86b67cbec.1618196135.git.kai.huang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kvm/cpuid.h | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 2041e2f07347..fbef5b730805 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -13,12 +13,17 @@
  * "bug" caps, but KVM doesn't use those.
  */
 enum kvm_only_cpuid_leafs {
-	NR_KVM_CPU_CAPS = NCAPINTS,
+	CPUID_12_EAX	 = NCAPINTS,
+	NR_KVM_CPU_CAPS,
 
 	NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
 };
 
-#define X86_KVM_FEATURE(w, f)		((w)*32 + (f))
+#define KVM_X86_FEATURE(w, f)		((w)*32 + (f))
+
+/* Intel-defined SGX sub-features, CPUID level 0x12 (EAX). */
+#define KVM_X86_FEATURE_SGX1		KVM_X86_FEATURE(CPUID_12_EAX, 0)
+#define KVM_X86_FEATURE_SGX2		KVM_X86_FEATURE(CPUID_12_EAX, 1)
 
 extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
 void kvm_set_cpu_caps(void);
@@ -76,6 +81,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
 	[CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
 	[CPUID_7_EDX]         = {         7, 0, CPUID_EDX},
 	[CPUID_7_1_EAX]       = {         7, 1, CPUID_EAX},
+	[CPUID_12_EAX]        = {0x00000012, 0, CPUID_EAX},
 };
 
 /*
@@ -102,6 +108,11 @@ static __always_inline void reverse_cpuid_check(unsigned int x86_leaf)
  */
 static __always_inline u32 __feature_translate(int x86_feature)
 {
+	if (x86_feature == X86_FEATURE_SGX1)
+		return KVM_X86_FEATURE_SGX1;
+	else if (x86_feature == X86_FEATURE_SGX2)
+		return KVM_X86_FEATURE_SGX2;
+
 	return x86_feature;
 }
 
-- 
Gitee


From 2b6c6bedc7fa2e0a13a088103f25e618abec2857 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 12 Apr 2021 16:21:37 +1200
Subject: [PATCH 47/63] KVM: VMX: Add basic handling of VM-Exit from SGX
 enclave

mainline inclusion
from mainline-5.13
commit 3c0c2ad1ae75963c05bf89ec91918c6a53a72696
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 3c0c2ad1ae75 KVM: VMX: Add basic handling of VM-Exit
from SGX enclave
Backport for SGX virtualization support

--------------------------------

Add support for handling VM-Exits that originate from a guest SGX
enclave.  In SGX, an "enclave" is a new CPL3-only execution environment,
wherein the CPU and memory state is protected by hardware to make the
state inaccesible to code running outside of the enclave.  When exiting
an enclave due to an asynchronous event (from the perspective of the
enclave), e.g. exceptions, interrupts, and VM-Exits, the enclave's state
is automatically saved and scrubbed (the CPU loads synthetic state), and
then reloaded when re-entering the enclave.  E.g. after an instruction
based VM-Exit from an enclave, vmcs.GUEST_RIP will not contain the RIP
of the enclave instruction that trigered VM-Exit, but will instead point
to a RIP in the enclave's untrusted runtime (the guest userspace code
that coordinates entry/exit to/from the enclave).

To help a VMM recognize and handle exits from enclaves, SGX adds bits to
existing VMCS fields, VM_EXIT_REASON.VMX_EXIT_REASON_FROM_ENCLAVE and
GUEST_INTERRUPTIBILITY_INFO.GUEST_INTR_STATE_ENCLAVE_INTR.  Define the
new architectural bits, and add a boolean to struct vcpu_vmx to cache
VMX_EXIT_REASON_FROM_ENCLAVE.  Clear the bit in exit_reason so that
checks against exit_reason do not need to account for SGX, e.g.
"if (exit_reason == EXIT_REASON_EXCEPTION_NMI)" continues to work.

KVM is a largely a passive observer of the new bits, e.g. KVM needs to
account for the bits when propagating information to a nested VMM, but
otherwise doesn't need to act differently for the majority of VM-Exits
from enclaves.

The one scenario that is directly impacted is emulation, which is for
all intents and purposes impossible[1] since KVM does not have access to
the RIP or instruction stream that triggered the VM-Exit.  The inability
to emulate is a non-issue for KVM, as most instructions that might
trigger VM-Exit unconditionally #UD in an enclave (before the VM-Exit
check.  For the few instruction that conditionally #UD, KVM either never
sets the exiting control, e.g. PAUSE_EXITING[2], or sets it if and only
if the feature is not exposed to the guest in order to inject a #UD,
e.g. RDRAND_EXITING.

But, because it is still possible for a guest to trigger emulation,
e.g. MMIO, inject a #UD if KVM ever attempts emulation after a VM-Exit
from an enclave.  This is architecturally accurate for instruction
VM-Exits, and for MMIO it's the least bad choice, e.g. it's preferable
to killing the VM.  In practice, only broken or particularly stupid
guests should ever encounter this behavior.

Add a WARN in skip_emulated_instruction to detect any attempt to
modify the guest's RIP during an SGX enclave VM-Exit as all such flows
should either be unreachable or must handle exits from enclaves before
getting to skip_emulated_instruction.

[1] Impossible for all practical purposes.  Not truly impossible
    since KVM could implement some form of para-virtualization scheme.

[2] PAUSE_LOOP_EXITING only affects CPL0 and enclaves exist only at
    CPL3, so we also don't need to worry about that interaction.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Message-Id: <315f54a8507d09c292463ef29104e1d4c62e9090.1618196135.git.kai.huang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/include/asm/vmx.h      |  1 +
 arch/x86/include/uapi/asm/vmx.h |  1 +
 arch/x86/kvm/vmx/nested.c       |  2 ++
 arch/x86/kvm/vmx/vmx.c          | 45 +++++++++++++++++++++++++++++++--
 4 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index f8ba5289ecb0..c6f028bac3ff 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -371,6 +371,7 @@ enum vmcs_field {
 #define GUEST_INTR_STATE_MOV_SS		0x00000002
 #define GUEST_INTR_STATE_SMI		0x00000004
 #define GUEST_INTR_STATE_NMI		0x00000008
+#define GUEST_INTR_STATE_ENCLAVE_INTR	0x00000010
 
 /* GUEST_ACTIVITY_STATE flags */
 #define GUEST_ACTIVITY_ACTIVE		0
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index b8ff9e8ac0d5..df6707a76a3d 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -27,6 +27,7 @@
 
 
 #define VMX_EXIT_REASONS_FAILED_VMENTRY         0x80000000
+#define VMX_EXIT_REASONS_SGX_ENCLAVE_MODE	0x08000000
 
 #define EXIT_REASON_EXCEPTION_NMI       0
 #define EXIT_REASON_EXTERNAL_INTERRUPT  1
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 0c2389d0fdaf..0f871faeff1a 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -4142,6 +4142,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 {
 	/* update exit information fields: */
 	vmcs12->vm_exit_reason = vm_exit_reason;
+	if (to_vmx(vcpu)->exit_reason.enclave_mode)
+		vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE;
 	vmcs12->exit_qualification = exit_qualification;
 	vmcs12->vm_exit_intr_info = exit_intr_info;
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index e208e54f5cad..24399db48fbc 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1573,12 +1573,25 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
 
 static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
 {
+	/*
+	 * Emulation of instructions in SGX enclaves is impossible as RIP does
+	 * not point  tthe failing instruction, and even if it did, the code
+	 * stream is inaccessible.  Inject #UD instead of exiting to userspace
+	 * so that guest userspace can't DoS the guest simply by triggering
+	 * emulation (enclaves are CPL3 only).
+	 */
+	if (to_vmx(vcpu)->exit_reason.enclave_mode) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return false;
+	}
 	return true;
 }
 
 static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
 {
+	union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason;
 	unsigned long rip, orig_rip;
+	u32 instr_len;
 
 	/*
 	 * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
@@ -1589,9 +1602,33 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
 	 * i.e. we end up advancing IP with some random value.
 	 */
 	if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
-	    to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
+	    exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
+		instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+
+		/*
+		 * Emulating an enclave's instructions isn't supported as KVM
+		 * cannot access the enclave's memory or its true RIP, e.g. the
+		 * vmcs.GUEST_RIP points at the exit point of the enclave, not
+		 * the RIP that actually triggered the VM-Exit.  But, because
+		 * most instructions that cause VM-Exit will #UD in an enclave,
+		 * most instruction-based VM-Exits simply do not occur.
+		 *
+		 * There are a few exceptions, notably the debug instructions
+		 * INT1ICEBRK and INT3, as they are allowed in debug enclaves
+		 * and generate #DB/#BP as expected, which KVM might intercept.
+		 * But again, the CPU does the dirty work and saves an instr
+		 * length of zero so VMMs don't shoot themselves in the foot.
+		 * WARN if KVM tries to skip a non-zero length instruction on
+		 * a VM-Exit from an enclave.
+		 */
+		if (!instr_len)
+			goto rip_updated;
+
+		WARN(exit_reason.enclave_mode,
+		     "KVM: skipping instruction after SGX enclave VM-Exit");
+
 		orig_rip = kvm_rip_read(vcpu);
-		rip = orig_rip + vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+		rip = orig_rip + instr_len;
 #ifdef CONFIG_X86_64
 		/*
 		 * We need to mask out the high 32 bits of RIP if not in 64-bit
@@ -1607,6 +1644,7 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
 			return 0;
 	}
 
+rip_updated:
 	/* skipping an emulated instruction also counts */
 	vmx_set_interrupt_shadow(vcpu, 0);
 
@@ -5445,6 +5483,9 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
 {
 	gpa_t gpa;
 
+	if (!vmx_can_emulate_instruction(vcpu, NULL, 0))
+		return 1;
+
 	/*
 	 * A nested guest cannot optimize MMIO vmexits, because we have an
 	 * nGPA here instead of the required GPA.
-- 
Gitee


From 5f56276e2df55c1d10c53b7e6c60871fa1dcdcbb Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 12 Apr 2021 16:21:38 +1200
Subject: [PATCH 48/63] KVM: VMX: Frame in ENCLS handler for SGX virtualization

mainline inclusion
from mainline-5.13
commit 9798adbc04cf1b14325dc7e2c882639693516a69
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 9798adbc04cf KVM: VMX: Frame in ENCLS handler for
SGX virtualization.
Backport for SGX virtualization support

--------------------------------

Introduce sgx.c and sgx.h, along with the framework for handling ENCLS
VM-Exits.  Add a bool, enable_sgx, that will eventually be wired up to a
module param to control whether or not SGX virtualization is enabled at
runtime.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Message-Id: <1c782269608b2f5e1034be450f375a8432fb705d.1618196135.git.kai.huang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kvm/Makefile  |  2 ++
 arch/x86/kvm/vmx/sgx.c | 50 ++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/vmx/sgx.h | 15 +++++++++++++
 arch/x86/kvm/vmx/vmx.c |  9 +++++---
 4 files changed, 73 insertions(+), 3 deletions(-)
 create mode 100644 arch/x86/kvm/vmx/sgx.c
 create mode 100644 arch/x86/kvm/vmx/sgx.h

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index b804444e16d4..1c6c9adcb730 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -20,6 +20,8 @@ kvm-y			+= x86.o emulate.o i8259.o irq.o lapic.o \
 
 kvm-intel-y		+= vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
 			   vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
+kvm-intel-$(CONFIG_X86_SGX_KVM)	+= vmx/sgx.o
+
 kvm-amd-y		+= svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o
 
 obj-$(CONFIG_KVM)	+= kvm.o
diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
new file mode 100644
index 000000000000..d874eb180b7d
--- /dev/null
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+/*  Copyright(c) 2021 Intel Corporation. */
+
+#include <asm/sgx.h>
+
+#include "cpuid.h"
+#include "kvm_cache_regs.h"
+#include "sgx.h"
+#include "vmx.h"
+#include "x86.h"
+
+bool __read_mostly enable_sgx;
+
+static inline bool encls_leaf_enabled_in_guest(struct kvm_vcpu *vcpu, u32 leaf)
+{
+	if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX))
+		return false;
+
+	if (leaf >= ECREATE && leaf <= ETRACK)
+		return guest_cpuid_has(vcpu, X86_FEATURE_SGX1);
+
+	if (leaf >= EAUG && leaf <= EMODT)
+		return guest_cpuid_has(vcpu, X86_FEATURE_SGX2);
+
+	return false;
+}
+
+static inline bool sgx_enabled_in_guest_bios(struct kvm_vcpu *vcpu)
+{
+	const u64 bits = FEAT_CTL_SGX_ENABLED | FEAT_CTL_LOCKED;
+
+	return (to_vmx(vcpu)->msr_ia32_feature_control & bits) == bits;
+}
+
+int handle_encls(struct kvm_vcpu *vcpu)
+{
+	u32 leaf = (u32)kvm_rax_read(vcpu);
+
+	if (!encls_leaf_enabled_in_guest(vcpu, leaf)) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+	} else if (!sgx_enabled_in_guest_bios(vcpu)) {
+		kvm_inject_gp(vcpu, 0);
+	} else {
+		WARN(1, "KVM: unexpected exit on ENCLS[%u]", leaf);
+		vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
+		vcpu->run->hw.hardware_exit_reason = EXIT_REASON_ENCLS;
+		return 0;
+	}
+	return 1;
+}
diff --git a/arch/x86/kvm/vmx/sgx.h b/arch/x86/kvm/vmx/sgx.h
new file mode 100644
index 000000000000..6e17ecd4aca3
--- /dev/null
+++ b/arch/x86/kvm/vmx/sgx.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_SGX_H
+#define __KVM_X86_SGX_H
+
+#include <linux/kvm_host.h>
+
+#ifdef CONFIG_X86_SGX_KVM
+extern bool __read_mostly enable_sgx;
+
+int handle_encls(struct kvm_vcpu *vcpu);
+#else
+#define enable_sgx 0
+#endif
+
+#endif /* __KVM_X86_SGX_H */
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 24399db48fbc..46fa34c03445 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -58,6 +58,7 @@
 #include "mmu.h"
 #include "nested.h"
 #include "pmu.h"
+#include "sgx.h"
 #include "trace.h"
 #include "vmcs.h"
 #include "vmcs12.h"
@@ -5737,16 +5738,18 @@ static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+#ifndef CONFIG_X86_SGX_KVM
 static int handle_encls(struct kvm_vcpu *vcpu)
 {
 	/*
-	 * SGX virtualization is not yet supported.  There is no software
-	 * enable bit for SGX, so we have to trap ENCLS and inject a #UD
-	 * to prevent the guest from executing ENCLS.
+	 * SGX virtualization is disabled.  There is no software enable bit for
+	 * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent
+	 * the guest from executing ENCLS (when SGX is supported by hardware).
 	 */
 	kvm_queue_exception(vcpu, UD_VECTOR);
 	return 1;
 }
+#endif /* CONFIG_X86_SGX_KVM */
 
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
-- 
Gitee


From 1f03df5667c18c6118e4f2c8c029f347efc421d5 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 12 Apr 2021 16:21:39 +1200
Subject: [PATCH 49/63] KVM: VMX: Add SGX ENCLS[ECREATE] handler to enforce
 CPUID restrictions

mainline inclusion
from mainline-5.13
commit 70210c044b4ea8f05e93ec62abc30cab4233ec88
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 70210c044b4e KVM: VMX: Add SGX ENCLS[ECREATE] handler
to enforce CPUID restrictions.
Backport for SGX virtualization support

--------------------------------

Add an ECREATE handler that will be used to intercept ECREATE for the
purpose of enforcing and enclave's MISCSELECT, ATTRIBUTES and XFRM, i.e.
to allow userspace to restrict SGX features via CPUID.  ECREATE will be
intercepted when any of the aforementioned masks diverges from hardware
in order to enforce the desired CPUID model, i.e. inject #GP if the
guest attempts to set a bit that hasn't been enumerated as allowed-1 in
CPUID.

Note, access to the PROVISIONKEY is not yet supported.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Co-developed-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Message-Id: <c3a97684f1b71b4f4626a1fc3879472a95651725.1618196135.git.kai.huang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/include/asm/kvm_host.h |   3 +
 arch/x86/kvm/vmx/sgx.c          | 275 ++++++++++++++++++++++++++++++++
 2 files changed, 278 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b7a2d2293732..cf3e3bb5a187 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -994,6 +994,9 @@ struct kvm_arch {
 
 	bool bus_lock_detection_enabled;
 
+	/* Guest can access the SGX PROVISIONKEY. */
+	bool sgx_provisioning_allowed;
+
 	/* Deflect RDMSR and WRMSR to user space when they trigger a #GP */
 	u32 user_space_msr_mask;
 
diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
index d874eb180b7d..2cf9d7cf818c 100644
--- a/arch/x86/kvm/vmx/sgx.c
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -11,6 +11,279 @@
 
 bool __read_mostly enable_sgx;
 
+/*
+ * ENCLS's memory operands use a fixed segment (DS) and a fixed
+ * address size based on the mode.  Related prefixes are ignored.
+ */
+static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset,
+			     int size, int alignment, gva_t *gva)
+{
+	struct kvm_segment s;
+	bool fault;
+
+	/* Skip vmcs.GUEST_DS retrieval for 64-bit mode to avoid VMREADs. */
+	*gva = offset;
+	if (!is_long_mode(vcpu)) {
+		vmx_get_segment(vcpu, &s, VCPU_SREG_DS);
+		*gva += s.base;
+	}
+
+	if (!IS_ALIGNED(*gva, alignment)) {
+		fault = true;
+	} else if (likely(is_long_mode(vcpu))) {
+		fault = is_noncanonical_address(*gva, vcpu);
+	} else {
+		*gva &= 0xffffffff;
+		fault = (s.unusable) ||
+			(s.type != 2 && s.type != 3) ||
+			(*gva > s.limit) ||
+			((s.base != 0 || s.limit != 0xffffffff) &&
+			(((u64)*gva + size - 1) > s.limit + 1));
+	}
+	if (fault)
+		kvm_inject_gp(vcpu, 0);
+	return fault ? -EINVAL : 0;
+}
+
+static void sgx_handle_emulation_failure(struct kvm_vcpu *vcpu, u64 addr,
+					 unsigned int size)
+{
+	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+	vcpu->run->internal.ndata = 2;
+	vcpu->run->internal.data[0] = addr;
+	vcpu->run->internal.data[1] = size;
+}
+
+static int sgx_read_hva(struct kvm_vcpu *vcpu, unsigned long hva, void *data,
+			unsigned int size)
+{
+	if (__copy_from_user(data, (void __user *)hva, size)) {
+		sgx_handle_emulation_failure(vcpu, hva, size);
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+static int sgx_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t gva, bool write,
+			  gpa_t *gpa)
+{
+	struct x86_exception ex;
+
+	if (write)
+		*gpa = kvm_mmu_gva_to_gpa_write(vcpu, gva, &ex);
+	else
+		*gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, &ex);
+
+	if (*gpa == UNMAPPED_GVA) {
+		kvm_inject_emulated_page_fault(vcpu, &ex);
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+static int sgx_gpa_to_hva(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long *hva)
+{
+	*hva = kvm_vcpu_gfn_to_hva(vcpu, PFN_DOWN(gpa));
+	if (kvm_is_error_hva(*hva)) {
+		sgx_handle_emulation_failure(vcpu, gpa, 1);
+		return -EFAULT;
+	}
+
+	*hva |= gpa & ~PAGE_MASK;
+
+	return 0;
+}
+
+static int sgx_inject_fault(struct kvm_vcpu *vcpu, gva_t gva, int trapnr)
+{
+	struct x86_exception ex;
+
+	/*
+	 * A non-EPCM #PF indicates a bad userspace HVA.  This *should* check
+	 * for PFEC.SGX and not assume any #PF on SGX2 originated in the EPC,
+	 * but the error code isn't (yet) plumbed through the ENCLS helpers.
+	 */
+	if (trapnr == PF_VECTOR && !boot_cpu_has(X86_FEATURE_SGX2)) {
+		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+		vcpu->run->internal.ndata = 0;
+		return 0;
+	}
+
+	/*
+	 * If the guest thinks it's running on SGX2 hardware, inject an SGX
+	 * #PF if the fault matches an EPCM fault signature (#GP on SGX1,
+	 * #PF on SGX2).  The assumption is that EPCM faults are much more
+	 * likely than a bad userspace address.
+	 */
+	if ((trapnr == PF_VECTOR || !boot_cpu_has(X86_FEATURE_SGX2)) &&
+	    guest_cpuid_has(vcpu, X86_FEATURE_SGX2)) {
+		memset(&ex, 0, sizeof(ex));
+		ex.vector = PF_VECTOR;
+		ex.error_code = PFERR_PRESENT_MASK | PFERR_WRITE_MASK |
+				PFERR_SGX_MASK;
+		ex.address = gva;
+		ex.error_code_valid = true;
+		ex.nested_page_fault = false;
+		kvm_inject_page_fault(vcpu, &ex);
+	} else {
+		kvm_inject_gp(vcpu, 0);
+	}
+	return 1;
+}
+
+static int __handle_encls_ecreate(struct kvm_vcpu *vcpu,
+				  struct sgx_pageinfo *pageinfo,
+				  unsigned long secs_hva,
+				  gva_t secs_gva)
+{
+	struct sgx_secs *contents = (struct sgx_secs *)pageinfo->contents;
+	struct kvm_cpuid_entry2 *sgx_12_0, *sgx_12_1;
+	u64 attributes, xfrm, size;
+	u32 miscselect;
+	u8 max_size_log2;
+	int trapnr, ret;
+
+	sgx_12_0 = kvm_find_cpuid_entry(vcpu, 0x12, 0);
+	sgx_12_1 = kvm_find_cpuid_entry(vcpu, 0x12, 1);
+	if (!sgx_12_0 || !sgx_12_1) {
+		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+		vcpu->run->internal.ndata = 0;
+		return 0;
+	}
+
+	miscselect = contents->miscselect;
+	attributes = contents->attributes;
+	xfrm = contents->xfrm;
+	size = contents->size;
+
+	/* Enforce restriction of access to the PROVISIONKEY. */
+	if (!vcpu->kvm->arch.sgx_provisioning_allowed &&
+	    (attributes & SGX_ATTR_PROVISIONKEY)) {
+		if (sgx_12_1->eax & SGX_ATTR_PROVISIONKEY)
+			pr_warn_once("KVM: SGX PROVISIONKEY advertised but not allowed\n");
+		kvm_inject_gp(vcpu, 0);
+		return 1;
+	}
+
+	/* Enforce CPUID restrictions on MISCSELECT, ATTRIBUTES and XFRM. */
+	if ((u32)miscselect & ~sgx_12_0->ebx ||
+	    (u32)attributes & ~sgx_12_1->eax ||
+	    (u32)(attributes >> 32) & ~sgx_12_1->ebx ||
+	    (u32)xfrm & ~sgx_12_1->ecx ||
+	    (u32)(xfrm >> 32) & ~sgx_12_1->edx) {
+		kvm_inject_gp(vcpu, 0);
+		return 1;
+	}
+
+	/* Enforce CPUID restriction on max enclave size. */
+	max_size_log2 = (attributes & SGX_ATTR_MODE64BIT) ? sgx_12_0->edx >> 8 :
+							    sgx_12_0->edx;
+	if (size >= BIT_ULL(max_size_log2))
+		kvm_inject_gp(vcpu, 0);
+
+	/*
+	 * sgx_virt_ecreate() returns:
+	 *  1) 0:	ECREATE was successful
+	 *  2) -EFAULT:	ECREATE was run but faulted, and trapnr was set to the
+	 *		exception number.
+	 *  3) -EINVAL:	access_ok() on @secs_hva failed. This should never
+	 *		happen as KVM checks host addresses at memslot creation.
+	 *		sgx_virt_ecreate() has already warned in this case.
+	 */
+	ret = sgx_virt_ecreate(pageinfo, (void __user *)secs_hva, &trapnr);
+	if (!ret)
+		return kvm_skip_emulated_instruction(vcpu);
+	if (ret == -EFAULT)
+		return sgx_inject_fault(vcpu, secs_gva, trapnr);
+
+	return ret;
+}
+
+static int handle_encls_ecreate(struct kvm_vcpu *vcpu)
+{
+	gva_t pageinfo_gva, secs_gva;
+	gva_t metadata_gva, contents_gva;
+	gpa_t metadata_gpa, contents_gpa, secs_gpa;
+	unsigned long metadata_hva, contents_hva, secs_hva;
+	struct sgx_pageinfo pageinfo;
+	struct sgx_secs *contents;
+	struct x86_exception ex;
+	int r;
+
+	if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 32, 32, &pageinfo_gva) ||
+	    sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva))
+		return 1;
+
+	/*
+	 * Copy the PAGEINFO to local memory, its pointers need to be
+	 * translated, i.e. we need to do a deep copy/translate.
+	 */
+	r = kvm_read_guest_virt(vcpu, pageinfo_gva, &pageinfo,
+				sizeof(pageinfo), &ex);
+	if (r == X86EMUL_PROPAGATE_FAULT) {
+		kvm_inject_emulated_page_fault(vcpu, &ex);
+		return 1;
+	} else if (r != X86EMUL_CONTINUE) {
+		sgx_handle_emulation_failure(vcpu, pageinfo_gva,
+					     sizeof(pageinfo));
+		return 0;
+	}
+
+	if (sgx_get_encls_gva(vcpu, pageinfo.metadata, 64, 64, &metadata_gva) ||
+	    sgx_get_encls_gva(vcpu, pageinfo.contents, 4096, 4096,
+			      &contents_gva))
+		return 1;
+
+	/*
+	 * Translate the SECINFO, SOURCE and SECS pointers from GVA to GPA.
+	 * Resume the guest on failure to inject a #PF.
+	 */
+	if (sgx_gva_to_gpa(vcpu, metadata_gva, false, &metadata_gpa) ||
+	    sgx_gva_to_gpa(vcpu, contents_gva, false, &contents_gpa) ||
+	    sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa))
+		return 1;
+
+	/*
+	 * ...and then to HVA.  The order of accesses isn't architectural, i.e.
+	 * KVM doesn't have to fully process one address at a time.  Exit to
+	 * userspace if a GPA is invalid.
+	 */
+	if (sgx_gpa_to_hva(vcpu, metadata_gpa, &metadata_hva) ||
+	    sgx_gpa_to_hva(vcpu, contents_gpa, &contents_hva) ||
+	    sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva))
+		return 0;
+
+	/*
+	 * Copy contents into kernel memory to prevent TOCTOU attack. E.g. the
+	 * guest could do ECREATE w/ SECS.SGX_ATTR_PROVISIONKEY=0, and
+	 * simultaneously set SGX_ATTR_PROVISIONKEY to bypass the check to
+	 * enforce restriction of access to the PROVISIONKEY.
+	 */
+	contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL_ACCOUNT);
+	if (!contents)
+		return -ENOMEM;
+
+	/* Exit to userspace if copying from a host userspace address fails. */
+	if (sgx_read_hva(vcpu, contents_hva, (void *)contents, PAGE_SIZE)) {
+		free_page((unsigned long)contents);
+		return 0;
+	}
+
+	pageinfo.metadata = metadata_hva;
+	pageinfo.contents = (u64)contents;
+
+	r = __handle_encls_ecreate(vcpu, &pageinfo, secs_hva, secs_gva);
+
+	free_page((unsigned long)contents);
+
+	return r;
+}
+
 static inline bool encls_leaf_enabled_in_guest(struct kvm_vcpu *vcpu, u32 leaf)
 {
 	if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX))
@@ -41,6 +314,8 @@ int handle_encls(struct kvm_vcpu *vcpu)
 	} else if (!sgx_enabled_in_guest_bios(vcpu)) {
 		kvm_inject_gp(vcpu, 0);
 	} else {
+		if (leaf == ECREATE)
+			return handle_encls_ecreate(vcpu);
 		WARN(1, "KVM: unexpected exit on ENCLS[%u]", leaf);
 		vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
 		vcpu->run->hw.hardware_exit_reason = EXIT_REASON_ENCLS;
-- 
Gitee


From 897e6f98c6f67446f24b04761eaa57624675f20d Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 12 Apr 2021 16:21:40 +1200
Subject: [PATCH 50/63] KVM: VMX: Add emulation of SGX Launch Control LE hash
 MSRs

mainline inclusion
from mainline-5.13
commit 8f102445d4045384799627c53d82c45ca2cad3a5
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 8f102445d404 KVM: VMX: Add emulation of SGX Launch
Control LE hash MSRs.
Backport for SGX virtualization support

--------------------------------

Emulate the four Launch Enclave public key hash MSRs (LE hash MSRs) that
exist on CPUs that support SGX Launch Control (LC).  SGX LC modifies the
behavior of ENCLS[EINIT] to use the LE hash MSRs when verifying the key
used to sign an enclave.  On CPUs without LC support, the LE hash is
hardwired into the CPU to an Intel controlled key (the Intel key is also
the reset value of the LE hash MSRs). Track the guest's desired hash so
that a future patch can stuff the hash into the hardware MSRs when
executing EINIT on behalf of the guest, when those MSRs are writable in
host.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Co-developed-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Message-Id: <c58ef601ddf88f3a113add837969533099b1364a.1618196135.git.kai.huang@intel.com>
[Add a comment regarding the MSRs being available until SGX is locked.
 - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kvm/vmx/sgx.c | 35 +++++++++++++++++++++++++++++++++++
 arch/x86/kvm/vmx/sgx.h |  6 ++++++
 arch/x86/kvm/vmx/vmx.c | 31 +++++++++++++++++++++++++++++++
 arch/x86/kvm/vmx/vmx.h |  3 +++
 4 files changed, 75 insertions(+)

diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
index 2cf9d7cf818c..d53e6b17b320 100644
--- a/arch/x86/kvm/vmx/sgx.c
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -11,6 +11,9 @@
 
 bool __read_mostly enable_sgx;
 
+/* Initial value of guest's virtual SGX_LEPUBKEYHASHn MSRs */
+static u64 sgx_pubkey_hash[4] __ro_after_init;
+
 /*
  * ENCLS's memory operands use a fixed segment (DS) and a fixed
  * address size based on the mode.  Related prefixes are ignored.
@@ -323,3 +326,35 @@ int handle_encls(struct kvm_vcpu *vcpu)
 	}
 	return 1;
 }
+
+void setup_default_sgx_lepubkeyhash(void)
+{
+	/*
+	 * Use Intel's default value for Skylake hardware if Launch Control is
+	 * not supported, i.e. Intel's hash is hardcoded into silicon, or if
+	 * Launch Control is supported and enabled, i.e. mimic the reset value
+	 * and let the guest write the MSRs at will.  If Launch Control is
+	 * supported but disabled, then use the current MSR values as the hash
+	 * MSRs exist but are read-only (locked and not writable).
+	 */
+	if (!enable_sgx || boot_cpu_has(X86_FEATURE_SGX_LC) ||
+	    rdmsrl_safe(MSR_IA32_SGXLEPUBKEYHASH0, &sgx_pubkey_hash[0])) {
+		sgx_pubkey_hash[0] = 0xa6053e051270b7acULL;
+		sgx_pubkey_hash[1] = 0x6cfbe8ba8b3b413dULL;
+		sgx_pubkey_hash[2] = 0xc4916d99f2b3735dULL;
+		sgx_pubkey_hash[3] = 0xd4f8c05909f9bb3bULL;
+	} else {
+		/* MSR_IA32_SGXLEPUBKEYHASH0 is read above */
+		rdmsrl(MSR_IA32_SGXLEPUBKEYHASH1, sgx_pubkey_hash[1]);
+		rdmsrl(MSR_IA32_SGXLEPUBKEYHASH2, sgx_pubkey_hash[2]);
+		rdmsrl(MSR_IA32_SGXLEPUBKEYHASH3, sgx_pubkey_hash[3]);
+	}
+}
+
+void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	memcpy(vmx->msr_ia32_sgxlepubkeyhash, sgx_pubkey_hash,
+	       sizeof(sgx_pubkey_hash));
+}
diff --git a/arch/x86/kvm/vmx/sgx.h b/arch/x86/kvm/vmx/sgx.h
index 6e17ecd4aca3..6502fa52c7e9 100644
--- a/arch/x86/kvm/vmx/sgx.h
+++ b/arch/x86/kvm/vmx/sgx.h
@@ -8,8 +8,14 @@
 extern bool __read_mostly enable_sgx;
 
 int handle_encls(struct kvm_vcpu *vcpu);
+
+void setup_default_sgx_lepubkeyhash(void);
+void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu);
 #else
 #define enable_sgx 0
+
+static inline void setup_default_sgx_lepubkeyhash(void) { }
+static inline void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu) { }
 #endif
 
 #endif /* __KVM_X86_SGX_H */
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 46fa34c03445..b5a9c45cfc22 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1907,6 +1907,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_FEAT_CTL:
 		msr_info->data = vmx->msr_ia32_feature_control;
 		break;
+	case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
+		if (!msr_info->host_initiated &&
+		    !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
+			return 1;
+		msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
+			[msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
+		break;
 	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
 		if (!nested_vmx_allowed(vcpu))
 			return 1;
@@ -2202,6 +2209,26 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		if (msr_info->host_initiated && data == 0)
 			vmx_leave_nested(vcpu);
 		break;
+	case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
+		/*
+		 * On real hardware, the LE hash MSRs are writable before
+		 * the firmware sets bit 0 in MSR 0x7a ("activating" SGX),
+		 * at which point SGX related bits in IA32_FEATURE_CONTROL
+		 * become writable.
+		 *
+		 * KVM does not emulate SGX activation for simplicity, so
+		 * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL
+		 * is unlocked.  This is technically not architectural
+		 * behavior, but it's close enough.
+		 */
+		if (!msr_info->host_initiated &&
+		    (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) ||
+		    ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) &&
+		    !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED))))
+			return 1;
+		vmx->msr_ia32_sgxlepubkeyhash
+			[msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data;
+		break;
 	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
 		if (!msr_info->host_initiated)
 			return 1; /* they are read-only */
@@ -7075,6 +7102,8 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
 	else
 		memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));
 
+	vcpu_setup_sgx_lepubkeyhash(vcpu);
+
 	vmx->nested.posted_intr_nv = -1;
 	vmx->nested.current_vmptr = -1ull;
 
@@ -8017,6 +8046,8 @@ static __init int hardware_setup(void)
 	if (!enable_ept || !cpu_has_vmx_intel_pt())
 		pt_mode = PT_MODE_SYSTEM;
 
+	setup_default_sgx_lepubkeyhash();
+
 	if (nested) {
 		nested_vmx_setup_ctls_msrs(&vmcs_config.nested,
 					   vmx_capability.ept);
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index c0b52498e4bb..04f1f8544177 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -324,6 +324,9 @@ struct vcpu_vmx {
 	 */
 	u64 msr_ia32_feature_control;
 	u64 msr_ia32_feature_control_valid_bits;
+	/* SGX Launch Control public key hash */
+	u64 msr_ia32_sgxlepubkeyhash[4];
+
 	u64 ept_pointer;
 
 	struct pt_desc pt_desc;
-- 
Gitee


From ba02e6c91a5db08703f28ae26a0d16591d8182e7 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 12 Apr 2021 16:21:41 +1200
Subject: [PATCH 51/63] KVM: VMX: Add ENCLS[EINIT] handler to support SGX
 Launch Control (LC)

mainline inclusion
from mainline-5.13
commit b6f084ca553845135ccade79ce6548035e52884a
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit b6f084ca5538 KVM: VMX: Add ENCLS[EINIT] handler to support
SGX Launch Control (LC).
Backport for SGX virtualization support

--------------------------------

Add a VM-Exit handler to trap-and-execute EINIT when SGX LC is enabled
in the host.  When SGX LC is enabled, the host kernel may rewrite the
hardware values at will, e.g. to launch enclaves with different signers,
thus KVM needs to intercept EINIT to ensure it is executed with the
correct LE hash (even if the guest sees a hardwired hash).

Switching the LE hash MSRs on VM-Enter/VM-Exit is not a viable option as
writing the MSRs is prohibitively expensive, e.g. on SKL hardware each
WRMSR is ~400 cycles.  And because EINIT takes tens of thousands of
cycles to execute, the ~1500 cycle overhead to trap-and-execute EINIT is
unlikely to be noticed by the guest, let alone impact its overall SGX
performance.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Message-Id: <57c92fa4d2083eb3be9e6355e3882fc90cffea87.1618196135.git.kai.huang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kvm/vmx/sgx.c | 64 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
index d53e6b17b320..2eed5da91698 100644
--- a/arch/x86/kvm/vmx/sgx.c
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -287,6 +287,68 @@ static int handle_encls_ecreate(struct kvm_vcpu *vcpu)
 	return r;
 }
 
+static int handle_encls_einit(struct kvm_vcpu *vcpu)
+{
+	unsigned long sig_hva, secs_hva, token_hva, rflags;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	gva_t sig_gva, secs_gva, token_gva;
+	gpa_t sig_gpa, secs_gpa, token_gpa;
+	int ret, trapnr;
+
+	if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 1808, 4096, &sig_gva) ||
+	    sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva) ||
+	    sgx_get_encls_gva(vcpu, kvm_rdx_read(vcpu), 304, 512, &token_gva))
+		return 1;
+
+	/*
+	 * Translate the SIGSTRUCT, SECS and TOKEN pointers from GVA to GPA.
+	 * Resume the guest on failure to inject a #PF.
+	 */
+	if (sgx_gva_to_gpa(vcpu, sig_gva, false, &sig_gpa) ||
+	    sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa) ||
+	    sgx_gva_to_gpa(vcpu, token_gva, false, &token_gpa))
+		return 1;
+
+	/*
+	 * ...and then to HVA.  The order of accesses isn't architectural, i.e.
+	 * KVM doesn't have to fully process one address at a time.  Exit to
+	 * userspace if a GPA is invalid.  Note, all structures are aligned and
+	 * cannot split pages.
+	 */
+	if (sgx_gpa_to_hva(vcpu, sig_gpa, &sig_hva) ||
+	    sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva) ||
+	    sgx_gpa_to_hva(vcpu, token_gpa, &token_hva))
+		return 0;
+
+	ret = sgx_virt_einit((void __user *)sig_hva, (void __user *)token_hva,
+			     (void __user *)secs_hva,
+			     vmx->msr_ia32_sgxlepubkeyhash, &trapnr);
+
+	if (ret == -EFAULT)
+		return sgx_inject_fault(vcpu, secs_gva, trapnr);
+
+	/*
+	 * sgx_virt_einit() returns -EINVAL when access_ok() fails on @sig_hva,
+	 * @token_hva or @secs_hva. This should never happen as KVM checks host
+	 * addresses at memslot creation. sgx_virt_einit() has already warned
+	 * in this case, so just return.
+	 */
+	if (ret < 0)
+		return ret;
+
+	rflags = vmx_get_rflags(vcpu) & ~(X86_EFLAGS_CF | X86_EFLAGS_PF |
+					  X86_EFLAGS_AF | X86_EFLAGS_SF |
+					  X86_EFLAGS_OF);
+	if (ret)
+		rflags |= X86_EFLAGS_ZF;
+	else
+		rflags &= ~X86_EFLAGS_ZF;
+	vmx_set_rflags(vcpu, rflags);
+
+	kvm_rax_write(vcpu, ret);
+	return kvm_skip_emulated_instruction(vcpu);
+}
+
 static inline bool encls_leaf_enabled_in_guest(struct kvm_vcpu *vcpu, u32 leaf)
 {
 	if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX))
@@ -319,6 +381,8 @@ int handle_encls(struct kvm_vcpu *vcpu)
 	} else {
 		if (leaf == ECREATE)
 			return handle_encls_ecreate(vcpu);
+		if (leaf == EINIT)
+			return handle_encls_einit(vcpu);
 		WARN(1, "KVM: unexpected exit on ENCLS[%u]", leaf);
 		vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
 		vcpu->run->hw.hardware_exit_reason = EXIT_REASON_ENCLS;
-- 
Gitee


From 9b00521ed039f7f6559f805e1659eb2971deeb6c Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 12 Apr 2021 16:21:42 +1200
Subject: [PATCH 52/63] KVM: VMX: Enable SGX virtualization for SGX1, SGX2 and
 LC

mainline inclusion
from mainline-5.13
commit 72add915fbd5bf5c57deee3da5b2605e966ac199
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 72add915fbd5 KVM: VMX: Enable SGX virtualization for
SGX1, SGX2 and LC.
Backport for SGX virtualization support

--------------------------------

Enable SGX virtualization now that KVM has the VM-Exit handlers needed
to trap-and-execute ENCLS to ensure correctness and/or enforce the CPU
model exposed to the guest.  Add a KVM module param, "sgx", to allow an
admin to disable SGX virtualization independent of the kernel.

When supported in hardware and the kernel, advertise SGX1, SGX2 and SGX
LC to userspace via CPUID and wire up the ENCLS_EXITING bitmap based on
the guest's SGX capabilities, i.e. to allow ENCLS to be executed in an
SGX-enabled guest.  With the exception of the provision key, all SGX
attribute bits may be exposed to the guest.  Guest access to the
provision key, which is controlled via securityfs, will be added in a
future patch.

Note, KVM does not yet support exposing ENCLS_C leafs or ENCLV leafs.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Message-Id: <a99e9c23310c79f2f4175c1af4c4cbcef913c3e5.1618196135.git.kai.huang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kvm/cpuid.c      | 57 +++++++++++++++++++++++++++-
 arch/x86/kvm/vmx/nested.c | 27 +++++++++++--
 arch/x86/kvm/vmx/nested.h |  5 +++
 arch/x86/kvm/vmx/sgx.c    | 80 ++++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/vmx/sgx.h    | 13 +++++++
 arch/x86/kvm/vmx/vmcs12.c |  1 +
 arch/x86/kvm/vmx/vmcs12.h |  4 +-
 arch/x86/kvm/vmx/vmx.c    | 26 ++++++++++++-
 8 files changed, 204 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index c5fa5c62e391..6d6130ecf250 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -18,6 +18,7 @@
 #include <asm/processor.h>
 #include <asm/user.h>
 #include <asm/fpu/xstate.h>
+#include <asm/sgx.h>
 #include "cpuid.h"
 #include "lapic.h"
 #include "mmu.h"
@@ -170,6 +171,21 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
 		vcpu->arch.guest_supported_xcr0 =
 			(best->eax | ((u64)best->edx << 32)) & supported_xcr0;
 
+	/*
+	 * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate
+	 * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's
+	 * requested XCR0 value.  The enclave's XFRM must be a subset of XCRO
+	 * at the time of EENTER, thus adjust the allowed XFRM by the guest's
+	 * supported XCR0.  Similar to XCR0 handling, FP and SSE are forced to
+	 * '1' even on CPUs that don't support XSAVE.
+	 */
+	best = kvm_find_cpuid_entry(vcpu, 0x12, 0x1);
+	if (best) {
+		best->ecx &= vcpu->arch.guest_supported_xcr0 & 0xffffffff;
+		best->edx &= vcpu->arch.guest_supported_xcr0 >> 32;
+		best->ecx |= XFEATURE_MASK_FPSSE;
+	}
+
 	kvm_update_pv_runtime(vcpu);
 
 	vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
@@ -412,7 +428,7 @@ void kvm_set_cpu_caps(void)
 	);
 
 	kvm_cpu_cap_mask(CPUID_7_0_EBX,
-		F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
+		F(FSGSBASE) | F(SGX) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
 		F(BMI2) | F(ERMS) | 0 /*INVPCID*/ | F(RTM) | 0 /*MPX*/ | F(RDSEED) |
 		F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
 		F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
@@ -423,7 +439,8 @@ void kvm_set_cpu_caps(void)
 		F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) |
 		F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
 		F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
-		F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/
+		F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ |
+		F(SGX_LC)
 	);
 	/* Set LA57 based on hardware capability. */
 	if (cpuid_ecx(7) & F(LA57))
@@ -462,6 +479,10 @@ void kvm_set_cpu_caps(void)
 		F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES)
 	);
 
+	kvm_cpu_cap_init_scattered(CPUID_12_EAX,
+		SF(SGX1) | SF(SGX2)
+	);
+
 	kvm_cpu_cap_mask(CPUID_8000_0001_ECX,
 		F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
 		F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
@@ -785,6 +806,38 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 			entry->edx = 0;
 		}
 		break;
+	case 0x12:
+		/* Intel SGX */
+		if (!kvm_cpu_cap_has(X86_FEATURE_SGX)) {
+			entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+			break;
+		}
+
+		/*
+		 * Index 0: Sub-features, MISCSELECT (a.k.a extended features)
+		 * and max enclave sizes.   The SGX sub-features and MISCSELECT
+		 * are restricted by kernel and KVM capabilities (like most
+		 * feature flags), while enclave size is unrestricted.
+		 */
+		cpuid_entry_override(entry, CPUID_12_EAX);
+		entry->ebx &= SGX_MISC_EXINFO;
+
+		entry = do_host_cpuid(array, function, 1);
+		if (!entry)
+			goto out;
+
+		/*
+		 * Index 1: SECS.ATTRIBUTES.  ATTRIBUTES are restricted a la
+		 * feature flags.  Advertise all supported flags, including
+		 * privileged attributes that require explicit opt-in from
+		 * userspace.  ATTRIBUTES.XFRM is not adjusted as userspace is
+		 * expected to derive it from supported XCR0.
+		 */
+		entry->eax &= SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT |
+			      /* PROVISIONKEY | */ SGX_ATTR_EINITTOKENKEY |
+			      SGX_ATTR_KSS;
+		entry->ebx &= 0;
+		break;
 	/* Intel PT */
 	case 0x14:
 		if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) {
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 0f871faeff1a..d2a0464e95c7 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -11,6 +11,7 @@
 #include "mmu.h"
 #include "nested.h"
 #include "pmu.h"
+#include "sgx.h"
 #include "trace.h"
 #include "x86.h"
 
@@ -2326,6 +2327,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
 		if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
 		    exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
 
+		if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
+			vmx_write_encls_bitmap(&vmx->vcpu, vmcs12);
+
 		secondary_exec_controls_set(vmx, exec_control);
 	}
 
@@ -5738,6 +5742,21 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
 	return false;
 }
 
+static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu,
+					  struct vmcs12 *vmcs12)
+{
+	u32 encls_leaf;
+
+	if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) ||
+	    !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING))
+		return false;
+
+	encls_leaf = kvm_rax_read(vcpu);
+	if (encls_leaf > 62)
+		encls_leaf = 63;
+	return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf);
+}
+
 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
 	struct vmcs12 *vmcs12, gpa_t bitmap)
 {
@@ -5835,9 +5854,6 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
 	case EXIT_REASON_VMFUNC:
 		/* VM functions are emulated through L2->L0 vmexits. */
 		return true;
-	case EXIT_REASON_ENCLS:
-		/* SGX is never exposed to L1 */
-		return true;
 	default:
 		break;
 	}
@@ -5961,6 +5977,8 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
 	case EXIT_REASON_TPAUSE:
 		return nested_cpu_has2(vmcs12,
 			SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
+	case EXIT_REASON_ENCLS:
+		return nested_vmx_exit_handled_encls(vcpu, vmcs12);
 	default:
 		return true;
 	}
@@ -6536,6 +6554,9 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
 		msrs->secondary_ctls_high |=
 			SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 
+	if (enable_sgx)
+		msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING;
+
 	/* miscellaneous data */
 	rdmsr(MSR_IA32_VMX_MISC,
 		msrs->misc_low,
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index 197148d76b8f..184418baeb3c 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -244,6 +244,11 @@ static inline bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
 		PIN_BASED_EXT_INTR_MASK;
 }
 
+static inline bool nested_cpu_has_encls_exit(struct vmcs12 *vmcs12)
+{
+	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING);
+}
+
 /*
  * if fixed0[i] == 1: val[i] must be 1
  * if fixed1[i] == 0: val[i] must be 0
diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
index 2eed5da91698..6693ebdc0770 100644
--- a/arch/x86/kvm/vmx/sgx.c
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -5,11 +5,13 @@
 
 #include "cpuid.h"
 #include "kvm_cache_regs.h"
+#include "nested.h"
 #include "sgx.h"
 #include "vmx.h"
 #include "x86.h"
 
-bool __read_mostly enable_sgx;
+bool __read_mostly enable_sgx = 1;
+module_param_named(sgx, enable_sgx, bool, 0444);
 
 /* Initial value of guest's virtual SGX_LEPUBKEYHASHn MSRs */
 static u64 sgx_pubkey_hash[4] __ro_after_init;
@@ -422,3 +424,79 @@ void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu)
 	memcpy(vmx->msr_ia32_sgxlepubkeyhash, sgx_pubkey_hash,
 	       sizeof(sgx_pubkey_hash));
 }
+
+/*
+ * ECREATE must be intercepted to enforce MISCSELECT, ATTRIBUTES and XFRM
+ * restrictions if the guest's allowed-1 settings diverge from hardware.
+ */
+static bool sgx_intercept_encls_ecreate(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *guest_cpuid;
+	u32 eax, ebx, ecx, edx;
+
+	if (!vcpu->kvm->arch.sgx_provisioning_allowed)
+		return true;
+
+	guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 0);
+	if (!guest_cpuid)
+		return true;
+
+	cpuid_count(0x12, 0, &eax, &ebx, &ecx, &edx);
+	if (guest_cpuid->ebx != ebx || guest_cpuid->edx != edx)
+		return true;
+
+	guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 1);
+	if (!guest_cpuid)
+		return true;
+
+	cpuid_count(0x12, 1, &eax, &ebx, &ecx, &edx);
+	if (guest_cpuid->eax != eax || guest_cpuid->ebx != ebx ||
+	    guest_cpuid->ecx != ecx || guest_cpuid->edx != edx)
+		return true;
+
+	return false;
+}
+
+void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+	/*
+	 * There is no software enable bit for SGX that is virtualized by
+	 * hardware, e.g. there's no CR4.SGXE, so when SGX is disabled in the
+	 * guest (either by the host or by the guest's BIOS) but enabled in the
+	 * host, trap all ENCLS leafs and inject #UD/#GP as needed to emulate
+	 * the expected system behavior for ENCLS.
+	 */
+	u64 bitmap = -1ull;
+
+	/* Nothing to do if hardware doesn't support SGX */
+	if (!cpu_has_vmx_encls_vmexit())
+		return;
+
+	if (guest_cpuid_has(vcpu, X86_FEATURE_SGX) &&
+	    sgx_enabled_in_guest_bios(vcpu)) {
+		if (guest_cpuid_has(vcpu, X86_FEATURE_SGX1)) {
+			bitmap &= ~GENMASK_ULL(ETRACK, ECREATE);
+			if (sgx_intercept_encls_ecreate(vcpu))
+				bitmap |= (1 << ECREATE);
+		}
+
+		if (guest_cpuid_has(vcpu, X86_FEATURE_SGX2))
+			bitmap &= ~GENMASK_ULL(EMODT, EAUG);
+
+		/*
+		 * Trap and execute EINIT if launch control is enabled in the
+		 * host using the guest's values for launch control MSRs, even
+		 * if the guest's values are fixed to hardware default values.
+		 * The MSRs are not loaded/saved on VM-Enter/VM-Exit as writing
+		 * the MSRs is extraordinarily expensive.
+		 */
+		if (boot_cpu_has(X86_FEATURE_SGX_LC))
+			bitmap |= (1 << EINIT);
+
+		if (!vmcs12 && is_guest_mode(vcpu))
+			vmcs12 = get_vmcs12(vcpu);
+		if (vmcs12 && nested_cpu_has_encls_exit(vmcs12))
+			bitmap |= vmcs12->encls_exiting_bitmap;
+	}
+	vmcs_write64(ENCLS_EXITING_BITMAP, bitmap);
+}
diff --git a/arch/x86/kvm/vmx/sgx.h b/arch/x86/kvm/vmx/sgx.h
index 6502fa52c7e9..a400888b376d 100644
--- a/arch/x86/kvm/vmx/sgx.h
+++ b/arch/x86/kvm/vmx/sgx.h
@@ -4,6 +4,9 @@
 
 #include <linux/kvm_host.h>
 
+#include "capabilities.h"
+#include "vmx_ops.h"
+
 #ifdef CONFIG_X86_SGX_KVM
 extern bool __read_mostly enable_sgx;
 
@@ -11,11 +14,21 @@ int handle_encls(struct kvm_vcpu *vcpu);
 
 void setup_default_sgx_lepubkeyhash(void);
 void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu);
+
+void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12);
 #else
 #define enable_sgx 0
 
 static inline void setup_default_sgx_lepubkeyhash(void) { }
 static inline void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu) { }
+
+static inline void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu,
+					  struct vmcs12 *vmcs12)
+{
+	/* Nothing to do if hardware doesn't support SGX */
+	if (cpu_has_vmx_encls_vmexit())
+		vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
+}
 #endif
 
 #endif /* __KVM_X86_SGX_H */
diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c
index c8e51c004f78..034adb6404dc 100644
--- a/arch/x86/kvm/vmx/vmcs12.c
+++ b/arch/x86/kvm/vmx/vmcs12.c
@@ -50,6 +50,7 @@ const unsigned short vmcs_field_to_offset_table[] = {
 	FIELD64(VMREAD_BITMAP, vmread_bitmap),
 	FIELD64(VMWRITE_BITMAP, vmwrite_bitmap),
 	FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
+	FIELD64(ENCLS_EXITING_BITMAP, encls_exiting_bitmap),
 	FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
 	FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
 	FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
index 80232daf00ff..13494956d0e9 100644
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -69,7 +69,8 @@ struct __packed vmcs12 {
 	u64 vm_function_control;
 	u64 eptp_list_address;
 	u64 pml_address;
-	u64 padding64[3]; /* room for future expansion */
+	u64 encls_exiting_bitmap;
+	u64 padding64[2]; /* room for future expansion */
 	/*
 	 * To allow migration of L1 (complete with its L2 guests) between
 	 * machines of different natural widths (32 or 64 bit), we cannot have
@@ -256,6 +257,7 @@ static inline void vmx_check_vmcs12_offsets(void)
 	CHECK_OFFSET(vm_function_control, 296);
 	CHECK_OFFSET(eptp_list_address, 304);
 	CHECK_OFFSET(pml_address, 312);
+	CHECK_OFFSET(encls_exiting_bitmap, 320);
 	CHECK_OFFSET(cr0_guest_host_mask, 344);
 	CHECK_OFFSET(cr4_guest_host_mask, 352);
 	CHECK_OFFSET(cr0_read_shadow, 360);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index b5a9c45cfc22..b737f696efd0 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2208,6 +2208,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		vmx->msr_ia32_feature_control = data;
 		if (msr_info->host_initiated && data == 0)
 			vmx_leave_nested(vcpu);
+
+		/* SGX may be enabled/disabled by guest's firmware */
+		vmx_write_encls_bitmap(vcpu, NULL);
 		break;
 	case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
 		/*
@@ -4493,8 +4496,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
 		vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
 	}
 
-	if (cpu_has_vmx_encls_vmexit())
-		vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
+	vmx_write_encls_bitmap(&vmx->vcpu, NULL);
 
 	if (vmx_pt_mode_is_host_guest()) {
 		memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
@@ -7430,6 +7432,19 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
 
 	set_cr4_guest_host_mask(vmx);
 
+	vmx_write_encls_bitmap(vcpu, NULL);
+	if (guest_cpuid_has(vcpu, X86_FEATURE_SGX))
+		vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED;
+	else
+		vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED;
+
+	if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
+		vmx->msr_ia32_feature_control_valid_bits |=
+			FEAT_CTL_SGX_LC_ENABLED;
+	else
+		vmx->msr_ia32_feature_control_valid_bits &=
+			~FEAT_CTL_SGX_LC_ENABLED;
+
 	/* Refresh #PF interception to account for MAXPHYADDR changes. */
 	update_exception_bitmap(vcpu);
 }
@@ -7450,6 +7465,13 @@ static __init void vmx_set_cpu_caps(void)
 	if (vmx_pt_mode_is_host_guest())
 		kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
 
+	if (!enable_sgx) {
+		kvm_cpu_cap_clear(X86_FEATURE_SGX);
+		kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
+		kvm_cpu_cap_clear(X86_FEATURE_SGX1);
+		kvm_cpu_cap_clear(X86_FEATURE_SGX2);
+	}
+
 	if (vmx_umip_emulated())
 		kvm_cpu_cap_set(X86_FEATURE_UMIP);
 
-- 
Gitee


From a86c0305ec8a1e03d93ac66a2267abcec7f75f29 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 12 Apr 2021 16:21:43 +1200
Subject: [PATCH 53/63] KVM: x86: Add capability to grant VM access to
 privileged SGX attribute

mainline inclusion
from mainline-5.13
commit fe7e948837f312d87853b3fce743795d1ae3715a
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit fe7e948837f3 KVM: x86: Add capability to grant VM
access to privileged SGX attribute.
Backport for SGX virtualization support

--------------------------------

Add a capability, KVM_CAP_SGX_ATTRIBUTE, that can be used by userspace
to grant a VM access to a priveleged attribute, with args[0] holding a
file handle to a valid SGX attribute file.

The SGX subsystem restricts access to a subset of enclave attributes to
provide additional security for an uncompromised kernel, e.g. to prevent
malware from using the PROVISIONKEY to ensure its nodes are running
inside a geniune SGX enclave and/or to obtain a stable fingerprint.

To prevent userspace from circumventing such restrictions by running an
enclave in a VM, KVM restricts guest access to privileged attributes by
default.

Cc: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Message-Id: <0b099d65e933e068e3ea934b0523bab070cb8cea.1618196135.git.kai.huang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 Documentation/virt/kvm/api.rst | 23 +++++++++++++++++++++++
 arch/x86/kvm/cpuid.c           |  2 +-
 arch/x86/kvm/x86.c             | 21 +++++++++++++++++++++
 include/uapi/linux/kvm.h       |  1 +
 4 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 56deaceab625..671bbd90f15b 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6197,6 +6197,29 @@ KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR exit notifications which user space
 can then handle to implement model specific MSR handling and/or user notifications
 to inform a user that an MSR was not handled.
 
+7.25 KVM_CAP_SGX_ATTRIBUTE
+----------------------
+
+:Architectures: x86
+:Target: VM
+:Parameters: args[0] is a file handle of a SGX attribute file in securityfs
+:Returns: 0 on success, -EINVAL if the file handle is invalid or if a requested
+          attribute is not supported by KVM.
+
+KVM_CAP_SGX_ATTRIBUTE enables a userspace VMM to grant a VM access to one or
+more priveleged enclave attributes.  args[0] must hold a file handle to a valid
+SGX attribute file corresponding to an attribute that is supported/restricted
+by KVM (currently only PROVISIONKEY).
+
+The SGX subsystem restricts access to a subset of enclave attributes to provide
+additional security for an uncompromised kernel, e.g. use of the PROVISIONKEY
+is restricted to deter malware from using the PROVISIONKEY to obtain a stable
+system fingerprint.  To prevent userspace from circumventing such restrictions
+by running an enclave in a VM, KVM prevents access to privileged attributes by
+default.
+
+See Documentation/x86/sgx/2.Kernel-internals.rst for more details.
+
 8. Other capabilities.
 ======================
 
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 6d6130ecf250..72e7db3a0ad1 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -834,7 +834,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 		 * expected to derive it from supported XCR0.
 		 */
 		entry->eax &= SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT |
-			      /* PROVISIONKEY | */ SGX_ATTR_EINITTOKENKEY |
+			      SGX_ATTR_PROVISIONKEY | SGX_ATTR_EINITTOKENKEY |
 			      SGX_ATTR_KSS;
 		entry->ebx &= 0;
 		break;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3fb8b3c46298..e3896ba64eca 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -74,6 +74,7 @@
 #include <asm/tlbflush.h>
 #include <asm/intel_pt.h>
 #include <asm/emulate_prefix.h>
+#include <asm/sgx.h>
 #include <clocksource/hyperv_timer.h>
 
 #define CREATE_TRACE_POINTS
@@ -3839,6 +3840,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_X86_USER_SPACE_MSR:
 	case KVM_CAP_X86_MSR_FILTER:
 	case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
+#ifdef CONFIG_X86_SGX_KVM
+	case KVM_CAP_SGX_ATTRIBUTE:
+#endif
 		r = 1;
 		break;
 	case KVM_CAP_SYNC_REGS:
@@ -5391,6 +5395,23 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 		kvm->arch.user_space_msr_mask = cap->args[0];
 		r = 0;
 		break;
+#ifdef CONFIG_X86_SGX_KVM
+	case KVM_CAP_SGX_ATTRIBUTE: {
+		unsigned long allowed_attributes = 0;
+
+		r = sgx_set_attribute(&allowed_attributes, cap->args[0]);
+		if (r)
+			break;
+
+		/* KVM only supports the PROVISIONKEY privileged attribute. */
+		if ((allowed_attributes & SGX_ATTR_PROVISIONKEY) &&
+		    !(allowed_attributes & ~SGX_ATTR_PROVISIONKEY))
+			kvm->arch.sgx_provisioning_allowed = true;
+		else
+			r = -EINVAL;
+		break;
+	}
+#endif
 	default:
 		r = -EINVAL;
 		break;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 3af8b0164f1e..4e0ebd66368f 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1061,6 +1061,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_X86_USER_SPACE_MSR 188
 #define KVM_CAP_X86_MSR_FILTER 189
 #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190
+#define KVM_CAP_SGX_ATTRIBUTE 196
 
 #define KVM_CAP_ARM_CPU_FEATURE 555
 
-- 
Gitee


From baa84abb50941c5a8202331b40b3e82150a372ba Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Tue, 20 Apr 2021 18:08:50 -0700
Subject: [PATCH 54/63] KVM: x86: Fix implicit enum conversion goof in
 scattered reverse CPUID code

mainline inclusion
from mainline-5.13
commit 462f8ddebccbb8a364b154008212052d515ac6b1
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 462f8ddebccb KVM: x86: Fix implicit enum conversion
goof in scattered reverse CPUID code.
Backport for SGX virtualization support

--------------------------------

Take "enum kvm_only_cpuid_leafs" in scattered specific CPUID helpers
(which is obvious in hindsight), and use "unsigned int" for leafs that
can be the kernel's standard "enum cpuid_leaf" or the aforementioned
KVM-only variant.  Loss of the enum params is a bit disapponting, but
gcc obviously isn't providing any extra sanity checks, and the various
BUILD_BUG_ON() assertions ensure the input is in range.

This fixes implicit enum conversions that are detected by clang-11:

arch/x86/kvm/cpuid.c:499:29: warning: implicit conversion from enumeration type 'enum kvm_only_cpuid_leafs' to different enumeration type 'enum cpuid_leafs' [-Wenum-conversion]
        kvm_cpu_cap_init_scattered(CPUID_12_EAX,
        ~~~~~~~~~~~~~~~~~~~~~~~~~~ ^~~~~~~~~~~~
arch/x86/kvm/cpuid.c:837:31: warning: implicit conversion from enumeration type 'enum kvm_only_cpuid_leafs' to different enumeration type 'enum cpuid_leafs' [-Wenum-conversion]
                cpuid_entry_override(entry, CPUID_12_EAX);
                ~~~~~~~~~~~~~~~~~~~~        ^~~~~~~~~~~~
2 warnings generated.

Fixes: 4e66c0cb79b7 ("KVM: x86: Add support for reverse CPUID lookup of scattered features")
Cc: Kai Huang <kai.huang@intel.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20210421010850.3009718-1-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kvm/cpuid.c | 5 +++--
 arch/x86/kvm/cpuid.h | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 72e7db3a0ad1..911c6efe60aa 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -348,7 +348,7 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 }
 
 /* Mask kvm_cpu_caps for @leaf with the raw CPUID capabilities of this CPU. */
-static __always_inline void __kvm_cpu_cap_mask(enum cpuid_leafs leaf)
+static __always_inline void __kvm_cpu_cap_mask(unsigned int leaf)
 {
 	const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32);
 	struct kvm_cpuid_entry2 entry;
@@ -361,7 +361,8 @@ static __always_inline void __kvm_cpu_cap_mask(enum cpuid_leafs leaf)
 	kvm_cpu_caps[leaf] &= *__cpuid_entry_get_reg(&entry, cpuid.reg);
 }
 
-static __always_inline void kvm_cpu_cap_init_scattered(enum cpuid_leafs leaf, u32 mask)
+static __always_inline
+void kvm_cpu_cap_init_scattered(enum kvm_only_cpuid_leafs leaf, u32 mask)
 {
 	/* Use kvm_cpu_cap_mask for non-scattered leafs. */
 	BUILD_BUG_ON(leaf < NCAPINTS);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index fbef5b730805..9fbc0de7c337 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -218,7 +218,7 @@ static __always_inline void cpuid_entry_change(struct kvm_cpuid_entry2 *entry,
 }
 
 static __always_inline void cpuid_entry_override(struct kvm_cpuid_entry2 *entry,
-						 enum cpuid_leafs leaf)
+						 unsigned int leaf)
 {
 	u32 *reg = cpuid_entry_get_reg(entry, leaf * 32);
 
-- 
Gitee


From 637b0e7ab279e8a8dacaa64f2fdd7be971560f73 Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko@kernel.org>
Date: Thu, 8 Apr 2021 12:29:24 +0300
Subject: [PATCH 55/63] x86/sgx: Do not update sgx_nr_free_pages in
 sgx_setup_epc_section()

mainline inclusion
from mainline-5.13
commit ae40aaf6bdbf0354a75b8284a0de453fcf5f4d32
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit ae40aaf6bdbf x86/sgx: Do not update sgx_nr_free_pages in
sgx_setup_epc_section().
Backport for SGX virtualization support

--------------------------------

The commit in Fixes: changed the SGX EPC page sanitization to end up in
sgx_free_epc_page() which puts clean and sanitized pages on the free
list.

This was done for the reason that it is best to keep the logic to assign
available-for-use EPC pages to the correct NUMA lists in a single
location.

sgx_nr_free_pages is also incremented by sgx_free_epc_pages() but those
pages which are being added there per EPC section do not belong to the
free list yet because they haven't been sanitized yet - they land on the
dirty list first and the sanitization happens later when ksgxd starts
massaging them.

So remove that addition there and have sgx_free_epc_page() do that
solely.

 [ bp: Sanitize commit message too. ]

Fixes: 51ab30eb2ad4 ("x86/sgx: Replace section->init_laundry_list with sgx_dirty_page_list")
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210408092924.7032-1-jarkko@kernel.org
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kernel/cpu/sgx/main.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 92cb11dffd4c..ad904747419e 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -656,7 +656,6 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
 		list_add_tail(&section->pages[i].list, &sgx_dirty_page_list);
 	}
 
-	sgx_nr_free_pages += nr_pages;
 	return true;
 }
 
-- 
Gitee


From 37543e9422f504308175d182f50a590be2173e42 Mon Sep 17 00:00:00 2001
From: Kai Huang <kai.huang@intel.com>
Date: Tue, 15 Jun 2021 22:16:39 +1200
Subject: [PATCH 56/63] x86/sgx: Add missing xa_destroy() when virtual EPC is
 destroyed

mainline inclusion
from mainline-5.13
commit 4692bc775d2180a937335ccba0edce557103d44a
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 4692bc775d21 x86/sgx: Add missing xa_destroy() when
virtual EPC is destroyed.
Backport for SGX virtualization support

--------------------------------

xa_destroy() needs to be called to destroy a virtual EPC's page array
before calling kfree() to free the virtual EPC. Currently it is not
called so add the missing xa_destroy().

Fixes: 540745ddbc70 ("x86/sgx: Introduce virtual EPC for use by KVM guests")
Signed-off-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Tested-by: Yang Zhong <yang.zhong@intel.com>
Link: https://lkml.kernel.org/r/20210615101639.291929-1-kai.huang@intel.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kernel/cpu/sgx/virt.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
index 7d221eac716a..8f7dd1dc4189 100644
--- a/arch/x86/kernel/cpu/sgx/virt.c
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -212,6 +212,7 @@ static int sgx_vepc_release(struct inode *inode, struct file *file)
 		list_splice_tail(&secs_pages, &zombie_secs_pages);
 	mutex_unlock(&zombie_secs_pages_lock);
 
+	xa_destroy(&vepc->page_array);
 	kfree(vepc);
 
 	return 0;
-- 
Gitee


From 3698e21d1f5239a52378804bc972df19018aa87a Mon Sep 17 00:00:00 2001
From: Liam Howlett <liam.howlett@oracle.com>
Date: Tue, 23 Mar 2021 13:42:21 +0000
Subject: [PATCH 57/63] i915_vma: Rename vma_lookup to i915_vma_lookup

mainline inclusion
from mainline-5.13
commit 547be6a479fd19fe1071d2cf76ed574fbff13481
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 547be6a479fd i915_vma: Rename vma_lookup to
i915_vma_lookup.
Backport for SGX virtualization support

--------------------------------

Use i915 prefix to avoid name collision with future vma_lookup() in mm.

Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20210323134208.3077275-1-Liam.Howlett@Oracle.com
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 drivers/gpu/drm/i915/i915_vma.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index 50a86fd89d00..9aa4a6ce9fbf 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -230,7 +230,7 @@ vma_create(struct drm_i915_gem_object *obj,
 }
 
 static struct i915_vma *
-vma_lookup(struct drm_i915_gem_object *obj,
+i915_vma_lookup(struct drm_i915_gem_object *obj,
 	   struct i915_address_space *vm,
 	   const struct i915_ggtt_view *view)
 {
@@ -278,7 +278,7 @@ i915_vma_instance(struct drm_i915_gem_object *obj,
 	GEM_BUG_ON(!atomic_read(&vm->open));
 
 	spin_lock(&obj->vma.lock);
-	vma = vma_lookup(obj, vm, view);
+	vma = i915_vma_lookup(obj, vm, view);
 	spin_unlock(&obj->vma.lock);
 
 	/* vma_create() will resolve the race if another creates the vma */
-- 
Gitee


From 963e4489bb943d6e1abc1dee31fb3eb529a9a1e6 Mon Sep 17 00:00:00 2001
From: Liam Howlett <liam.howlett@oracle.com>
Date: Mon, 28 Jun 2021 19:38:50 -0700
Subject: [PATCH 58/63] mm: add vma_lookup(), update find_vma_intersection()
 comments

mainline inclusion
from mainline-5.14
commit ce6d42f2e4a2d98898419743b037a95661e3ac9d
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit ce6d42f2e4a2  mm: add vma_lookup(), update
find_vma_intersection() comments.
Backport for SGX virtualization support

--------------------------------

Patch series "mm: Add vma_lookup()", v2.

Many places in the kernel use find_vma() to get a vma and then check the
start address of the vma to ensure the next vma was not returned.

Other places use the find_vma_intersection() call with add, addr + 1 as
the range; looking for just the vma at a specific address.

The third use of find_vma() is by developers who do not know that the
function starts searching at the provided address upwards for the next
vma.  This results in a bug that is often overlooked for a long time.

Adding the new vma_lookup() function will allow for cleaner code by
removing the find_vma() calls which check limits, making
find_vma_intersection() calls of a single address to be shorter, and
potentially reduce the incorrect uses of find_vma().

This patch (of 22):

Many places in the kernel use find_vma() to get a vma and then check the
start address of the vma to ensure the next vma was not returned.

Other places use the find_vma_intersection() call with add, addr + 1 as
the range; looking for just the vma at a specific address.

The third use of find_vma() is by developers who do not know that the
function starts searching at the provided address upwards for the next
vma.  This results in a bug that is often overlooked for a long time.

Adding the new vma_lookup() function will allow for cleaner code by
removing the find_vma() calls which check limits, making
find_vma_intersection() calls of a single address to be shorter, and
potentially reduce the incorrect uses of find_vma().

Also change find_vma_intersection() comments and declaration to be of the
correct length and add kernel documentation style comment.

Link: https://lkml.kernel.org/r/20210521174745.2219620-1-Liam.Howlett@Oracle.com
Link: https://lkml.kernel.org/r/20210521174745.2219620-2-Liam.Howlett@Oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Reviewed-by: Laurent Dufour <ldufour@linux.ibm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
Cc: David Miller <davem@davemloft.net>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 include/linux/mm.h | 36 ++++++++++++++++++++++++++++++++----
 1 file changed, 32 insertions(+), 4 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index a886f48b6a0e..b0ddfac0486b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2712,17 +2712,45 @@ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long add
 extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
 					     struct vm_area_struct **pprev);
 
-/* Look up the first VMA which intersects the interval start_addr..end_addr-1,
-   NULL if none.  Assume start_addr < end_addr. */
-static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr)
+/**
+ * find_vma_intersection() - Look up the first VMA which intersects the interval
+ * @mm: The process address space.
+ * @start_addr: The inclusive start user address.
+ * @end_addr: The exclusive end user address.
+ *
+ * Returns: The first VMA within the provided range, %NULL otherwise.  Assumes
+ * start_addr < end_addr.
+ */
+static inline
+struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
+					     unsigned long start_addr,
+					     unsigned long end_addr)
 {
-	struct vm_area_struct * vma = find_vma(mm,start_addr);
+	struct vm_area_struct *vma = find_vma(mm, start_addr);
 
 	if (vma && end_addr <= vma->vm_start)
 		vma = NULL;
 	return vma;
 }
 
+/**
+ * vma_lookup() - Find a VMA at a specific address
+ * @mm: The process address space.
+ * @addr: The user address.
+ *
+ * Return: The vm_area_struct at the given address, %NULL otherwise.
+ */
+static inline
+struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr)
+{
+	struct vm_area_struct *vma = find_vma(mm, addr);
+
+	if (vma && addr < vma->vm_start)
+		vma = NULL;
+
+	return vma;
+}
+
 static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
 {
 	unsigned long vm_start = vma->vm_start;
-- 
Gitee


From af7c9caa2ac4f047ac5f57acfc0087b22e8a163f Mon Sep 17 00:00:00 2001
From: Liam Howlett <liam.howlett@oracle.com>
Date: Mon, 28 Jun 2021 19:39:14 -0700
Subject: [PATCH 59/63] x86/sgx: use vma_lookup() in sgx_encl_find()

mainline inclusion
from mainline-5.14
commit 9ce2c3fc0be6e7d0bb2236a33bbb7a0f1943bd81
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 9ce2c3fc0be6 x86/sgx: use vma_lookup() in sgx_encl_find().
Backport for SGX virtualization support

--------------------------------

Use vma_lookup() to find the VMA at a specific address.  As vma_lookup()
will return NULL if the address is not within any VMA, the start address
no longer needs to be validated.

Link: https://lkml.kernel.org/r/20210521174745.2219620-10-Liam.Howlett@Oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Reviewed-by: Laurent Dufour <ldufour@linux.ibm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kernel/cpu/sgx/encl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h
index 6e74f85b6264..fec43ca65065 100644
--- a/arch/x86/kernel/cpu/sgx/encl.h
+++ b/arch/x86/kernel/cpu/sgx/encl.h
@@ -91,8 +91,8 @@ static inline int sgx_encl_find(struct mm_struct *mm, unsigned long addr,
 {
 	struct vm_area_struct *result;
 
-	result = find_vma(mm, addr);
-	if (!result || result->vm_ops != &sgx_vm_ops || addr < result->vm_start)
+	result = vma_lookup(mm, addr);
+	if (!result || result->vm_ops != &sgx_vm_ops)
 		return -EINVAL;
 
 	*vma = result;
-- 
Gitee


From 49f8efe7fc1b67e1b114cc509f9bfd17a6d4fcee Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 21 Oct 2021 16:11:54 -0400
Subject: [PATCH 60/63] x86/sgx/virt: extract sgx_vepc_remove_page

mainline inclusion
from mainline-5.14
commit fd5128e622d7834bb3f7ee23c2bbea8db63cebaf
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit fd5128e622d7 x86/sgx/virt: extract sgx_vepc_remove_page.
Backport for SGX virtualization support

--------------------------------

For bare-metal SGX on real hardware, the hardware provides guarantees
SGX state at reboot.  For instance, all pages start out uninitialized.
The vepc driver provides a similar guarantee today for freshly-opened
vepc instances, but guests such as Windows expect all pages to be in
uninitialized state on startup, including after every guest reboot.

One way to do this is to simply close and reopen the /dev/sgx_vepc file
descriptor and re-mmap the virtual EPC.  However, this is problematic
because it prevents sandboxing the userspace (for example forbidding
open() after the guest starts; this is doable with heavy use of SCM_RIGHTS
file descriptor passing).

In order to implement this, we will need a ioctl that performs
EREMOVE on all pages mapped by a /dev/sgx_vepc file descriptor:
other possibilities, such as closing and reopening the device,
are racy.

Start the implementation by creating a separate function with just
the __eremove wrapper.

Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lkml.kernel.org/r/20211021201155.1523989-2-pbonzini@redhat.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kernel/cpu/sgx/virt.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
index 8f7dd1dc4189..09674c824ae9 100644
--- a/arch/x86/kernel/cpu/sgx/virt.c
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -111,10 +111,8 @@ static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma)
 	return 0;
 }
 
-static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
+static int sgx_vepc_remove_page(struct sgx_epc_page *epc_page)
 {
-	int ret;
-
 	/*
 	 * Take a previously guest-owned EPC page and return it to the
 	 * general EPC page pool.
@@ -124,7 +122,12 @@ static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
 	 * case that a guest properly EREMOVE'd this page, a superfluous
 	 * EREMOVE is harmless.
 	 */
-	ret = __eremove(sgx_get_epc_virt_addr(epc_page));
+	return __eremove(sgx_get_epc_virt_addr(epc_page));
+}
+
+static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
+{
+	int ret = sgx_vepc_remove_page(epc_page);
 	if (ret) {
 		/*
 		 * Only SGX_CHILD_PRESENT is expected, which is because of
@@ -144,7 +147,6 @@ static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
 	}
 
 	sgx_free_epc_page(epc_page);
-
 	return 0;
 }
 
-- 
Gitee


From 3377cda5884bbdc2bd1e7fecde51a287abc194fd Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 21 Oct 2021 16:11:55 -0400
Subject: [PATCH 61/63] x86/sgx/virt: implement SGX_IOC_VEPC_REMOVE ioctl

mainline inclusion
from mainline-5.16
commit ae095b16fc652f459e6c16a256834985c85ecc4d
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit ae095b16fc65 x86/sgx/virt: implement SGX_IOC_VEPC_REMOVE
ioctl.
Backport for SGX virtualization support

--------------------------------

For bare-metal SGX on real hardware, the hardware provides guarantees
SGX state at reboot.  For instance, all pages start out uninitialized.
The vepc driver provides a similar guarantee today for freshly-opened
vepc instances, but guests such as Windows expect all pages to be in
uninitialized state on startup, including after every guest reboot.

Some userspace implementations of virtual SGX would rather avoid having
to close and reopen the /dev/sgx_vepc file descriptor and re-mmap the
virtual EPC.  For example, they could sandbox themselves after the guest
starts and forbid further calls to open(), in order to mitigate exploits
from untrusted guests.

Therefore, add a ioctl that does this with EREMOVE.  Userspace can
invoke the ioctl to bring its vEPC pages back to uninitialized state.
There is a possibility that some pages fail to be removed if they are
SECS pages, and the child and SECS pages could be in separate vEPC
regions.  Therefore, the ioctl returns the number of EREMOVE failures,
telling userspace to try the ioctl again after it's done with all
vEPC regions.  A more verbose description of the correct usage and
the possible error conditions is documented in sgx.rst.

Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lkml.kernel.org/r/20211021201155.1523989-3-pbonzini@redhat.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 Documentation/x86/sgx.rst       | 35 ++++++++++++++++++++++
 arch/x86/include/uapi/asm/sgx.h |  2 ++
 arch/x86/kernel/cpu/sgx/virt.c  | 53 +++++++++++++++++++++++++++++++++
 3 files changed, 90 insertions(+)

diff --git a/Documentation/x86/sgx.rst b/Documentation/x86/sgx.rst
index dd0ac96ff9ef..a608f667fb95 100644
--- a/Documentation/x86/sgx.rst
+++ b/Documentation/x86/sgx.rst
@@ -250,3 +250,38 @@ user wants to deploy SGX applications both on the host and in guests
 on the same machine, the user should reserve enough EPC (by taking out
 total virtual EPC size of all SGX VMs from the physical EPC size) for
 host SGX applications so they can run with acceptable performance.
+
+Architectural behavior is to restore all EPC pages to an uninitialized
+state also after a guest reboot.  Because this state can be reached only
+through the privileged ``ENCLS[EREMOVE]`` instruction, ``/dev/sgx_vepc``
+provides the ``SGX_IOC_VEPC_REMOVE_ALL`` ioctl to execute the instruction
+on all pages in the virtual EPC.
+
+``EREMOVE`` can fail for three reasons.  Userspace must pay attention
+to expected failures and handle them as follows:
+
+1. Page removal will always fail when any thread is running in the
+   enclave to which the page belongs.  In this case the ioctl will
+   return ``EBUSY`` independent of whether it has successfully removed
+   some pages; userspace can avoid these failures by preventing execution
+   of any vcpu which maps the virtual EPC.
+
+2. Page removal will cause a general protection fault if two calls to
+   ``EREMOVE`` happen concurrently for pages that refer to the same
+   "SECS" metadata pages.  This can happen if there are concurrent
+   invocations to ``SGX_IOC_VEPC_REMOVE_ALL``, or if a ``/dev/sgx_vepc``
+   file descriptor in the guest is closed at the same time as
+   ``SGX_IOC_VEPC_REMOVE_ALL``; it will also be reported as ``EBUSY``.
+   This can be avoided in userspace by serializing calls to the ioctl()
+   and to close(), but in general it should not be a problem.
+
+3. Finally, page removal will fail for SECS metadata pages which still
+   have child pages.  Child pages can be removed by executing
+   ``SGX_IOC_VEPC_REMOVE_ALL`` on all ``/dev/sgx_vepc`` file descriptors
+   mapped into the guest.  This means that the ioctl() must be called
+   twice: an initial set of calls to remove child pages and a subsequent
+   set of calls to remove SECS pages.  The second set of calls is only
+   required for those mappings that returned a nonzero value from the
+   first call.  It indicates a bug in the kernel or the userspace client
+   if any of the second round of ``SGX_IOC_VEPC_REMOVE_ALL`` calls has
+   a return code other than 0.
diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h
index 791e45334a4a..c815a6fec9aa 100644
--- a/arch/x86/include/uapi/asm/sgx.h
+++ b/arch/x86/include/uapi/asm/sgx.h
@@ -27,6 +27,8 @@ enum sgx_page_flags {
 	_IOW(SGX_MAGIC, 0x02, struct sgx_enclave_init)
 #define SGX_IOC_ENCLAVE_PROVISION \
 	_IOW(SGX_MAGIC, 0x03, struct sgx_enclave_provision)
+#define SGX_IOC_VEPC_REMOVE_ALL \
+	_IO(SGX_MAGIC, 0x04)
 
 /**
  * struct sgx_enclave_create - parameter structure for the
diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
index 09674c824ae9..b4f9a50de776 100644
--- a/arch/x86/kernel/cpu/sgx/virt.c
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -150,6 +150,41 @@ static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
 	return 0;
 }
 
+static long sgx_vepc_remove_all(struct sgx_vepc *vepc)
+{
+	struct sgx_epc_page *entry;
+	unsigned long index;
+	long failures = 0;
+
+	xa_for_each(&vepc->page_array, index, entry) {
+		int ret = sgx_vepc_remove_page(entry);
+		if (ret) {
+			if (ret == SGX_CHILD_PRESENT) {
+				/* The page is a SECS, userspace will retry.  */
+				failures++;
+			} else {
+				/*
+				 * Report errors due to #GP or SGX_ENCLAVE_ACT; do not
+				 * WARN, as userspace can induce said failures by
+				 * calling the ioctl concurrently on multiple vEPCs or
+				 * while one or more CPUs is running the enclave.  Only
+				 * a #PF on EREMOVE indicates a kernel/hardware issue.
+				 */
+				WARN_ON_ONCE(encls_faulted(ret) &&
+					     ENCLS_TRAPNR(ret) != X86_TRAP_GP);
+				return -EBUSY;
+			}
+		}
+		cond_resched();
+	}
+
+	/*
+	 * Return the number of SECS pages that failed to be removed, so
+	 * userspace knows that it has to retry.
+	 */
+	return failures;
+}
+
 static int sgx_vepc_release(struct inode *inode, struct file *file)
 {
 	struct sgx_vepc *vepc = file->private_data;
@@ -235,9 +270,27 @@ static int sgx_vepc_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static long sgx_vepc_ioctl(struct file *file,
+			   unsigned int cmd, unsigned long arg)
+{
+	struct sgx_vepc *vepc = file->private_data;
+
+	switch (cmd) {
+	case SGX_IOC_VEPC_REMOVE_ALL:
+		if (arg)
+			return -EINVAL;
+		return sgx_vepc_remove_all(vepc);
+
+	default:
+		return -ENOTTY;
+	}
+}
+
 static const struct file_operations sgx_vepc_fops = {
 	.owner		= THIS_MODULE,
 	.open		= sgx_vepc_open,
+	.unlocked_ioctl	= sgx_vepc_ioctl,
+	.compat_ioctl	= sgx_vepc_ioctl,
 	.release	= sgx_vepc_release,
 	.mmap		= sgx_vepc_mmap,
 };
-- 
Gitee


From c586f7db3dd413eade7a026416f2535ee2d07a98 Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Mon, 15 Nov 2021 11:29:04 -0800
Subject: [PATCH 62/63] x86/sgx: Fix free page accounting

mainline inclusion
from mainline-5.16
commit ac5d272a0ad0419f52e08c91953356e32b075af7
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit ac5d272a0ad0 x86/sgx: Fix free page accounting
Backport for SGX virtualization support

--------------------------------

The SGX driver maintains a single global free page counter,
sgx_nr_free_pages, that reflects the number of free pages available
across all NUMA nodes. Correspondingly, a list of free pages is
associated with each NUMA node and sgx_nr_free_pages is updated
every time a page is added or removed from any of the free page
lists. The main usage of sgx_nr_free_pages is by the reclaimer
that runs when it (sgx_nr_free_pages) goes below a watermark
to ensure that there are always some free pages available to, for
example, support efficient page faults.

With sgx_nr_free_pages accessed and modified from a few places
it is essential to ensure that these accesses are done safely but
this is not the case. sgx_nr_free_pages is read without any
protection and updated with inconsistent protection by any one
of the spin locks associated with the individual NUMA nodes.
For example:

      CPU_A                                 CPU_B
      -----                                 -----
 spin_lock(&nodeA->lock);              spin_lock(&nodeB->lock);
 ...                                   ...
 sgx_nr_free_pages--;  /* NOT SAFE */  sgx_nr_free_pages--;

 spin_unlock(&nodeA->lock);            spin_unlock(&nodeB->lock);

Since sgx_nr_free_pages may be protected by different spin locks
while being modified from different CPUs, the following scenario
is possible:

      CPU_A                                CPU_B
      -----                                -----
{sgx_nr_free_pages = 100}
 spin_lock(&nodeA->lock);              spin_lock(&nodeB->lock);
 sgx_nr_free_pages--;                  sgx_nr_free_pages--;
 /* LOAD sgx_nr_free_pages = 100 */    /* LOAD sgx_nr_free_pages = 100 */
 /* sgx_nr_free_pages--          */    /* sgx_nr_free_pages--          */
 /* STORE sgx_nr_free_pages = 99 */    /* STORE sgx_nr_free_pages = 99 */
 spin_unlock(&nodeA->lock);            spin_unlock(&nodeB->lock);

In the above scenario, sgx_nr_free_pages is decremented from two CPUs
but instead of sgx_nr_free_pages ending with a value that is two less
than it started with, it was only decremented by one while the number
of free pages were actually reduced by two. The consequence of
sgx_nr_free_pages not being protected is that its value may not
accurately reflect the actual number of free pages on the system,
impacting the availability of free pages in support of many flows.

The problematic scenario is when the reclaimer does not run because it
believes there to be sufficient free pages while any attempt to allocate
a page fails because there are no free pages available. In the SGX driver
the reclaimer's watermark is only 32 pages so after encountering the
above example scenario 32 times a user space hang is possible when there
are no more free pages because of repeated page faults caused by no
free pages made available.

The following flow was encountered:
asm_exc_page_fault
 ...
   sgx_vma_fault()
     sgx_encl_load_page()
       sgx_encl_eldu() // Encrypted page needs to be loaded from backing
                       // storage into newly allocated SGX memory page
         sgx_alloc_epc_page() // Allocate a page of SGX memory
           __sgx_alloc_epc_page() // Fails, no free SGX memory
           ...
           if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) // Wake reclaimer
             wake_up(&ksgxd_waitq);
           return -EBUSY; // Return -EBUSY giving reclaimer time to run
       return -EBUSY;
     return -EBUSY;
   return VM_FAULT_NOPAGE;

The reclaimer is triggered in above flow with the following code:

static bool sgx_should_reclaim(unsigned long watermark)
{
        return sgx_nr_free_pages < watermark &&
               !list_empty(&sgx_active_page_list);
}

In the problematic scenario there were no free pages available yet the
value of sgx_nr_free_pages was above the watermark. The allocation of
SGX memory thus always failed because of a lack of free pages while no
free pages were made available because the reclaimer is never started
because of sgx_nr_free_pages' incorrect value. The consequence was that
user space kept encountering VM_FAULT_NOPAGE that caused the same
address to be accessed repeatedly with the same result.

Change the global free page counter to an atomic type that
ensures simultaneous updates are done safely. While doing so, move
the updating of the variable outside of the spin lock critical
section to which it does not belong.

Cc: stable@vger.kernel.org
Fixes: 901ddbb9ecf5 ("x86/sgx: Add a basic NUMA allocation scheme to sgx_alloc_epc_page()")
Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Link: https://lkml.kernel.org/r/a95a40743bbd3f795b465f30922dde7f1ea9e0eb.1637004094.git.reinette.chatre@intel.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kernel/cpu/sgx/main.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index ad904747419e..379eeefa744a 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -28,8 +28,7 @@ static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq);
 static LIST_HEAD(sgx_active_page_list);
 static DEFINE_SPINLOCK(sgx_reclaimer_lock);
 
-/* The free page list lock protected variables prepend the lock. */
-static unsigned long sgx_nr_free_pages;
+static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0);
 
 /* Nodes with one or more EPC sections. */
 static nodemask_t sgx_numa_mask;
@@ -403,14 +402,15 @@ static void sgx_reclaim_pages(void)
 
 		spin_lock(&node->lock);
 		list_add_tail(&epc_page->list, &node->free_page_list);
-		sgx_nr_free_pages++;
 		spin_unlock(&node->lock);
+		atomic_long_inc(&sgx_nr_free_pages);
 	}
 }
 
 static bool sgx_should_reclaim(unsigned long watermark)
 {
-	return sgx_nr_free_pages < watermark && !list_empty(&sgx_active_page_list);
+	return atomic_long_read(&sgx_nr_free_pages) < watermark &&
+	       !list_empty(&sgx_active_page_list);
 }
 
 static int ksgxd(void *p)
@@ -471,9 +471,9 @@ static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)
 
 	page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list);
 	list_del_init(&page->list);
-	sgx_nr_free_pages--;
 
 	spin_unlock(&node->lock);
+	atomic_long_dec(&sgx_nr_free_pages);
 
 	return page;
 }
@@ -625,9 +625,9 @@ void sgx_free_epc_page(struct sgx_epc_page *page)
 	spin_lock(&node->lock);
 
 	list_add_tail(&page->list, &node->free_page_list);
-	sgx_nr_free_pages++;
 
 	spin_unlock(&node->lock);
+	atomic_long_inc(&sgx_nr_free_pages);
 }
 
 static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
-- 
Gitee


From 9be4bee162775adaae218fba56179f6189a6a4b2 Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Tue, 8 Feb 2022 10:48:07 -0800
Subject: [PATCH 63/63] x86/sgx: Silence softlockup detection when releasing
 large enclaves

mainline inclusion
from mainline-5.17
commit 8795359e35bc33bf86b6d0765aa7f37431db3b9c
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5EZEK
CVE: NA

Intel-SIG: commit 8795359e35bc x86/sgx: Silence softlockup detection
when releasing large enclaves.
Backport for SGX virtualization support

--------------------------------

Vijay reported that the "unclobbered_vdso_oversubscribed" selftest
triggers the softlockup detector.

Actual SGX systems have 128GB of enclave memory or more.  The
"unclobbered_vdso_oversubscribed" selftest creates one enclave which
consumes all of the enclave memory on the system. Tearing down such a
large enclave takes around a minute, most of it in the loop where
the EREMOVE instruction is applied to each individual 4k enclave page.

Spending one minute in a loop triggers the softlockup detector.

Add a cond_resched() to give other tasks a chance to run and placate
the softlockup detector.

Cc: stable@vger.kernel.org
Fixes: 1728ab54b4be ("x86/sgx: Add a page reclaimer")
Reported-by: Vijay Dhanraj <vijay.dhanraj@intel.com>
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Tested-by: Jarkko Sakkinen <jarkko@kernel.org>  (kselftest as sanity check)
Link: https://lkml.kernel.org/r/ced01cac1e75f900251b0a4ae1150aa8ebd295ec.1644345232.git.reinette.chatre@intel.com
Signed-off-by: Fan Du <fan.du@intel.com>
Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com>
---
 arch/x86/kernel/cpu/sgx/encl.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 3be203297988..87fed32a30a8 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -410,6 +410,8 @@ void sgx_encl_release(struct kref *ref)
 		}
 
 		kfree(entry);
+		/* Invoke scheduler to prevent soft lockups. */
+		cond_resched();
 	}
 
 	xa_destroy(&encl->page_array);
-- 
Gitee