From 5694b2e450d4ad76c3314ad8ac73025bc616930e Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 16 Feb 2017 10:40:56 +0100 Subject: [PATCH 01/12] kvm: make KVM_CAP_ENABLE_CAP_VM architecture agnostic mainline inclusion from mainline-v5.0 commit: e5d83c74a5800c2a1fa3ba982c1c4b2b39ae6db2 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I66COX CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=linux-5.10.y&id=e5d83c74a5800c2a1fa3ba982c1c4b2b39ae6db2 -------------------------------- The first such capability to be handled in virt/kvm/ will be manual dirty page reprotection. Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 13 +++++++++---- arch/powerpc/kvm/powerpc.c | 14 ++------------ arch/s390/kvm/kvm-s390.c | 11 +---------- arch/x86/kvm/x86.c | 14 ++------------ include/linux/kvm_host.h | 2 ++ virt/kvm/kvm_main.c | 25 +++++++++++++++++++++++++ 6 files changed, 41 insertions(+), 38 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 3072aaf2278b..181b074dc914 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1130,10 +1130,15 @@ documentation when it pops into existence). 4.37 KVM_ENABLE_CAP -Capability: KVM_CAP_ENABLE_CAP, KVM_CAP_ENABLE_CAP_VM -Architectures: x86 (only KVM_CAP_ENABLE_CAP_VM), - mips (only KVM_CAP_ENABLE_CAP), ppc, s390 -Type: vcpu ioctl, vm ioctl (with KVM_CAP_ENABLE_CAP_VM) +Capability: KVM_CAP_ENABLE_CAP +Architectures: mips, ppc, s390 +Type: vcpu ioctl +Parameters: struct kvm_enable_cap (in) +Returns: 0 on success; -1 on error + +Capability: KVM_CAP_ENABLE_CAP_VM +Architectures: all +Type: vcpu ioctl Parameters: struct kvm_enable_cap (in) Returns: 0 on success; -1 on error diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 51cd66dc1bb0..ee89f6fbea86 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -523,7 +523,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_PPC_UNSET_IRQ: case KVM_CAP_PPC_IRQ_LEVEL: case KVM_CAP_ENABLE_CAP: - case KVM_CAP_ENABLE_CAP_VM: case KVM_CAP_ONE_REG: case KVM_CAP_IOEVENTFD: case KVM_CAP_DEVICE_CTRL: @@ -2090,8 +2089,8 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, } -static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, - struct kvm_enable_cap *cap) +int kvm_vm_ioctl_enable_cap(struct kvm *kvm, + struct kvm_enable_cap *cap) { int r; @@ -2271,15 +2270,6 @@ long kvm_arch_vm_ioctl(struct file *filp, break; } - case KVM_ENABLE_CAP: - { - struct kvm_enable_cap cap; - r = -EFAULT; - if (copy_from_user(&cap, argp, sizeof(cap))) - goto out; - r = kvm_vm_ioctl_enable_cap(kvm, &cap); - break; - } #ifdef CONFIG_SPAPR_TCE_IOMMU case KVM_CREATE_SPAPR_TCE_64: { struct kvm_create_spapr_tce_64 create_tce_64; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index db3196aebaa1..57f2f83cc375 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -474,7 +474,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_CSS_SUPPORT: case KVM_CAP_IOEVENTFD: case KVM_CAP_DEVICE_CTRL: - case KVM_CAP_ENABLE_CAP_VM: case KVM_CAP_S390_IRQCHIP: case KVM_CAP_VM_ATTRIBUTES: case KVM_CAP_MP_STATE: @@ -618,7 +617,7 @@ static void icpt_operexc_on_all_vcpus(struct kvm *kvm) } } -static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) +int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) { int r; @@ -1912,14 +1911,6 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_s390_inject_vm(kvm, &s390int); break; } - case KVM_ENABLE_CAP: { - struct kvm_enable_cap cap; - r = -EFAULT; - if (copy_from_user(&cap, argp, sizeof(cap))) - break; - r = kvm_vm_ioctl_enable_cap(kvm, &cap); - break; - } case KVM_CREATE_IRQCHIP: { struct kvm_irq_routing_entry routing; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c61cab768d2a..9f51405ac43f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3091,7 +3091,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_TIME: case KVM_CAP_IOAPIC_POLARITY_IGNORED: case KVM_CAP_TSC_DEADLINE_TIMER: - case KVM_CAP_ENABLE_CAP_VM: case KVM_CAP_DISABLE_QUIRKS: case KVM_CAP_SET_BOOT_CPU_ID: case KVM_CAP_SPLIT_IRQCHIP: @@ -4489,8 +4488,8 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, return 0; } -static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, - struct kvm_enable_cap *cap) +int kvm_vm_ioctl_enable_cap(struct kvm *kvm, + struct kvm_enable_cap *cap) { int r; @@ -4819,15 +4818,6 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; break; } - case KVM_ENABLE_CAP: { - struct kvm_enable_cap cap; - - r = -EFAULT; - if (copy_from_user(&cap, argp, sizeof(cap))) - goto out; - r = kvm_vm_ioctl_enable_cap(kvm, &cap); - break; - } case KVM_MEMORY_ENCRYPT_OP: { r = -ENOTTY; if (kvm_x86_ops->mem_enc_op) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 386ac3f48178..34a8d08bee7a 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -812,6 +812,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, bool line_status); +int kvm_vm_ioctl_enable_cap(struct kvm *kvm, + struct kvm_enable_cap *cap); long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 38e758e0f452..2fe428e880c1 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3042,6 +3042,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) #endif case KVM_CAP_IOEVENTFD_ANY_LENGTH: case KVM_CAP_CHECK_EXTENSION_VM: + case KVM_CAP_ENABLE_CAP_VM: return 1; #ifdef CONFIG_KVM_MMIO case KVM_CAP_COALESCED_MMIO: @@ -3061,6 +3062,21 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) return kvm_vm_ioctl_check_extension(kvm, arg); } +int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm, + struct kvm_enable_cap *cap) +{ + return -EINVAL; +} + +static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, + struct kvm_enable_cap *cap) +{ + switch (cap->cap) { + default: + return kvm_vm_ioctl_enable_cap(kvm, cap); + } +} + static long kvm_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -3074,6 +3090,15 @@ static long kvm_vm_ioctl(struct file *filp, case KVM_CREATE_VCPU: r = kvm_vm_ioctl_create_vcpu(kvm, arg); break; + case KVM_ENABLE_CAP: { + struct kvm_enable_cap cap; + + r = -EFAULT; + if (copy_from_user(&cap, argp, sizeof(cap))) + goto out; + r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap); + break; + } case KVM_SET_USER_MEMORY_REGION: { struct kvm_userspace_memory_region kvm_userspace_mem; -- Gitee From b8194d785d6f974174a7be07fbe5a726667b07fc Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 23 Oct 2018 02:18:42 +0200 Subject: [PATCH 02/12] kvm: rename last argument to kvm_get_dirty_log_protect mainline inclusion from mainline-v5.0 commit: 8fe65a8299f9e1f40cb95308ab7b3c4ad80bf801 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I66COX CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=linux-5.10.y&id=8fe65a8299f9e1f40cb95308ab7b3c4ad80bf801 -------------------------------- When manual dirty log reprotect will be enabled, kvm_get_dirty_log_protect's pointer argument will always be false on exit, because no TLB flush is needed until the manual re-protection operation. Rename it from "is_dirty" to "flush", which more accurately tells the caller what they have to do with it. Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 6 +++--- arch/x86/kvm/x86.c | 6 +++--- include/linux/kvm_host.h | 2 +- virt/kvm/arm/arm.c | 6 +++--- virt/kvm/kvm_main.c | 6 +++--- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index e7f5ef6bed0f..e9313a50d81f 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1004,14 +1004,14 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; - bool is_dirty = false; + bool flush = false; int r; mutex_lock(&kvm->slots_lock); - r = kvm_get_dirty_log_protect(kvm, log, &is_dirty); + r = kvm_get_dirty_log_protect(kvm, log, &flush); - if (is_dirty) { + if (flush) { slots = kvm_memslots(kvm); memslot = id_to_memslot(slots, log->slot); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9f51405ac43f..543eb0c11672 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4451,7 +4451,7 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, */ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - bool is_dirty = false; + bool flush = false; int r; mutex_lock(&kvm->slots_lock); @@ -4462,14 +4462,14 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) if (kvm_x86_ops->flush_log_dirty) kvm_x86_ops->flush_log_dirty(kvm); - r = kvm_get_dirty_log_protect(kvm, log, &is_dirty); + r = kvm_get_dirty_log_protect(kvm, log, &flush); /* * All the TLBs can be flushed out of mmu lock, see the comments in * kvm_mmu_slot_remove_write_access(). */ lockdep_assert_held(&kvm->slots_lock); - if (is_dirty) + if (flush) kvm_flush_remote_tlbs(kvm); mutex_unlock(&kvm->slots_lock); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 34a8d08bee7a..9267cfb0a38e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -800,7 +800,7 @@ int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log, int *is_dirty); int kvm_get_dirty_log_protect(struct kvm *kvm, - struct kvm_dirty_log *log, bool *is_dirty); + struct kvm_dirty_log *log, bool *flush); void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 1271779873d3..4870b0795587 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -1319,14 +1319,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp, */ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - bool is_dirty = false; + bool flush = false; int r; mutex_lock(&kvm->slots_lock); - r = kvm_get_dirty_log_protect(kvm, log, &is_dirty); + r = kvm_get_dirty_log_protect(kvm, log, &flush); - if (is_dirty) + if (flush) kvm_flush_remote_tlbs(kvm); mutex_unlock(&kvm->slots_lock); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2fe428e880c1..e499a8a8383d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1204,7 +1204,7 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log); * */ int kvm_get_dirty_log_protect(struct kvm *kvm, - struct kvm_dirty_log *log, bool *is_dirty) + struct kvm_dirty_log *log, bool *flush) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; @@ -1231,7 +1231,7 @@ int kvm_get_dirty_log_protect(struct kvm *kvm, memset(dirty_bitmap_buffer, 0, n); spin_lock(&kvm->mmu_lock); - *is_dirty = false; + *flush = false; for (i = 0; i < n / sizeof(long); i++) { unsigned long mask; gfn_t offset; @@ -1239,7 +1239,7 @@ int kvm_get_dirty_log_protect(struct kvm *kvm, if (!dirty_bitmap[i]) continue; - *is_dirty = true; + *flush = true; mask = xchg(&dirty_bitmap[i], 0); dirty_bitmap_buffer[i] = mask; -- Gitee From f6636bd76ef29cbf69bac5d749d8a59f2b23589f Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 23 Oct 2018 02:36:47 +0200 Subject: [PATCH 03/12] kvm: introduce manual dirty log reprotect mainline inclusion from mainline-v5.0 commit: 2a31b9db153530df4aa02dac8c32837bf5f47019 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I66COX CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=linux-5.10.y&id=2a31b9db153530df4aa02dac8c32837bf5f47019 -------------------------------- There are two problems with KVM_GET_DIRTY_LOG. First, and less important, it can take kvm->mmu_lock for an extended period of time. Second, its user can actually see many false positives in some cases. The latter is due to a benign race like this: 1. KVM_GET_DIRTY_LOG returns a set of dirty pages and write protects them. 2. The guest modifies the pages, causing them to be marked ditry. 3. Userspace actually copies the pages. 4. KVM_GET_DIRTY_LOG returns those pages as dirty again, even though they were not written to since (3). This is especially a problem for large guests, where the time between (1) and (3) can be substantial. This patch introduces a new capability which, when enabled, makes KVM_GET_DIRTY_LOG not write-protect the pages it returns. Instead, userspace has to explicitly clear the dirty log bits just before using the content of the page. The new KVM_CLEAR_DIRTY_LOG ioctl can also operate on a 64-page granularity rather than requiring to sync a full memslot; this way, the mmu_lock is taken for small amounts of time, and only a small amount of time will pass between write protection of pages and the sending of their content. Signed-off-by: Paolo Bonzini [Remove test code because the test code is too different.] Signed-off-by: zhengchuan --- Documentation/virtual/kvm/api.txt | 67 +++++++++++++++ arch/mips/kvm/mips.c | 23 ++++++ arch/x86/kvm/x86.c | 27 ++++++ include/linux/kvm_host.h | 5 ++ include/uapi/linux/kvm.h | 15 ++++ virt/kvm/arm/arm.c | 16 ++++ virt/kvm/kvm_main.c | 132 ++++++++++++++++++++++++++---- 7 files changed, 268 insertions(+), 17 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 181b074dc914..0ff6dd775cab 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -311,6 +311,9 @@ the address space for which you want to return the dirty bitmap. They must be less than the value that KVM_CHECK_EXTENSION returns for the KVM_CAP_MULTI_ADDRESS_SPACE capability. +The bits in the dirty bitmap are cleared before the ioctl returns, unless +KVM_CAP_MANUAL_DIRTY_LOG_PROTECT is enabled. For more information, +see the description of the capability. 4.9 KVM_SET_MEMORY_ALIAS @@ -3726,6 +3729,46 @@ Returns: 0 on success, -1 on error This copies the vcpu's kvm_nested_state struct from userspace to the kernel. For the definition of struct kvm_nested_state, see KVM_GET_NESTED_STATE. +4.117 KVM_CLEAR_DIRTY_LOG (vm ioctl) + +Capability: KVM_CAP_MANUAL_DIRTY_LOG_PROTECT +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_dirty_log (in) +Returns: 0 on success, -1 on error + +/* for KVM_CLEAR_DIRTY_LOG */ +struct kvm_clear_dirty_log { + __u32 slot; + __u32 num_pages; + __u64 first_page; + union { + void __user *dirty_bitmap; /* one bit per page */ + __u64 padding; + }; +}; + +The ioctl clears the dirty status of pages in a memory slot, according to +the bitmap that is passed in struct kvm_clear_dirty_log's dirty_bitmap +field. Bit 0 of the bitmap corresponds to page "first_page" in the +memory slot, and num_pages is the size in bits of the input bitmap. +Both first_page and num_pages must be a multiple of 64. For each bit +that is set in the input bitmap, the corresponding page is marked "clean" +in KVM's dirty bitmap, and dirty tracking is re-enabled for that page +(for example via write-protection, or by clearing the dirty bit in +a page table entry). + +If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 specifies +the address space for which you want to return the dirty bitmap. +They must be less than the value that KVM_CHECK_EXTENSION returns for +the KVM_CAP_MULTI_ADDRESS_SPACE capability. + +This ioctl is mostly useful when KVM_CAP_MANUAL_DIRTY_LOG_PROTECT +is enabled; for more information, see the description of the capability. +However, it can always be used as long as KVM_CHECK_EXTENSION confirms +that KVM_CAP_MANUAL_DIRTY_LOG_PROTECT is present. + + 5. The kvm_run structure ------------------------ @@ -4581,6 +4624,30 @@ With this capability, a guest may read the MSR_PLATFORM_INFO MSR. Otherwise, a #GP would be raised when the guest tries to access. Currently, this capability does not enable write permissions of this MSR for the guest. +7.18 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT + +Architectures: all +Parameters: args[0] whether feature should be enabled or not + +With this capability enabled, KVM_GET_DIRTY_LOG will not automatically +clear and write-protect all pages that are returned as dirty. +Rather, userspace will have to do this operation separately using +KVM_CLEAR_DIRTY_LOG. + +At the cost of a slightly more complicated operation, this provides better +scalability and responsiveness for two reasons. First, +KVM_CLEAR_DIRTY_LOG ioctl can operate on a 64-page granularity rather +than requiring to sync a full memslot; this ensures that KVM does not +take spinlocks for an extended period of time. Second, in some cases a +large amount of time can pass between a call to KVM_GET_DIRTY_LOG and +userspace actually using the data in the page. Pages can be modified +during this time, which is inefficint for both the guest and userspace: +the guest will incur a higher penalty due to write protection faults, +while userspace can see false reports of dirty pages. Manual reprotection +helps reducing this time, improving guest performance and reducing the +number of dirty log false positives. + + 8. Other capabilities. ---------------------- diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index e9313a50d81f..e3f7606bdbb4 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1023,6 +1023,29 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) return r; } +int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + bool flush = false; + int r; + + mutex_lock(&kvm->slots_lock); + + r = kvm_clear_dirty_log_protect(kvm, log, &flush); + + if (flush) { + slots = kvm_memslots(kvm); + memslot = id_to_memslot(slots, log->slot); + + /* Let implementation handle TLB/GVA invalidation */ + kvm_mips_callbacks->flush_shadow_memslot(kvm, memslot); + } + + mutex_unlock(&kvm->slots_lock); + return r; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { long r; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 543eb0c11672..90afa57f7a1e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4476,6 +4476,33 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) return r; } +int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log) +{ + bool flush = false; + int r; + + mutex_lock(&kvm->slots_lock); + + /* + * Flush potentially hardware-cached dirty pages to dirty_bitmap. + */ + if (kvm_x86_ops->flush_log_dirty) + kvm_x86_ops->flush_log_dirty(kvm); + + r = kvm_clear_dirty_log_protect(kvm, log, &flush); + + /* + * All the TLBs can be flushed out of mmu lock, see the comments in + * kvm_mmu_slot_remove_write_access(). + */ + lockdep_assert_held(&kvm->slots_lock); + if (flush) + kvm_flush_remote_tlbs(kvm); + + mutex_unlock(&kvm->slots_lock); + return r; +} + int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, bool line_status) { diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9267cfb0a38e..3ec2e7f066f8 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -472,6 +472,7 @@ struct kvm { #endif long tlbs_dirty; struct list_head devices; + bool manual_dirty_log_protect; struct dentry *debugfs_dentry; struct kvm_stat_data **debugfs_stat_data; struct srcu_struct srcu; @@ -801,6 +802,8 @@ int kvm_get_dirty_log(struct kvm *kvm, int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log, bool *flush); +int kvm_clear_dirty_log_protect(struct kvm *kvm, + struct kvm_clear_dirty_log *log, bool *flush); void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, @@ -809,6 +812,8 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log); +int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, + struct kvm_clear_dirty_log *log); int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, bool line_status); diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index eb1bb50a5c56..eb9987b962ed 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -486,6 +486,17 @@ struct kvm_dirty_log { }; }; +/* for KVM_CLEAR_DIRTY_LOG */ +struct kvm_clear_dirty_log { + __u32 slot; + __u32 num_pages; + __u64 first_page; + union { + void __user *dirty_bitmap; /* one bit per page */ + __u64 padding2; + }; +}; + /* for KVM_SET_SIGNAL_MASK */ struct kvm_signal_mask { __u32 len; @@ -963,6 +974,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_INJECT_SERROR_ESR 158 #define KVM_CAP_MSR_PLATFORM_INFO 159 #define KVM_CAP_ARM_VM_IPA_SIZE 165 /* returns maximum IPA bits for a VM */ +#define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT 166 #define KVM_CAP_ARM_IRQ_LINE_LAYOUT_2 174 #define KVM_CAP_ARM_CPU_FEATURE 555 @@ -1423,6 +1435,9 @@ struct kvm_enc_region { #define KVM_GET_NESTED_STATE _IOWR(KVMIO, 0xbe, struct kvm_nested_state) #define KVM_SET_NESTED_STATE _IOW(KVMIO, 0xbf, struct kvm_nested_state) +/* Available with KVM_CAP_MANUAL_DIRTY_LOG_PROTECT */ +#define KVM_CLEAR_DIRTY_LOG _IOWR(KVMIO, 0xc0, struct kvm_clear_dirty_log) + /* Secure Encrypted Virtualization command */ enum sev_cmd_id { /* Guest initialization commands */ diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 4870b0795587..f12efc694f90 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -1333,6 +1333,22 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) return r; } +int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log) +{ + bool flush = false; + int r; + + mutex_lock(&kvm->slots_lock); + + r = kvm_clear_dirty_log_protect(kvm, log, &flush); + + if (flush) + kvm_flush_remote_tlbs(kvm); + + mutex_unlock(&kvm->slots_lock); + return r; +} + static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr) { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e499a8a8383d..55b117ca37c6 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1183,7 +1183,7 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log); #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT /** * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages - * are dirty write protect them for next write. + * and reenable dirty page tracking for the corresponding pages. * @kvm: pointer to kvm instance * @log: slot id and address to which we copy the log * @is_dirty: flag set if any page is dirty @@ -1226,37 +1226,114 @@ int kvm_get_dirty_log_protect(struct kvm *kvm, return -ENOENT; n = kvm_dirty_bitmap_bytes(memslot); + *flush = false; + if (kvm->manual_dirty_log_protect) { + /* + * Unlike kvm_get_dirty_log, we always return false in *flush, + * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There + * is some code duplication between this function and + * kvm_get_dirty_log, but hopefully all architecture + * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log + * can be eliminated. + */ + dirty_bitmap_buffer = dirty_bitmap; + } else { + dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); + memset(dirty_bitmap_buffer, 0, n); - dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); - memset(dirty_bitmap_buffer, 0, n); + spin_lock(&kvm->mmu_lock); + for (i = 0; i < n / sizeof(long); i++) { + unsigned long mask; + gfn_t offset; - spin_lock(&kvm->mmu_lock); + if (!dirty_bitmap[i]) + continue; + + *flush = true; + mask = xchg(&dirty_bitmap[i], 0); + dirty_bitmap_buffer[i] = mask; + + if (mask) { + offset = i * BITS_PER_LONG; + kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, + offset, mask); + } + } + spin_unlock(&kvm->mmu_lock); + } + + if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect); + +/** + * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap + * and reenable dirty page tracking for the corresponding pages. + * @kvm: pointer to kvm instance + * @log: slot id and address from which to fetch the bitmap of dirty pages + */ +int kvm_clear_dirty_log_protect(struct kvm *kvm, + struct kvm_clear_dirty_log *log, bool *flush) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + int as_id, id, n; + gfn_t offset; + unsigned long i; + unsigned long *dirty_bitmap; + unsigned long *dirty_bitmap_buffer; + + as_id = log->slot >> 16; + id = (u16)log->slot; + if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) + return -EINVAL; + + if ((log->first_page & 63) || (log->num_pages & 63)) + return -EINVAL; + + slots = __kvm_memslots(kvm, as_id); + memslot = id_to_memslot(slots, id); + + dirty_bitmap = memslot->dirty_bitmap; + if (!dirty_bitmap) + return -ENOENT; + + n = kvm_dirty_bitmap_bytes(memslot); *flush = false; - for (i = 0; i < n / sizeof(long); i++) { - unsigned long mask; - gfn_t offset; + dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); + if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n)) + return -EFAULT; - if (!dirty_bitmap[i]) + spin_lock(&kvm->mmu_lock); + for (offset = log->first_page, + i = offset / BITS_PER_LONG, n = log->num_pages / BITS_PER_LONG; n--; + i++, offset += BITS_PER_LONG) { + unsigned long mask = *dirty_bitmap_buffer++; + atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i]; + if (!mask) continue; - *flush = true; - - mask = xchg(&dirty_bitmap[i], 0); - dirty_bitmap_buffer[i] = mask; + mask &= atomic_long_fetch_andnot(mask, p); + /* + * mask contains the bits that really have been cleared. This + * never includes any bits beyond the length of the memslot (if + * the length is not aligned to 64 pages), therefore it is not + * a problem if userspace sets them in log->dirty_bitmap. + */ if (mask) { - offset = i * BITS_PER_LONG; + *flush = true; kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask); } } - spin_unlock(&kvm->mmu_lock); - if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) - return -EFAULT; + return 0; } -EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect); +EXPORT_SYMBOL_GPL(kvm_clear_dirty_log_protect); #endif bool kvm_largepages_enabled(void) @@ -3043,6 +3120,9 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_IOEVENTFD_ANY_LENGTH: case KVM_CAP_CHECK_EXTENSION_VM: case KVM_CAP_ENABLE_CAP_VM: +#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT + case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT: +#endif return 1; #ifdef CONFIG_KVM_MMIO case KVM_CAP_COALESCED_MMIO: @@ -3072,6 +3152,13 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, struct kvm_enable_cap *cap) { switch (cap->cap) { +#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT + case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT: + if (cap->flags || (cap->args[0] & ~1)) + return -EINVAL; + kvm->manual_dirty_log_protect = cap->args[0]; + return 0; +#endif default: return kvm_vm_ioctl_enable_cap(kvm, cap); } @@ -3119,6 +3206,17 @@ static long kvm_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_get_dirty_log(kvm, &log); break; } +#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT + case KVM_CLEAR_DIRTY_LOG: { + struct kvm_clear_dirty_log log; + + r = -EFAULT; + if (copy_from_user(&log, argp, sizeof(log))) + goto out; + r = kvm_vm_ioctl_clear_dirty_log(kvm, &log); + break; + } +#endif #ifdef CONFIG_KVM_MMIO case KVM_REGISTER_COALESCED_MMIO: { struct kvm_coalesced_mmio_zone zone; -- Gitee From 67ef1ab0be632f15eb84c4e42d44b60e745c21a8 Mon Sep 17 00:00:00 2001 From: Tomas Bortoli Date: Wed, 2 Jan 2019 18:29:37 +0100 Subject: [PATCH 04/12] KVM: validate userspace input in kvm_clear_dirty_log_protect() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v5.0 commit: 98938aa8edd66dc95024d7c936a4bc315f6615ff category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I66COX CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=linux-5.10.y&id=98938aa8edd66dc95024d7c936a4bc315f6615ff -------------------------------- The function at issue does not fully validate the content of the structure pointed by the log parameter, though its content has just been copied from userspace and lacks validation. Fix that. Moreover, change the type of n to unsigned long as that is the type returned by kvm_dirty_bitmap_bytes(). Signed-off-by: Tomas Bortoli Reported-by: syzbot+028366e52c9ace67deb3@syzkaller.appspotmail.com [Squashed the fix from Paolo. - Radim.] Signed-off-by: Radim Krčmář --- virt/kvm/kvm_main.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 55b117ca37c6..9881baf39976 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1279,9 +1279,9 @@ int kvm_clear_dirty_log_protect(struct kvm *kvm, { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; - int as_id, id, n; + int as_id, id; gfn_t offset; - unsigned long i; + unsigned long i, n; unsigned long *dirty_bitmap; unsigned long *dirty_bitmap_buffer; @@ -1301,6 +1301,11 @@ int kvm_clear_dirty_log_protect(struct kvm *kvm, return -ENOENT; n = kvm_dirty_bitmap_bytes(memslot); + + if (log->first_page > memslot->npages || + log->num_pages > memslot->npages - log->first_page) + return -EINVAL; + *flush = false; dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n)) -- Gitee From 76f6024afa7952517ba5741710679c687b28e1b9 Mon Sep 17 00:00:00 2001 From: Lan Tianyu Date: Sat, 2 Feb 2019 17:20:27 +0800 Subject: [PATCH 05/12] Revert "KVM: Eliminate extra function calls in kvm_get_dirty_log_protect()" mainline inclusion from mainline-v5.1 commit: a67794cafbc4594debf53dbe4e2a7708426f492e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I66COX CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=linux-5.10.y&id=a67794cafbc4594debf53dbe4e2a7708426f492e -------------------------------- The value of "dirty_bitmap[i]" is already check before setting its value to mask. The following check of "mask" is redundant. The check of "mask" was introduced by commit 58d2930f4ee3 ("KVM: Eliminate extra function calls in kvm_get_dirty_log_protect()"), revert it. Signed-off-by: Lan Tianyu Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 9881baf39976..06f47308c87e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1253,11 +1253,9 @@ int kvm_get_dirty_log_protect(struct kvm *kvm, mask = xchg(&dirty_bitmap[i], 0); dirty_bitmap_buffer[i] = mask; - if (mask) { - offset = i * BITS_PER_LONG; - kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, - offset, mask); - } + offset = i * BITS_PER_LONG; + kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, + offset, mask); } spin_unlock(&kvm->mmu_lock); } -- Gitee From 87bf4975f07291cf453caed1e2550732f90018ed Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 17 Apr 2019 15:28:44 +0200 Subject: [PATCH 06/12] KVM: fix KVM_CLEAR_DIRTY_LOG for memory slots of unaligned size mainline inclusion from mainline-v5.1 commit: 76d58e0f07ec203bbdfcaabd9a9fc10a5a3ed5ea category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I66COX CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=linux-5.10.y&id=76d58e0f07ec203bbdfcaabd9a9fc10a5a3ed5ea -------------------------------- If a memory slot's size is not a multiple of 64 pages (256K), then the KVM_CLEAR_DIRTY_LOG API is unusable: clearing the final 64 pages either requires the requested page range to go beyond memslot->npages, or requires log->num_pages to be unaligned, and kvm_clear_dirty_log_protect requires log->num_pages to be both in range and aligned. To allow this case, allow log->num_pages not to be a multiple of 64 if it ends exactly on the last page of the slot. Reported-by: Peter Xu Fixes: 98938aa8edd6 ("KVM: validate userspace input in kvm_clear_dirty_log_protect()", 2019-01-02) Signed-off-by: Paolo Bonzini [Remove test code because the test code is too different.] Signed-off-by: zhengchuan --- Documentation/virtual/kvm/api.txt | 5 +++-- virt/kvm/kvm_main.c | 7 ++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 0ff6dd775cab..2b753800416d 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3752,8 +3752,9 @@ The ioctl clears the dirty status of pages in a memory slot, according to the bitmap that is passed in struct kvm_clear_dirty_log's dirty_bitmap field. Bit 0 of the bitmap corresponds to page "first_page" in the memory slot, and num_pages is the size in bits of the input bitmap. -Both first_page and num_pages must be a multiple of 64. For each bit -that is set in the input bitmap, the corresponding page is marked "clean" +first_page must be a multiple of 64; num_pages must also be a multiple of +64 unless first_page + num_pages is the size of the memory slot. For each +bit that is set in the input bitmap, the corresponding page is marked "clean" in KVM's dirty bitmap, and dirty tracking is re-enabled for that page (for example via write-protection, or by clearing the dirty bit in a page table entry). diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 06f47308c87e..d0bcd84b6c80 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1288,7 +1288,7 @@ int kvm_clear_dirty_log_protect(struct kvm *kvm, if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) return -EINVAL; - if ((log->first_page & 63) || (log->num_pages & 63)) + if (log->first_page & 63) return -EINVAL; slots = __kvm_memslots(kvm, as_id); @@ -1301,8 +1301,9 @@ int kvm_clear_dirty_log_protect(struct kvm *kvm, n = kvm_dirty_bitmap_bytes(memslot); if (log->first_page > memslot->npages || - log->num_pages > memslot->npages - log->first_page) - return -EINVAL; + log->num_pages > memslot->npages - log->first_page || + (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63))) + return -EINVAL; *flush = false; dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); -- Gitee From d767613b94377c27e7cace6657f7d6a1a0536c00 Mon Sep 17 00:00:00 2001 From: Jiang Biao Date: Tue, 23 Apr 2019 19:40:30 +0800 Subject: [PATCH 07/12] kvm_main: fix some comments mainline inclusion from mainline-v5.10 commit: b8b002209c061273fd1ef7bb3c3c32301623a282 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I66COX CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=linux-5.10.y&id=b8b002209c061273fd1ef7bb3c3c32301623a282 -------------------------------- is_dirty has been renamed to flush, but the comment for it is outdated. And the description about @flush parameter for kvm_clear_dirty_log_protect() is missing, add it in this patch as well. Signed-off-by: Jiang Biao Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d0bcd84b6c80..04c802a87c99 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1182,11 +1182,11 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log); #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT /** - * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages + * kvm_get_dirty_log_protect - get a snapshot of dirty pages * and reenable dirty page tracking for the corresponding pages. * @kvm: pointer to kvm instance * @log: slot id and address to which we copy the log - * @is_dirty: flag set if any page is dirty + * @flush: true if TLB flush is needed by caller * * We need to keep it in mind that VCPU threads can write to the bitmap * concurrently. So, to avoid losing track of dirty pages we keep the @@ -1271,6 +1271,7 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect); * and reenable dirty page tracking for the corresponding pages. * @kvm: pointer to kvm instance * @log: slot id and address from which to fetch the bitmap of dirty pages + * @flush: true if TLB flush is needed by caller */ int kvm_clear_dirty_log_protect(struct kvm *kvm, struct kvm_clear_dirty_log *log, bool *flush) -- Gitee From 6022f129d5e5bf2e3dec0d78c0e2733e37756904 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 8 May 2019 17:15:45 +0800 Subject: [PATCH 08/12] KVM: Fix the bitmap range to copy during clear dirty mainline inclusion from mainline-v5.10 commit: 4ddc9204572c33f2eb91fbdb1d99d8078388b67d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I66COX CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=linux-5.10.y&id=4ddc9204572c33f2eb91fbdb1d99d8078388b67d -------------------------------- kvm_dirty_bitmap_bytes() will return the size of the dirty bitmap of the memslot rather than the size of bitmap passed over from the ioctl. Here for KVM_CLEAR_DIRTY_LOG we should only copy exactly the size of bitmap that covers kvm_clear_dirty_log.num_pages. Signed-off-by: Peter Xu Cc: stable@vger.kernel.org Fixes: 2a31b9db153530df4aa02dac8c32837bf5f47019 Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 04c802a87c99..26ef8797f45e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1299,7 +1299,7 @@ int kvm_clear_dirty_log_protect(struct kvm *kvm, if (!dirty_bitmap) return -ENOENT; - n = kvm_dirty_bitmap_bytes(memslot); + n = ALIGN(log->num_pages, BITS_PER_LONG) / 8; if (log->first_page > memslot->npages || log->num_pages > memslot->npages - log->first_page || -- Gitee From 95a88f43bf85569320f947edf32bdc65bc1c71f9 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 8 May 2019 17:15:46 +0800 Subject: [PATCH 09/12] KVM: Fix kvm_clear_dirty_log_protect off-by-(minus-)one mainline inclusion from mainline-v5.10 commit: 53eac7a8f8cf3d7dc5ecac1946f31442f5eee5f3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I66COX CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=linux-5.10.y&id=53eac7a8f8cf3d7dc5ecac1946f31442f5eee5f3 -------------------------------- Just imaging the case where num_pages < BITS_PER_LONG, then the loop will be skipped while it shouldn't. Signed-off-by: Peter Xu Fixes: 2a31b9db153530df4aa02dac8c32837bf5f47019 Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 26ef8797f45e..fa094671cf3e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1312,8 +1312,8 @@ int kvm_clear_dirty_log_protect(struct kvm *kvm, return -EFAULT; spin_lock(&kvm->mmu_lock); - for (offset = log->first_page, - i = offset / BITS_PER_LONG, n = log->num_pages / BITS_PER_LONG; n--; + for (offset = log->first_page, i = offset / BITS_PER_LONG, + n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--; i++, offset += BITS_PER_LONG) { unsigned long mask = *dirty_bitmap_buffer++; atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i]; -- Gitee From 2c0c29fa94d59343109854afbc10ec932ace6b0c Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 8 May 2019 17:15:47 +0800 Subject: [PATCH 10/12] KVM: Introduce KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 mainline inclusion from mainline-v5.10 commit: d7547c55cbe7471255ca51f14bcd4699f5eaabe5 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I66COX CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=linux-5.10.y&id=d7547c55cbe7471255ca51f14bcd4699f5eaabe5 -------------------------------- The previous KVM_CAP_MANUAL_DIRTY_LOG_PROTECT has some problem which blocks the correct usage from userspace. Obsolete the old one and introduce a new capability bit for it. Suggested-by: Paolo Bonzini Signed-off-by: Peter Xu Signed-off-by: Paolo Bonzini [Remove test code because the test code is too different.] Signed-off-by: zhengchuan --- Documentation/virtual/kvm/api.txt | 15 ++++++++++----- include/uapi/linux/kvm.h | 5 +++-- virt/kvm/kvm_main.c | 4 ++-- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 2b753800416d..b9c0aea0e365 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -312,7 +312,7 @@ They must be less than the value that KVM_CHECK_EXTENSION returns for the KVM_CAP_MULTI_ADDRESS_SPACE capability. The bits in the dirty bitmap are cleared before the ioctl returns, unless -KVM_CAP_MANUAL_DIRTY_LOG_PROTECT is enabled. For more information, +KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is enabled. For more information, see the description of the capability. 4.9 KVM_SET_MEMORY_ALIAS @@ -3731,7 +3731,7 @@ the definition of struct kvm_nested_state, see KVM_GET_NESTED_STATE. 4.117 KVM_CLEAR_DIRTY_LOG (vm ioctl) -Capability: KVM_CAP_MANUAL_DIRTY_LOG_PROTECT +Capability: KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 Architectures: x86 Type: vm ioctl Parameters: struct kvm_dirty_log (in) @@ -3764,10 +3764,10 @@ the address space for which you want to return the dirty bitmap. They must be less than the value that KVM_CHECK_EXTENSION returns for the KVM_CAP_MULTI_ADDRESS_SPACE capability. -This ioctl is mostly useful when KVM_CAP_MANUAL_DIRTY_LOG_PROTECT +This ioctl is mostly useful when KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is enabled; for more information, see the description of the capability. However, it can always be used as long as KVM_CHECK_EXTENSION confirms -that KVM_CAP_MANUAL_DIRTY_LOG_PROTECT is present. +that KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is present. 5. The kvm_run structure @@ -4625,7 +4625,7 @@ With this capability, a guest may read the MSR_PLATFORM_INFO MSR. Otherwise, a #GP would be raised when the guest tries to access. Currently, this capability does not enable write permissions of this MSR for the guest. -7.18 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT +7.18 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 Architectures: all Parameters: args[0] whether feature should be enabled or not @@ -4648,6 +4648,11 @@ while userspace can see false reports of dirty pages. Manual reprotection helps reducing this time, improving guest performance and reducing the number of dirty log false positives. +KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 was previously available under the name +KVM_CAP_MANUAL_DIRTY_LOG_PROTECT, but the implementation had bugs that make +it hard or impossible to use it correctly. The availability of +KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 signals that those bugs are fixed. +Userspace should not try to use KVM_CAP_MANUAL_DIRTY_LOG_PROTECT. 8. Other capabilities. ---------------------- diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index eb9987b962ed..dde33f8e9adf 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -974,7 +974,8 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_INJECT_SERROR_ESR 158 #define KVM_CAP_MSR_PLATFORM_INFO 159 #define KVM_CAP_ARM_VM_IPA_SIZE 165 /* returns maximum IPA bits for a VM */ -#define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT 166 +#define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT 166 /* Obsolete */ +#define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 168 #define KVM_CAP_ARM_IRQ_LINE_LAYOUT_2 174 #define KVM_CAP_ARM_CPU_FEATURE 555 @@ -1435,7 +1436,7 @@ struct kvm_enc_region { #define KVM_GET_NESTED_STATE _IOWR(KVMIO, 0xbe, struct kvm_nested_state) #define KVM_SET_NESTED_STATE _IOW(KVMIO, 0xbf, struct kvm_nested_state) -/* Available with KVM_CAP_MANUAL_DIRTY_LOG_PROTECT */ +/* Available with KVM_CAP_MANUAL_DIRTY_LOG_PROTECT_2 */ #define KVM_CLEAR_DIRTY_LOG _IOWR(KVMIO, 0xc0, struct kvm_clear_dirty_log) /* Secure Encrypted Virtualization command */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index fa094671cf3e..a989b1d5fd41 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3126,7 +3126,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_CHECK_EXTENSION_VM: case KVM_CAP_ENABLE_CAP_VM: #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT - case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT: + case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: #endif return 1; #ifdef CONFIG_KVM_MMIO @@ -3158,7 +3158,7 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, { switch (cap->cap) { #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT - case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT: + case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: if (cap->flags || (cap->args[0] & ~1)) return -EINVAL; kvm->manual_dirty_log_protect = cap->args[0]; -- Gitee From 1135dc02a7bb6d2b862392863c29d064675b5711 Mon Sep 17 00:00:00 2001 From: Jay Zhou Date: Thu, 27 Feb 2020 09:32:27 +0800 Subject: [PATCH 11/12] KVM: x86: enable dirty log gradually in small chunks mainline inclusion from mainline-v5.10 commit: 3c9bd4006bfc2dccda1823db61b3f470ef91cfaa category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I66COX CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=linux-5.10.y&id=3c9bd4006bfc2dccda1823db61b3f470ef91cfaa -------------------------------- It could take kvm->mmu_lock for an extended period of time when enabling dirty log for the first time. The main cost is to clear all the D-bits of last level SPTEs. This situation can benefit from manual dirty log protect as well, which can reduce the mmu_lock time taken. The sequence is like this: 1. Initialize all the bits of the dirty bitmap to 1 when enabling dirty log for the first time 2. Only write protect the huge pages 3. KVM_GET_DIRTY_LOG returns the dirty bitmap info 4. KVM_CLEAR_DIRTY_LOG will clear D-bit for each of the leaf level SPTEs gradually in small chunks Under the Intel(R) Xeon(R) Gold 6152 CPU @ 2.10GHz environment, I did some tests with a 128G windows VM and counted the time taken of memory_global_dirty_log_start, here is the numbers: VM Size Before After optimization 128G 460ms 10ms Signed-off-by: Jay Zhou Signed-off-by: Paolo Bonzini [Remove test code because the test code is too different.] Signed-off-by: zhengchuan --- Documentation/virtual/kvm/api.txt | 18 +++++++++++++++--- arch/x86/include/asm/kvm_host.h | 6 +++++- arch/x86/kvm/mmu.c | 7 ++++--- arch/x86/kvm/vmx.c | 3 ++- arch/x86/kvm/x86.c | 21 +++++++++++++++++---- include/linux/kvm_host.h | 11 ++++++++++- include/uapi/linux/kvm.h | 3 +++ virt/kvm/kvm_main.c | 24 +++++++++++++++++------- 8 files changed, 73 insertions(+), 20 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index b9c0aea0e365..c4770f01cea1 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -4630,8 +4630,13 @@ capability does not enable write permissions of this MSR for the guest. Architectures: all Parameters: args[0] whether feature should be enabled or not -With this capability enabled, KVM_GET_DIRTY_LOG will not automatically -clear and write-protect all pages that are returned as dirty. +Valid flags are:: + + #define KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE (1 << 0) + #define KVM_DIRTY_LOG_INITIALLY_SET (1 << 1) + +With KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE set, KVM_GET_DIRTY_LOG will not +automatically clear and write-protect all pages that are returned as dirty. Rather, userspace will have to do this operation separately using KVM_CLEAR_DIRTY_LOG. @@ -4642,12 +4647,19 @@ than requiring to sync a full memslot; this ensures that KVM does not take spinlocks for an extended period of time. Second, in some cases a large amount of time can pass between a call to KVM_GET_DIRTY_LOG and userspace actually using the data in the page. Pages can be modified -during this time, which is inefficint for both the guest and userspace: +during this time, which is inefficient for both the guest and userspace: the guest will incur a higher penalty due to write protection faults, while userspace can see false reports of dirty pages. Manual reprotection helps reducing this time, improving guest performance and reducing the number of dirty log false positives. +With KVM_DIRTY_LOG_INITIALLY_SET set, all the bits of the dirty bitmap +will be initialized to 1 when created. This also improves performance because +dirty logging can be enabled gradually in small chunks on the first call +to KVM_CLEAR_DIRTY_LOG. KVM_DIRTY_LOG_INITIALLY_SET depends on +KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE (it is also only available on +x86 for now). + KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 was previously available under the name KVM_CAP_MANUAL_DIRTY_LOG_PROTECT, but the implementation had bugs that make it hard or impossible to use it correctly. The availability of diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7afb2cb2d42c..3eae4fd19060 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -49,6 +49,9 @@ #define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS +#define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ + KVM_DIRTY_LOG_INITIALLY_SET) + /* x86-specific vcpu->requests bit members */ #define KVM_REQ_MIGRATE_TIMER KVM_ARCH_REQ(0) #define KVM_REQ_REPORT_TPR_ACCESS KVM_ARCH_REQ(1) @@ -1205,7 +1208,8 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, - struct kvm_memory_slot *memslot); + struct kvm_memory_slot *memslot, + int start_level); void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *memslot); void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ff0901e8e606..f0b393dc74f7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5719,13 +5719,14 @@ static bool slot_rmap_write_protect(struct kvm *kvm, } void kvm_mmu_slot_remove_write_access(struct kvm *kvm, - struct kvm_memory_slot *memslot) + struct kvm_memory_slot *memslot, + int start_level) { bool flush; spin_lock(&kvm->mmu_lock); - flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect, - false); + flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, + start_level, PT_MAX_HUGEPAGE_LEVEL, false); spin_unlock(&kvm->mmu_lock); /* diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4164016cf105..85948d028ff8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -13918,7 +13918,8 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) static void vmx_slot_enable_log_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) { - kvm_mmu_slot_leaf_clear_dirty(kvm, slot); + if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) + kvm_mmu_slot_leaf_clear_dirty(kvm, slot); kvm_mmu_slot_largepage_remove_write_access(kvm, slot); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 90afa57f7a1e..aa8674f65dee 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9368,7 +9368,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, { /* Still write protect RO slot */ if (new->flags & KVM_MEM_READONLY) { - kvm_mmu_slot_remove_write_access(kvm, new); + kvm_mmu_slot_remove_write_access(kvm, new, PT_PAGE_TABLE_LEVEL); return; } @@ -9403,10 +9403,23 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, * See the comments in fast_page_fault(). */ if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { - if (kvm_x86_ops->slot_enable_log_dirty) + if (kvm_x86_ops->slot_enable_log_dirty) { kvm_x86_ops->slot_enable_log_dirty(kvm, new); - else - kvm_mmu_slot_remove_write_access(kvm, new); + } else { + int level = + kvm_dirty_log_manual_protect_and_init_set(kvm) ? + PT_DIRECTORY_LEVEL : PT_PAGE_TABLE_LEVEL; + + /* + * If we're with initial-all-set, we don't need + * to write protect any small page because + * they're reported as dirty already. However + * we still need to write-protect huge pages + * so that the page split can happen lazily on + * the first write to the huge page. + */ + kvm_mmu_slot_remove_write_access(kvm, new, level); + } } else { if (kvm_x86_ops->slot_disable_log_dirty) kvm_x86_ops->slot_disable_log_dirty(kvm, new); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3ec2e7f066f8..0435ec43c432 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -339,6 +339,10 @@ static inline unsigned long *kvm_second_dirty_bitmap(struct kvm_memory_slot *mem return memslot->dirty_bitmap + len / sizeof(*memslot->dirty_bitmap); } +#ifndef KVM_DIRTY_LOG_MANUAL_CAPS +#define KVM_DIRTY_LOG_MANUAL_CAPS KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE +#endif + struct kvm_s390_adapter_int { u64 ind_addr; u64 summary_addr; @@ -472,7 +476,7 @@ struct kvm { #endif long tlbs_dirty; struct list_head devices; - bool manual_dirty_log_protect; + u64 manual_dirty_log_protect; struct dentry *debugfs_dentry; struct kvm_stat_data **debugfs_stat_data; struct srcu_struct srcu; @@ -506,6 +510,11 @@ struct kvm { #define vcpu_err(vcpu, fmt, ...) \ kvm_err("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__) +static inline bool kvm_dirty_log_manual_protect_and_init_set(struct kvm *kvm) +{ + return !!(kvm->manual_dirty_log_protect & KVM_DIRTY_LOG_INITIALLY_SET); +} + static inline struct kvm_io_bus *kvm_get_bus(struct kvm *kvm, enum kvm_bus idx) { return srcu_dereference_check(kvm->buses[idx], &kvm->srcu, diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index dde33f8e9adf..eb9a522d8662 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1589,4 +1589,7 @@ struct kvm_hyperv_eventfd { #define KVM_HYPERV_CONN_ID_MASK 0x00ffffff #define KVM_HYPERV_EVENTFD_DEASSIGN (1 << 0) +#define KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE (1 << 0) +#define KVM_DIRTY_LOG_INITIALLY_SET (1 << 1) + #endif /* __LINUX_KVM_H */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a989b1d5fd41..96fb21ac7224 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -839,7 +839,7 @@ static int kvm_vm_release(struct inode *inode, struct file *filp) * Allocation size is twice as large as the actual dirty bitmap size. * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed. */ -static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) +static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot) { unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); @@ -1066,8 +1066,11 @@ int __kvm_set_memory_region(struct kvm *kvm, /* Allocate page dirty bitmap if needed */ if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { - if (kvm_create_dirty_bitmap(&new) < 0) + if (kvm_alloc_dirty_bitmap(&new) < 0) goto out_free; + + if (kvm_dirty_log_manual_protect_and_init_set(kvm)) + bitmap_set(new.dirty_bitmap, 0, new.npages); } slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); @@ -3125,14 +3128,15 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_IOEVENTFD_ANY_LENGTH: case KVM_CAP_CHECK_EXTENSION_VM: case KVM_CAP_ENABLE_CAP_VM: -#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT - case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: -#endif return 1; #ifdef CONFIG_KVM_MMIO case KVM_CAP_COALESCED_MMIO: return KVM_COALESCED_MMIO_PAGE_OFFSET; #endif +#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT + case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: + return KVM_DIRTY_LOG_MANUAL_CAPS; +#endif #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING case KVM_CAP_IRQ_ROUTING: return KVM_MAX_IRQ_ROUTES; @@ -3158,11 +3162,17 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, { switch (cap->cap) { #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT - case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: - if (cap->flags || (cap->args[0] & ~1)) + case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: { + u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE; + + if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE) + allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS; + + if (cap->flags || (cap->args[0] & ~allowed_options)) return -EINVAL; kvm->manual_dirty_log_protect = cap->args[0]; return 0; + } #endif default: return kvm_vm_ioctl_enable_cap(kvm, cap); -- Gitee From 694627e6c4ce3b20ebeda81e87b020ac4820727a Mon Sep 17 00:00:00 2001 From: Keqian Zhu Date: Mon, 13 Apr 2020 20:20:23 +0800 Subject: [PATCH 12/12] KVM: arm64: Support enabling dirty log gradually in small chunks mainline inclusion from mainline-v5.10 commit: c862626e19efdc26b26481515470b160e8fe52f3 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I66COX CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=linux-5.10.y&id=c862626e19efdc26b26481515470b160e8fe52f3 -------------------------------- There is already support of enabling dirty log gradually in small chunks for x86 in commit 3c9bd4006bfc ("KVM: x86: enable dirty log gradually in small chunks"). This adds support for arm64. x86 still writes protect all huge pages when DIRTY_LOG_INITIALLY_ALL_SET is enabled. However, for arm64, both huge pages and normal pages can be write protected gradually by userspace. Under the Huawei Kunpeng 920 2.6GHz platform, I did some tests on 128G Linux VMs with different page size. The memory pressure is 127G in each case. The time taken of memory_global_dirty_log_start in QEMU is listed below: Page Size Before After Optimization 4K 650ms 1.8ms 2M 4ms 1.8ms 1G 2ms 1.8ms Besides the time reduction, the biggest improvement is that we will minimize the performance side effect (because of dissolving huge pages and marking memslots dirty) on guest after enabling dirty log. Signed-off-by: Keqian Zhu Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200413122023.52583-1-zhukeqian1@huawei.com --- Documentation/virtual/kvm/api.txt | 2 +- arch/arm64/include/asm/kvm_host.h | 3 +++ virt/kvm/arm/mmu.c | 12 ++++++++++-- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index c4770f01cea1..f03924c4353a 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -4658,7 +4658,7 @@ will be initialized to 1 when created. This also improves performance because dirty logging can be enabled gradually in small chunks on the first call to KVM_CLEAR_DIRTY_LOG. KVM_DIRTY_LOG_INITIALLY_SET depends on KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE (it is also only available on -x86 for now). +x86 and arm64 for now). KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 was previously available under the name KVM_CAP_MANUAL_DIRTY_LOG_PROTECT, but the implementation had bugs that make diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index bf03056e3751..6eeaec218d9f 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -53,6 +53,9 @@ #define KVM_REQ_IRQ_PENDING KVM_ARCH_REQ(1) #define KVM_REQ_VCPU_RESET KVM_ARCH_REQ(2) +#define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ + KVM_DIRTY_LOG_INITIALLY_SET) + DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use); int __attribute_const__ kvm_target_cpu(void); diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index aec599488847..2c183030f32e 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -2304,8 +2304,16 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, * allocated dirty_bitmap[], dirty pages will be be tracked while the * memory slot is write protected. */ - if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) - kvm_mmu_wp_memory_region(kvm, mem->slot); + if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { + /* + * If we're with initial-all-set, we don't need to write + * protect any pages because they're all reported as dirty. + * Huge pages and normal pages will be write protect gradually. + */ + if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) { + kvm_mmu_wp_memory_region(kvm, mem->slot); + } + } } int kvm_arch_prepare_memory_region(struct kvm *kvm, -- Gitee