From 530a8ba71b4c3b7fcee323dd997f4bab1be1a6ba Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 16 May 2025 14:35:35 -0700 Subject: [PATCH 1/6] KVM: Bound the number of dirty ring entries in a single reset at INT_MAX Cap the number of ring entries that are reset in a single ioctl to INT_MAX to ensure userspace isn't confused by a wrap into negative space, and so that, in a truly pathological scenario, KVM doesn't miss a TLB flush due to the count wrapping to zero. While the size of the ring is fixed at 0x10000 entries and KVM (currently) supports at most 4096, userspace is allowed to harvest entries from the ring while the reset is in-progress, i.e. it's possible for the ring to always have harvested entries. Opportunistically return an actual error code from the helper so that a future fix to handle pending signals can gracefully return -EINTR. Drop the function comment now that the return code is a stanard 0/-errno (and because a future commit will add a proper lockdep assertion). Opportunistically drop a similarly stale comment for kvm_dirty_ring_push(). Cc: Peter Xu Cc: Yan Zhao Cc: Maxim Levitsky Cc: Binbin Wu Fixes: fb04a1eddb1a ("KVM: X86: Implement ring-based dirty memory tracking") Reviewed-by: James Houghton Reviewed-by: Binbin Wu Reviewed-by: Yan Zhao Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20250516213540.2546077-2-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_dirty_ring.h | 18 +++++------------- virt/kvm/dirty_ring.c | 10 +++++----- virt/kvm/kvm_main.c | 9 ++++++--- 3 files changed, 16 insertions(+), 21 deletions(-) diff --git a/include/linux/kvm_dirty_ring.h b/include/linux/kvm_dirty_ring.h index da4d9b5f58f1..eb10d87adf7d 100644 --- a/include/linux/kvm_dirty_ring.h +++ b/include/linux/kvm_dirty_ring.h @@ -49,9 +49,10 @@ static inline int kvm_dirty_ring_alloc(struct kvm *kvm, struct kvm_dirty_ring *r } static inline int kvm_dirty_ring_reset(struct kvm *kvm, - struct kvm_dirty_ring *ring) + struct kvm_dirty_ring *ring, + int *nr_entries_reset) { - return 0; + return -ENOENT; } static inline void kvm_dirty_ring_push(struct kvm_vcpu *vcpu, @@ -77,17 +78,8 @@ bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm); u32 kvm_dirty_ring_get_rsvd_entries(struct kvm *kvm); int kvm_dirty_ring_alloc(struct kvm *kvm, struct kvm_dirty_ring *ring, int index, u32 size); - -/* - * called with kvm->slots_lock held, returns the number of - * processed pages. - */ -int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring); - -/* - * returns =0: successfully pushed - * <0: unable to push, need to wait - */ +int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring, + int *nr_entries_reset); void kvm_dirty_ring_push(struct kvm_vcpu *vcpu, u32 slot, u64 offset); bool kvm_dirty_ring_check_request(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c index d14ffc7513ee..77986f34eff8 100644 --- a/virt/kvm/dirty_ring.c +++ b/virt/kvm/dirty_ring.c @@ -105,19 +105,19 @@ static inline bool kvm_dirty_gfn_harvested(struct kvm_dirty_gfn *gfn) return smp_load_acquire(&gfn->flags) & KVM_DIRTY_GFN_F_RESET; } -int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring) +int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring, + int *nr_entries_reset) { u32 cur_slot, next_slot; u64 cur_offset, next_offset; unsigned long mask; - int count = 0; struct kvm_dirty_gfn *entry; bool first_round = true; /* This is only needed to make compilers happy */ cur_slot = cur_offset = mask = 0; - while (true) { + while (likely((*nr_entries_reset) < INT_MAX)) { entry = &ring->dirty_gfns[ring->reset_index & (ring->size - 1)]; if (!kvm_dirty_gfn_harvested(entry)) @@ -130,7 +130,7 @@ int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring) kvm_dirty_gfn_set_invalid(entry); ring->reset_index++; - count++; + (*nr_entries_reset)++; /* * Try to coalesce the reset operations when the guest is * scanning pages in the same slot. @@ -167,7 +167,7 @@ int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring) trace_kvm_dirty_ring_reset(ring); - return count; + return 0; } void kvm_dirty_ring_push(struct kvm_vcpu *vcpu, u32 slot, u64 offset) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index eec82775c5bf..c784b6708708 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4962,15 +4962,18 @@ static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm) { unsigned long i; struct kvm_vcpu *vcpu; - int cleared = 0; + int cleared = 0, r; if (!kvm->dirty_ring_size) return -EINVAL; mutex_lock(&kvm->slots_lock); - kvm_for_each_vcpu(i, vcpu, kvm) - cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring); + kvm_for_each_vcpu(i, vcpu, kvm) { + r = kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring, &cleared); + if (r) + break; + } mutex_unlock(&kvm->slots_lock); From 49005a2a3d2a2974db486e330dfb7c8bc85df17f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 16 May 2025 14:35:36 -0700 Subject: [PATCH 2/6] KVM: Bail from the dirty ring reset flow if a signal is pending Abort a dirty ring reset if the current task has a pending signal, as the hard limit of INT_MAX entries doesn't ensure KVM will respond to a signal in a timely fashion. Fixes: fb04a1eddb1a ("KVM: X86: Implement ring-based dirty memory tracking") Reviewed-by: James Houghton Reviewed-by: Binbin Wu Reviewed-by: Yan Zhao Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20250516213540.2546077-3-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/dirty_ring.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c index 77986f34eff8..e844e869e8c7 100644 --- a/virt/kvm/dirty_ring.c +++ b/virt/kvm/dirty_ring.c @@ -118,6 +118,9 @@ int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring, cur_slot = cur_offset = mask = 0; while (likely((*nr_entries_reset) < INT_MAX)) { + if (signal_pending(current)) + return -EINTR; + entry = &ring->dirty_gfns[ring->reset_index & (ring->size - 1)]; if (!kvm_dirty_gfn_harvested(entry)) From 1333c35c4eea6533874564899371501ee80b9583 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 16 May 2025 14:35:37 -0700 Subject: [PATCH 3/6] KVM: Conditionally reschedule when resetting the dirty ring When resetting a dirty ring, conditionally reschedule on each iteration after the first. The recently introduced hard limit mitigates the issue of an endless reset, but isn't sufficient to completely prevent RCU stalls, soft lockups, etc., nor is the hard limit intended to guard against such badness. Note! Take care to check for reschedule even in the "continue" paths, as a pathological scenario (or malicious userspace) could dirty the same gfn over and over, i.e. always hit the continue path. rcu: INFO: rcu_sched self-detected stall on CPU rcu: 4-....: (5249 ticks this GP) idle=51e4/1/0x4000000000000000 softirq=309/309 fqs=2563 rcu: (t=5250 jiffies g=-319 q=608 ncpus=24) CPU: 4 UID: 1000 PID: 1067 Comm: dirty_log_test Tainted: G L 6.13.0-rc3-17fa7a24ea1e-HEAD-vm #814 Tainted: [L]=SOFTLOCKUP Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:kvm_arch_mmu_enable_log_dirty_pt_masked+0x26/0x200 [kvm] Call Trace: kvm_reset_dirty_gfn.part.0+0xb4/0xe0 [kvm] kvm_dirty_ring_reset+0x58/0x220 [kvm] kvm_vm_ioctl+0x10eb/0x15d0 [kvm] __x64_sys_ioctl+0x8b/0xb0 do_syscall_64+0x5b/0x160 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Tainted: [L]=SOFTLOCKUP Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:kvm_arch_mmu_enable_log_dirty_pt_masked+0x17/0x200 [kvm] Call Trace: kvm_reset_dirty_gfn.part.0+0xb4/0xe0 [kvm] kvm_dirty_ring_reset+0x58/0x220 [kvm] kvm_vm_ioctl+0x10eb/0x15d0 [kvm] __x64_sys_ioctl+0x8b/0xb0 do_syscall_64+0x5b/0x160 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Fixes: fb04a1eddb1a ("KVM: X86: Implement ring-based dirty memory tracking") Reviewed-by: James Houghton Reviewed-by: Yan Zhao Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20250516213540.2546077-4-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/dirty_ring.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c index e844e869e8c7..97cca0c02fd1 100644 --- a/virt/kvm/dirty_ring.c +++ b/virt/kvm/dirty_ring.c @@ -134,6 +134,16 @@ int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring, ring->reset_index++; (*nr_entries_reset)++; + + /* + * While the size of each ring is fixed, it's possible for the + * ring to be constantly re-dirtied/harvested while the reset + * is in-progress (the hard limit exists only to guard against + * wrapping the count into negative space). + */ + if (!first_round) + cond_resched(); + /* * Try to coalesce the reset operations when the guest is * scanning pages in the same slot. From ee188dea1677a0d9b3a8097e71b803896b8aaed7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 16 May 2025 14:35:38 -0700 Subject: [PATCH 4/6] KVM: Check for empty mask of harvested dirty ring entries in caller When resetting a dirty ring, explicitly check that there is work to be done before calling kvm_reset_dirty_gfn(), e.g. if no harvested entries are found and/or on the loop's first iteration, and delete the extremely misleading comment "This is only needed to make compilers happy". KVM absolutely relies on mask to be zero-initialized, i.e. the comment is an outright lie. Furthermore, the compiler is right to complain that KVM is calling a function with uninitialized data, as there are no guarantees the implementation details of kvm_reset_dirty_gfn() will be visible to kvm_dirty_ring_reset(). While the flaw could be fixed by simply deleting (or rewording) the comment, and duplicating the check is unfortunate, checking mask in the caller will allow for additional cleanups. Opportunistically drop the zero-initialization of cur_slot and cur_offset. If a bug were introduced where either the slot or offset was consumed before mask is set to a non-zero value, then it is highly desirable for the compiler (or some other sanitizer) to yell. Cc: Peter Xu Cc: Yan Zhao Cc: Maxim Levitsky Cc: Binbin Wu Reviewed-by: Pankaj Gupta Reviewed-by: James Houghton Reviewed-by: Binbin Wu Reviewed-by: Yan Zhao Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20250516213540.2546077-5-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/dirty_ring.c | 44 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c index 97cca0c02fd1..939198ac6695 100644 --- a/virt/kvm/dirty_ring.c +++ b/virt/kvm/dirty_ring.c @@ -55,9 +55,6 @@ static void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask) struct kvm_memory_slot *memslot; int as_id, id; - if (!mask) - return; - as_id = slot >> 16; id = (u16)slot; @@ -108,15 +105,24 @@ static inline bool kvm_dirty_gfn_harvested(struct kvm_dirty_gfn *gfn) int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring, int *nr_entries_reset) { + /* + * To minimize mmu_lock contention, batch resets for harvested entries + * whose gfns are in the same slot, and are within N frame numbers of + * each other, where N is the number of bits in an unsigned long. For + * simplicity, process the current set of entries when the next entry + * can't be included in the batch. + * + * Track the current batch slot, the gfn offset into the slot for the + * batch, and the bitmask of gfns that need to be reset (relative to + * offset). Note, the offset may be adjusted backwards, e.g. so that + * a sequence of gfns X, X-1, ... X-N-1 can be batched. + */ u32 cur_slot, next_slot; u64 cur_offset, next_offset; - unsigned long mask; + unsigned long mask = 0; struct kvm_dirty_gfn *entry; bool first_round = true; - /* This is only needed to make compilers happy */ - cur_slot = cur_offset = mask = 0; - while (likely((*nr_entries_reset) < INT_MAX)) { if (signal_pending(current)) return -EINTR; @@ -164,14 +170,34 @@ int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring, continue; } } - kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask); + + /* + * Reset the slot for all the harvested entries that have been + * gathered, but not yet fully processed. + */ + if (mask) + kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask); + + /* + * The current slot was reset or this is the first harvested + * entry, (re)initialize the metadata. + */ cur_slot = next_slot; cur_offset = next_offset; mask = 1; first_round = false; } - kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask); + /* + * Perform a final reset if there are harvested entries that haven't + * been processed, which is guaranteed if at least one harvested was + * found. The loop only performs a reset when the "next" entry can't + * be batched with the "current" entry(s), and that reset processes the + * _current_ entry(s); i.e. the last harvested entry, a.k.a. next, will + * always be left pending. + */ + if (mask) + kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask); /* * The request KVM_REQ_DIRTY_RING_SOFT_FULL will be cleared From e46ad851150f1dd14b8542b6fb7a51f695a99eb1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 16 May 2025 14:35:39 -0700 Subject: [PATCH 5/6] KVM: Use mask of harvested dirty ring entries to coalesce dirty ring resets Use "mask" instead of a dedicated boolean to track whether or not there is at least one to-be-reset entry for the current slot+offset. In the body of the loop, mask is zero only on the first iteration, i.e. !mask is equivalent to first_round. Opportunistically combine the adjacent "if (mask)" statements into a single if-statement. No functional change intended. Cc: Peter Xu Cc: Yan Zhao Cc: Maxim Levitsky Reviewed-by: Pankaj Gupta Reviewed-by: James Houghton Reviewed-by: Binbin Wu Reviewed-by: Yan Zhao Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20250516213540.2546077-6-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/dirty_ring.c | 62 +++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 32 deletions(-) diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c index 939198ac6695..4caa63e610d2 100644 --- a/virt/kvm/dirty_ring.c +++ b/virt/kvm/dirty_ring.c @@ -121,7 +121,6 @@ int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring, u64 cur_offset, next_offset; unsigned long mask = 0; struct kvm_dirty_gfn *entry; - bool first_round = true; while (likely((*nr_entries_reset) < INT_MAX)) { if (signal_pending(current)) @@ -141,42 +140,42 @@ int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring, ring->reset_index++; (*nr_entries_reset)++; - /* - * While the size of each ring is fixed, it's possible for the - * ring to be constantly re-dirtied/harvested while the reset - * is in-progress (the hard limit exists only to guard against - * wrapping the count into negative space). - */ - if (!first_round) + if (mask) { + /* + * While the size of each ring is fixed, it's possible + * for the ring to be constantly re-dirtied/harvested + * while the reset is in-progress (the hard limit exists + * only to guard against the count becoming negative). + */ cond_resched(); - /* - * Try to coalesce the reset operations when the guest is - * scanning pages in the same slot. - */ - if (!first_round && next_slot == cur_slot) { - s64 delta = next_offset - cur_offset; + /* + * Try to coalesce the reset operations when the guest + * is scanning pages in the same slot. + */ + if (next_slot == cur_slot) { + s64 delta = next_offset - cur_offset; - if (delta >= 0 && delta < BITS_PER_LONG) { - mask |= 1ull << delta; - continue; + if (delta >= 0 && delta < BITS_PER_LONG) { + mask |= 1ull << delta; + continue; + } + + /* Backwards visit, careful about overflows! */ + if (delta > -BITS_PER_LONG && delta < 0 && + (mask << -delta >> -delta) == mask) { + cur_offset = next_offset; + mask = (mask << -delta) | 1; + continue; + } } - /* Backwards visit, careful about overflows! */ - if (delta > -BITS_PER_LONG && delta < 0 && - (mask << -delta >> -delta) == mask) { - cur_offset = next_offset; - mask = (mask << -delta) | 1; - continue; - } - } - - /* - * Reset the slot for all the harvested entries that have been - * gathered, but not yet fully processed. - */ - if (mask) + /* + * Reset the slot for all the harvested entries that + * have been gathered, but not yet fully processed. + */ kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask); + } /* * The current slot was reset or this is the first harvested @@ -185,7 +184,6 @@ int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring, cur_slot = next_slot; cur_offset = next_offset; mask = 1; - first_round = false; } /* From 614fb9d1479b1d90721ca70da8b7c55f69fe9ad2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 16 May 2025 14:35:40 -0700 Subject: [PATCH 6/6] KVM: Assert that slots_lock is held when resetting per-vCPU dirty rings Assert that slots_lock is held in kvm_dirty_ring_reset() and add a comment to explain _why_ slots needs to be held for the duration of the reset. Link: https://lore.kernel.org/all/aCSns6Q5oTkdXUEe@google.com Suggested-by: James Houghton Reviewed-by: Yan Zhao Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20250516213540.2546077-7-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/dirty_ring.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c index 4caa63e610d2..02bc6b00d76c 100644 --- a/virt/kvm/dirty_ring.c +++ b/virt/kvm/dirty_ring.c @@ -122,6 +122,14 @@ int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring, unsigned long mask = 0; struct kvm_dirty_gfn *entry; + /* + * Ensure concurrent calls to KVM_RESET_DIRTY_RINGS are serialized, + * e.g. so that KVM fully resets all entries processed by a given call + * before returning to userspace. Holding slots_lock also protects + * the various memslot accesses. + */ + lockdep_assert_held(&kvm->slots_lock); + while (likely((*nr_entries_reset) < INT_MAX)) { if (signal_pending(current)) return -EINTR;