Merge tag 'kvm-x86-dirty_ring-6.17' of https://github.com/kvm-x86/linux into HEAD

KVM Dirty Ring changes for 6.17 Fix issues with dirty ring harvesting where KVM doesn't bound the processing of entries in any way, which allows userspace to keep KVM in a tight loop indefinitely. Clean up code and comments along the way.
2025-09-04 20:19:47 +08:00 · 2025-07-28 11:05:24 -04:00 · 2025-07-28 11:05:24 -04:00 · cc5a1021aa
commit cc5a1021aa
parent d284562862 614fb9d147
3 changed files with 87 additions and 47 deletions
--- a/include/linux/kvm_dirty_ring.h
+++ b/include/linux/kvm_dirty_ring.h
@ -49,9 +49,10 @@ static inline int kvm_dirty_ring_alloc(struct kvm *kvm, struct kvm_dirty_ring *r
 }
 static inline int kvm_dirty_ring_reset(struct kvm *kvm,
-				       struct kvm_dirty_ring *ring)
+				       struct kvm_dirty_ring *ring,
 				       int *nr_entries_reset)
 {
-	return 0;
+	return -ENOENT;
 }
 static inline void kvm_dirty_ring_push(struct kvm_vcpu *vcpu,
@ -77,17 +78,8 @@ bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm);
 u32 kvm_dirty_ring_get_rsvd_entries(struct kvm *kvm);
 int kvm_dirty_ring_alloc(struct kvm *kvm, struct kvm_dirty_ring *ring,
 			 int index, u32 size);
-
+int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring,
-/*
+			 int *nr_entries_reset);
 * called with kvm->slots_lock held, returns the number of
 * processed pages.
 */
 int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring);
 /*
 * returns =0: successfully pushed
 *         <0: unable to push, need to wait
 */
 void kvm_dirty_ring_push(struct kvm_vcpu *vcpu, u32 slot, u64 offset);
 bool kvm_dirty_ring_check_request(struct kvm_vcpu *vcpu);
--- a/virt/kvm/dirty_ring.c
+++ b/virt/kvm/dirty_ring.c
@ -55,9 +55,6 @@ static void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask)
 	struct kvm_memory_slot *memslot;
 	int as_id, id;
 	if (!mask)
 		return;
 	as_id = slot >> 16;
 	id = (u16)slot;
@ -105,19 +102,38 @@ static inline bool kvm_dirty_gfn_harvested(struct kvm_dirty_gfn *gfn)
 	return smp_load_acquire(&gfn->flags) & KVM_DIRTY_GFN_F_RESET;
 }
-int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring)
+int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring,
 			 int *nr_entries_reset)
 {
 	/*
 	 * To minimize mmu_lock contention, batch resets for harvested entries
 	 * whose gfns are in the same slot, and are within N frame numbers of
 	 * each other, where N is the number of bits in an unsigned long.  For
 	 * simplicity, process the current set of entries when the next entry
 	 * can't be included in the batch.
 	 *
 	 * Track the current batch slot, the gfn offset into the slot for the
 	 * batch, and the bitmask of gfns that need to be reset (relative to
 	 * offset).  Note, the offset may be adjusted backwards, e.g. so that
 	 * a sequence of gfns X, X-1, ... X-N-1 can be batched.
 	 */
 	u32 cur_slot, next_slot;
 	u64 cur_offset, next_offset;
-	unsigned long mask;
+	unsigned long mask = 0;
 	int count = 0;
 	struct kvm_dirty_gfn *entry;
 	bool first_round = true;
-	/* This is only needed to make compilers happy */
+	/*
-	cur_slot = cur_offset = mask = 0;
+	 * Ensure concurrent calls to KVM_RESET_DIRTY_RINGS are serialized,
 	 * e.g. so that KVM fully resets all entries processed by a given call
 	 * before returning to userspace.  Holding slots_lock also protects
 	 * the various memslot accesses.
 	 */
 	lockdep_assert_held(&kvm->slots_lock);
 	while (likely((*nr_entries_reset) < INT_MAX)) {
 		if (signal_pending(current))
 			return -EINTR;
 	while (true) {
 		entry = &ring->dirty_gfns[ring->reset_index & (ring->size - 1)];
 		if (!kvm_dirty_gfn_harvested(entry))
@ -130,35 +146,64 @@ int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring)
 		kvm_dirty_gfn_set_invalid(entry);
 		ring->reset_index++;
-		count++;
+		(*nr_entries_reset)++;
 		/*
 		 * Try to coalesce the reset operations when the guest is
 		 * scanning pages in the same slot.
 		 */
 		if (!first_round && next_slot == cur_slot) {
 			s64 delta = next_offset - cur_offset;
-			if (delta >= 0 && delta < BITS_PER_LONG) {
+		if (mask) {
-				mask |= 1ull << delta;
+			/*
-				continue;
+			 * While the size of each ring is fixed, it's possible
 			 * for the ring to be constantly re-dirtied/harvested
 			 * while the reset is in-progress (the hard limit exists
 			 * only to guard against the count becoming negative).
 			 */
 			cond_resched();
 			/*
 			 * Try to coalesce the reset operations when the guest
 			 * is scanning pages in the same slot.
 			 */
 			if (next_slot == cur_slot) {
 				s64 delta = next_offset - cur_offset;
 				if (delta >= 0 && delta < BITS_PER_LONG) {
 					mask |= 1ull << delta;
 					continue;
 				}
 				/* Backwards visit, careful about overflows! */
 				if (delta > -BITS_PER_LONG && delta < 0 &&
 				(mask << -delta >> -delta) == mask) {
 					cur_offset = next_offset;
 					mask = (mask << -delta) | 1;
 					continue;
 				}
 			}
-			/* Backwards visit, careful about overflows!  */
+			/*
-			if (delta > -BITS_PER_LONG && delta < 0 &&
+			 * Reset the slot for all the harvested entries that
-			    (mask << -delta >> -delta) == mask) {
+			 * have been gathered, but not yet fully processed.
-				cur_offset = next_offset;
+			 */
-				mask = (mask << -delta) | 1;
+			kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);
 				continue;
 			}
 		}
-		kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);
+
 		/*
 		 * The current slot was reset or this is the first harvested
 		 * entry, (re)initialize the metadata.
 		 */
 		cur_slot = next_slot;
 		cur_offset = next_offset;
 		mask = 1;
 		first_round = false;
 	}
-	kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);
+	/*
 	 * Perform a final reset if there are harvested entries that haven't
 	 * been processed, which is guaranteed if at least one harvested was
 	 * found.  The loop only performs a reset when the "next" entry can't
 	 * be batched with the "current" entry(s), and that reset processes the
 	 * _current_ entry(s); i.e. the last harvested entry, a.k.a. next, will
 	 * always be left pending.
 	 */
 	if (mask)
 		kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);
 	/*
 	 * The request KVM_REQ_DIRTY_RING_SOFT_FULL will be cleared
@ -167,7 +212,7 @@ int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring)
 	trace_kvm_dirty_ring_reset(ring);
-	return count;
+	return 0;
 }
 void kvm_dirty_ring_push(struct kvm_vcpu *vcpu, u32 slot, u64 offset)
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@ -4967,15 +4967,18 @@ static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
 {
 	unsigned long i;
 	struct kvm_vcpu *vcpu;
-	int cleared = 0;
+	int cleared = 0, r;
 	if (!kvm->dirty_ring_size)
 		return -EINVAL;
 	mutex_lock(&kvm->slots_lock);
-	kvm_for_each_vcpu(i, vcpu, kvm)
+	kvm_for_each_vcpu(i, vcpu, kvm) {
-		cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring);
+		r = kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring, &cleared);
 		if (r)
 			break;
 	}
 	mutex_unlock(&kvm->slots_lock);